Compare commits

..

5 Commits

Author SHA1 Message Date
Ettore Di Giacinto
2a03905920 Merge branch 'master' into cleanup_deps 2024-08-21 13:10:46 +02:00
Ettore Di Giacinto
35297ebc14 Drop also ttf files
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-08-21 13:03:26 +02:00
Ettore Di Giacinto
b303805df9 fix marked 2024-08-21 13:02:19 +02:00
Ettore Di Giacinto
32d51797d9 fix alpine.js 2024-08-21 13:02:19 +02:00
Ettore Di Giacinto
af09b019ed fix(assets): generate assets on build time
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-08-21 13:02:19 +02:00
47 changed files with 82 additions and 1627 deletions

View File

@@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=fc54ef0d1c138133a01933296d50a36a1ab64735
CPPLLAMA_VERSION?=2f3c1466ff46a2413b0e363a5005c46538186ee6
# go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=9e3c5345cd46ea718209db53464e426c3fe7a25e
WHISPER_CPP_VERSION?=d65786ea540a5aef21f67cacfa6f134097727780
# bert.cpp version
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -338,7 +338,7 @@ rebuild: ## Rebuilds the project
$(MAKE) -C sources/go-tiny-dream clean
$(MAKE) build
prepare: prepare-sources $(OPTIONAL_TARGETS)
prepare: prepare-sources gen-assets $(OPTIONAL_TARGETS)
clean: ## Remove build related file
$(GOCMD) clean -cache

View File

@@ -1,6 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.66.0
grpcio==1.65.4
protobuf
certifi
transformers

View File

@@ -1,4 +1,4 @@
bark==0.1.5
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi

View File

@@ -1,2 +1,2 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf

View File

@@ -1,4 +1,4 @@
TTS==0.22.0
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi

View File

@@ -1,5 +1,5 @@
setuptools
grpcio==1.66.0
grpcio==1.65.4
pillow
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi
setuptools

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.4
protobuf
certifi
wheel

View File

@@ -1,3 +1,3 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi

View File

@@ -2,7 +2,7 @@
intel-extension-for-pytorch
torch
optimum[openvino]
grpcio==1.66.0
grpcio==1.65.5
protobuf
librosa==0.9.1
faster-whisper==1.0.3

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
librosa
faster-whisper

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.3.0+rocm6.0
torchaudio==2.3.0+rocm6.0
torch
torchaudio
transformers
accelerate
accelerate

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi
llvmlite==0.43.0

View File

@@ -1,3 +1,3 @@
grpcio==1.66.0
grpcio==1.65.4
protobuf
certifi

View File

@@ -1,3 +1,3 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
scipy==1.14.0
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,3 +1,3 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.66.0
grpcio==1.65.5
protobuf
certifi
setuptools

View File

@@ -1,80 +0,0 @@
package cli_api
import (
"context"
"fmt"
"net"
"os"
"strings"
"github.com/mudler/LocalAI/core/p2p"
"github.com/mudler/edgevpn/pkg/node"
"github.com/rs/zerolog/log"
)
func StartP2PStack(ctx context.Context, address, token, networkID string, federated bool) error {
var n *node.Node
// Here we are avoiding creating multiple nodes:
// - if the federated mode is enabled, we create a federated node and expose a service
// - exposing a service creates a node with specific options, and we don't want to create another node
// If the federated mode is enabled, we expose a service to the local instance running
// at r.Address
if federated {
_, port, err := net.SplitHostPort(address)
if err != nil {
return err
}
// Here a new node is created and started
// and a service is exposed by the node
node, err := p2p.ExposeService(ctx, "localhost", port, token, p2p.NetworkID(networkID, p2p.FederatedID))
if err != nil {
return err
}
if err := p2p.ServiceDiscoverer(ctx, node, token, p2p.NetworkID(networkID, p2p.FederatedID), nil, false); err != nil {
return err
}
n = node
}
// If the p2p mode is enabled, we start the service discovery
if token != "" {
// If a node wasn't created previously, create it
if n == nil {
node, err := p2p.NewNode(token)
if err != nil {
return err
}
err = node.Start(ctx)
if err != nil {
return fmt.Errorf("starting new node: %w", err)
}
n = node
}
// Attach a ServiceDiscoverer to the p2p node
log.Info().Msg("Starting P2P server discovery...")
if err := p2p.ServiceDiscoverer(ctx, n, token, p2p.NetworkID(networkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
var tunnelAddresses []string
for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(networkID, p2p.WorkerID)) {
if v.IsOnline() {
tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
} else {
log.Info().Msgf("Node %s is offline", v.ID)
}
}
tunnelEnvVar := strings.Join(tunnelAddresses, ",")
os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
}, true); err != nil {
return err
}
}
return nil
}

View File

@@ -3,10 +3,11 @@ package cli
import (
"context"
"fmt"
"net"
"os"
"strings"
"time"
cli_api "github.com/mudler/LocalAI/core/cli/api"
cliContext "github.com/mudler/LocalAI/core/cli/context"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http"
@@ -52,8 +53,6 @@ type RunCMD struct {
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
Peer2Peer bool `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
Peer2PeerDHTInterval int `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
Peer2PeerOTPInterval int `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
@@ -108,7 +107,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
// IF no token is provided, and p2p is enabled,
// we generate one and wait for the user to pick up the token (this is for interactive)
log.Info().Msg("No token provided, generating one")
token = p2p.GenerateToken(r.Peer2PeerDHTInterval, r.Peer2PeerOTPInterval)
token = p2p.GenerateToken()
log.Info().Msg("Generated Token:")
fmt.Println(token)
@@ -116,12 +115,52 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
}
opts = append(opts, config.WithP2PToken(token))
node, err := p2p.NewNode(token)
if err != nil {
return err
}
nodeContext := context.Background()
err = node.Start(nodeContext)
if err != nil {
return fmt.Errorf("starting new node: %w", err)
}
log.Info().Msg("Starting P2P server discovery...")
if err := p2p.ServiceDiscoverer(nodeContext, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
var tunnelAddresses []string
for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID)) {
if v.IsOnline() {
tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
} else {
log.Info().Msgf("Node %s is offline", v.ID)
}
}
tunnelEnvVar := strings.Join(tunnelAddresses, ",")
os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
}, true); err != nil {
return err
}
}
backgroundCtx := context.Background()
if r.Federated {
_, port, err := net.SplitHostPort(r.Address)
if err != nil {
return err
}
fedCtx := context.Background()
if err := cli_api.StartP2PStack(backgroundCtx, r.Address, token, r.Peer2PeerNetworkID, r.Federated); err != nil {
return err
node, err := p2p.ExposeService(fedCtx, "localhost", port, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID))
if err != nil {
return err
}
if err := p2p.ServiceDiscoverer(fedCtx, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID), nil, false); err != nil {
return err
}
}
idleWatchDog := r.EnableWatchdogIdle

View File

@@ -25,8 +25,9 @@ import (
// @Success 200 {object} schema.OpenAIResponse "Response"
// @Router /v1/chat/completions [post]
func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
var id, textContentToReturn string
var created int
textContentToReturn := ""
id := uuid.New().String()
created := int(time.Now().Unix())
process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
initialMessage := schema.OpenAIResponse{
@@ -158,10 +159,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
}
return func(c *fiber.Ctx) error {
textContentToReturn = ""
id = uuid.New().String()
created = int(time.Now().Unix())
modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
if err != nil {
return fmt.Errorf("failed reading parameters from request:%w", err)

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,9 +0,0 @@
/*!
Theme: Default
Description: Original highlight.js style
Author: (c) Ivan Sagalaev <maniac@softwaremaniacs.org>
Maintainer: @highlightjs/core-team
Website: https://highlightjs.org/
License: see project LICENSE
Touched: 2021
*/pre code.hljs{display:block;overflow-x:auto;padding:1em}code.hljs{padding:3px 5px}.hljs{background:#f3f3f3;color:#444}.hljs-comment{color:#697070}.hljs-punctuation,.hljs-tag{color:#444a}.hljs-tag .hljs-attr,.hljs-tag .hljs-name{color:#444}.hljs-attribute,.hljs-doctag,.hljs-keyword,.hljs-meta .hljs-keyword,.hljs-name,.hljs-selector-tag{font-weight:700}.hljs-deletion,.hljs-number,.hljs-quote,.hljs-selector-class,.hljs-selector-id,.hljs-string,.hljs-template-tag,.hljs-type{color:#800}.hljs-section,.hljs-title{color:#800;font-weight:700}.hljs-link,.hljs-operator,.hljs-regexp,.hljs-selector-attr,.hljs-selector-pseudo,.hljs-symbol,.hljs-template-variable,.hljs-variable{color:#ab5656}.hljs-literal{color:#695}.hljs-addition,.hljs-built_in,.hljs-bullet,.hljs-code{color:#397300}.hljs-meta{color:#1f7199}.hljs-meta .hljs-string{color:#38a}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

@@ -28,15 +28,9 @@ import (
"github.com/mudler/edgevpn/pkg/logger"
)
func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectionConfig {
func generateNewConnectionData() *node.YAMLConnectionConfig {
maxMessSize := 20 << 20 // 20MB
keyLength := 43
if DHTInterval == 0 {
DHTInterval = 360
}
if OTPInterval == 0 {
OTPInterval = 9000
}
return &node.YAMLConnectionConfig{
MaxMessageSize: maxMessSize,
@@ -46,21 +40,21 @@ func generateNewConnectionData(DHTInterval, OTPInterval int) *node.YAMLConnectio
OTP: node.OTP{
DHT: node.OTPConfig{
Key: eutils.RandStringRunes(keyLength),
Interval: DHTInterval,
Interval: 120,
Length: keyLength,
},
Crypto: node.OTPConfig{
Key: eutils.RandStringRunes(keyLength),
Interval: OTPInterval,
Interval: 9000,
Length: keyLength,
},
},
}
}
func GenerateToken(DHTInterval, OTPInterval int) string {
func GenerateToken() string {
// Generates a new config and exit
return generateNewConnectionData(DHTInterval, OTPInterval).Base64()
return generateNewConnectionData().Base64()
}
func IsP2PEnabled() bool {

View File

@@ -10,7 +10,7 @@ import (
"github.com/mudler/edgevpn/pkg/node"
)
func GenerateToken(DHTInterval, OTPInterval int) string {
func GenerateToken() string {
return "not implemented"
}

View File

@@ -5,11 +5,11 @@
url: "https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/highlight.min.js"
sha: "4499ff936d4fd562adca5a5cbe512dc19eb80942eee8618dafbcebc4f7974bdb"
- filename: "alpine.js"
url: "https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
sha: "fb9b146b7fbd1bbf251fb3ef464f2e7c5d33a4a83aeb0fcf21e92ca6a9558c4b"
url: "https://cdn.jsdelivr.net/npm/alpinejs@3.14.1/dist/cdn.min.js"
sha: "358d9afbb1ab5befa2f48061a30776e5bcd7707f410a606ba985f98bc3b1c034"
- filename: "marked.js"
url: "https://cdn.jsdelivr.net/npm/marked/marked.min.js"
sha: "15fabce5b65898b32b03f5ed25e9f891a729ad4c0d6d877110a7744aa847a894"
url: "https://cdn.jsdelivr.net/npm/marked@14.0.0/lib/marked.umd.min.js"
sha: "0996c58f732096b6aed537916589c0786dd3332bf4612cc9c206bc44a031b13d"
- filename: "purify.js"
url: "https://cdn.jsdelivr.net/npm/dompurify@3.0.6/dist/purify.min.js"
sha: "ea4b09082ca4ba0ae71be6431a097678751d0453b9c52a4d2c7c39a2166ed9fc"

View File

@@ -1,91 +0,0 @@
---
name: "hermes-vllm"
config_file: |
backend: vllm
context_size: 8192
stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
disable_no_action: true
grammar:
# Uncomment the line below to enable grammar matching for JSON results if the model is breaking
# the output. This will make the model more accurate and won't break the JSON output.
# This however, will make parallel_calls not functional (it is a known bug)
# mixed_mode: true
disable: true
parallel_calls: true
expect_strings_after_json: true
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*)"
capture_llm_results:
- (?s)<scratchpad>(.*?)</scratchpad>
replace_llm_results:
- key: (?s)<scratchpad>(.*?)</scratchpad>
value: ""
template:
use_tokenizer_template: true
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: |
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant
# Uncomment to specify a quantization method (optional)
# quantization: "awq"
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
# gpu_memory_utilization: 0.5
# Uncomment to trust remote code from huggingface
# trust_remote_code: true
# Uncomment to enable eager execution
# enforce_eager: true
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
# swap_space: 2
# Uncomment to specify the maximum length of a sequence (including prompt and output)
# max_model_len: 32768
# Uncomment and specify the number of Tensor divisions.
# Allows you to partition and run large models. Performance gains are limited.
# https://github.com/vllm-project/vllm/issues/1435
# tensor_parallel_size: 2

View File

@@ -4752,38 +4752,6 @@
- filename: Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
sha256: 955c2f42caade4278f3c9dbffa32bb74572652b20e49e5340e782de3585bbe3f
uri: huggingface://NousResearch/Hermes-3-Llama-3.1-70B-GGUF/Hermes-3-Llama-3.1-70B.Q4_K_M.gguf
- &hermes-vllm
url: "github:mudler/LocalAI/gallery/hermes-vllm.yaml@master"
name: "hermes-3-llama-3.1-8b:vllm"
icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/vG6j5WxHX09yj32vgjJlI.jpeg
tags:
- llm
- vllm
- gpu
- function-calling
license: llama-3
urls:
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-8B
description: |
Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board. It is designed to focus on aligning LLMs to the user, with powerful steering capabilities and control given to the end user. The model uses ChatML as the prompt format, opening up a much more structured system for engaging the LLM in multi-turn chat dialogue. It also supports function calling and structured output capabilities, generalist assistant capabilities, and improved code generation skills.
overrides:
parameters:
model: NousResearch/Hermes-3-Llama-3.1-8B
- !!merge <<: *hermes-vllm
name: "hermes-3-llama-3.1-70b:vllm"
urls:
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B
overrides:
parameters:
model: NousResearch/Hermes-3-Llama-3.1-70B
- !!merge <<: *hermes-vllm
name: "hermes-3-llama-3.1-405b:vllm"
icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
urls:
- https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-405B
overrides:
parameters:
model: NousResearch/Hermes-3-Llama-3.1-405B
- !!merge <<: *hermes-2-pro-mistral
name: "biomistral-7b"
description: |

View File

@@ -1,29 +0,0 @@
---
name: "vllm"
config_file: |
backend: vllm
function:
disable_no_action: true
grammar:
disable: true
parallel_calls: true
expect_strings_after_json: true
template:
use_tokenizer_template: true
# Uncomment to specify a quantization method (optional)
# quantization: "awq"
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
# gpu_memory_utilization: 0.5
# Uncomment to trust remote code from huggingface
# trust_remote_code: true
# Uncomment to enable eager execution
# enforce_eager: true
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
# swap_space: 2
# Uncomment to specify the maximum length of a sequence (including prompt and output)
# max_model_len: 32768
# Uncomment and specify the number of Tensor divisions.
# Allows you to partition and run large models. Performance gains are limited.
# https://github.com/vllm-project/vllm/issues/1435
# tensor_parallel_size: 2