mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-01 05:36:49 -04:00
* fix: Automatically disable mmap for Intel SYCL backends Fixes issue #9012 where Qwen3.5 models fail to load on Intel Arc GPU with RPC EOF error. The Intel SYCL backend has a known issue where mmap enabled causes the backend to hang. This change automatically disables mmap when detecting Intel or SYCL backends. References: - https://github.com/mudler/LocalAI/issues/9012 - Documentation mentions: SYCL hangs when mmap: true is set * feat: Add logging for mmap auto-disable on Intel SYCL backends As requested in PR review, add xlog.Info call to log when mmap is automatically disabled for Intel SYCL backends. This helps with debugging and confirms the auto-disable logic is working. --------- Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>
282 lines
7.6 KiB
Go
282 lines
7.6 KiB
Go
package backend
|
|
|
|
import (
|
|
"strings"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
|
name := c.Name
|
|
if name == "" {
|
|
name = c.Model
|
|
}
|
|
|
|
defOpts := []model.Option{
|
|
model.WithBackendString(c.Backend),
|
|
model.WithModel(c.Model),
|
|
model.WithContext(so.Context),
|
|
model.WithModelID(name),
|
|
}
|
|
|
|
threads := 1
|
|
|
|
if c.Threads != nil {
|
|
threads = *c.Threads
|
|
}
|
|
|
|
if so.Threads != 0 {
|
|
threads = so.Threads
|
|
}
|
|
|
|
c.Threads = &threads
|
|
|
|
grpcOpts := grpcModelOpts(c, so.SystemState.Model.ModelsPath)
|
|
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
|
|
|
if so.ParallelBackendRequests {
|
|
defOpts = append(defOpts, model.EnableParallelRequests)
|
|
}
|
|
|
|
if c.GRPC.Attempts != 0 {
|
|
defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
|
}
|
|
|
|
if c.GRPC.AttemptsSleepTime != 0 {
|
|
defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
|
}
|
|
|
|
for k, v := range so.ExternalGRPCBackends {
|
|
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
|
}
|
|
|
|
return append(defOpts, opts...)
|
|
}
|
|
|
|
func getSeed(c config.ModelConfig) int32 {
|
|
var seed int32 = config.RAND_SEED
|
|
|
|
if c.Seed != nil {
|
|
seed = int32(*c.Seed)
|
|
}
|
|
|
|
if seed == config.RAND_SEED {
|
|
seed = rand.Int31()
|
|
}
|
|
|
|
return seed
|
|
}
|
|
|
|
func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
|
b := 512
|
|
if c.Batch != 0 {
|
|
b = c.Batch
|
|
}
|
|
|
|
flashAttention := "auto"
|
|
|
|
if c.FlashAttention != nil {
|
|
flashAttention = *c.FlashAttention
|
|
}
|
|
|
|
f16 := false
|
|
if c.F16 != nil {
|
|
f16 = *c.F16
|
|
}
|
|
|
|
embeddings := false
|
|
if c.Embeddings != nil {
|
|
embeddings = *c.Embeddings
|
|
}
|
|
|
|
lowVRAM := false
|
|
if c.LowVRAM != nil {
|
|
lowVRAM = *c.LowVRAM
|
|
}
|
|
|
|
reranking := false
|
|
if c.Reranking != nil {
|
|
reranking = *c.Reranking
|
|
}
|
|
|
|
mmap := false
|
|
if c.MMap != nil {
|
|
mmap = *c.MMap
|
|
}
|
|
|
|
// Intel SYCL backend has issues with mmap enabled
|
|
// See: https://github.com/mudler/LocalAI/issues/9012
|
|
// Automatically disable mmap for Intel SYCL backends
|
|
if c.Backend != "" {
|
|
if strings.Contains(strings.ToLower(c.Backend), "intel") || strings.Contains(strings.ToLower(c.Backend), "sycl") {
|
|
mmap = false
|
|
xlog.Info("Auto-disabling mmap for Intel SYCL backend", "backend", c.Backend)
|
|
}
|
|
}
|
|
|
|
ctxSize := 4096
|
|
if c.ContextSize != nil {
|
|
ctxSize = *c.ContextSize
|
|
}
|
|
|
|
mmlock := false
|
|
if c.MMlock != nil {
|
|
mmlock = *c.MMlock
|
|
}
|
|
|
|
nGPULayers := 9999999
|
|
if c.NGPULayers != nil {
|
|
nGPULayers = *c.NGPULayers
|
|
}
|
|
|
|
triggers := make([]*pb.GrammarTrigger, 0)
|
|
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
|
|
triggers = append(triggers, &pb.GrammarTrigger{
|
|
Word: t.Word,
|
|
})
|
|
}
|
|
|
|
opts := &pb.ModelOptions{
|
|
CUDA: c.CUDA || c.Diffusers.CUDA,
|
|
SchedulerType: c.Diffusers.SchedulerType,
|
|
GrammarTriggers: triggers,
|
|
PipelineType: c.Diffusers.PipelineType,
|
|
CFGScale: c.CFGScale,
|
|
LoraAdapter: c.LoraAdapter,
|
|
LoraScale: c.LoraScale,
|
|
LoraAdapters: c.LoraAdapters,
|
|
LoraScales: c.LoraScales,
|
|
F16Memory: f16,
|
|
LoraBase: c.LoraBase,
|
|
IMG2IMG: c.Diffusers.IMG2IMG,
|
|
CLIPModel: c.Diffusers.ClipModel,
|
|
CLIPSubfolder: c.Diffusers.ClipSubFolder,
|
|
Options: c.Options,
|
|
Overrides: c.Overrides,
|
|
CLIPSkip: int32(c.Diffusers.ClipSkip),
|
|
ControlNet: c.Diffusers.ControlNet,
|
|
ContextSize: int32(ctxSize),
|
|
Seed: getSeed(c),
|
|
NBatch: int32(b),
|
|
NoMulMatQ: c.NoMulMatQ,
|
|
DraftModel: c.DraftModel,
|
|
AudioPath: c.AudioPath,
|
|
Quantization: c.Quantization,
|
|
LoadFormat: c.LoadFormat,
|
|
GPUMemoryUtilization: c.GPUMemoryUtilization,
|
|
TrustRemoteCode: c.TrustRemoteCode,
|
|
EnforceEager: c.EnforceEager,
|
|
SwapSpace: int32(c.SwapSpace),
|
|
MaxModelLen: int32(c.MaxModelLen),
|
|
TensorParallelSize: int32(c.TensorParallelSize),
|
|
DisableLogStatus: c.DisableLogStatus,
|
|
DType: c.DType,
|
|
// LimitMMPerPrompt vLLM
|
|
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
|
|
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
|
|
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
|
|
FlashAttention: flashAttention,
|
|
CacheTypeKey: c.CacheTypeK,
|
|
CacheTypeValue: c.CacheTypeV,
|
|
NoKVOffload: c.NoKVOffloading,
|
|
YarnExtFactor: c.YarnExtFactor,
|
|
YarnAttnFactor: c.YarnAttnFactor,
|
|
YarnBetaFast: c.YarnBetaFast,
|
|
YarnBetaSlow: c.YarnBetaSlow,
|
|
NGQA: c.NGQA,
|
|
RMSNormEps: c.RMSNormEps,
|
|
MLock: mmlock,
|
|
RopeFreqBase: c.RopeFreqBase,
|
|
RopeScaling: c.RopeScaling,
|
|
Type: c.ModelType,
|
|
RopeFreqScale: c.RopeFreqScale,
|
|
NUMA: c.NUMA,
|
|
Embeddings: embeddings,
|
|
Reranking: reranking,
|
|
LowVRAM: lowVRAM,
|
|
NGPULayers: int32(nGPULayers),
|
|
MMap: mmap,
|
|
MainGPU: c.MainGPU,
|
|
Threads: int32(*c.Threads),
|
|
TensorSplit: c.TensorSplit,
|
|
// RWKV
|
|
Tokenizer: c.Tokenizer,
|
|
}
|
|
|
|
if c.MMProj != "" {
|
|
opts.MMProj = filepath.Join(modelPath, c.MMProj)
|
|
}
|
|
|
|
return opts
|
|
}
|
|
|
|
func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions {
|
|
promptCachePath := ""
|
|
if c.PromptCachePath != "" {
|
|
p := filepath.Join(modelPath, c.PromptCachePath)
|
|
err := os.MkdirAll(filepath.Dir(p), 0750)
|
|
if err == nil {
|
|
promptCachePath = p
|
|
} else {
|
|
xlog.Error("error creating prompt cache folder", "error", err, "promptCachePath", promptCachePath)
|
|
}
|
|
}
|
|
|
|
pbOpts := &pb.PredictOptions{
|
|
Temperature: float32(*c.Temperature),
|
|
TopP: float32(*c.TopP),
|
|
NDraft: c.NDraft,
|
|
TopK: int32(*c.TopK),
|
|
Tokens: int32(*c.Maxtokens),
|
|
Threads: int32(*c.Threads),
|
|
PromptCacheAll: c.PromptCacheAll,
|
|
PromptCacheRO: c.PromptCacheRO,
|
|
PromptCachePath: promptCachePath,
|
|
F16KV: *c.F16,
|
|
DebugMode: *c.Debug,
|
|
Grammar: c.Grammar,
|
|
NegativePromptScale: c.NegativePromptScale,
|
|
RopeFreqBase: c.RopeFreqBase,
|
|
RopeFreqScale: c.RopeFreqScale,
|
|
NegativePrompt: c.NegativePrompt,
|
|
Mirostat: int32(*c.LLMConfig.Mirostat),
|
|
MirostatETA: float32(*c.LLMConfig.MirostatETA),
|
|
MirostatTAU: float32(*c.LLMConfig.MirostatTAU),
|
|
Debug: *c.Debug,
|
|
StopPrompts: c.StopWords,
|
|
Repeat: int32(c.RepeatLastN),
|
|
FrequencyPenalty: float32(c.FrequencyPenalty),
|
|
PresencePenalty: float32(c.PresencePenalty),
|
|
Penalty: float32(c.RepeatPenalty),
|
|
NKeep: int32(c.Keep),
|
|
Batch: int32(c.Batch),
|
|
IgnoreEOS: c.IgnoreEOS,
|
|
Seed: getSeed(c),
|
|
MLock: *c.MMlock,
|
|
MMap: *c.MMap,
|
|
MainGPU: c.MainGPU,
|
|
TensorSplit: c.TensorSplit,
|
|
TailFreeSamplingZ: float32(*c.TFZ),
|
|
TypicalP: float32(*c.TypicalP),
|
|
}
|
|
|
|
metadata := map[string]string{}
|
|
if c.ReasoningConfig.DisableReasoning != nil {
|
|
if *c.ReasoningConfig.DisableReasoning {
|
|
metadata["enable_thinking"] = "false"
|
|
} else {
|
|
metadata["enable_thinking"] = "true"
|
|
}
|
|
}
|
|
pbOpts.Metadata = metadata
|
|
|
|
// Logprobs and TopLogprobs are set by the caller if provided
|
|
return pbOpts
|
|
}
|