fix(python): pin exllama2 (#1711 )

fix(python): pin python deps Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
fix(llama.cpp): downgrade to a known working version (#1706 )
2026-02-03 11:13:31 -05:00 · 2024-02-14 21:44:12 +01:00 · 2024-02-14 10:28:06 +01:00 · 2024-02-13 21:17:21 +01:00 · 2024-02-13 08:35:39 +00:00 · 2024-02-12 21:43:33 +00:00
32 changed files with 383 additions and 253 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -20,6 +20,10 @@ jobs:
            defines: '-DLLAMA_AVX2=OFF'
          - build: 'avx512'
            defines: '-DLLAMA_AVX512=ON'
+          - build: 'cuda12'
+            defines: ''
+          - build: 'cuda11'
+            defines: ''
    runs-on: ubuntu-latest
    steps:
      - name: Clone
@@ -33,7 +37,18 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-
+      - name: Install CUDA Dependencies
+        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
+        run: |
+          if [ "${{ matrix.build }}" == "cuda12" ]; then
+            export CUDA_VERSION=12-3
+          else
+            export CUDA_VERSION=11-7
+          fi
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v3
@@ -50,14 +65,19 @@ jobs:
      - name: Install gRPC
        run: |
          cd grpc && cd cmake/build && sudo make -j12 install
-
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
-          STATIC=true make dist
+          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
+            export BUILD_TYPE=cublas
+            export PATH=/usr/local/cuda/bin:$PATH
+            make dist
+          else
+            STATIC=true make dist
+          fi
      - uses: actions/upload-artifact@v3
        with:
          name: ${{ matrix.build }}
@@ -109,4 +129,4 @@ jobs:
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
-            release/*
+            release/*
--- a/2
+++ b/2
@@ -39,7 +39,7 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.1-1_all.deb && \
    rm -f cuda-keyring_1.1-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi

 ENV PATH /usr/local/cuda/bin:${PATH}
--- a/4
+++ b/4
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=4b7b38bef5addbd31f453871d79647fbae6bec8a
+CPPLLAMA_VERSION?=f026f8120f97090d34a52b3dc023c82e0ede3f7d

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -550,4 +550,4 @@ docker-image-intel:
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
-		--build-arg BUILD_TYPE=sycl_f16 -t $(DOCKER_IMAGE) .
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -7,6 +7,7 @@ import (
 	"path/filepath"

 	api_config "github.com/go-skynet/LocalAI/api/config"
+	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -29,16 +30,20 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option, c config.Config) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
 	}
+
+	grpcOpts := gRPCModelOpts(c)
+
 	opts := modelOpts(api_config.Config{}, o, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 		model.WithAssetDir(o.AssetsDestination),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	piperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -183,6 +183,60 @@ func (c *Config) FunctionToCall() string {
 	return c.functionCallNameString
 }

+// Load a config file for a model
+func Load(modelName, modelPath string, cm *ConfigLoader, debug bool, threads, ctx int, f16 bool) (*Config, error) {
+	// Load a config file if present after the model name
+	modelConfig := filepath.Join(modelPath, modelName+".yaml")
+
+	var cfg *Config
+
+	defaults := func() {
+		cfg = DefaultConfig(modelName)
+		cfg.ContextSize = ctx
+		cfg.Threads = threads
+		cfg.F16 = f16
+		cfg.Debug = debug
+	}
+
+	cfgExisting, exists := cm.GetConfig(modelName)
+	if !exists {
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := cm.LoadConfig(modelConfig); err != nil {
+				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = cm.GetConfig(modelName)
+			if exists {
+				cfg = &cfgExisting
+			} else {
+				defaults()
+			}
+		} else {
+			defaults()
+		}
+	} else {
+		cfg = &cfgExisting
+	}
+
+	// Set the parameters for the language model prediction
+	//updateConfig(cfg, input)
+
+	// Don't allow 0 as setting
+	if cfg.Threads == 0 {
+		if threads != 0 {
+			cfg.Threads = threads
+		} else {
+			cfg.Threads = 4
+		}
+	}
+
+	// Enforce debug flag if passed from CLI
+	if debug {
+		cfg.Debug = true
+	}
+
+	return cfg, nil
+}
+
 func defaultPredictOptions(modelFile string) PredictionOptions {
 	return PredictionOptions{
 		TopP:        0.7,
--- a/api/ctx/fiber.go
+++ b/api/ctx/fiber.go
@@ -0,0 +1,43 @@
+package fiberContext
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// ModelFromContext returns the model from the context
+// If no model is specified, it will take the first available
+// Takes a model string as input which should be the one received from the user request.
+// It returns the model name resolved from the context and an error if any.
+func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
+	if ctx.Params("model") != "" {
+		modelInput = ctx.Params("model")
+	}
+
+	// Set model from bearer token, if available
+	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+
+	// If no model was specified, take the first available
+	if modelInput == "" && !bearerExists && firstModel {
+		models, _ := loader.ListModels()
+		if len(models) > 0 {
+			modelInput = models[0]
+			log.Debug().Msgf("No model specified, using: %s", modelInput)
+		} else {
+			log.Debug().Msgf("No model specified, returning error")
+			return "", fmt.Errorf("no model specified")
+		}
+	}
+
+	// If a model is found in bearer token takes precedence
+	if bearerExists {
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		modelInput = bearer
+	}
+	return modelInput, nil
+}
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -3,6 +3,8 @@ package localai
 import (
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
+	fiberContext "github.com/go-skynet/LocalAI/api/ctx"
+	"github.com/rs/zerolog/log"

 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
@@ -18,12 +20,31 @@ func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	return func(c *fiber.Ctx) error {

 		input := new(TTSRequest)
+
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}

-		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
+		modelFile, err := fiberContext.ModelFromContext(c, o.Loader, input.Model, false)
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		}
+		cfg, err := config.Load(modelFile, o.Loader.ModelPath, cm, false, 0, 0, false)
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		} else {
+			modelFile = cfg.Model
+		}
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		if input.Backend != "" {
+			cfg.Backend = input.Input
+		}
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, o.Loader, o, *cfg)
 		if err != nil {
 			return err
 		}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -58,12 +58,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	return func(c *fiber.Ctx) error {
 		processFunctions := false
 		funcs := grammar.Functions{}
-		modelFile, input, err := readInput(c, o, true)
+		modelFile, input, err := readRequest(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -53,14 +53,14 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 	}

 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o, true)
+		modelFile, input, err := readRequest(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

 		log.Debug().Msgf("`input`: %+v", input)

-		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -18,12 +18,12 @@ import (

 func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o, true)
+		modelFile, input, err := readRequest(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -18,12 +18,12 @@ import (
 // https://platform.openai.com/docs/api-reference/embeddings
 func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o, true)
+		model, input, err := readRequest(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := readConfig(model, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(model, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -61,7 +61,7 @@ func downloadFile(url string) (string, error) {
 */
 func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o, false)
+		m, input, err := readRequest(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -71,7 +71,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		}
 		log.Debug().Msgf("Loading model: %+v", m)

-		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, 0, 0, false)
+		config, input, err := mergeRequestWithConfig(m, input, cm, o.Loader, o.Debug, 0, 0, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -7,11 +7,10 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
-	"os"
-	"path/filepath"
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
+	fiberContext "github.com/go-skynet/LocalAI/api/ctx"
 	options "github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -19,8 +18,7 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
-	loader := o.Loader
+func readRequest(c *fiber.Ctx, o *options.Option, firstModel bool) (string, *schema.OpenAIRequest, error) {
 	input := new(schema.OpenAIRequest)
 	ctx, cancel := context.WithCancel(o.Context)
 	input.Context = ctx
@@ -30,38 +28,13 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 	}

-	modelFile := input.Model
-
-	if c.Params("model") != "" {
-		modelFile = c.Params("model")
-	}
-
 	received, _ := json.Marshal(input)

 	log.Debug().Msgf("Request received: %s", string(received))

-	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+	modelFile, err := fiberContext.ModelFromContext(c, o.Loader, input.Model, firstModel)

-	// If no model was specified, take the first available
-	if modelFile == "" && !bearerExists && randomModel {
-		models, _ := loader.ListModels()
-		if len(models) > 0 {
-			modelFile = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelFile)
-		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", nil, fmt.Errorf("no model specified")
-		}
-	}
-
-	// If a model is found in bearer token takes precedence
-	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
-		modelFile = bearer
-	}
-	return modelFile, input, nil
+	return modelFile, input, err
 }

 // this function check if the string is an URL, if it's an URL downloads the image in memory
@@ -95,7 +68,7 @@ func getBase64Image(s string) (string, error) {
 	return "", fmt.Errorf("not valid string")
 }

-func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
+func updateRequestConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
 	}
@@ -282,55 +255,11 @@ func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	}
 }

-func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
-	// Load a config file if present after the model name
-	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
-
-	var cfg *config.Config
-
-	defaults := func() {
-		cfg = config.DefaultConfig(modelFile)
-		cfg.ContextSize = ctx
-		cfg.Threads = threads
-		cfg.F16 = f16
-		cfg.Debug = debug
-	}
-
-	cfgExisting, exists := cm.GetConfig(modelFile)
-	if !exists {
-		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cm.LoadConfig(modelConfig); err != nil {
-				return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-			}
-			cfgExisting, exists = cm.GetConfig(modelFile)
-			if exists {
-				cfg = &cfgExisting
-			} else {
-				defaults()
-			}
-		} else {
-			defaults()
-		}
-	} else {
-		cfg = &cfgExisting
-	}
+func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
+	cfg, err := config.Load(modelFile, loader.ModelPath, cm, debug, threads, ctx, f16)

 	// Set the parameters for the language model prediction
-	updateConfig(cfg, input)
+	updateRequestConfig(cfg, input)

-	// Don't allow 0 as setting
-	if cfg.Threads == 0 {
-		if threads != 0 {
-			cfg.Threads = threads
-		} else {
-			cfg.Threads = 4
-		}
-	}
-
-	// Enforce debug flag if passed from CLI
-	if debug {
-		cfg.Debug = true
-	}
-
-	return cfg, input, nil
+	return cfg, input, err
 }
--- a/api/openai/transcription.go
+++ b/api/openai/transcription.go
@@ -19,12 +19,12 @@ import (
 // https://platform.openai.com/docs/api-reference/audio/create
 func TranscriptEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o, false)
+		m, input, err := readRequest(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(m, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1387,30 +1387,20 @@ struct llama_server_context
            {
                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                {
-                    // Shift context
-                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
-                    const int n_discard = n_left / 2;
+                    // START LOCALAI changes
+                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
+                    // See: https://github.com/mudler/LocalAI/issues/1333
+                    // Context is exhausted, release the slot
+                    slot.release();
+                    send_final_response(slot);
+                    slot.cache_tokens.clear();
+                    slot.n_past = 0;
+                    slot.truncated = false;
+                    slot.has_next_token = true;
+                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());

-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
-
-                    for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                    }
-
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
-
-                    LOG_VERBOSE("context shift", {
-                        { "n_ctx", n_ctx },
-                        { "n_keep", params.n_keep },
-                        { "n_left", n_left },
-                    });
+                    continue;
+                    // END LOCALAI changes
                }
            }
        }
--- a/backend/go/llm/dolly/main.go
+++ b/backend/go/llm/dolly/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Dolly{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/gptj/main.go
+++ b/backend/go/llm/gptj/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.GPTJ{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/gptneox/main.go
+++ b/backend/go/llm/gptneox/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.GPTNeoX{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/mpt/main.go
+++ b/backend/go/llm/mpt/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.MPT{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/replit/main.go
+++ b/backend/go/llm/replit/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Replit{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -1,8 +1,9 @@
+CONDA_ENV_PATH = "diffusers.yml"
+
 .PHONY: diffusers
 diffusers:
-	@echo "Creating virtual environment..."
-	@conda env create --name diffusers --file diffusers.yml
-	@echo "Virtual environment created."
+	@echo "Installing $(CONDA_ENV_PATH)..."
+	bash install.sh $(CONDA_ENV_PATH)

 .PHONY: run
 run:
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -ex
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if conda_env_exists "diffusers" ; then
+    echo "Creating virtual environment..."
+    conda env create --name diffusers --file $1
+    echo "Virtual environment created."
+else 
+    echo "Virtual environment already exists."
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    export PATH=$PATH:/opt/conda/bin
+
+    # Activate conda environment
+    source activate diffusers
+
+    pip cache purge
+fi
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,15 +1,25 @@
 #!/bin/bash
-
+set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
 export PATH=$PATH:/opt/conda/bin
+export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

 # Activate conda environment
 source activate transformers

 echo $CONDA_PREFIX

-git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
+git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2
+
+pushd $CONDA_PREFIX/exllamav2
+
+git checkout -b build $SHA
+
+# TODO: this needs to be pinned within the conda environments
+pip install -r requirements.txt
+
+popd

 cp -rfv $CONDA_PREFIX/exllamav2/* ./  

--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
 export PATH=$PATH:/opt/conda/bin
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@@ -10,7 +10,7 @@ source activate transformers

 echo $CONDA_PREFIX

-git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd
+git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd

 cp -rfv $CONDA_PREFIX/vall-e-x/* ./

--- a/backend/python/vall-e-x/ttsvalle.py
+++ b/backend/python/vall-e-x/ttsvalle.py
@@ -55,6 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            print("Preparing models, please wait", file=sys.stderr)
            # download and load all models
            preload_models()
+            self.clonedVoice = False
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
@@ -65,6 +66,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.AudioPath != "":
                print("Generating model", file=sys.stderr)
                make_prompt(name=model_name, audio_prompt_path=request.AudioPath)
+                self.clonedVoice = True
                ### Use given transcript
                ##make_prompt(name=model_name, audio_prompt_path="paimon_prompt.wav",
                ##                transcript="Just, what was that? Paimon thought we were gonna get eaten.")
@@ -91,6 +93,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            audio_array = None
            if model != "":
+                if self.clonedVoice:
+                    model = os.path.basename(request.model)
                audio_array = generate_audio(request.text, prompt=model)
            else:
                audio_array = generate_audio(request.text)
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -112,14 +112,24 @@ llama_init_from_file: kv self size  =  512.00 MB

 ## Intel acceleration (sycl)

-#### Requirements
+### Requirements

-Requirement: [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit/download.html)
+If building from source, you need to install [Intel oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit/download.html) and have the Intel drivers available in the system.
+
+### Container images

 To use SYCL, use the images with the `sycl-f16` or `sycl-f32` tag, for example `{{< version >}}-sycl-f32-core`, `{{< version >}}-sycl-f16-ffmpeg-core`, ...

 The image list is on [quay](https://quay.io/repository/go-skynet/local-ai?tab=tags).

+#### Example
+
+To run LocalAI with Docker and sycl starting `phi-2`, you can use the following command as an example:
+
+```bash
+docker run -e DEBUG=true --privileged -ti -v $PWD/models:/build/models -p 8080:8080  -v /dev/dri:/dev/dri --rm quay.io/go-skynet/local-ai:master-sycl-f32-ffmpeg-core phi-2
+```
+
 ### Notes

 In addition to the commands to run LocalAI normally, you need to specify `--device /dev/dri` to docker, for example:
@@ -128,3 +138,4 @@ In addition to the commands to run LocalAI normally, you need to specify `--devi
 docker run --rm -ti --device /dev/dri -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -v $PWD/models:/models quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-ffmpeg-core
 ```

+Note also that sycl does have a known issue to hang with `mmap: true`. You have to disable it in the model configuration if explicitly enabled.
--- a/docs/content/docs/features/text-to-audio.md
+++ b/docs/content/docs/features/text-to-audio.md
@@ -144,15 +144,15 @@ parameters:
  model: "cloned-voice"
 vall-e:
  # The path to the audio file to be cloned
-  # relative to the models directory 
-  audio_path: "path-to-wav-source.wav"
+  # relative to the models directory
+  # Max 15s
+  audio_path: "audio-sample.wav"
 ```

 Then you can specify the model name in the requests:

 ```
 curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
-     "backend": "vall-e-x",
     "model": "cloned-voice",
     "input":"Hello!"
   }' | aplay
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.7.0"
+  "version": "v2.8.0"
 }
--- a/examples/kubernetes/deployment-intel-arc.yaml
+++ b/examples/kubernetes/deployment-intel-arc.yaml
@@ -0,0 +1,68 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: local-ai
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: models-pvc
+  namespace: local-ai
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: local-ai
+  namespace: local-ai
+  labels:
+    app: local-ai
+spec:
+  selector:
+    matchLabels:
+      app: local-ai
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: local-ai
+      name: local-ai
+    spec:
+      containers:
+        - args:
+          - phi-2
+          env:
+          - name: DEBUG
+            value: "true"
+          name: local-ai
+          image: quay.io/go-skynet/local-ai:master-sycl-f32-ffmpeg-core
+          imagePullPolicy: Always
+          resources:
+            limits:
+              gpu.intel.com/i915: 1
+          volumeMounts:
+            - name: models-volume
+              mountPath: /build/models
+      volumes:
+        - name: models-volume
+          persistentVolumeClaim:
+            claimName: models-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: local-ai
+  namespace: local-ai
+spec:
+  selector:
+    app: local-ai
+  type: LoadBalancer
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
--- a/examples/kubernetes/deployment.yaml
+++ b/examples/kubernetes/deployment.yaml
@@ -0,0 +1,65 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: local-ai
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: models-pvc
+  namespace: local-ai
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: local-ai
+  namespace: local-ai
+  labels:
+    app: local-ai
+spec:
+  selector:
+    matchLabels:
+      app: local-ai
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: local-ai
+      name: local-ai
+    spec:
+      containers:
+        - args:
+          - phi-2
+          env:
+          - name: DEBUG
+            value: "true"
+          name: local-ai
+          image: quay.io/go-skynet/local-ai:master-ffmpeg-core
+          imagePullPolicy: IfNotPresent
+          volumeMounts:
+            - name: models-volume
+              mountPath: /build/models
+      volumes:
+        - name: models-volume
+          persistentVolumeClaim:
+            claimName: models-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: local-ai
+  namespace: local-ai
+spec:
+  selector:
+    app: local-ai
+  type: LoadBalancer
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
--- a/main.go
+++ b/main.go
@@ -404,7 +404,7 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit

 					defer opts.Loader.StopAllGRPC()

-					filePath, _, err := backend.ModelTTS(backendOption, text, modelOption, opts.Loader, opts)
+					filePath, _, err := backend.ModelTTS(backendOption, text, modelOption, opts.Loader, opts, config.Config{})
 					if err != nil {
 						return err
 					}
Author	SHA1	Message	Date
Ettore Di Giacinto	5e155fb081	fix(python): pin exllama2 (#1711 ) fix(python): pin python deps Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-02-14 21:44:12 +01:00
Ettore Di Giacinto	39a6b562cf	fix(llama.cpp): downgrade to a known working version (#1706 ) sycl support is broken otherwise. See upstream issue: https://github.com/ggerganov/llama.cpp/issues/5469 Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2024-02-14 10:28:06 +01:00
Ettore Di Giacinto	c56b6ddb1c	fix(llama.cpp): disable infinite context shifting (#1704 ) Infinite context loop might as well trigger an infinite loop of context shifting if the model hallucinates and does not stop answering. This has the unpleasant effect that the predicion never terminates, which is the case especially on small models which tends to hallucinate. Workarounds https://github.com/mudler/LocalAI/issues/1333 by removing context-shifting. See also upstream issue: https://github.com/ggerganov/llama.cpp/issues/3969	2024-02-13 21:17:21 +01:00
Sertaç Özercan	2e61ff32ad	ci: add cuda builds to release (#1702 ) Signed-off-by: Sertac Ozercan <sozercan@gmail.com>	2024-02-13 08:35:39 +00:00
LocalAI [bot]	02f6e18adc	⬆️ Update ggerganov/llama.cpp (#1700 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-02-12 21:43:33 +00:00
LocalAI [bot]	4436e62cf1	⬆️ Update ggerganov/llama.cpp (#1698 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-02-12 09:56:04 +01:00
Ettore Di Giacinto	6e0eb96c61	fix: drop unused code (#1697 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-02-11 11:28:59 +01:00
Ettore Di Giacinto	fd68bf7084	fix(vall-e-x): Fix voice cloning (#1696 )	2024-02-11 11:20:00 +01:00
LocalAI [bot]	58cdf97361	⬆️ Update ggerganov/llama.cpp (#1694 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-02-11 10:01:11 +01:00
Ettore Di Giacinto	53dbe36f32	feat(tts): respect YAMLs config file, add sycl docs/examples (#1692 ) * feat(refactor): refactor config and input reading * feat(tts): read config file for TTS * examples(kubernetes): Add simple deployment example * examples(kubernetes): Add simple deployment for intel arc * docs(sycl): add sycl example * feat(tts): do not always pick a first model * fixups to run vall-e-x on container * Correctly resolve backend	2024-02-10 21:37:03 +01:00
LocalAI [bot]	081bd07fd1	⬆️ Update docs version mudler/LocalAI (#1693 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2024-02-10 21:33:14 +01:00