fix(deps): update module github.com/otiai10/openaigo to v1.6.0 (#960 )

[![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [github.com/otiai10/openaigo](https://togithub.com/otiai10/openaigo) | require | minor | `v1.5.2` -> `v1.6.0` | --- ### Release Notes <details> <summary>otiai10/openaigo (github.com/otiai10/openaigo)</summary> ### [`v1.6.0`](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) [Compare Source](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) </details> --- ### Configuration 📅 **Schedule**: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI).  Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
fix(deps): update github.com/tmc/langchaingo digest to 1e2a401 (#948 )
2026-02-03 03:02:38 -05:00 · 2023-08-26 14:18:06 +02:00 · 2023-08-26 14:17:48 +02:00 · 2023-08-26 14:17:22 +02:00 · 2023-08-25 21:58:46 +02:00 · 2023-08-25 18:45:40 +02:00
106 changed files with 3082 additions and 654 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -22,6 +22,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -60,6 +63,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Build
        id: build
        env:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,12 +16,21 @@ concurrency:
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -52,13 +61,21 @@ jobs:

  macOS-latest:
    runs-on: macOS-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
-
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Test
        run: |
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # go-llama build artifacts
 go-llama
+go-llama-stable
 /gpt4all
 go-stable-diffusion
 go-piper
@@ -22,6 +23,8 @@ LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
+# prevent above rules from omitting the api/localai folder
+!api/localai

 # Ignore models
 models/*
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20-bullseye
+ARG GO_VERSION=1.21-bullseye

 FROM golang:$GO_VERSION as requirements

--- a/37
+++ b/37
@@ -4,11 +4,13 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
+GOLLAMA_VERSION?=bf63302a2be787674e6ca4227a8aaeb95a8eb6b1
+
+GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
-GPT4ALL_VERSION?=0f2bb506a8ee752afc06cbb832773bf85b97eef3
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8

 # go-ggml-transformers version
 GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
@@ -101,6 +103,8 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+
 .PHONY: all test build vendor

 all: help
@@ -192,17 +196,23 @@ go-llama:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

+go-llama-stable:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
+	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+
 go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

+go-llama-stable/libbinding.a: go-llama-stable
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+
 go-piper/libpiper_binding.a:
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
@@ -220,6 +230,7 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-llama-stable clean
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C go-ggml-transformers clean
 	$(MAKE) -C go-rwkv clean
@@ -238,7 +249,8 @@ clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
 	rm -rf ./go-llama
-	rm -rf ./gpt4all
+	rm -rf ./gpt4all	
+	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
 	rm -rf ./go-ggml-transformers
@@ -290,9 +302,10 @@ test: prepare test-models/testmodel grpcs
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
+	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

@@ -304,6 +317,10 @@ test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg

+test-llama-gguf: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
@@ -351,6 +368,7 @@ backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/

 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
 # TODO: every binary should have its own folder instead, so can have different metal implementations
@@ -358,6 +376,11 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

+backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
+
 backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
@@ -421,4 +444,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

-grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+grpcs: prepare $(GRPC_BACKENDS)
--- a/api/api.go
+++ b/api/api.go
@@ -2,12 +2,14 @@ package api

 import (
 	"errors"
+	"fmt"
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/localai"
 	"github.com/go-skynet/LocalAI/api/openai"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/pkg/assets"

@@ -19,7 +21,7 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func App(opts ...options.AppOption) (*fiber.App, error) {
+func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
 	options := options.NewOptions(opts...)

 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
@@ -27,6 +29,65 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}

+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+
+	cl := config.NewConfigLoader()
+	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
+		log.Error().Msgf("error loading config files: %s", err.Error())
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
+			log.Error().Msgf("error loading config file: %s", err.Error())
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.ListConfigs() {
+			cfg, _ := cl.GetConfig(v)
+			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+		}
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		options.Loader.StopAllGRPC()
+	}()
+
+	return options, cl, nil
+}
+
+func App(opts ...options.AppOption) (*fiber.App, error) {
+
+	options, cl, err := Startup(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
+	}
+
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
 		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
@@ -44,8 +105,8 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 			// Send custom error page
 			return ctx.Status(code).JSON(
-				openai.ErrorResponse{
-					Error: &openai.APIError{Message: err.Error(), Code: code},
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
 		},
@@ -57,36 +118,6 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}))
 	}

-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-
-	cm := config.NewConfigLoader()
-	if err := cm.LoadConfigs(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
-	}
-
-	if options.ConfigFile != "" {
-		if err := cm.LoadConfigFile(options.ConfigFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cm.ListConfigs() {
-			cfg, _ := cm.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
 	// Default middleware config
 	app.Use(recover.New())

@@ -116,18 +147,6 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		return c.Next()
 	}

-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cm, options.Galleries); err != nil {
-			return nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cm, options.Galleries); err != nil {
-			return nil, err
-		}
-	}
-
 	if options.CORS {
 		var c func(ctx *fiber.Ctx) error
 		if options.CORSAllowOrigins == "" {
@@ -141,7 +160,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// LocalAI API endpoints
 	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
-	galleryService.Start(options.Context, cm)
+	galleryService.Start(options.Context, cl)

 	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
@@ -149,36 +168,36 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cm, galleryService.C, options.Galleries))
+	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cl, galleryService.C, options.Galleries))
 	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
 	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))

 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cm, options))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cm, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))

 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cm, options))
-	app.Post("/edits", auth, openai.EditEndpoint(cm, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, options))

 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cm, options))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cm, options))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))

 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))

 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cm, options))
-	app.Post("/tts", auth, localai.TTSEndpoint(cm, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))

 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cm, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))

 	if options.ImageDir != "" {
 		app.Static("/generated-images", options.ImageDir)
@@ -196,16 +215,14 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)

-	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cm))
-	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cm))
+	// Experimental Backend Statistics Module
+	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))

-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		options.Loader.StopGRPC()
-	}()
+	// models
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -296,7 +296,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -359,6 +359,76 @@ var _ = Describe("API test", func() {
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})

+			It("runs openllama gguf", Label("llama-gguf"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
+					Name:      "openllama_3b_gguf",
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing completion")
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: "openllama_3b_gguf",
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
 			It("runs gpt4all", Label("gpt4all"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -2,7 +2,6 @@ package backend

 import (
 	"fmt"
-	"sync"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -22,17 +21,13 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	var inferenceModel interface{}
 	var err error

-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	if c.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
@@ -80,18 +75,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	}

 	return func() ([]float32, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
 		embeds, err := fn()
 		if err != nil {
 			return embeds, err
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -1,17 +1,15 @@
 package backend

 import (
-	"sync"
-
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {

-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithThreads(uint32(c.Threads)),
@@ -21,12 +19,13 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			CUDA:          c.Diffusers.CUDA,
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
+			CFGScale:      c.Diffusers.CFGScale,
+			IMG2IMG:       c.Diffusers.IMG2IMG,
+			CLIPModel:     c.Diffusers.ClipModel,
+			CLIPSubfolder: c.Diffusers.ClipSubFolder,
+			CLIPSkip:      int32(c.Diffusers.ClipSkip),
 		}),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	inferenceModel, err := loader.BackendLoader(
 		opts...,
@@ -39,31 +38,20 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		_, err := inferenceModel.GenerateImage(
 			o.Context,
 			&proto.GenerateImageRequest{
-				Height:         int32(height),
-				Width:          int32(width),
-				Mode:           int32(mode),
-				Step:           int32(step),
-				Seed:           int32(seed),
-				PositivePrompt: positive_prompt,
-				NegativePrompt: negative_prompt,
-				Dst:            dst,
+				Height:           int32(height),
+				Width:            int32(width),
+				Mode:             int32(mode),
+				Step:             int32(step),
+				Seed:             int32(seed),
+				CLIPSkip:         int32(c.Diffusers.ClipSkip),
+				PositivePrompt:   positive_prompt,
+				NegativePrompt:   negative_prompt,
+				Dst:              dst,
+				Src:              src,
+				EnableParameters: c.Diffusers.EnableParameters,
 			})
 		return err
 	}

-	return func() error {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[c.Backend]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[c.Backend] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
+	return fn, nil
 }
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -15,7 +15,17 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string) bool) (func() (string, error), error) {
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+type TokenUsage struct {
+	Prompt     int
+	Completion int
+}
+
+func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model

 	grpcOpts := gRPCModelOpts(c)
@@ -23,17 +33,13 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	var inferenceModel *grpc.Client
 	var err error

-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -62,40 +68,57 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	}

 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
-	fn := func() (string, error) {
+	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
+
+		tokenUsage := TokenUsage{}
+
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}
+
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}
+
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
 		if tokenCallback != nil {
 			ss := ""
 			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
-				tokenCallback(string(s))
+				tokenCallback(string(s), tokenUsage)
 				ss += string(s)
 			})
-			return ss, err
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
 		} else {
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
 			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
-				return "", err
+				return LLMResponse{}, err
 			}
-			return string(reply.Message), err
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
 		}
 	}

-	return func() (string, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
+	return fn, nil
 }

 var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
--- a/api/backend/lock.go
+++ b/api/backend/lock.go
@@ -1,22 +0,0 @@
-package backend
-
-import "sync"
-
-// mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-var mutexMap sync.Mutex
-var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
-
-func Lock(s string) *sync.Mutex {
-	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-	mutexMap.Lock()
-	l, ok := mutexes[s]
-	if !ok {
-		m := &sync.Mutex{}
-		mutexes[s] = m
-		l = m
-	}
-	mutexMap.Unlock()
-	l.Lock()
-
-	return l
-}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -5,21 +5,46 @@ import (
 	"path/filepath"

 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"

 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
 )

+func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
+	if o.SingleBackend {
+		opts = append(opts, model.WithSingleActiveBackend())
+	}
+
+	if c.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	}
+
+	if c.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	}
+
+	for k, v := range o.ExternalGRPCBackends {
+		opts = append(opts, model.WithExternalBackend(k, v))
+	}
+
+	return opts
+}
+
 func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
-	return &pb.ModelOptions{
-		ContextSize: int32(c.ContextSize),
-		Seed:        int32(c.Seed),
-		NBatch:      int32(b),
-		NGQA:        c.NGQA,

+	return &pb.ModelOptions{
+		ContextSize:   int32(c.ContextSize),
+		Seed:          int32(c.Seed),
+		NBatch:        int32(b),
+		NoMulMatQ:     c.NoMulMatQ,
+		LoraAdapter:   c.LoraAdapter,
+		LoraBase:      c.LoraBase,
+		NGQA:          c.NGQA,
 		RMSNormEps:    c.RMSNormEps,
 		F16Memory:     c.F16,
 		MLock:         c.MMlock,
@@ -38,6 +63,8 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Device:           c.AutoGPTQ.Device,
 		UseTriton:        c.AutoGPTQ.Triton,
 		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		// RWKV
+		Tokenizer: c.Tokenizer,
 	}
 }

--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -5,25 +5,22 @@ import (
 	"fmt"

 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"

 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*api.Result, error) {
-	opts := []model.Option{
+func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
+
+	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
 		model.WithModel(c.Model),
 		model.WithContext(o.Context),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	whisperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -6,6 +6,7 @@ import (
 	"os"
 	"path/filepath"

+	api_config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -33,17 +34,12 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
 	if bb == "" {
 		bb = model.PiperBackend
 	}
-	opts := []model.Option{
+	opts := modelOpts(api_config.Config{}, o, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 		model.WithAssetDir(o.AssetsDestination),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
-
+	})
 	piperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -29,6 +29,7 @@ type Config struct {

 	FunctionsConfig Functions `yaml:"function"`

+	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

@@ -39,12 +40,33 @@ type Config struct {
 	Diffusers Diffusers `yaml:"diffusers"`

 	Step int `yaml:"step"`
+
+	// GRPC Options
+	GRPC GRPC `yaml:"grpc"`
+}
+
+type FeatureFlag map[string]*bool
+
+func (ff FeatureFlag) Enabled(s string) bool {
+	v, exist := ff[s]
+	return exist && v != nil && *v
+}
+
+type GRPC struct {
+	Attempts          int `yaml:"attempts"`
+	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
 }

 type Diffusers struct {
-	PipelineType  string `yaml:"pipeline_type"`
-	SchedulerType string `yaml:"scheduler_type"`
-	CUDA          bool   `yaml:"cuda"`
+	PipelineType     string  `yaml:"pipeline_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
+	CUDA             bool    `yaml:"cuda"`
+	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
+	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
 }

 type LLMConfig struct {
@@ -69,6 +91,9 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	ContextSize     int      `yaml:"context_size"`
 	NUMA            bool     `yaml:"numa"`
+	LoraAdapter     string   `yaml:"lora_adapter"`
+	LoraBase        string   `yaml:"lora_base"`
+	NoMulMatQ       bool     `yaml:"no_mulmatq"`
 }

 type AutoGPTQ struct {
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -41,4 +41,10 @@ type PredictionOptions struct {
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
 	// AutoGPTQ
 	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
+
+	// Diffusers
+	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
+
+	// RWKV (?)
+	Tokenizer string `json:"tokenizer" yaml:"tokenizer"`
 }
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -0,0 +1,163 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+type BackendMonitor struct {
+	configLoader *config.ConfigLoader
+	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
+}
+
+func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
+	return BackendMonitor{
+		configLoader: configLoader,
+		options:      options,
+	}
+}
+
+func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
+	config, exists := bm.configLoader.GetConfig(model)
+	var backend string
+	if exists {
+		backend = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backend = model
+	}
+
+	if !strings.HasSuffix(backend, ".bin") {
+		backend = fmt.Sprintf("%s.bin", backend)
+	}
+
+	pid, err := bm.options.Loader.GetGRPCPID(backend)
+
+	if err != nil {
+		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
+		return nil, err
+	}
+
+	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
+	backendProcess, err := gopsutil.NewProcess(int32(pid))
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memInfo, err := backendProcess.MemoryInfo()
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memPercent, err := backendProcess.MemoryPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	cpuPercent, err := backendProcess.CPUPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	return &BackendMonitorResponse{
+		MemoryInfo:    memInfo,
+		MemoryPercent: memPercent,
+		CPUPercent:    cpuPercent,
+	}, nil
+}
+
+func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
+	input := new(BackendMonitorRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", err
+	}
+
+	config, exists := bm.configLoader.GetConfig(input.Model)
+	var backendId string
+	if exists {
+		backendId = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backendId = input.Model
+	}
+
+	if !strings.HasSuffix(backendId, ".bin") {
+		backendId = fmt.Sprintf("%s.bin", backendId)
+	}
+
+	return backendId, nil
+}
+
+func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		client := bm.options.Loader.CheckIsLoaded(backendId)
+
+		if client == nil {
+			return fmt.Errorf("backend %s is not currently loaded", backendId)
+		}
+
+		status, rpcErr := client.Status(context.TODO())
+		if rpcErr != nil {
+			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
+			val, slbErr := bm.SampleLocalBackendProcess(backendId)
+			if slbErr != nil {
+				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
+			}
+			return c.JSON(proto.StatusResponse{
+				State: proto.StatusResponse_ERROR,
+				Memory: &proto.MemoryUsageData{
+					Total: val.MemoryInfo.VMS,
+					Breakdown: map[string]uint64{
+						"gopsutil-RSS": val.MemoryInfo.RSS,
+					},
+				},
+			})
+		}
+
+		return c.JSON(status)
+	}
+}
+
+func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		return bm.options.Loader.ShutdownModel(backendId)
+	}
+}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -10,6 +10,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -21,19 +22,24 @@ import (
 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""

-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		initialMessage := OpenAIResponse{
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		initialMessage := schema.OpenAIResponse{
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []Choice{{Delta: &Message{Role: "assistant", Content: &emptyMessage}}},
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
 		}
 		responses <- initialMessage

-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Delta: &Message{Content: &s}, Index: 0}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
 			}

 			responses <- resp
@@ -231,17 +237,19 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}

 		if toStream {
-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)

 			go process(predInput, input, config, o.Loader, responses)

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {

+				usage := &schema.OpenAIUsage{}
+
 				for ev := range responses {
+					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
 					enc.Encode(ev)
-
 					log.Debug().Msgf("Sending chunk: %s", buf.String())
 					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
 					if err != nil {
@@ -252,15 +260,16 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					w.Flush()
 				}

-				resp := &OpenAIResponse{
+				resp := &schema.OpenAIResponse{
 					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{
+					Choices: []schema.Choice{
 						{
 							FinishReason: "stop",
 							Index:        0,
-							Delta:        &Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: &emptyMessage},
 						}},
 					Object: "chat.completion.chunk",
+					Usage:  *usage,
 				}
 				respData, _ := json.Marshal(resp)

@@ -271,7 +280,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return nil
 		}

-		result, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]Choice) {
+		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
 			if processFunctions {
 				// As we have to change the result before processing, we can't stream the answer (yet?)
 				ss := map[string]interface{}{}
@@ -305,7 +314,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 								message = backend.Finetune(*config, predInput, message)
 								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)

-								*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &message}})
+								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
 								return
 							}
 						}
@@ -327,28 +336,33 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						return
 					}

-					prediction = backend.Finetune(*config, predInput, prediction)
-					*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &prediction}})
+					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
 				} else {
 					// otherwise reply with the function call
-					*c = append(*c, Choice{
+					*c = append(*c, schema.Choice{
 						FinishReason: "function_call",
-						Message:      &Message{Role: "assistant", FunctionCall: ss},
+						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
 					})
 				}

 				return
 			}
-			*c = append(*c, Choice{FinishReason: "stop", Index: 0, Message: &Message{Role: "assistant", Content: &s}})
+			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
 		}, nil)
 		if err != nil {
 			return err
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     tokenUsage.Prompt,
+				CompletionTokens: tokenUsage.Completion,
+				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+			},
 		}
 		respData, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", respData)
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -7,8 +7,10 @@ import (
 	"errors"
 	"fmt"

+	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -17,17 +19,22 @@ import (

 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
 				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{
+				Choices: []schema.Choice{
 					{
 						Index: 0,
 						Text:  s,
 					},
 				},
 				Object: "text_completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
 			}
 			log.Debug().Msgf("Sending goroutine: %s", s)

@@ -84,7 +91,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}

-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)

 			go process(predInput, input, config, o.Loader, responses)

@@ -100,9 +107,9 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 					w.Flush()
 				}

-				resp := &OpenAIResponse{
+				resp := &schema.OpenAIResponse{
 					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{
+					Choices: []schema.Choice{
 						{
 							Index:        0,
 							FinishReason: "stop",
@@ -119,7 +126,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return nil
 		}

-		var result []Choice
+		var result []schema.Choice
+
+		totalTokenUsage := backend.TokenUsage{}
+
 		for k, i := range config.PromptStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
@@ -131,20 +141,29 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s, FinishReason: "stop", Index: k})
-			}, nil)
+			r, tokenUsage, err := ComputeChoices(
+				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
+				}, nil)
 			if err != nil {
 				return err
 			}

+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
 			result = append(result, r...)
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -4,10 +4,13 @@ import (
 	"encoding/json"
 	"fmt"

+	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+
 	"github.com/rs/zerolog/log"
 )

@@ -31,7 +34,9 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			templateFile = config.TemplateConfig.Edit
 		}

-		var result []Choice
+		var result []schema.Choice
+		totalTokenUsage := backend.TokenUsage{}
+
 		for _, i := range config.InputStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
@@ -44,20 +49,28 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
+			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				*c = append(*c, schema.Choice{Text: s})
 			}, nil)
 			if err != nil {
 				return err
 			}

+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
 			result = append(result, r...)
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -6,6 +6,8 @@ import (

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -25,7 +27,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}

 		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []Item{}
+		items := []schema.Item{}

 		for i, s := range config.InputToken {
 			// get the model function to call for the result
@@ -38,7 +40,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

 		for i, s := range config.InputStrings {
@@ -52,10 +54,10 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Data:   items,
 			Object: "list",
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -1,9 +1,11 @@
 package openai

 import (
+	"bufio"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -49,6 +51,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

+		src := ""
+		if input.File != "" {
+			//base 64 decode the file and write it somewhere
+			// that we will cleanup
+			decoded, err := base64.StdEncoding.DecodeString(input.File)
+			if err != nil {
+				return err
+			}
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(decoded)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
 		log.Debug().Msgf("Parameter Config: %+v", config)

 		// XXX: Only stablediffusion is supported for now
@@ -73,8 +100,8 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		if input.ResponseFormat == "b64_json" {
 			b64JSON = true
 		}
-
-		var result []Item
+		// src and clip_skip
+		var result []schema.Item
 		for _, i := range config.PromptStrings {
 			n := input.N
 			if input.N == 0 {
@@ -121,7 +148,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 				baseURL := c.BaseURL()

-				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, output, o.Loader, *config, o)
+				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, o.Loader, *config, o)
 				if err != nil {
 					return err
 				}
@@ -129,7 +156,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 					return err
 				}

-				item := &Item{}
+				item := &schema.Item{}

 				if b64JSON {
 					defer os.RemoveAll(output)
@@ -147,7 +174,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
 			Data: result,
 		}

--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -4,12 +4,20 @@ import (
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ComputeChoices(req *OpenAIRequest, predInput string, config *config.Config, o *options.Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
-	n := req.N
-	result := []Choice{}
+func ComputeChoices(
+	req *schema.OpenAIRequest,
+	predInput string,
+	config *config.Config,
+	o *options.Option,
+	loader *model.ModelLoader,
+	cb func(string, *[]schema.Choice),
+	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
+	n := req.N // number of completions to return
+	result := []schema.Choice{}

 	if n == 0 {
 		n = 1
@@ -18,20 +26,25 @@ func ComputeChoices(req *OpenAIRequest, predInput string, config *config.Config,
 	// get the model function to call for the result
 	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
 	if err != nil {
-		return result, err
+		return result, backend.TokenUsage{}, err
 	}

+	tokenUsage := backend.TokenUsage{}
+
 	for i := 0; i < n; i++ {
 		prediction, err := predFunc()
 		if err != nil {
-			return result, err
+			return result, backend.TokenUsage{}, err
 		}

-		prediction = backend.Finetune(*config, predInput, prediction)
-		cb(prediction, &result)
+		tokenUsage.Prompt += prediction.Usage.Prompt
+		tokenUsage.Completion += prediction.Usage.Completion
+
+		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+		cb(finetunedResponse, &result)

 		//result = append(result, Choice{Text: prediction})

 	}
-	return result, err
+	return result, tokenUsage, err
 }
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -4,6 +4,7 @@ import (
 	"regexp"

 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 )
@@ -16,7 +17,7 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		}
 		var mm map[string]interface{} = map[string]interface{}{}

-		dataModels := []OpenAIModel{}
+		dataModels := []schema.OpenAIModel{}

 		var filterFn func(name string) bool
 		filter := c.Query("filter")
@@ -45,7 +46,7 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 			}

 			if filterFn(c.Name) {
-				dataModels = append(dataModels, OpenAIModel{ID: c.Name, Object: "model"})
+				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
 			}
 		}

@@ -53,13 +54,13 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		for _, m := range models {
 			// And only adds them if they shouldn't be skipped.
 			if _, exists := mm[m]; !exists && filterFn(m) {
-				dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 			}
 		}

 		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
+			Object string               `json:"object"`
+			Data   []schema.OpenAIModel `json:"data"`
 		}{
 			Object: "list",
 			Data:   dataModels,
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -10,14 +10,15 @@ import (

 	config "github.com/go-skynet/LocalAI/api/config"
 	options "github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *OpenAIRequest, error) {
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
 	loader := o.Loader
-	input := new(OpenAIRequest)
+	input := new(schema.OpenAIRequest)
 	ctx, cancel := context.WithCancel(o.Context)
 	input.Context = ctx
 	input.Cancel = cancel
@@ -60,7 +61,7 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *Open
 	return modelFile, input, nil
 }

-func updateConfig(config *config.Config, input *OpenAIRequest) {
+func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
 	}
@@ -75,6 +76,10 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		config.Backend = input.Backend
 	}

+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
 	if input.ModelBaseName != "" {
 		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
 	}
@@ -214,7 +219,7 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 	}
 }

-func readConfig(modelFile string, input *OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *OpenAIRequest, error) {
+func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
 	// Load a config file if present after the model name
 	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")

--- a/api/options/options.go
+++ b/api/options/options.go
@@ -33,6 +33,8 @@ type Option struct {
 	ExternalGRPCBackends map[string]string

 	AutoloadGalleries bool
+
+	SingleBackend bool
 }

 type AppOption func(*Option)
@@ -58,6 +60,10 @@ func WithCors(b bool) AppOption {
 	}
 }

+var EnableSingleBackend = func(o *Option) {
+	o.SingleBackend = true
+}
+
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -1,4 +1,4 @@
-package openai
+package schema

 import (
 	"context"
--- a/pkg/grpc/whisper/api/api.go
+++ b/pkg/grpc/whisper/api/api.go
@@ -1,4 +1,4 @@
-package api
+package schema

 import "time"

--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -5,8 +5,8 @@ package main
 import (
 	"flag"

+	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-	bert "github.com/go-skynet/LocalAI/pkg/grpc/llm/bert"
 )

 var (
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	bloomz "github.com/go-skynet/LocalAI/pkg/grpc/llm/bloomz"
+	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/dolly/main.go
+++ b/cmd/grpc/dolly/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon-ggml/main.go
+++ b/cmd/grpc/falcon-ggml/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"

-	falcon "github.com/go-skynet/LocalAI/pkg/grpc/llm/falcon"
+	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gpt2/main.go
+++ b/cmd/grpc/gpt2/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gpt4all/main.go
+++ b/cmd/grpc/gpt4all/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	gpt4all "github.com/go-skynet/LocalAI/pkg/grpc/llm/gpt4all"
+	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptj/main.go
+++ b/cmd/grpc/gptj/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptneox/main.go
+++ b/cmd/grpc/gptneox/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/langchain-huggingface/main.go
+++ b/cmd/grpc/langchain-huggingface/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	langchain "github.com/go-skynet/LocalAI/pkg/grpc/llm/langchain"
+	langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/llama-stable/main.go
+++ b/cmd/grpc/llama-stable/main.go
@@ -0,0 +1,21 @@
+package main
+
+import (
+	"flag"
+
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/llama/main.go
+++ b/cmd/grpc/llama/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"

-	llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama"
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/mpt/main.go
+++ b/cmd/grpc/mpt/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/piper/main.go
+++ b/cmd/grpc/piper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	tts "github.com/go-skynet/LocalAI/pkg/grpc/tts"
+	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/replit/main.go
+++ b/cmd/grpc/replit/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/rwkv/main.go
+++ b/cmd/grpc/rwkv/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	rwkv "github.com/go-skynet/LocalAI/pkg/grpc/llm/rwkv"
+	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/stablediffusion/main.go
+++ b/cmd/grpc/stablediffusion/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	image "github.com/go-skynet/LocalAI/pkg/grpc/image"
+	image "github.com/go-skynet/LocalAI/pkg/backend/image"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/starcoder/main.go
+++ b/cmd/grpc/starcoder/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/whisper/main.go
+++ b/cmd/grpc/whisper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transcribe "github.com/go-skynet/LocalAI/pkg/grpc/transcribe"
+	transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/examples/README.md
+++ b/examples/README.md
@@ -157,6 +157,16 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:

 [Check it out here](https://runpod.io/gsc?template=uv9mtqnrd0&ref=984wlcra)

+### Continue
+
+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
+_by [@gruberdev](https://github.com/gruberdev)_
+
+Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)
+
 ## Want to contribute?

 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/continue/README.md
+++ b/examples/continue/README.md
@@ -0,0 +1,56 @@
+# Continue
+
+![logo](https://continue.dev/docs/assets/images/continue-cover-logo-aa135cc83fe8a14af480d1633ed74eb5.png)
+
+This document presents an example of integration with [continuedev/continue](https://github.com/continuedev/continue).
+
+![Screenshot](https://continue.dev/docs/assets/images/continue-screenshot-1f36b99467817f755739d7f4c4c08fe3.png)
+
+For a live demonstration, please click on the link below:
+
+- [How it works (Video demonstration)](https://www.youtube.com/watch?v=3Ocrc-WX4iQ)
+
+## Integration Setup Walkthrough
+
+1. [As outlined in `continue`'s documentation](https://continue.dev/docs/getting-started), install the [Visual Studio Code extension from the marketplace](https://marketplace.visualstudio.com/items?itemName=Continue.continue) and open it.
+2. In this example, LocalAI will download the gpt4all model and set it up as "gpt-3.5-turbo". Refer to the `docker-compose.yaml` file for details.
+
+    ```bash
+    # Clone LocalAI
+    git clone https://github.com/go-skynet/LocalAI
+
+    cd LocalAI/examples/continue
+
+    # Start with docker-compose
+    docker-compose up --build -d
+    ```
+
+3. Type `/config` within Continue's VSCode extension, or edit the file located at `~/.continue/config.py` on your system with the following configuration:
+
+    ```py
+    from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+
+    config = ContinueConfig(
+       ...
+       models=Models(
+            default=OpenAI(
+               api_key="my-api-key",
+               model="gpt-3.5-turbo",
+               openai_server_info=OpenAIServerInfo(
+                  api_base="http://localhost:8080",
+                  model="gpt-3.5-turbo"
+               )
+            )
+       ),
+    )
+    ```
+
+This setup enables you to make queries directly to your model running in the Docker container. Note that the `api_key` does not need to be properly set up; it is included here as a placeholder.
+
+If editing the configuration seems confusing, you may copy and paste the provided default `config.py` file over the existing one in `~/.continue/config.py` after initializing the extension in the VSCode IDE.
+
+## Additional Resources
+
+- [Official Continue documentation](https://continue.dev/docs/intro)
+- [Documentation page on using self-hosted models](https://continue.dev/docs/customization#self-hosting-an-open-source-model)
+- [Official extension link](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
--- a/examples/continue/config.py
+++ b/examples/continue/config.py
@@ -0,0 +1,148 @@
+"""
+This is the Continue configuration file.
+
+See https://continue.dev/docs/customization to learn more.
+"""
+
+import subprocess
+
+from continuedev.src.continuedev.core.main import Step
+from continuedev.src.continuedev.core.sdk import ContinueSDK
+from continuedev.src.continuedev.core.models import Models
+from continuedev.src.continuedev.core.config import CustomCommand, SlashCommand, ContinueConfig
+from continuedev.src.continuedev.plugins.context_providers.github import GitHubIssuesContextProvider
+from continuedev.src.continuedev.plugins.context_providers.google import GoogleContextProvider
+from continuedev.src.continuedev.plugins.policies.default import DefaultPolicy
+from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+from continuedev.src.continuedev.libs.llm.ggml import GGML
+
+from continuedev.src.continuedev.plugins.steps.open_config import OpenConfigStep
+from continuedev.src.continuedev.plugins.steps.clear_history import ClearHistoryStep
+from continuedev.src.continuedev.plugins.steps.feedback import FeedbackStep
+from continuedev.src.continuedev.plugins.steps.comment_code import CommentCodeStep
+from continuedev.src.continuedev.plugins.steps.share_session import ShareSessionStep
+from continuedev.src.continuedev.plugins.steps.main import EditHighlightedCodeStep
+from continuedev.src.continuedev.plugins.context_providers.search import SearchContextProvider
+from continuedev.src.continuedev.plugins.context_providers.diff import DiffContextProvider
+from continuedev.src.continuedev.plugins.context_providers.url import URLContextProvider
+
+class CommitMessageStep(Step):
+    """
+    This is a Step, the building block of Continue.
+    It can be used below as a slash command, so that
+    run will be called when you type '/commit'.
+    """
+    async def run(self, sdk: ContinueSDK):
+
+        # Get the root directory of the workspace
+        dir = sdk.ide.workspace_directory
+
+        # Run git diff in that directory
+        diff = subprocess.check_output(
+            ["git", "diff"], cwd=dir).decode("utf-8")
+
+        # Ask the LLM to write a commit message,
+        # and set it as the description of this step
+        self.description = await sdk.models.default.complete(
+            f"{diff}\n\nWrite a short, specific (less than 50 chars) commit message about the above changes:")
+
+
+config = ContinueConfig(
+
+    # If set to False, we will not collect any usage data
+    # See here to learn what anonymous data we collect: https://continue.dev/docs/telemetry
+    allow_anonymous_telemetry=True,
+
+    models = Models(
+        default = OpenAI(
+            api_key = "my-api-key",
+            model = "gpt-3.5-turbo",
+            openai_server_info = OpenAIServerInfo(
+                api_base = "http://localhost:8080",
+                model = "gpt-3.5-turbo"
+            )
+        )
+    ),
+    # Set a system message with information that the LLM should always keep in mind
+    # E.g. "Please give concise answers. Always respond in Spanish."
+    system_message=None,
+
+    # Set temperature to any value between 0 and 1. Higher values will make the LLM
+    # more creative, while lower values will make it more predictable.
+    temperature=0.5,
+
+    # Custom commands let you map a prompt to a shortened slash command
+    # They are like slash commands, but more easily defined - write just a prompt instead of a Step class
+    # Their output will always be in chat form
+    custom_commands=[
+        # CustomCommand(
+        #     name="test",
+        #     description="Write unit tests for the higlighted code",
+        #     prompt="Write a comprehensive set of unit tests for the selected code. It should setup, run tests that check for correctness including important edge cases, and teardown. Ensure that the tests are complete and sophisticated. Give the tests just as chat output, don't edit any file.",
+        # )
+    ],
+
+    # Slash commands let you run a Step from a slash command
+    slash_commands=[
+        # SlashCommand(
+        #     name="commit",
+        #     description="This is an example slash command. Use /config to edit it and create more",
+        #     step=CommitMessageStep,
+        # )
+        SlashCommand(
+            name="edit",
+            description="Edit code in the current file or the highlighted code",
+            step=EditHighlightedCodeStep,
+        ),
+        SlashCommand(
+            name="config",
+            description="Customize Continue - slash commands, LLMs, system message, etc.",
+            step=OpenConfigStep,
+        ),
+        SlashCommand(
+            name="comment",
+            description="Write comments for the current file or highlighted code",
+            step=CommentCodeStep,
+        ),
+        SlashCommand(
+            name="feedback",
+            description="Send feedback to improve Continue",
+            step=FeedbackStep,
+        ),
+        SlashCommand(
+            name="clear",
+            description="Clear step history",
+            step=ClearHistoryStep,
+        ),
+        SlashCommand(
+            name="share",
+            description="Download and share the session transcript",
+            step=ShareSessionStep,
+        )
+    ],
+
+    # Context providers let you quickly select context by typing '@'
+    # Uncomment the following to
+    # - quickly reference GitHub issues
+    # - show Google search results to the LLM
+    context_providers=[
+        # GitHubIssuesContextProvider(
+        #     repo_name="<your github username or organization>/<your repo name>",
+        #     auth_token="<your github auth token>"
+        # ),
+        # GoogleContextProvider(
+        #     serper_api_key="<your serper.dev api key>"
+        # )
+        SearchContextProvider(),
+        DiffContextProvider(),
+        URLContextProvider(
+            preset_urls = [
+                # Add any common urls you reference here so they appear in autocomplete
+            ]
+        )
+    ],
+
+    # Policies hold the main logic that decides which Step to take next
+    # You can use them to design agents, or deeply customize Continue
+    policy=DefaultPolicy()
+)
--- a/examples/continue/docker-compose.yml
+++ b/examples/continue/docker-compose.yml
@@ -0,0 +1,27 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
+    build:
+      context: ../../
+      dockerfile: Dockerfile
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+      # You can preload different models here as well.
+      # See: https://github.com/go-skynet/model-gallery
+      - 'PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}]'
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
--- a/examples/continue/img/screen.png
+++ b/examples/continue/img/screen.png
--- a/examples/insomnia/Insomnia_LocalAI.json
+++ b/examples/insomnia/Insomnia_LocalAI.json
--- a/examples/insomnia/README.md
+++ b/examples/insomnia/README.md
@@ -0,0 +1,17 @@
+# Insomnia
+
+Developer Testing Request Collection for [Insomnia](https://insomnia.rest/), an open-source REST client
+
+## Instructions
+
+* Install Insomnia as normal
+* [Import](https://docs.insomnia.rest/insomnia/import-export-data) `Insomnia_LocalAI.json`
+* Control + E opens the environment settings - 
+
+| **Parameter Name** | **Default Value** | **Description**                          |
+|--------------------|-------------------|------------------------------------------|
+| HOST               | localhost         | LocalAI base URL                         |
+| PORT               | 8080              | LocalAI port                             |
+| DEFAULT_MODEL      | gpt-3.5-turbo     | Name of the model used on most requests. |
+
+** you may want to duplicate localhost into a "Private" environment to avoid saving private settings back to this file **
--- a/extra/grpc/autogptq/autogptq.py
+++ b/extra/grpc/autogptq/autogptq.py
@@ -77,7 +77,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/extra/grpc/autogptq/backend_pb2.py
+++ b/extra/grpc/autogptq/backend_pb2.py
--- a/extra/grpc/autogptq/backend_pb2_grpc.py
+++ b/extra/grpc/autogptq/backend_pb2_grpc.py
@@ -54,6 +54,16 @@ class BackendStub(object):
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )


 class BackendServicer(object):
@@ -107,6 +117,18 @@ class BackendServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+

 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
@@ -150,6 +172,16 @@ def add_BackendServicer_to_server(servicer, server):
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
@@ -295,3 +327,37 @@ class Backend(object):
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/backend_pb2.py
+++ b/extra/grpc/bark/backend_pb2.py
--- a/extra/grpc/bark/backend_pb2_grpc.py
+++ b/extra/grpc/bark/backend_pb2_grpc.py
@@ -54,6 +54,16 @@ class BackendStub(object):
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )


 class BackendServicer(object):
@@ -107,6 +117,18 @@ class BackendServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+

 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
@@ -150,6 +172,16 @@ def add_BackendServicer_to_server(servicer, server):
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
@@ -295,3 +327,37 @@ class Backend(object):
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/ttsbark.py
+++ b/extra/grpc/bark/ttsbark.py
@@ -51,7 +51,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/extra/grpc/diffusers/backend_diffusers.py
+++ b/extra/grpc/diffusers/backend_diffusers.py
@@ -12,9 +12,110 @@ import os
 # import diffusers
 import torch
 from torch import autocast
-from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers.pipelines.stable_diffusion import safety_checker
+from compel import Compel
+from PIL import Image
+from io import BytesIO
+from diffusers import StableDiffusionImg2ImgPipeline
+from transformers import CLIPTextModel
+from enum import Enum

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
+COMPEL=os.environ.get("COMPEL", "1") == "1"
+CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
+
+# https://github.com/CompVis/stable-diffusion/issues/239#issuecomment-1627615287
+def sc(self, clip_input, images) : return images, [False for i in images]
+# edit the StableDiffusionSafetyChecker class so that, when called, it just returns the images and an array of True values
+safety_checker.StableDiffusionSafetyChecker.forward = sc
+
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UniPCMultistepScheduler,
+)
+# The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
+# Credits to https://github.com/neggles
+# See https://github.com/huggingface/diffusers/issues/4167 for more details on sched mapping from A1111
+class DiffusionScheduler(str, Enum):
+    ddim = "ddim"  # DDIM
+    pndm = "pndm"  # PNDM
+    heun = "heun"  # Heun
+    unipc = "unipc"  # UniPC
+    euler = "euler"  # Euler
+    euler_a = "euler_a"  # Euler a
+
+    lms = "lms"  # LMS
+    k_lms = "k_lms"  # LMS Karras
+
+    dpm_2 = "dpm_2"  # DPM2
+    k_dpm_2 = "k_dpm_2"  # DPM2 Karras
+
+    dpm_2_a = "dpm_2_a"  # DPM2 a
+    k_dpm_2_a = "k_dpm_2_a"  # DPM2 a Karras
+
+    dpmpp_2m = "dpmpp_2m"  # DPM++ 2M
+    k_dpmpp_2m = "k_dpmpp_2m"  # DPM++ 2M Karras
+
+    dpmpp_sde = "dpmpp_sde"  # DPM++ SDE
+    k_dpmpp_sde = "k_dpmpp_sde"  # DPM++ SDE Karras
+
+    dpmpp_2m_sde = "dpmpp_2m_sde"  # DPM++ 2M SDE
+    k_dpmpp_2m_sde = "k_dpmpp_2m_sde"  # DPM++ 2M SDE Karras
+
+
+def get_scheduler(name: str, config: dict = {}):
+    is_karras = name.startswith("k_")
+    if is_karras:
+        # strip the k_ prefix and add the karras sigma flag to config
+        name = name.lstrip("k_")
+        config["use_karras_sigmas"] = True
+
+    if name == DiffusionScheduler.ddim:
+        sched_class = DDIMScheduler
+    elif name == DiffusionScheduler.pndm:
+        sched_class = PNDMScheduler
+    elif name == DiffusionScheduler.heun:
+        sched_class = HeunDiscreteScheduler
+    elif name == DiffusionScheduler.unipc:
+        sched_class = UniPCMultistepScheduler
+    elif name == DiffusionScheduler.euler:
+        sched_class = EulerDiscreteScheduler
+    elif name == DiffusionScheduler.euler_a:
+        sched_class = EulerAncestralDiscreteScheduler
+    elif name == DiffusionScheduler.lms:
+        sched_class = LMSDiscreteScheduler
+    elif name == DiffusionScheduler.dpm_2:
+        # Equivalent to DPM2 in K-Diffusion
+        sched_class = KDPM2DiscreteScheduler
+    elif name == DiffusionScheduler.dpm_2_a:
+        # Equivalent to `DPM2 a`` in K-Diffusion
+        sched_class = KDPM2AncestralDiscreteScheduler
+    elif name == DiffusionScheduler.dpmpp_2m:
+        # Equivalent to `DPM++ 2M` in K-Diffusion
+        sched_class = DPMSolverMultistepScheduler
+        config["algorithm_type"] = "dpmsolver++"
+        config["solver_order"] = 2
+    elif name == DiffusionScheduler.dpmpp_sde:
+        # Equivalent to `DPM++ SDE` in K-Diffusion
+        sched_class = DPMSolverSinglestepScheduler
+    elif name == DiffusionScheduler.dpmpp_2m_sde:
+        # Equivalent to `DPM++ 2M SDE` in K-Diffusion
+        sched_class = DPMSolverMultistepScheduler
+        config["algorithm_type"] = "sde-dpmsolver++"
+    else:
+        raise ValueError(f"Invalid scheduler '{'k_' if is_karras else ''}{name}'")
+
+    return sched_class.from_config(config)

 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
@@ -28,32 +129,88 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.F16Memory:
                torchType = torch.float16

+            local = False
+            modelFile = request.Model
+
+            cfg_scale = 7
+            if request.CFGScale != 0:
+                cfg_scale = request.CFGScale
+            
+            clipmodel = "runwayml/stable-diffusion-v1-5"
+            if request.CLIPModel != "":
+                clipmodel = request.CLIPModel
+            clipsubfolder = "text_encoder"
+            if request.CLIPSubfolder != "":
+                clipsubfolder = request.CLIPSubfolder
+            
+            # Check if ModelFile exists
+            if request.ModelFile != "":
+                if os.path.exists(request.ModelFile):
+                    local = True
+                    modelFile = request.ModelFile
+            
+            fromSingleFile = request.Model.startswith("http") or request.Model.startswith("/") or local
+            
+            if request.IMG2IMG and request.PipelineType == "":
+                request.PipelineType == "StableDiffusionImg2ImgPipeline"
+                
            if request.PipelineType == "":
                request.PipelineType == "StableDiffusionPipeline"

+            ## img2img
+            if request.PipelineType == "StableDiffusionImg2ImgPipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusionImg2ImgPipeline.from_single_file(modelFile,
+                                torch_dtype=torchType,
+                                guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(request.Model,
+                                torch_dtype=torchType,
+                                guidance_scale=cfg_scale)
+
+            if request.PipelineType == "StableDiffusionDepth2ImgPipeline":
+                self.pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(request.Model,
+                            torch_dtype=torchType,
+                            guidance_scale=cfg_scale)
+            ## text2img
            if request.PipelineType == "StableDiffusionPipeline":
-                self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
-                                                               torch_dtype=torchType)
+                if fromSingleFile:
+                    self.pipe = StableDiffusionPipeline.from_single_file(modelFile,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)

            if request.PipelineType == "DiffusionPipeline":
                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
-                                                               torch_dtype=torchType)
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)

            if request.PipelineType == "StableDiffusionXLPipeline":
-                self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                    request.Model, 
-                    torch_dtype=torchType, 
-                    use_safetensors=True, 
-             #       variant="fp16"
-                    )
-          
+                if fromSingleFile:
+                    self.pipe = StableDiffusionXLPipeline.from_single_file(modelFile,
+                                                               torch_dtype=torchType, use_safetensors=True,
+                                                               guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionXLPipeline.from_pretrained(
+                        request.Model, 
+                        torch_dtype=torchType, 
+                        use_safetensors=True, 
+                #       variant="fp16"
+                        guidance_scale=cfg_scale)
+            # https://github.com/huggingface/diffusers/issues/4446
+            # do not use text_encoder in the constructor since then
+            # https://github.com/huggingface/diffusers/issues/3212#issuecomment-1521841481
+            if CLIPSKIP and request.CLIPSkip != 0:
+                text_encoder = CLIPTextModel.from_pretrained(clipmodel, num_hidden_layers=request.CLIPSkip,  subfolder=clipsubfolder, torch_dtype=torchType)
+                self.pipe.text_encoder=text_encoder
            # torch_dtype needs to be customized. float16 for GPU, float32 for CPU
            # TODO: this needs to be customized
-            if request.SchedulerType == "EulerAncestralDiscreteScheduler":
-                self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
-            if request.SchedulerType == "DPMSolverMultistepScheduler":
-                self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
-
+            if request.SchedulerType != "":
+                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
+            self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
            if request.CUDA:
                self.pipe.to('cuda')
        except Exception as err:
@@ -64,25 +221,53 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def GenerateImage(self, request, context):

        prompt = request.positive_prompt
-        negative_prompt = request.negative_prompt

-        image = self.pipe(
-            prompt, 
-            negative_prompt=negative_prompt, 
-            width=request.width,
-            height=request.height,
-      #      guidance_scale=12,
-            target_size=(request.width,request.height),
-            original_size=(4096,4096),
-            num_inference_steps=request.step
-            ).images[0]
+        # create a dictionary of values for the parameters
+        options = {
+            "negative_prompt":     request.negative_prompt, 
+            "width":               request.width, 
+            "height":              request.height,
+            "num_inference_steps": request.step,
+        }

+        if request.src != "":
+            image = Image.open(request.src)
+            options["image"] = image
+
+        # Get the keys that we will build the args for our pipe for
+        keys = options.keys()
+
+        if request.EnableParameters != "":
+            keys = request.EnableParameters.split(",")
+
+        if request.EnableParameters == "none":
+            keys = []
+
+        # create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
+        kwargs = {key: options[key] for key in keys}
+
+        image = {}
+        if COMPEL:
+            conditioning = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"]= conditioning
+            # pass the kwargs dictionary to the self.pipe method
+            image = self.pipe( 
+                **kwargs
+                ).images[0] 
+        else:
+            # pass the kwargs dictionary to the self.pipe method
+            image = self.pipe(
+                prompt, 
+                **kwargs
+                ).images[0]
+
+        # save the result
        image.save(request.dst)

        return backend_pb2.Result(message="Model loaded successfully", success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/extra/grpc/diffusers/backend_pb2.py
+++ b/extra/grpc/diffusers/backend_pb2.py
--- a/extra/grpc/diffusers/backend_pb2_grpc.py
+++ b/extra/grpc/diffusers/backend_pb2_grpc.py
@@ -54,6 +54,16 @@ class BackendStub(object):
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )


 class BackendServicer(object):
@@ -107,6 +117,18 @@ class BackendServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+

 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
@@ -150,6 +172,16 @@ def add_BackendServicer_to_server(servicer, server):
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
@@ -295,3 +327,37 @@ class Backend(object):
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/exllama/backend_pb2.py
+++ b/extra/grpc/exllama/backend_pb2.py
--- a/extra/grpc/exllama/backend_pb2_grpc.py
+++ b/extra/grpc/exllama/backend_pb2_grpc.py
@@ -54,6 +54,16 @@ class BackendStub(object):
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )


 class BackendServicer(object):
@@ -107,6 +117,18 @@ class BackendServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+

 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
@@ -150,6 +172,16 @@ def add_BackendServicer_to_server(servicer, server):
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
@@ -295,3 +327,37 @@ class Backend(object):
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/exllama/exllama.py
+++ b/extra/grpc/exllama/exllama.py
@@ -110,7 +110,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/extra/grpc/huggingface/backend_pb2.py
+++ b/extra/grpc/huggingface/backend_pb2.py
--- a/extra/grpc/huggingface/backend_pb2_grpc.py
+++ b/extra/grpc/huggingface/backend_pb2_grpc.py
@@ -54,6 +54,16 @@ class BackendStub(object):
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )


 class BackendServicer(object):
@@ -107,6 +117,18 @@ class BackendServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+

 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
@@ -150,6 +172,16 @@ def add_BackendServicer_to_server(servicer, server):
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
@@ -295,3 +327,37 @@ class Backend(object):
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/huggingface/huggingface.py
+++ b/extra/grpc/huggingface/huggingface.py
@@ -34,7 +34,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/extra/requirements.txt
+++ b/extra/requirements.txt
@@ -1,4 +1,7 @@
 sentence_transformers
 grpcio
 google
-protobuf
+protobuf
+six
+omegaconf
+compel
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/go-skynet/LocalAI

-go 1.20
+go 1.21

 require (
 	github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df
@@ -9,25 +9,25 @@ require (
 	github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa
 	github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1
 	github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e
-	github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230825165210-bf63302a2be7
 	github.com/gofiber/fiber/v2 v2.48.0
-	github.com/google/uuid v1.3.0
+	github.com/google/uuid v1.3.1
 	github.com/hashicorp/go-multierror v1.1.1
 	github.com/hpcloud/tail v1.0.0
 	github.com/imdario/mergo v0.3.16
 	github.com/json-iterator/go v1.1.12
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef
-	github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d
+	github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c
 	github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af
-	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a
-	github.com/onsi/ginkgo/v2 v2.11.0
+	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230823205330-27a8b020c36b
+	github.com/onsi/ginkgo/v2 v2.12.0
 	github.com/onsi/gomega v1.27.10
-	github.com/otiai10/openaigo v1.5.2
+	github.com/otiai10/openaigo v1.6.0
 	github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
 	github.com/rs/zerolog v1.30.0
-	github.com/sashabaranov/go-openai v1.14.1
-	github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537
+	github.com/sashabaranov/go-openai v1.14.2
+	github.com/tmc/langchaingo v0.0.0-20230826022619-1e2a401097d6
 	github.com/urfave/cli/v2 v2.25.7
 	github.com/valyala/fasthttp v1.48.0
 	google.golang.org/grpc v1.57.0
@@ -36,6 +36,17 @@ require (
 	gopkg.in/yaml.v3 v3.0.1
 )

+require (
+	github.com/go-ole/go-ole v1.2.6 // indirect
+	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
+	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
+	github.com/shirou/gopsutil/v3 v3.23.7
+	github.com/shoenig/go-m1cpu v0.1.6 // indirect
+	github.com/tklauser/go-sysconf v0.3.11 // indirect
+	github.com/tklauser/numcpus v0.6.0 // indirect
+	github.com/yusufpapurcu/wmi v1.2.3 // indirect
+)
+
 require (
 	github.com/dlclark/regexp2 v1.8.1 // indirect
 	github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
@@ -50,7 +61,6 @@ require (
 	github.com/pkoukk/tiktoken-go v0.1.2 // indirect
 	github.com/ulikunitz/xz v0.5.9 // indirect
 	github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
-	google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect
 	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
 	gopkg.in/fsnotify.v1 v1.4.7 // indirect
@@ -77,8 +87,8 @@ require (
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/net v0.12.0 // indirect
-	golang.org/x/sys v0.10.0 // indirect
-	golang.org/x/text v0.11.0 // indirect
-	golang.org/x/tools v0.9.3 // indirect
+	golang.org/x/net v0.14.0 // indirect
+	golang.org/x/sys v0.11.0 // indirect
+	golang.org/x/text v0.12.0 // indirect
+	golang.org/x/tools v0.12.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -13,8 +13,6 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dlclark/regexp2 v1.8.1 h1:6Lcdwya6GjPUNsBct8Lg/yRPwMhABj269AAzdGSiR+0=
 github.com/dlclark/regexp2 v1.8.1/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674 h1:G70Yf/QOCEL1v24idWnGd6rJsbqiGkJAJnMaWaolzEg=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df h1:qVcBEZlvp5A1gGWNJj02xyDtbsUI2hohlQMSB1fgER4=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY=
@@ -33,22 +31,18 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
 github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa h1:gxr68r/6EWroay4iI81jxqGCDbKotY4+CiwdUkBz2NQ=
 github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa/go.mod h1:wc0fJ9V04yiYTfgKvE5RUUSRQ5Kzi0Bo4I+U3nNOUuA=
 github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1 h1:yXvc7QfGtoZ51tUW/YVjoTwAfh8HG88XU7UOrbNlz5Y=
 github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1/go.mod h1:fYjkCDRzC+oRLHSjQoajmYK6AmeJnmEanV27CClAcDc=
 github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e h1:4reMY29i1eOZaRaSTMPNyXI7X8RMNxCTfDDBXYzrbr0=
 github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e/go.mod h1:31j1odgFXP8hDSUVfH0zErKI5aYVP18ddYnPkwCso2A=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230709163512-6c97625cca76 h1:NRdxo2MKi8qhWZXxu6CIZOkdH+LBERFz1kk22U1FD3k=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230709163512-6c97625cca76/go.mod h1:tzi97YvT1bVQ+iTG39LvpDkKG1WbizgtljC+orSoM40=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230724222459-562d2b5a7119 h1:FeUSk5yMHT7J7jeCQKAOs4x5LRNSYH0SR6djM/i1jcc=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230724222459-562d2b5a7119/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230727163958-6ba16de8e965 h1:2MO/rABKpkXnnKQ3Ar90aqhnlMEejE9gnKG6bafv+ow=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230727163958-6ba16de8e965/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230729200103-8c51308e42d7 h1:1uBwholTaJ8Lva8ySJjT4jNaCDAh+MJXtsbZBbQq9lA=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230729200103-8c51308e42d7/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066 h1:v4Js+yEdgY9IV7n35M+5MELLxlOMp3qC5whZm5YTLjI=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230815201253-f03869d188b7 h1:d/FXe1a55gCLf124uRYYtlYg6KvI7OI33xaFejQUAws=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230815201253-f03869d188b7/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230825165210-bf63302a2be7 h1:tWzT53oxcuw1xKCd058IX19eItt96WPdQPeqi574CHo=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230825165210-bf63302a2be7/go.mod h1:yEf77qqgs/YqcrFmgzLiPIvaeteeII/y/YfcQ66b1rM=
 github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
@@ -72,6 +66,7 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw
 github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
@@ -79,6 +74,8 @@ github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4=
+github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
 github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
 github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
@@ -103,6 +100,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
 github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
@@ -124,22 +123,16 @@ github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 h1:OFVkSxR7CRSRSNm
 github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760/go.mod h1:O7SwdSWMilAWhBZMK9N9Y/oBDyMMzshE3ju8Xkexwig=
 github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d h1:/lAg9vPAAU+s35cDMCx1IyeMn+4OYfCBPqi08Q8vXDg=
 github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d/go.mod h1:HGGAOJhipApckwNV8ZTliRJqxctUv3xRY+zbQEwuytc=
+github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c h1:CI5uGwqBpN8N7BrSKC+nmdfw+9nPQIDyjHHlaIiitZI=
+github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c/go.mod h1:gY3wyrhkRySJtmtI/JPt4a2mKv48h/M9pEZIW+SjeC0=
 github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af h1:XFq6OUqsWQam0OrEr05okXsJK/TQur3zoZTHbiZD3Ks=
 github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af/go.mod h1:8ufRkpz/S/9ahkaxzZ5i4WMgO9w4InEhuRoT7vK5Rnw=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230714185456-cfd70b69fcf5 h1:bmQnxyKiqCu8i2y/N/Sf0coWoG2/Ed12YGQeb7lTnjo=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230714185456-cfd70b69fcf5/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230725212419-9100b2ef6fb9 h1:/oRwZhulKTU8LpPD2fXi2o2kdlTutQjYWDVMkrv14po=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230725212419-9100b2ef6fb9/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230727161923-39acbc837816 h1:hRi7hpDUuaO0dB4NZ8eyaeD2fRar6CPyNAARsO5DhzA=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230727161923-39acbc837816/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230731161838-cbdcde8b7586 h1:WVEMSZMyHFe68PN204c3Fdk5g2lZouPvbU9/2zkPpWc=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230731161838-cbdcde8b7586/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230802145814-c449b71b56de h1:E5EGczxEAcbaO8yqj074MQxU609QbtB6in3qTOW1EFo=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230802145814-c449b71b56de/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230807175413-0f2bb506a8ee h1:Y/j+GNytyncmDnAEuDZwzkYC9nzUPvXJPF+nntQG0VU=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230807175413-0f2bb506a8ee/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a h1:bX26Zfwh72ug2aZTEwFISTMEJ56Wa/4KqboidD+g92A=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230815171941-a63093554fb5 h1:b4EeYDaGxOLNlNm5LOVEmrUhaw1v6xq/V79ZwWVlY6I=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230815171941-a63093554fb5/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230817205628-36f7fb584824 h1:Q8CV2o6XPucrhu3jFhQPZj5OzJT3Mgtv8JdoCpQVEgg=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230817205628-36f7fb584824/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230823205330-27a8b020c36b h1:M6rq7NEbunl3NecCDkPcp7U27PTRdKS6ujtZxcstDjs=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230823205330-27a8b020c36b/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
@@ -151,16 +144,18 @@ github.com/onsi/ginkgo v1.16.4 h1:29JGrr5oVBm5ulCWet69zQkzWipVXIol6ygQUe/EzNc=
 github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0=
 github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU=
 github.com/onsi/ginkgo/v2 v2.11.0/go.mod h1:ZhrRA5XmEE3x3rhlzamx/JJvujdZoJ2uvgI7kR0iZvM=
+github.com/onsi/ginkgo/v2 v2.12.0 h1:UIVDowFPwpg6yMUpPjGkYvf06K3RAiJXUhCxEwQVHRI=
+github.com/onsi/ginkgo/v2 v2.12.0/go.mod h1:ZNEzXISYlqpb8S36iN71ifqLi3vVD1rVJGvWRCJOUpQ=
 github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
 github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
 github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
-github.com/onsi/gomega v1.27.8 h1:gegWiwZjBsf2DgiSbf5hpokZ98JVDMcWkUiigk6/KXc=
-github.com/onsi/gomega v1.27.8/go.mod h1:2J8vzI/s+2shY9XHRApDkdgPo1TKT7P2u6fXeJKFnNQ=
 github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
 github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
 github.com/otiai10/mint v1.6.1 h1:kgbTJmOpp/0ce7hk3H8jiSuR0MXmpwWRfqUdKww17qg=
 github.com/otiai10/openaigo v1.5.2 h1:YnNDisZmA4syArF3IxMCIrfgZOq30PLV219gPY7n2z8=
 github.com/otiai10/openaigo v1.5.2/go.mod h1:kIaXc3V+Xy5JLplcBxehVyGYDtufHp3PFPy04jOwOAI=
+github.com/otiai10/openaigo v1.6.0 h1:YTQEbtDSvawETOB/Kmb/6JvuHdHH/eIpSQfHVufiwY8=
+github.com/otiai10/openaigo v1.6.0/go.mod h1:kIaXc3V+Xy5JLplcBxehVyGYDtufHp3PFPy04jOwOAI=
 github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI=
 github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE=
 github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
@@ -170,37 +165,45 @@ github.com/pkoukk/tiktoken-go v0.1.2 h1:u7PCSBiWJ3nJYoTGShyM9iHXz4dNyYkurwwp+GHt
 github.com/pkoukk/tiktoken-go v0.1.2/go.mod h1:boMWvk9pQCOTx11pgu0DrIdrAKgQzzJKUP6vLXaz7Rw=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
+github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
 github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
-github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
-github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/rs/zerolog v1.30.0 h1:SymVODrcRsaRaSInD9yQtKbtWqwsfoPcRff/oRXLj4c=
 github.com/rs/zerolog v1.30.0/go.mod h1:/tk+P47gFdPXq4QYjvCmT5/Gsug2nagsFWBWhAiSi1w=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sashabaranov/go-openai v1.14.0 h1:D1yAB+DHElgbJFdYyjxfTWMFzhddn+PwZmkQ039L7mQ=
-github.com/sashabaranov/go-openai v1.14.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
-github.com/sashabaranov/go-openai v1.14.1 h1:jqfkdj8XHnBF84oi2aNtT8Ktp3EJ0MfuVjvcMkfI0LA=
-github.com/sashabaranov/go-openai v1.14.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.14.2 h1:5DPTtR9JBjKPJS008/A409I5ntFhUPPGCmaAihcPRyo=
+github.com/sashabaranov/go-openai v1.14.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/shirou/gopsutil/v3 v3.23.6 h1:5y46WPI9QBKBbK7EEccUPNXpJpNrvPuTD0O2zHEHT08=
+github.com/shirou/gopsutil/v3 v3.23.6/go.mod h1:j7QX50DrXYggrpN30W0Mo+I4/8U2UUIQrnrhqUeWrAU=
+github.com/shirou/gopsutil/v3 v3.23.7 h1:C+fHO8hfIppoJ1WdsVm1RoI0RwXoNdfTK7yWXV0wVj4=
+github.com/shirou/gopsutil/v3 v3.23.7/go.mod h1:c4gnmoRC0hQuaLqvxnx1//VXQ0Ms/X9UnJF8pddY5z4=
+github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM=
+github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ=
+github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU=
+github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
-github.com/tmc/langchaingo v0.0.0-20230713201705-dcf7ecdc8ac8 h1:wdJigYmmIRCuXhCkADDr53Oa1fp/WlxCPoVXR2r7GrU=
-github.com/tmc/langchaingo v0.0.0-20230713201705-dcf7ecdc8ac8/go.mod h1:mTzgQfAGwmBz2hhQELZfu2bwsbHwyKHA6IHOa+9LDFg=
-github.com/tmc/langchaingo v0.0.0-20230726025230-7d5f9fd5e90a h1:I/2JSuYXkWaVVLSZmrPfrgbvvvPR0IaulZcB0Iu8oVI=
-github.com/tmc/langchaingo v0.0.0-20230726025230-7d5f9fd5e90a/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
-github.com/tmc/langchaingo v0.0.0-20230729232647-7df4fe5fb8fe h1:+XVrCjh3rPibfISkUFG2Ck5NLKODQ9cFdmraFye1bGA=
-github.com/tmc/langchaingo v0.0.0-20230729232647-7df4fe5fb8fe/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
-github.com/tmc/langchaingo v0.0.0-20230731024823-8f101609f600 h1:SABuIthjhIXEsxnokuA16CZOxxdW9XohIHQqd/go8Nc=
-github.com/tmc/langchaingo v0.0.0-20230731024823-8f101609f600/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
-github.com/tmc/langchaingo v0.0.0-20230802030916-271e9bd7e7c5 h1:js7vYDJGzUGVSt0YlIusUc5BXYVECu3LUI/asby5Ggo=
-github.com/tmc/langchaingo v0.0.0-20230802030916-271e9bd7e7c5/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
-github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537 h1:vkeNjlW+0Xiw2XizMHoQuLG8pg6AN1hU8zJuMV9GQBc=
-github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM=
+github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI=
+github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms=
+github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4=
+github.com/tmc/langchaingo v0.0.0-20230815194031-eb0cbd31327d h1:RBu2wOoyzxNxYTitUKVNDtU1H6T4Tu5skOwvZabnPFc=
+github.com/tmc/langchaingo v0.0.0-20230815194031-eb0cbd31327d/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230818002435-fef08210d974 h1:nlthMb2uN8S1v0XAE5bzFwz6v+Wb4wWEkTb7tdt+pOY=
+github.com/tmc/langchaingo v0.0.0-20230818002435-fef08210d974/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230826022619-1e2a401097d6 h1:G5ZejMgwwIYVlBVcZTGdewJPYdAljMrN/+1PH6P+Jtw=
+github.com/tmc/langchaingo v0.0.0-20230826022619-1e2a401097d6/go.mod h1:vCdA1t5qnS5YPkDsznowOziBHFn0Ul11ZqfJ2GOAi0s=
 github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/ulikunitz/xz v0.5.9 h1:RsKRIA2MO8x56wkkcd3LbtcE/uMszhb6DpRf+3uwa3I=
 github.com/ulikunitz/xz v0.5.9/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
@@ -217,6 +220,8 @@ github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMx
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw=
+github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
@@ -228,10 +233,10 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk=
-golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50=
 golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=
+golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14=
+golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -239,45 +244,47 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA=
 golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
+golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
-golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4=
 golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc=
+golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.9.3 h1:Gn1I8+64MsuTb/HpH+LmQtNas23LhUVr3rYZ0eKuaMM=
 golang.org/x/tools v0.9.3/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc=
+golang.org/x/tools v0.12.0 h1:YW6HUoUmYBpwSgyaGaZq1fHjrBjX1rlpZ54T6mu2kss=
+golang.org/x/tools v0.12.0/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A=
-google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU=
-google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M=
-google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54/go.mod h1:zqTuNwFlFRsw5zIts5VnzLQxSRqh+CGOTVMlYbY0Eyk=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA=
-google.golang.org/grpc v1.56.2 h1:fVRFRnXvU+x6C4IlHZewvJOVHoOv1TUuQyoRsYnB4bI=
-google.golang.org/grpc v1.56.2/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s=
 google.golang.org/grpc v1.57.0 h1:kfzNeI/klCGD2YPMUlaGNT3pxvYfga7smW3Vth8Zsiw=
 google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
--- a/main.go
+++ b/main.go
@@ -49,6 +49,11 @@ func main() {
 				Name:    "debug",
 				EnvVars: []string{"DEBUG"},
 			},
+			&cli.BoolFlag{
+				Name:    "single-active-backend",
+				EnvVars: []string{"SINGLE_ACTIVE_BACKEND"},
+				Usage:   "Allow only one backend to be running.",
+			},
 			&cli.BoolFlag{
 				Name:    "cors",
 				EnvVars: []string{"CORS"},
@@ -135,6 +140,12 @@ func main() {
 				Usage:   "List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys.",
 				EnvVars: []string{"API_KEY"},
 			},
+			&cli.BoolFlag{
+				Name:    "preload-backend-only",
+				Usage:   "If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups.",
+				EnvVars: []string{"PRELOAD_BACKEND_ONLY"},
+				Value:   false,
+			},
 		},
 		Description: `
 LocalAI is a drop-in replacement OpenAI API which runs inference locally.
@@ -175,6 +186,10 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
 				options.WithApiKeys(ctx.StringSlice("api-keys")),
 			}

+			if ctx.Bool("single-active-backend") {
+				opts = append(opts, options.EnableSingleBackend)
+			}
+
 			externalgRPC := ctx.StringSlice("external-grpc-backends")
 			// split ":" to get backend name and the uri
 			for _, v := range externalgRPC {
@@ -187,6 +202,11 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
 				opts = append(opts, options.EnableGalleriesAutoload)
 			}

+			if ctx.Bool("preload-backend-only") {
+				_, _, err := api.Startup(opts...)
+				return err
+			}
+
 			app, err := api.App(opts...)
 			if err != nil {
 				return err
--- a/pkg/backend/image/stablediffusion.go
+++ b/pkg/backend/image/stablediffusion.go
@@ -9,7 +9,7 @@ import (
 )

 type StableDiffusion struct {
-	base.Base
+	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }

--- a/pkg/backend/llm/bert/bert.go
+++ b/pkg/backend/llm/bert/bert.go
@@ -10,7 +10,7 @@ import (
 )

 type Embeddings struct {
-	base.Base
+	base.SingleThread
 	bert *bert.Bert
 }

@@ -21,6 +21,7 @@ func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
 }

 func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
--- a/pkg/backend/llm/bloomz/bloomz.go
+++ b/pkg/backend/llm/bloomz/bloomz.go
@@ -12,7 +12,7 @@ import (
 )

 type LLM struct {
-	base.Base
+	base.SingleThread

 	bloomz *bloomz.Bloomz
 }
--- a/pkg/backend/llm/falcon/falcon.go
+++ b/pkg/backend/llm/falcon/falcon.go
@@ -12,7 +12,7 @@ import (
 )

 type LLM struct {
-	base.Base
+	base.SingleThread

 	falcon *ggllm.Falcon
 }
@@ -122,6 +122,7 @@ func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 }

 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+
 	predictOptions := buildPredictOptions(opts)

 	predictOptions = append(predictOptions, ggllm.SetTokenCallback(func(token string) bool {
--- a/pkg/backend/llm/gpt4all/gpt4all.go
+++ b/pkg/backend/llm/gpt4all/gpt4all.go
@@ -11,7 +11,7 @@ import (
 )

 type LLM struct {
-	base.Base
+	base.SingleThread

 	gpt4all *gpt4all.Model
 }
--- a/pkg/backend/llm/langchain/langchain.go
+++ b/pkg/backend/llm/langchain/langchain.go
--- a/pkg/backend/llm/llama-stable/llama.go
+++ b/pkg/backend/llm/llama-stable/llama.go
@@ -11,13 +11,12 @@ import (
 )

 type LLM struct {
-	base.Base
+	base.SingleThread

 	llama *llama.LLama
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)

@@ -73,6 +72,7 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {

 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
+
 	return err
 }

--- a/pkg/backend/llm/llama/llama.go
+++ b/pkg/backend/llm/llama/llama.go
@@ -0,0 +1,220 @@
+package llama
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/go-llama.cpp"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	llama *llama.LLama
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(ropeFreqBase),
+		llama.WithRopeFreqScale(ropeFreqScale),
+	}
+
+	if opts.NoMulMatQ {
+		llamaOpts = append(llamaOpts, llama.SetMulMatQ(false))
+	}
+
+	if opts.LoraAdapter != "" {
+		llamaOpts = append(llamaOpts, llama.SetLoraAdapter(opts.LoraAdapter))
+	}
+
+	if opts.LoraBase != "" {
+		llamaOpts = append(llamaOpts, llama.SetLoraBase(opts.LoraBase))
+	}
+
+	if opts.ContextSize != 0 {
+		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
+	}
+	if opts.F16Memory {
+		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	}
+	if opts.Embeddings {
+		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
+	}
+	if opts.NGPULayers != 0 {
+		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
+	}
+
+	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
+	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
+	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
+	if opts.NBatch != 0 {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
+	} else {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
+	}
+
+	if opts.NUMA {
+		llamaOpts = append(llamaOpts, llama.EnableNUMA)
+	}
+
+	if opts.LowVRAM {
+		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
+	}
+
+	model, err := llama.New(opts.ModelFile, llamaOpts...)
+	llm.llama = model
+
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+	predictOptions := []llama.PredictOption{
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
+		llama.SetTopK(int(opts.TopK)),
+		llama.SetTokens(int(opts.Tokens)),
+		llama.SetThreads(int(opts.Threads)),
+		llama.WithGrammar(opts.Grammar),
+		llama.SetRopeFreqBase(ropeFreqBase),
+		llama.SetRopeFreqScale(ropeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
+		llama.SetNegativePrompt(opts.NegativePrompt),
+	}
+
+	if opts.PromptCacheAll {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
+	}
+
+	if opts.PromptCacheRO {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
+	}
+
+	// Expected absolute path
+	if opts.PromptCachePath != "" {
+		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
+	}
+
+	if opts.Mirostat != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
+	}
+
+	if opts.MirostatETA != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
+	}
+
+	if opts.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
+	}
+
+	if opts.Debug {
+		predictOptions = append(predictOptions, llama.Debug)
+	}
+
+	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
+
+	if opts.PresencePenalty != 0 {
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
+	}
+
+	if opts.NKeep != 0 {
+		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
+	}
+
+	if opts.F16KV {
+		predictOptions = append(predictOptions, llama.EnableF16KV)
+	}
+
+	if opts.IgnoreEOS {
+		predictOptions = append(predictOptions, llama.IgnoreEOS)
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
+	}
+
+	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
+	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
+	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
+	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
+	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
+		results <- token
+		return true
+	}))
+
+	go func() {
+		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		close(results)
+	}()
+
+	return nil
+}
+
+func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	predictOptions := buildPredictOptions(opts)
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
+	}
+
+	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
+}
+
+func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
+	predictOptions := buildPredictOptions(opts)
+	l, tokens, err := llm.llama.TokenizeString(opts.Prompt, predictOptions...)
+	if err != nil {
+		return pb.TokenizationResponse{}, err
+	}
+	return pb.TokenizationResponse{
+		Length: l,
+		Tokens: tokens,
+	}, nil
+}
--- a/pkg/backend/llm/rwkv/rwkv.go
+++ b/pkg/backend/llm/rwkv/rwkv.go
@@ -14,15 +14,21 @@ import (
 const tokenizerSuffix = ".tokenizer.json"

 type LLM struct {
-	base.Base
+	base.SingleThread

 	rwkv *rwkv.RwkvState
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	tokenizerFile := opts.Tokenizer
+	if tokenizerFile == "" {
+		modelFile := filepath.Base(opts.ModelFile)
+		tokenizerFile = modelFile + tokenizerSuffix
+	}
 	modelPath := filepath.Dir(opts.ModelFile)
-	modelFile := filepath.Base(opts.ModelFile)
-	model := rwkv.LoadFiles(opts.ModelFile, filepath.Join(modelPath, modelFile+tokenizerSuffix), uint32(opts.GetThreads()))
+	tokenizerPath := filepath.Join(modelPath, tokenizerFile)
+
+	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))

 	if model == nil {
 		return fmt.Errorf("could not load model")
@@ -32,7 +38,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 }

 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-
 	stopWord := "\n"
 	if len(opts.StopPrompts) > 0 {
 		stopWord = opts.StopPrompts[0]
@@ -69,3 +74,22 @@ func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) erro

 	return nil
 }
+
+func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
+	tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
+	if err != nil {
+		return pb.TokenizationResponse{}, err
+	}
+
+	l := len(tokens)
+	i32Tokens := make([]int32, l)
+
+	for i, t := range tokens {
+		i32Tokens[i] = int32(t.ID)
+	}
+
+	return pb.TokenizationResponse{
+		Length: int32(l),
+		Tokens: i32Tokens,
+	}, nil
+}
--- a/pkg/backend/llm/transformers/dolly.go
+++ b/pkg/backend/llm/transformers/dolly.go
@@ -12,7 +12,7 @@ import (
 )

 type Dolly struct {
-	base.Base
+	base.SingleThread

 	dolly *transformers.Dolly
 }
@@ -29,6 +29,7 @@ func (llm *Dolly) Predict(opts *pb.PredictOptions) (string, error) {

 // fallback to Predict
 func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) error {
+
 	go func() {
 		res, err := llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)

--- a/pkg/backend/llm/transformers/falcon.go
+++ b/pkg/backend/llm/transformers/falcon.go
@@ -12,7 +12,7 @@ import (
 )

 type Falcon struct {
-	base.Base
+	base.SingleThread

 	falcon *transformers.Falcon
 }
--- a/pkg/backend/llm/transformers/gpt2.go
+++ b/pkg/backend/llm/transformers/gpt2.go
@@ -12,7 +12,7 @@ import (
 )

 type GPT2 struct {
-	base.Base
+	base.SingleThread

 	gpt2 *transformers.GPT2
 }
--- a/pkg/backend/llm/transformers/gptj.go
+++ b/pkg/backend/llm/transformers/gptj.go
@@ -12,7 +12,7 @@ import (
 )

 type GPTJ struct {
-	base.Base
+	base.SingleThread

 	gptj *transformers.GPTJ
 }
--- a/pkg/backend/llm/transformers/gptneox.go
+++ b/pkg/backend/llm/transformers/gptneox.go
@@ -12,7 +12,7 @@ import (
 )

 type GPTNeoX struct {
-	base.Base
+	base.SingleThread

 	gptneox *transformers.GPTNeoX
 }
--- a/pkg/backend/llm/transformers/mpt.go
+++ b/pkg/backend/llm/transformers/mpt.go
@@ -12,7 +12,7 @@ import (
 )

 type MPT struct {
-	base.Base
+	base.SingleThread

 	mpt *transformers.MPT
 }
--- a/pkg/backend/llm/transformers/predict.go
+++ b/pkg/backend/llm/transformers/predict.go
--- a/pkg/backend/llm/transformers/replit.go
+++ b/pkg/backend/llm/transformers/replit.go
@@ -12,7 +12,7 @@ import (
 )

 type Replit struct {
-	base.Base
+	base.SingleThread

 	replit *transformers.Replit
 }
--- a/pkg/backend/llm/transformers/starcoder.go
+++ b/pkg/backend/llm/transformers/starcoder.go
@@ -12,7 +12,7 @@ import (
 )

 type Starcoder struct {
-	base.Base
+	base.SingleThread

 	starcoder *transformers.Starcoder
 }
--- a/pkg/backend/transcribe/transcript.go
+++ b/pkg/backend/transcribe/transcript.go
@@ -1,4 +1,4 @@
-package whisper
+package transcribe

 import (
 	"fmt"
@@ -7,8 +7,8 @@ import (
 	"path/filepath"

 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	wav "github.com/go-audio/wav"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
+	"github.com/go-audio/wav"
+	"github.com/go-skynet/LocalAI/api/schema"
 )

 func sh(c string) (string, error) {
@@ -29,8 +29,8 @@ func audioToWav(src, dst string) error {
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (api.Result, error) {
-	res := api.Result{}
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.Result, error) {
+	res := schema.Result{}

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
@@ -90,7 +90,7 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
 			tokens = append(tokens, t.Id)
 		}

-		segment := api.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
+		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
 		res.Segments = append(res.Segments, segment)

 		res.Text += s.Text
--- a/pkg/backend/transcribe/whisper.go
+++ b/pkg/backend/transcribe/whisper.go
@@ -4,14 +4,13 @@ package transcribe
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	whisperutil "github.com/go-skynet/LocalAI/pkg/grpc/whisper"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 )

 type Whisper struct {
-	base.Base
+	base.SingleThread
 	whisper whisper.Model
 }

@@ -22,6 +21,6 @@ func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (api.Result, error) {
-	return whisperutil.Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.Result, error) {
+	return Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
 }
--- a/pkg/backend/tts/piper.go
+++ b/pkg/backend/tts/piper.go
@@ -13,7 +13,7 @@ import (
 )

 type Piper struct {
-	base.Base
+	base.SingleThread
 	piper *PiperB
 }

--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -4,17 +4,37 @@ package base
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
+	"os"

+	"github.com/go-skynet/LocalAI/api/schema"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
+	gopsutil "github.com/shirou/gopsutil/v3/process"
 )

+// Base is a base class for all backends to implement
+// Note: the backends that does not support multiple requests
+// should use SingleThread instead
 type Base struct {
 }

+func (llm *Base) Locking() bool {
+	return false
+}
+
+func (llm *Base) Lock() {
+	panic("not implemented")
+}
+
+func (llm *Base) Unlock() {
+	panic("not implemented")
+}
+
+func (llm *Base) Busy() bool {
+	return false
+}
+
 func (llm *Base) Load(opts *pb.ModelOptions) error {
 	return fmt.Errorf("unimplemented")
-
 }

 func (llm *Base) Predict(opts *pb.PredictOptions) (string, error) {
@@ -33,10 +53,40 @@ func (llm *Base) GenerateImage(*pb.GenerateImageRequest) error {
 	return fmt.Errorf("unimplemented")
 }

-func (llm *Base) AudioTranscription(*pb.TranscriptRequest) (api.Result, error) {
-	return api.Result{}, fmt.Errorf("unimplemented")
+func (llm *Base) AudioTranscription(*pb.TranscriptRequest) (schema.Result, error) {
+	return schema.Result{}, fmt.Errorf("unimplemented")
 }

 func (llm *Base) TTS(*pb.TTSRequest) error {
 	return fmt.Errorf("unimplemented")
 }
+
+func (llm *Base) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
+	return pb.TokenizationResponse{}, fmt.Errorf("unimplemented")
+}
+
+// backends may wish to call this to capture the gopsutil info, then enhance with additional memory usage details?
+func (llm *Base) Status() (pb.StatusResponse, error) {
+	return pb.StatusResponse{
+		Memory: memoryUsage(),
+	}, nil
+}
+
+func memoryUsage() *pb.MemoryUsageData {
+	mud := pb.MemoryUsageData{
+		Breakdown: make(map[string]uint64),
+	}
+
+	pid := int32(os.Getpid())
+
+	backendProcess, err := gopsutil.NewProcess(pid)
+
+	if err == nil {
+		memInfo, err := backendProcess.MemoryInfo()
+		if err == nil {
+			mud.Total = memInfo.VMS // TEST, but rss seems reasonable first guess. Does include swap, but we might care about that.
+			mud.Breakdown["gopsutil-RSS"] = memInfo.RSS
+		}
+	}
+	return &mud
+}
--- a/pkg/grpc/base/singlethread.go
+++ b/pkg/grpc/base/singlethread.go
@@ -0,0 +1,52 @@
+package base
+
+import (
+	"sync"
+
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+)
+
+// SingleThread are backends that does not support multiple requests.
+// There will be only one request being served at the time.
+// This is useful for models that are not thread safe and cannot run
+// multiple requests at the same time.
+type SingleThread struct {
+	Base
+	backendBusy sync.Mutex
+}
+
+// Locking returns true if the backend needs to lock resources
+func (llm *SingleThread) Locking() bool {
+	return true
+}
+
+func (llm *SingleThread) Lock() {
+	llm.backendBusy.Lock()
+}
+
+func (llm *SingleThread) Unlock() {
+	llm.backendBusy.Unlock()
+}
+
+func (llm *SingleThread) Busy() bool {
+	r := llm.backendBusy.TryLock()
+	if r {
+		llm.backendBusy.Unlock()
+	}
+	return r
+}
+
+// backends may wish to call this to capture the gopsutil info, then enhance with additional memory usage details?
+func (llm *SingleThread) Status() (pb.StatusResponse, error) {
+	mud := memoryUsage()
+
+	state := pb.StatusResponse_READY
+	if llm.Busy() {
+		state = pb.StatusResponse_BUSY
+	}
+
+	return pb.StatusResponse{
+		State:  state,
+		Memory: mud,
+	}, nil
+}
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -4,16 +4,19 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"sync"
 	"time"

+	"github.com/go-skynet/LocalAI/api/schema"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
 )

 type Client struct {
 	address string
+	busy    bool
+	sync.Mutex
 }

 func NewClient(address string) *Client {
@@ -22,7 +25,21 @@ func NewClient(address string) *Client {
 	}
 }

+func (c *Client) IsBusy() bool {
+	c.Lock()
+	defer c.Unlock()
+	return c.busy
+}
+
+func (c *Client) setBusy(v bool) {
+	c.Lock()
+	c.busy = v
+	c.Unlock()
+}
+
 func (c *Client) HealthCheck(ctx context.Context) bool {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		fmt.Println(err)
@@ -49,6 +66,8 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
 }

 func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -60,6 +79,8 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...
 }

 func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -71,6 +92,8 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp
 }

 func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -81,6 +104,8 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 }

 func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return err
@@ -110,6 +135,8 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 }

 func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -120,6 +147,8 @@ func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest,
 }

 func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -129,7 +158,9 @@ func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOp
 	return client.TTS(ctx, in, opts...)
 }

-func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*api.Result, error) {
+func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*schema.Result, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return nil, err
@@ -140,14 +171,14 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 	if err != nil {
 		return nil, err
 	}
-	tresult := &api.Result{}
+	tresult := &schema.Result{}
 	for _, s := range res.Segments {
 		tks := []int{}
 		for _, t := range s.Tokens {
 			tks = append(tks, int(t))
 		}
 		tresult.Segments = append(tresult.Segments,
-			api.Segment{
+			schema.Segment{
 				Text:   s.Text,
 				Id:     int(s.Id),
 				Start:  time.Duration(s.Start),
@@ -158,3 +189,33 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 	tresult.Text = res.Text
 	return tresult, err
 }
+
+func (c *Client) TokenizeString(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.TokenizationResponse, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+	client := pb.NewBackendClient(conn)
+
+	res, err := client.TokenizeString(ctx, in, opts...)
+
+	if err != nil {
+		return nil, err
+	}
+	return res, nil
+}
+
+func (c *Client) Status(ctx context.Context) (*pb.StatusResponse, error) {
+	c.setBusy(true)
+	defer c.setBusy(false)
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+	client := pb.NewBackendClient(conn)
+	return client.Status(ctx, &pb.HealthMessage{})
+}
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -1,18 +1,24 @@
 package grpc

 import (
+	"github.com/go-skynet/LocalAI/api/schema"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 )

 type LLM interface {
+	Busy() bool
+	Lock()
+	Unlock()
+	Locking() bool
 	Predict(*pb.PredictOptions) (string, error)
 	PredictStream(*pb.PredictOptions, chan string) error
 	Load(*pb.ModelOptions) error
 	Embeddings(*pb.PredictOptions) ([]float32, error)
 	GenerateImage(*pb.GenerateImageRequest) error
-	AudioTranscription(*pb.TranscriptRequest) (api.Result, error)
+	AudioTranscription(*pb.TranscriptRequest) (schema.Result, error)
 	TTS(*pb.TTSRequest) error
+	TokenizeString(*pb.PredictOptions) (pb.TokenizationResponse, error)
+	Status() (pb.StatusResponse, error)
 }

 func newReply(s string) *pb.Reply {
--- a/pkg/grpc/proto/backend.pb.go
+++ b/pkg/grpc/proto/backend.pb.go
@@ -20,6 +20,58 @@ const (
 	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
 )

+type StatusResponse_State int32
+
+const (
+	StatusResponse_UNINITIALIZED StatusResponse_State = 0
+	StatusResponse_BUSY          StatusResponse_State = 1
+	StatusResponse_READY         StatusResponse_State = 2
+	StatusResponse_ERROR         StatusResponse_State = -1
+)
+
+// Enum value maps for StatusResponse_State.
+var (
+	StatusResponse_State_name = map[int32]string{
+		0:  "UNINITIALIZED",
+		1:  "BUSY",
+		2:  "READY",
+		-1: "ERROR",
+	}
+	StatusResponse_State_value = map[string]int32{
+		"UNINITIALIZED": 0,
+		"BUSY":          1,
+		"READY":         2,
+		"ERROR":         -1,
+	}
+)
+
+func (x StatusResponse_State) Enum() *StatusResponse_State {
+	p := new(StatusResponse_State)
+	*p = x
+	return p
+}
+
+func (x StatusResponse_State) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (StatusResponse_State) Descriptor() protoreflect.EnumDescriptor {
+	return file_pkg_grpc_proto_backend_proto_enumTypes[0].Descriptor()
+}
+
+func (StatusResponse_State) Type() protoreflect.EnumType {
+	return &file_pkg_grpc_proto_backend_proto_enumTypes[0]
+}
+
+func (x StatusResponse_State) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Use StatusResponse_State.Descriptor instead.
+func (StatusResponse_State) EnumDescriptor() ([]byte, []int) {
+	return file_pkg_grpc_proto_backend_proto_rawDescGZIP(), []int{13, 0}
+}
+
 type HealthMessage struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -490,9 +542,20 @@ type ModelOptions struct {
 	ModelBaseName    string `protobuf:"bytes,24,opt,name=ModelBaseName,proto3" json:"ModelBaseName,omitempty"`
 	UseFastTokenizer bool   `protobuf:"varint,25,opt,name=UseFastTokenizer,proto3" json:"UseFastTokenizer,omitempty"`
 	// Diffusers
-	PipelineType  string `protobuf:"bytes,26,opt,name=PipelineType,proto3" json:"PipelineType,omitempty"`
-	SchedulerType string `protobuf:"bytes,27,opt,name=SchedulerType,proto3" json:"SchedulerType,omitempty"`
-	CUDA          bool   `protobuf:"varint,28,opt,name=CUDA,proto3" json:"CUDA,omitempty"`
+	PipelineType  string  `protobuf:"bytes,26,opt,name=PipelineType,proto3" json:"PipelineType,omitempty"`
+	SchedulerType string  `protobuf:"bytes,27,opt,name=SchedulerType,proto3" json:"SchedulerType,omitempty"`
+	CUDA          bool    `protobuf:"varint,28,opt,name=CUDA,proto3" json:"CUDA,omitempty"`
+	CFGScale      float32 `protobuf:"fixed32,29,opt,name=CFGScale,proto3" json:"CFGScale,omitempty"`
+	IMG2IMG       bool    `protobuf:"varint,30,opt,name=IMG2IMG,proto3" json:"IMG2IMG,omitempty"`
+	CLIPModel     string  `protobuf:"bytes,31,opt,name=CLIPModel,proto3" json:"CLIPModel,omitempty"`
+	CLIPSubfolder string  `protobuf:"bytes,32,opt,name=CLIPSubfolder,proto3" json:"CLIPSubfolder,omitempty"`
+	CLIPSkip      int32   `protobuf:"varint,33,opt,name=CLIPSkip,proto3" json:"CLIPSkip,omitempty"`
+	// RWKV
+	Tokenizer string `protobuf:"bytes,34,opt,name=Tokenizer,proto3" json:"Tokenizer,omitempty"`
+	// LLM (llama.cpp)
+	LoraBase    string `protobuf:"bytes,35,opt,name=LoraBase,proto3" json:"LoraBase,omitempty"`
+	LoraAdapter string `protobuf:"bytes,36,opt,name=LoraAdapter,proto3" json:"LoraAdapter,omitempty"`
+	NoMulMatQ   bool   `protobuf:"varint,37,opt,name=NoMulMatQ,proto3" json:"NoMulMatQ,omitempty"`
 }

 func (x *ModelOptions) Reset() {
@@ -723,6 +786,69 @@ func (x *ModelOptions) GetCUDA() bool {
 	return false
 }

+func (x *ModelOptions) GetCFGScale() float32 {
+	if x != nil {
+		return x.CFGScale
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetIMG2IMG() bool {
+	if x != nil {
+		return x.IMG2IMG
+	}
+	return false
+}
+
+func (x *ModelOptions) GetCLIPModel() string {
+	if x != nil {
+		return x.CLIPModel
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetCLIPSubfolder() string {
+	if x != nil {
+		return x.CLIPSubfolder
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetCLIPSkip() int32 {
+	if x != nil {
+		return x.CLIPSkip
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetTokenizer() string {
+	if x != nil {
+		return x.Tokenizer
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetLoraBase() string {
+	if x != nil {
+		return x.LoraBase
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetLoraAdapter() string {
+	if x != nil {
+		return x.LoraAdapter
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetNoMulMatQ() bool {
+	if x != nil {
+		return x.NoMulMatQ
+	}
+	return false
+}
+
 type Result struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -1035,6 +1161,10 @@ type GenerateImageRequest struct {
 	PositivePrompt string `protobuf:"bytes,6,opt,name=positive_prompt,json=positivePrompt,proto3" json:"positive_prompt,omitempty"`
 	NegativePrompt string `protobuf:"bytes,7,opt,name=negative_prompt,json=negativePrompt,proto3" json:"negative_prompt,omitempty"`
 	Dst            string `protobuf:"bytes,8,opt,name=dst,proto3" json:"dst,omitempty"`
+	Src            string `protobuf:"bytes,9,opt,name=src,proto3" json:"src,omitempty"`
+	// Diffusers
+	EnableParameters string `protobuf:"bytes,10,opt,name=EnableParameters,proto3" json:"EnableParameters,omitempty"`
+	CLIPSkip         int32  `protobuf:"varint,11,opt,name=CLIPSkip,proto3" json:"CLIPSkip,omitempty"`
 }

 func (x *GenerateImageRequest) Reset() {
@@ -1125,6 +1255,27 @@ func (x *GenerateImageRequest) GetDst() string {
 	return ""
 }

+func (x *GenerateImageRequest) GetSrc() string {
+	if x != nil {
+		return x.Src
+	}
+	return ""
+}
+
+func (x *GenerateImageRequest) GetEnableParameters() string {
+	if x != nil {
+		return x.EnableParameters
+	}
+	return ""
+}
+
+func (x *GenerateImageRequest) GetCLIPSkip() int32 {
+	if x != nil {
+		return x.CLIPSkip
+	}
+	return 0
+}
+
 type TTSRequest struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -1188,6 +1339,171 @@ func (x *TTSRequest) GetDst() string {
 	return ""
 }

+type TokenizationResponse struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	Length int32   `protobuf:"varint,1,opt,name=length,proto3" json:"length,omitempty"`
+	Tokens []int32 `protobuf:"varint,2,rep,packed,name=tokens,proto3" json:"tokens,omitempty"`
+}
+
+func (x *TokenizationResponse) Reset() {
+	*x = TokenizationResponse{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_pkg_grpc_proto_backend_proto_msgTypes[11]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *TokenizationResponse) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*TokenizationResponse) ProtoMessage() {}
+
+func (x *TokenizationResponse) ProtoReflect() protoreflect.Message {
+	mi := &file_pkg_grpc_proto_backend_proto_msgTypes[11]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use TokenizationResponse.ProtoReflect.Descriptor instead.
+func (*TokenizationResponse) Descriptor() ([]byte, []int) {
+	return file_pkg_grpc_proto_backend_proto_rawDescGZIP(), []int{11}
+}
+
+func (x *TokenizationResponse) GetLength() int32 {
+	if x != nil {
+		return x.Length
+	}
+	return 0
+}
+
+func (x *TokenizationResponse) GetTokens() []int32 {
+	if x != nil {
+		return x.Tokens
+	}
+	return nil
+}
+
+type MemoryUsageData struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	Total     uint64            `protobuf:"varint,1,opt,name=total,proto3" json:"total,omitempty"`
+	Breakdown map[string]uint64 `protobuf:"bytes,2,rep,name=breakdown,proto3" json:"breakdown,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"varint,2,opt,name=value,proto3"`
+}
+
+func (x *MemoryUsageData) Reset() {
+	*x = MemoryUsageData{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_pkg_grpc_proto_backend_proto_msgTypes[12]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *MemoryUsageData) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*MemoryUsageData) ProtoMessage() {}
+
+func (x *MemoryUsageData) ProtoReflect() protoreflect.Message {
+	mi := &file_pkg_grpc_proto_backend_proto_msgTypes[12]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use MemoryUsageData.ProtoReflect.Descriptor instead.
+func (*MemoryUsageData) Descriptor() ([]byte, []int) {
+	return file_pkg_grpc_proto_backend_proto_rawDescGZIP(), []int{12}
+}
+
+func (x *MemoryUsageData) GetTotal() uint64 {
+	if x != nil {
+		return x.Total
+	}
+	return 0
+}
+
+func (x *MemoryUsageData) GetBreakdown() map[string]uint64 {
+	if x != nil {
+		return x.Breakdown
+	}
+	return nil
+}
+
+type StatusResponse struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	State  StatusResponse_State `protobuf:"varint,1,opt,name=state,proto3,enum=backend.StatusResponse_State" json:"state,omitempty"`
+	Memory *MemoryUsageData     `protobuf:"bytes,2,opt,name=memory,proto3" json:"memory,omitempty"`
+}
+
+func (x *StatusResponse) Reset() {
+	*x = StatusResponse{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_pkg_grpc_proto_backend_proto_msgTypes[13]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *StatusResponse) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*StatusResponse) ProtoMessage() {}
+
+func (x *StatusResponse) ProtoReflect() protoreflect.Message {
+	mi := &file_pkg_grpc_proto_backend_proto_msgTypes[13]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use StatusResponse.ProtoReflect.Descriptor instead.
+func (*StatusResponse) Descriptor() ([]byte, []int) {
+	return file_pkg_grpc_proto_backend_proto_rawDescGZIP(), []int{13}
+}
+
+func (x *StatusResponse) GetState() StatusResponse_State {
+	if x != nil {
+		return x.State
+	}
+	return StatusResponse_UNINITIALIZED
+}
+
+func (x *StatusResponse) GetMemory() *MemoryUsageData {
+	if x != nil {
+		return x.Memory
+	}
+	return nil
+}
+
 var File_pkg_grpc_proto_backend_proto protoreflect.FileDescriptor

 var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
@@ -1273,7 +1589,7 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x0e, 0x4e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22,
 	0x21, 0x0a, 0x05, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73,
 	0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61,
-	0x67, 0x65, 0x22, 0xcc, 0x06, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69,
+	0x67, 0x65, 0x22, 0xdc, 0x08, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69,
 	0x6f, 0x6e, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x01, 0x20, 0x01,
 	0x28, 0x09, 0x52, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x43, 0x6f, 0x6e,
 	0x74, 0x65, 0x78, 0x74, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b,
@@ -1326,7 +1642,24 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x75, 0x6c, 0x65, 0x72, 0x54, 0x79, 0x70, 0x65, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d,
 	0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x54, 0x79, 0x70, 0x65, 0x12, 0x12, 0x0a,
 	0x04, 0x43, 0x55, 0x44, 0x41, 0x18, 0x1c, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x43, 0x55, 0x44,
-	0x41, 0x22, 0x3c, 0x0a, 0x06, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x6d,
+	0x41, 0x12, 0x1a, 0x0a, 0x08, 0x43, 0x46, 0x47, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x18, 0x1d, 0x20,
+	0x01, 0x28, 0x02, 0x52, 0x08, 0x43, 0x46, 0x47, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x12, 0x18, 0x0a,
+	0x07, 0x49, 0x4d, 0x47, 0x32, 0x49, 0x4d, 0x47, 0x18, 0x1e, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07,
+	0x49, 0x4d, 0x47, 0x32, 0x49, 0x4d, 0x47, 0x12, 0x1c, 0x0a, 0x09, 0x43, 0x4c, 0x49, 0x50, 0x4d,
+	0x6f, 0x64, 0x65, 0x6c, 0x18, 0x1f, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x43, 0x4c, 0x49, 0x50,
+	0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x24, 0x0a, 0x0d, 0x43, 0x4c, 0x49, 0x50, 0x53, 0x75, 0x62,
+	0x66, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x18, 0x20, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x43, 0x4c,
+	0x49, 0x50, 0x53, 0x75, 0x62, 0x66, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x12, 0x1a, 0x0a, 0x08, 0x43,
+	0x4c, 0x49, 0x50, 0x53, 0x6b, 0x69, 0x70, 0x18, 0x21, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x43,
+	0x4c, 0x49, 0x50, 0x53, 0x6b, 0x69, 0x70, 0x12, 0x1c, 0x0a, 0x09, 0x54, 0x6f, 0x6b, 0x65, 0x6e,
+	0x69, 0x7a, 0x65, 0x72, 0x18, 0x22, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x54, 0x6f, 0x6b, 0x65,
+	0x6e, 0x69, 0x7a, 0x65, 0x72, 0x12, 0x1a, 0x0a, 0x08, 0x4c, 0x6f, 0x72, 0x61, 0x42, 0x61, 0x73,
+	0x65, 0x18, 0x23, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x4c, 0x6f, 0x72, 0x61, 0x42, 0x61, 0x73,
+	0x65, 0x12, 0x20, 0x0a, 0x0b, 0x4c, 0x6f, 0x72, 0x61, 0x41, 0x64, 0x61, 0x70, 0x74, 0x65, 0x72,
+	0x18, 0x24, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x4c, 0x6f, 0x72, 0x61, 0x41, 0x64, 0x61, 0x70,
+	0x74, 0x65, 0x72, 0x12, 0x1c, 0x0a, 0x09, 0x4e, 0x6f, 0x4d, 0x75, 0x6c, 0x4d, 0x61, 0x74, 0x51,
+	0x18, 0x25, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x4e, 0x6f, 0x4d, 0x75, 0x6c, 0x4d, 0x61, 0x74,
+	0x51, 0x22, 0x3c, 0x0a, 0x06, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x6d,
 	0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65,
 	0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73,
 	0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x22,
@@ -1352,7 +1685,7 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x03, 0x65, 0x6e, 0x64, 0x12, 0x12, 0x0a, 0x04,
 	0x74, 0x65, 0x78, 0x74, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74,
 	0x12, 0x16, 0x0a, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x05,
-	0x52, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x22, 0xe4, 0x01, 0x0a, 0x14, 0x47, 0x65, 0x6e,
+	0x52, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x22, 0xbe, 0x02, 0x0a, 0x14, 0x47, 0x65, 0x6e,
 	0x65, 0x72, 0x61, 0x74, 0x65, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73,
 	0x74, 0x12, 0x16, 0x0a, 0x06, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28,
 	0x05, 0x52, 0x06, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x77, 0x69, 0x64,
@@ -1366,12 +1699,45 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x6f, 0x6d, 0x70, 0x74, 0x12, 0x27, 0x0a, 0x0f, 0x6e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65,
 	0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x6e,
 	0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x12, 0x10, 0x0a,
-	0x03, 0x64, 0x73, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x22,
-	0x48, 0x0a, 0x0a, 0x54, 0x54, 0x53, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a,
-	0x04, 0x74, 0x65, 0x78, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78,
-	0x74, 0x12, 0x14, 0x0a, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09,
-	0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x03,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x32, 0xeb, 0x03, 0x0a, 0x07, 0x42, 0x61,
+	0x03, 0x64, 0x73, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x12,
+	0x10, 0x0a, 0x03, 0x73, 0x72, 0x63, 0x18, 0x09, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x72,
+	0x63, 0x12, 0x2a, 0x0a, 0x10, 0x45, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d,
+	0x65, 0x74, 0x65, 0x72, 0x73, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x09, 0x52, 0x10, 0x45, 0x6e, 0x61,
+	0x62, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x12, 0x1a, 0x0a,
+	0x08, 0x43, 0x4c, 0x49, 0x50, 0x53, 0x6b, 0x69, 0x70, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x05, 0x52,
+	0x08, 0x43, 0x4c, 0x49, 0x50, 0x53, 0x6b, 0x69, 0x70, 0x22, 0x48, 0x0a, 0x0a, 0x54, 0x54, 0x53,
+	0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x65, 0x78, 0x74, 0x18,
+	0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x6d,
+	0x6f, 0x64, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65,
+	0x6c, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
+	0x64, 0x73, 0x74, 0x22, 0x46, 0x0a, 0x14, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74,
+	0x69, 0x6f, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6c,
+	0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x6c, 0x65, 0x6e,
+	0x67, 0x74, 0x68, 0x12, 0x16, 0x0a, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x18, 0x02, 0x20,
+	0x03, 0x28, 0x05, 0x52, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x22, 0xac, 0x01, 0x0a, 0x0f,
+	0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x55, 0x73, 0x61, 0x67, 0x65, 0x44, 0x61, 0x74, 0x61, 0x12,
+	0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05,
+	0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x45, 0x0a, 0x09, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x64, 0x6f,
+	0x77, 0x6e, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65,
+	0x6e, 0x64, 0x2e, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x55, 0x73, 0x61, 0x67, 0x65, 0x44, 0x61,
+	0x74, 0x61, 0x2e, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x45, 0x6e, 0x74, 0x72,
+	0x79, 0x52, 0x09, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x1a, 0x3c, 0x0a, 0x0e,
+	0x42, 0x72, 0x65, 0x61, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10,
+	0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79,
+	0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52,
+	0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0xbc, 0x01, 0x0a, 0x0e, 0x53,
+	0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x33, 0x0a,
+	0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x62,
+	0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73,
+	0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61,
+	0x74, 0x65, 0x12, 0x30, 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x02, 0x20, 0x01,
+	0x28, 0x0b, 0x32, 0x18, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x4d, 0x65, 0x6d,
+	0x6f, 0x72, 0x79, 0x55, 0x73, 0x61, 0x67, 0x65, 0x44, 0x61, 0x74, 0x61, 0x52, 0x06, 0x6d, 0x65,
+	0x6d, 0x6f, 0x72, 0x79, 0x22, 0x43, 0x0a, 0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a,
+	0x0d, 0x55, 0x4e, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x41, 0x4c, 0x49, 0x5a, 0x45, 0x44, 0x10, 0x00,
+	0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, 0x53, 0x59, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x45,
+	0x41, 0x44, 0x59, 0x10, 0x02, 0x12, 0x12, 0x0a, 0x05, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x32, 0xf4, 0x04, 0x0a, 0x07, 0x42, 0x61,
 	0x63, 0x6b, 0x65, 0x6e, 0x64, 0x12, 0x32, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12,
 	0x16, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
 	0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e,
@@ -1402,13 +1768,22 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x2d, 0x0a, 0x03, 0x54, 0x54, 0x53, 0x12,
 	0x13, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x54, 0x54, 0x53, 0x52, 0x65, 0x71,
 	0x75, 0x65, 0x73, 0x74, 0x1a, 0x0f, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52,
-	0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x42, 0x5a, 0x0a, 0x19, 0x69, 0x6f, 0x2e, 0x73, 0x6b,
-	0x79, 0x6e, 0x65, 0x74, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x61, 0x69, 0x2e, 0x62, 0x61, 0x63,
-	0x6b, 0x65, 0x6e, 0x64, 0x42, 0x0e, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x42, 0x61, 0x63,
-	0x6b, 0x65, 0x6e, 0x64, 0x50, 0x01, 0x5a, 0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63,
-	0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x2d, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2f, 0x4c, 0x6f, 0x63,
-	0x61, 0x6c, 0x41, 0x49, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72,
-	0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x4a, 0x0a, 0x0e, 0x54, 0x6f, 0x6b, 0x65, 0x6e,
+	0x69, 0x7a, 0x65, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x12, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b,
+	0x65, 0x6e, 0x64, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f,
+	0x6e, 0x73, 0x1a, 0x1d, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x54, 0x6f, 0x6b,
+	0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73,
+	0x65, 0x22, 0x00, 0x12, 0x3b, 0x0a, 0x06, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x2e,
+	0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x4d, 0x65,
+	0x73, 0x73, 0x61, 0x67, 0x65, 0x1a, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e,
+	0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00,
+	0x42, 0x5a, 0x0a, 0x19, 0x69, 0x6f, 0x2e, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2e, 0x6c, 0x6f,
+	0x63, 0x61, 0x6c, 0x61, 0x69, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x42, 0x0e, 0x4c,
+	0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x50, 0x01, 0x5a,
+	0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x2d, 0x73,
+	0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2f, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x2f, 0x70, 0x6b,
+	0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72,
+	0x6f, 0x74, 0x6f, 0x33,
 }

 var (
@@ -1423,43 +1798,56 @@ func file_pkg_grpc_proto_backend_proto_rawDescGZIP() []byte {
 	return file_pkg_grpc_proto_backend_proto_rawDescData
 }

-var file_pkg_grpc_proto_backend_proto_msgTypes = make([]protoimpl.MessageInfo, 11)
+var file_pkg_grpc_proto_backend_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
+var file_pkg_grpc_proto_backend_proto_msgTypes = make([]protoimpl.MessageInfo, 15)
 var file_pkg_grpc_proto_backend_proto_goTypes = []interface{}{
-	(*HealthMessage)(nil),        // 0: backend.HealthMessage
-	(*PredictOptions)(nil),       // 1: backend.PredictOptions
-	(*Reply)(nil),                // 2: backend.Reply
-	(*ModelOptions)(nil),         // 3: backend.ModelOptions
-	(*Result)(nil),               // 4: backend.Result
-	(*EmbeddingResult)(nil),      // 5: backend.EmbeddingResult
-	(*TranscriptRequest)(nil),    // 6: backend.TranscriptRequest
-	(*TranscriptResult)(nil),     // 7: backend.TranscriptResult
-	(*TranscriptSegment)(nil),    // 8: backend.TranscriptSegment
-	(*GenerateImageRequest)(nil), // 9: backend.GenerateImageRequest
-	(*TTSRequest)(nil),           // 10: backend.TTSRequest
+	(StatusResponse_State)(0),    // 0: backend.StatusResponse.State
+	(*HealthMessage)(nil),        // 1: backend.HealthMessage
+	(*PredictOptions)(nil),       // 2: backend.PredictOptions
+	(*Reply)(nil),                // 3: backend.Reply
+	(*ModelOptions)(nil),         // 4: backend.ModelOptions
+	(*Result)(nil),               // 5: backend.Result
+	(*EmbeddingResult)(nil),      // 6: backend.EmbeddingResult
+	(*TranscriptRequest)(nil),    // 7: backend.TranscriptRequest
+	(*TranscriptResult)(nil),     // 8: backend.TranscriptResult
+	(*TranscriptSegment)(nil),    // 9: backend.TranscriptSegment
+	(*GenerateImageRequest)(nil), // 10: backend.GenerateImageRequest
+	(*TTSRequest)(nil),           // 11: backend.TTSRequest
+	(*TokenizationResponse)(nil), // 12: backend.TokenizationResponse
+	(*MemoryUsageData)(nil),      // 13: backend.MemoryUsageData
+	(*StatusResponse)(nil),       // 14: backend.StatusResponse
+	nil,                          // 15: backend.MemoryUsageData.BreakdownEntry
 }
 var file_pkg_grpc_proto_backend_proto_depIdxs = []int32{
-	8,  // 0: backend.TranscriptResult.segments:type_name -> backend.TranscriptSegment
-	0,  // 1: backend.Backend.Health:input_type -> backend.HealthMessage
-	1,  // 2: backend.Backend.Predict:input_type -> backend.PredictOptions
-	3,  // 3: backend.Backend.LoadModel:input_type -> backend.ModelOptions
-	1,  // 4: backend.Backend.PredictStream:input_type -> backend.PredictOptions
-	1,  // 5: backend.Backend.Embedding:input_type -> backend.PredictOptions
-	9,  // 6: backend.Backend.GenerateImage:input_type -> backend.GenerateImageRequest
-	6,  // 7: backend.Backend.AudioTranscription:input_type -> backend.TranscriptRequest
-	10, // 8: backend.Backend.TTS:input_type -> backend.TTSRequest
-	2,  // 9: backend.Backend.Health:output_type -> backend.Reply
-	2,  // 10: backend.Backend.Predict:output_type -> backend.Reply
-	4,  // 11: backend.Backend.LoadModel:output_type -> backend.Result
-	2,  // 12: backend.Backend.PredictStream:output_type -> backend.Reply
-	5,  // 13: backend.Backend.Embedding:output_type -> backend.EmbeddingResult
-	4,  // 14: backend.Backend.GenerateImage:output_type -> backend.Result
-	7,  // 15: backend.Backend.AudioTranscription:output_type -> backend.TranscriptResult
-	4,  // 16: backend.Backend.TTS:output_type -> backend.Result
-	9,  // [9:17] is the sub-list for method output_type
-	1,  // [1:9] is the sub-list for method input_type
-	1,  // [1:1] is the sub-list for extension type_name
-	1,  // [1:1] is the sub-list for extension extendee
-	0,  // [0:1] is the sub-list for field type_name
+	9,  // 0: backend.TranscriptResult.segments:type_name -> backend.TranscriptSegment
+	15, // 1: backend.MemoryUsageData.breakdown:type_name -> backend.MemoryUsageData.BreakdownEntry
+	0,  // 2: backend.StatusResponse.state:type_name -> backend.StatusResponse.State
+	13, // 3: backend.StatusResponse.memory:type_name -> backend.MemoryUsageData
+	1,  // 4: backend.Backend.Health:input_type -> backend.HealthMessage
+	2,  // 5: backend.Backend.Predict:input_type -> backend.PredictOptions
+	4,  // 6: backend.Backend.LoadModel:input_type -> backend.ModelOptions
+	2,  // 7: backend.Backend.PredictStream:input_type -> backend.PredictOptions
+	2,  // 8: backend.Backend.Embedding:input_type -> backend.PredictOptions
+	10, // 9: backend.Backend.GenerateImage:input_type -> backend.GenerateImageRequest
+	7,  // 10: backend.Backend.AudioTranscription:input_type -> backend.TranscriptRequest
+	11, // 11: backend.Backend.TTS:input_type -> backend.TTSRequest
+	2,  // 12: backend.Backend.TokenizeString:input_type -> backend.PredictOptions
+	1,  // 13: backend.Backend.Status:input_type -> backend.HealthMessage
+	3,  // 14: backend.Backend.Health:output_type -> backend.Reply
+	3,  // 15: backend.Backend.Predict:output_type -> backend.Reply
+	5,  // 16: backend.Backend.LoadModel:output_type -> backend.Result
+	3,  // 17: backend.Backend.PredictStream:output_type -> backend.Reply
+	6,  // 18: backend.Backend.Embedding:output_type -> backend.EmbeddingResult
+	5,  // 19: backend.Backend.GenerateImage:output_type -> backend.Result
+	8,  // 20: backend.Backend.AudioTranscription:output_type -> backend.TranscriptResult
+	5,  // 21: backend.Backend.TTS:output_type -> backend.Result
+	12, // 22: backend.Backend.TokenizeString:output_type -> backend.TokenizationResponse
+	14, // 23: backend.Backend.Status:output_type -> backend.StatusResponse
+	14, // [14:24] is the sub-list for method output_type
+	4,  // [4:14] is the sub-list for method input_type
+	4,  // [4:4] is the sub-list for extension type_name
+	4,  // [4:4] is the sub-list for extension extendee
+	0,  // [0:4] is the sub-list for field type_name
 }

 func init() { file_pkg_grpc_proto_backend_proto_init() }
@@ -1600,19 +1988,56 @@ func file_pkg_grpc_proto_backend_proto_init() {
 				return nil
 			}
 		}
+		file_pkg_grpc_proto_backend_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*TokenizationResponse); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_pkg_grpc_proto_backend_proto_msgTypes[12].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*MemoryUsageData); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_pkg_grpc_proto_backend_proto_msgTypes[13].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*StatusResponse); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
 	}
 	type x struct{}
 	out := protoimpl.TypeBuilder{
 		File: protoimpl.DescBuilder{
 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
 			RawDescriptor: file_pkg_grpc_proto_backend_proto_rawDesc,
-			NumEnums:      0,
-			NumMessages:   11,
+			NumEnums:      1,
+			NumMessages:   15,
 			NumExtensions: 0,
 			NumServices:   1,
 		},
 		GoTypes:           file_pkg_grpc_proto_backend_proto_goTypes,
 		DependencyIndexes: file_pkg_grpc_proto_backend_proto_depIdxs,
+		EnumInfos:         file_pkg_grpc_proto_backend_proto_enumTypes,
 		MessageInfos:      file_pkg_grpc_proto_backend_proto_msgTypes,
 	}.Build()
 	File_pkg_grpc_proto_backend_proto = out.File
--- a/pkg/grpc/proto/backend.proto
+++ b/pkg/grpc/proto/backend.proto
@@ -16,6 +16,8 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
+  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
+  rpc Status(HealthMessage) returns (StatusResponse) {}
 }

 message HealthMessage {}
@@ -101,6 +103,19 @@ message ModelOptions {
  string PipelineType = 26;
  string SchedulerType = 27;
  bool CUDA = 28;
+  float CFGScale = 29;
+  bool IMG2IMG = 30;
+  string CLIPModel = 31;
+  string CLIPSubfolder = 32;
+  int32 CLIPSkip = 33;
+
+  // RWKV
+  string Tokenizer = 34;
+
+  // LLM (llama.cpp)
+  string LoraBase = 35;
+  string LoraAdapter = 36;
+  bool NoMulMatQ = 37;
 }

 message Result {
@@ -140,6 +155,11 @@ message GenerateImageRequest {
  string positive_prompt = 6;
  string negative_prompt = 7;
  string dst = 8;
+  string src = 9;
+
+  // Diffusers
+  string EnableParameters = 10;
+  int32 CLIPSkip = 11;
 }

 message TTSRequest {
@@ -147,3 +167,24 @@ message TTSRequest {
  string model = 2;
  string dst = 3;
 }
+
+message TokenizationResponse {
+  int32 length = 1;
+  repeated int32 tokens = 2;
+}
+
+message MemoryUsageData {
+  uint64 total = 1;
+  map<string, uint64> breakdown = 2;
+}
+
+message StatusResponse {
+  enum State {
+    UNINITIALIZED = 0;
+    BUSY = 1;
+    READY = 2;
+    ERROR = -1;
+  }
+  State state = 1;
+  MemoryUsageData memory = 2;
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
renovate[bot]	9e5fb29965	fix(deps): update module github.com/otiai10/openaigo to v1.6.0 (#960 ) [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: \| Package \| Type \| Update \| Change \| \|---\|---\|---\|---\| \| [github.com/otiai10/openaigo](https://togithub.com/otiai10/openaigo) \| require \| minor \| `v1.5.2` -> `v1.6.0` \| --- ### Release Notes <details> <summary>otiai10/openaigo (github.com/otiai10/openaigo)</summary> ### [`v1.6.0`](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) [Compare Source](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) </details> --- ### Configuration 📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied. ♻ Rebasing: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 Ignore: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNi41Ni4wIiwidXBkYXRlZEluVmVyIjoiMzYuNTYuMCIsInRhcmdldEJyYW5jaCI6Im1hc3RlciJ9--> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-26 14:18:06 +02:00
renovate[bot]	7dba131d5f	fix(deps): update github.com/tmc/langchaingo digest to 1e2a401 (#948 ) [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: \| Package \| Type \| Update \| Change \| \|---\|---\|---\|---\| \| [github.com/tmc/langchaingo](https://togithub.com/tmc/langchaingo) \| require \| digest \| `fef0821` -> `1e2a401` \| --- ### Configuration 📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied. ♻ Rebasing: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 Ignore: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNi41Ni4wIiwidXBkYXRlZEluVmVyIjoiMzYuNTYuMCIsInRhcmdldEJyYW5jaCI6Im1hc3RlciJ9--> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-26 14:17:48 +02:00
renovate[bot]	ce0b771217	fix(deps): update github.com/go-skynet/go-llama.cpp digest to bf63302 (#930 ) [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: \| Package \| Type \| Update \| Change \| \|---\|---\|---\|---\| \| [github.com/go-skynet/go-llama.cpp](https://togithub.com/go-skynet/go-llama.cpp) \| require \| digest \| `f03869d` -> `bf63302` \| --- ### Configuration 📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied. ♻ Rebasing: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 Ignore: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNi40My4yIiwidXBkYXRlZEluVmVyIjoiMzYuNTYuMCIsInRhcmdldEJyYW5jaCI6Im1hc3RlciJ9--> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-26 14:17:22 +02:00
Ettore Di Giacinto	44bc7aa3d0	feat: Allow to load lora adapters for llama.cpp (#955 ) Description This PR fixes # Notes for Reviewers [Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin) - [ ] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions: 1. Include descriptive PR titles with [<component-name>] prepended. 2. Build and test your changes before submitting a PR. 3. Sign your commits By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. --> Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-25 21:58:46 +02:00
ci-robbot [bot]	7f0c88ed3e	⬆️ Update go-skynet/go-llama.cpp (#954 ) Bump of go-skynet/go-llama.cpp version Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-25 18:45:40 +02:00
ci-robbot [bot]	d15508f52c	⬆️ Update nomic-ai/gpt4all (#953 ) Bump of nomic-ai/gpt4all version Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-25 01:19:48 +02:00
renovate[bot]	b111423b9c	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 27a8b02 (#947 ) [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: \| Package \| Type \| Update \| Change \| \|---\|---\|---\|---\| \| [github.com/nomic-ai/gpt4all/gpt4all-bindings/golang](https://togithub.com/nomic-ai/gpt4all) \| require \| digest \| `36f7fb5` -> `27a8b02` \| --- ### Configuration 📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied. ♻ Rebasing: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 Ignore: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNi41Ni4wIiwidXBkYXRlZEluVmVyIjoiMzYuNTYuMCIsInRhcmdldEJyYW5jaCI6Im1hc3RlciJ9--> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-24 23:52:49 +02:00
renovate[bot]	215a51c4c1	fix(deps): update module github.com/onsi/ginkgo/v2 to v2.12.0 (#949 ) [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: \| Package \| Type \| Update \| Change \| \|---\|---\|---\|---\| \| [github.com/onsi/ginkgo/v2](https://togithub.com/onsi/ginkgo) \| require \| minor \| `v2.11.0` -> `v2.12.0` \| --- ### Release Notes <details> <summary>onsi/ginkgo (github.com/onsi/ginkgo/v2)</summary> ### [`v2.12.0`](https://togithub.com/onsi/ginkgo/releases/tag/v2.12.0) [Compare Source](https://togithub.com/onsi/ginkgo/compare/v2.11.0...v2.12.0) #### 2.12.0 ##### Features - feat: allow MustPassRepeatedly decorator to be set at suite level ([#1266](https://togithub.com/onsi/ginkgo/issues/1266)) \[[`05de518`](https://togithub.com/onsi/ginkgo/commit/05de518)] ##### Fixes - fix-errors-in-readme ([#1244](https://togithub.com/onsi/ginkgo/issues/1244)) \[[`27c2f5d`](https://togithub.com/onsi/ginkgo/commit/27c2f5d)] ##### Maintenance Various chores/dependency bumps. </details> --- ### Configuration 📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied. ♻ Rebasing: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 Ignore: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNi41Ni4wIiwidXBkYXRlZEluVmVyIjoiMzYuNTYuMCIsInRhcmdldEJyYW5jaCI6Im1hc3RlciJ9--> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-24 18:58:06 +02:00
Ettore Di Giacinto	1120847f72	feat: bump llama.cpp, add gguf support (#943 ) Description This PR syncs up the `llama` backend to use `gguf` (https://github.com/go-skynet/go-llama.cpp/pull/180). It also adds `llama-stable` to the targets so we can still load ggml. It adapts the current tests to use the `llama-backend` for ggml and uses a `gguf` model to run tests on the new backend. In order to consume the new version of go-llama.cpp, it also bump go to 1.21 (images, pipelines, etc) --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-24 01:18:58 +02:00
Dave	704323b805	initial draft of an importable Insomnia profile for developers (#942 ) This is a starting point for developers to easily import a collection of requests to hit LocalAI. Insomnia was chosen as it's open source, has a graphical user interface for users desiring that, and has the ability to easily export requests as cURL commands for our documentation site.	2023-08-23 18:39:27 +02:00
Dave	10b0e13882	feat: backend monitor shutdown endpoint, process based (#938 ) This PR adds a new endpoint to the backend monitor section `/backend/shutdown` which terminates the grpc process for the related model.	2023-08-23 18:38:37 +02:00
Dave	901f0709c5	Feat: rwkv improvements: (#937 )	2023-08-22 18:48:06 +02:00
Gruber	0d6165e481	Example: Continue (dev) (#940 )	2023-08-22 18:46:45 +02:00
renovate[bot]	6583eed6b2	fix(deps): update module github.com/google/uuid to v1.3.1 (#936 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-22 10:51:04 +02:00
Dave	a9ca70ad4a	infra: add setup-go@4, test against 1.20.x (go.mod) and stable (1.21) (#935 )	2023-08-21 22:16:47 +02:00
Ettore Di Giacinto	ab5b75eb01	feat: add llama-stable backend (#932 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-20 16:35:42 +02:00
Ettore Di Giacinto	cc060a283d	fix: drop racy code, refactor and group API schema (#931 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-20 14:04:45 +02:00
Ettore Di Giacinto	28db83e17b	fix: disable usage by default (still experimental) (#929 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-19 16:15:22 +02:00
ci-robbot [bot]	dbb1f86455	⬆️ Update nomic-ai/gpt4all (#911 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-19 10:17:41 +02:00
renovate[bot]	02f7c555af	fix(deps): update github.com/tmc/langchaingo digest to fef0821 (#922 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-19 01:50:04 +02:00
renovate[bot]	d982b38f76	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 36f7fb5 (#908 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-19 01:49:51 +02:00
renovate[bot]	bc2e4b952e	fix(deps): update module github.com/shirou/gopsutil/v3 to v3.23.7 (#924 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-19 01:49:43 +02:00
Ettore Di Giacinto	afdc0ebfd7	feat: add --single-active-backend to allow only one backend active at the time (#925 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-19 01:49:33 +02:00
Ettore Di Giacinto	1079b18ff7	feat(diffusers): be consistent with pipelines, support also depthimg2img (#926 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-18 22:06:24 +02:00
Dave	8cb1061c11	Usage Features (#863 )	2023-08-18 21:23:14 +02:00
Ettore Di Giacinto	2bacd0180d	feat(diffusers): add img2img and clip_skip, support more kernels schedulers (#906 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-17 23:38:59 +02:00
renovate[bot]	ddf9bc2335	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to a630935 (#898 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-16 22:25:28 +02:00
renovate[bot]	a1afd940e3	fix(deps): update github.com/go-skynet/go-llama.cpp digest to f03869d (#901 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-16 22:25:14 +02:00
renovate[bot]	8bb76201c0	fix(deps): update github.com/tmc/langchaingo digest to eb0cbd3 (#902 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-16 22:25:02 +02:00
Ettore Di Giacinto	ede71d398c	feat(diffusers): overcome prompt limit (#904 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-16 22:24:52 +02:00
ci-robbot [bot]	0c73a637f1	⬆️ Update nomic-ai/gpt4all (#899 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-16 01:11:54 +02:00
Ettore Di Giacinto	37700f2d98	feat(diffusers): add DPMSolverMultistepScheduler++, DPMSolverMultistepSchedulerSDE++, guidance_scale (#903 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-16 01:11:42 +02:00
Ettore Di Giacinto	0ec695f9e4	feat: make initializer accept gRPC delay times (#900 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-16 01:11:32 +02:00
renovate[bot]	7ffd21dbc8	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 18f25c2 (#894 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-15 09:25:40 +02:00
renovate[bot]	48b3920656	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 4e55940 (#893 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-15 09:25:27 +02:00
ci-robbot [bot]	63d91af555	⬆️ Update nomic-ai/gpt4all (#878 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-15 09:25:10 +02:00
Ettore Di Giacinto	a96c3bc885	feat(diffusers): various enhancements (#895 )	2023-08-14 23:12:00 +02:00
Ettore Di Giacinto	77e1ae3d70	feat(Makefile): allow to restrict backend builds (#890 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-13 20:04:08 +02:00
renovate[bot]	9cc8d90865	fix(deps): update module github.com/sashabaranov/go-openai to v1.14.2 (#884 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-12 22:41:12 +02:00