feat: bert.cpp token embeddings (#241 )

tests: enable whisper (#239 )
Add support for gptneox/replit (#238 )
2026-02-04 03:32:40 -05:00 · 2023-05-12 17:16:49 +02:00 · 2023-05-12 14:10:18 +02:00 · 2023-05-12 11:36:35 +02:00 · 2023-05-12 10:04:20 +02:00 · 2023-05-11 23:43:55 +02:00
12 changed files with 166 additions and 28 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential ffmpeg
      - name: Test
        run: |
          make test
@@ -38,7 +38,7 @@ jobs:
      - name: Dependencies
        run: |
          brew update
-          brew install sdl2
+          brew install sdl2 ffmpeg
      - name: Test
        run: |
          make test
--- a/12
+++ b/12
@@ -3,14 +3,14 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

-GOLLAMA_VERSION?=c03e8adbc45c866e0f6d876af1887d6b01d57eb4
+GOLLAMA_VERSION?=70593fccbe4b01dedaab805b0f25cb58192c7b38
 GPT4ALL_REPO?=https://github.com/go-skynet/gpt4all
 GPT4ALL_VERSION?=3657f9417e17edf378c27d0a9274a1bf41caa914
-GOGPT2_VERSION?=6a10572
+GOGPT2_VERSION?=92421a8cf61ed6e03babd9067af292b094cb1307
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=07166da10cb2a9e8854395a4f210464dcea76e47
 WHISPER_CPP_VERSION?=bf2449dfae35a46b2cd92ab22661ce81a48d4993
-BERT_VERSION?=ec771ec715576ac050263bb7bb74bfd616a5ba13
+BERT_VERSION?=ac22f8f74aec5e31bc46242c17e7d511f127856b
 BLOOMZ_VERSION?=e9366e82abdfe70565644fbfae9651976714efd1


@@ -179,12 +179,16 @@ run: prepare ## run local-ai

 test-models/testmodel:
 	mkdir test-models
+	mkdir test-dir
 	wget https://huggingface.co/concedo/cerebras-111M-ggml/resolve/main/cerberas-111m-q4_0.bin -O test-models/testmodel
+	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
+	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	cp tests/fixtures/* test-models

 test: prepare test-models/testmodel
 	cp tests/fixtures/* test-models
-	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo -v -r ./api
+	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} TEST_DIR=$(abspath ./)/test-dir/ CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo -v -r ./api

 ## Help:
 help: ## Show this help.
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ It should also be compatible with StableLM and GPTNeoX ggml models (untested).
 Depending on the model you are attempting to run might need more RAM or CPU resources. Check out also [here](https://github.com/ggerganov/llama.cpp#memorydisk-requirements) for `ggml` based backends. `rwkv` is less expensive on resources.


-### Feature support matrix
+### Model compatibility table

 <details>

@@ -106,6 +106,8 @@ Depending on the model you are attempting to run might need more RAM or CPU reso
 | dolly           | Dolly                 | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
 | redpajama       | RedPajama             | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
 | stableLM        | StableLM GPT/NeoX     | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
+| replit       | Replit             | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
+| gptneox       | GPT NeoX             | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
 | starcoder       | Starcoder             | yes                      | no                  | no                                | no                   | https://github.com/ggerganov/ggml          | https://github.com/go-skynet/go-gpt2.cpp  |
 | bloomz          | Bloom                 | yes                      | no                  | no                                | no                   | https://github.com/NouamaneTazi/bloomz.cpp | https://github.com/go-skynet/bloomz.cpp   |
 | rwkv            | RWKV                  | yes                      | no                  | no                                | yes                  | https://github.com/saharNooby/rwkv.cpp     | https://github.com/donomii/go-rwkv.cpp    |
--- a/api/api.go
+++ b/api/api.go
@@ -12,7 +12,7 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func App(configFile string, loader *model.ModelLoader, threads, ctxSize int, f16 bool, debug, disableMessage bool) *fiber.App {
+func App(configFile string, loader *model.ModelLoader, uploadLimitMB, threads, ctxSize int, f16 bool, debug, disableMessage bool) *fiber.App {
 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
 	if debug {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
@@ -20,6 +20,7 @@ func App(configFile string, loader *model.ModelLoader, threads, ctxSize int, f16

 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
+		BodyLimit:             uploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
 		DisableStartupMessage: disableMessage,
 		// Override default error handler
 		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -3,6 +3,8 @@ package api_test
 import (
 	"context"
 	"os"
+	"path/filepath"
+	"runtime"

 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/pkg/model"
@@ -23,7 +25,7 @@ var _ = Describe("API test", func() {
 	Context("API query", func() {
 		BeforeEach(func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			app = App("", modelLoader, 1, 512, false, true, true)
+			app = App("", modelLoader, 15, 1, 512, false, true, true)
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -45,8 +47,7 @@ var _ = Describe("API test", func() {
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(3))
-			Expect(models.Models[0].ID).To(Equal("testmodel"))
+			Expect(len(models.Models)).To(Equal(7))
 		})
 		It("can generate completions", func() {
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
@@ -79,15 +80,55 @@ var _ = Describe("API test", func() {
 		It("returns errors", func() {
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 10 errors occurred:"))
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 12 errors occurred:"))
+		})
+		It("transcribes audio", func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+			resp, err := client.CreateTranscription(
+				context.Background(),
+				openai.AudioRequest{
+					Model:    openai.Whisper1,
+					FilePath: filepath.Join(os.Getenv("TEST_DIR"), "audio.wav"),
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp.Text).To(ContainSubstring("This is the Micro Machine Man presenting"))
 		})

+		It("calculate embeddings", func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+			resp, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: openai.AdaEmbeddingV2,
+					Input: []string{"sun", "cat"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
+			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
+
+			sunEmbedding := resp.Data[0].Embedding
+			resp2, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: openai.AdaEmbeddingV2,
+					Input: []string{"sun"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+		})
 	})

 	Context("Config file", func() {
 		BeforeEach(func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			app = App(os.Getenv("CONFIG_FILE"), modelLoader, 1, 512, false, true, true)
+			app = App(os.Getenv("CONFIG_FILE"), modelLoader, 5, 1, 512, false, true, true)
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -108,8 +149,7 @@ var _ = Describe("API test", func() {

 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(5))
-			Expect(models.Models[0].ID).To(Equal("testmodel"))
+			Expect(len(models.Models)).To(Equal(9))
 		})
 		It("can generate chat completions from config file", func() {
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
@@ -134,5 +174,6 @@ var _ = Describe("API test", func() {
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})
+
 	})
 })
--- a/api/openai.go
+++ b/api/openai.go
@@ -409,14 +409,13 @@ func transcriptEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
-			return c.Status(http.StatusBadRequest).JSON(fiber.Map{"error": err.Error()})
+			return err
 		}
 		f, err := file.Open()
 		if err != nil {
-			return c.Status(http.StatusBadRequest).JSON(fiber.Map{"error": err.Error()})
+			return err
 		}
 		defer f.Close()
-		log.Debug().Msgf("Audio file: %+v", file)

 		dir, err := os.MkdirTemp("", "whisper")

@@ -428,26 +427,33 @@ func transcriptEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,
 		dst := filepath.Join(dir, path.Base(file.Filename))
 		dstFile, err := os.Create(dst)
 		if err != nil {
-			return c.Status(http.StatusBadRequest).JSON(fiber.Map{"error": err.Error()})
+			return err
 		}

 		if _, err := io.Copy(dstFile, f); err != nil {
-			log.Debug().Msgf("Audio file %+v - %+v - err %+v", file.Filename, dst, err)
+			log.Debug().Msgf("Audio file copying error %+v - %+v - err %+v", file.Filename, dst, err)
 			return err
 		}

 		log.Debug().Msgf("Audio file copied to: %+v", dst)

-		whisperModel, err := loader.BackendLoader("whisper", config.Model, []llama.ModelOption{}, uint32(config.Threads))
+		whisperModel, err := loader.BackendLoader(model.WhisperBackend, config.Model, []llama.ModelOption{}, uint32(config.Threads))
 		if err != nil {
-			return c.Status(http.StatusBadRequest).JSON(fiber.Map{"error": err.Error()})
+			return err
 		}

-		w := whisperModel.(whisper.Model)
+		if whisperModel == nil {
+			return fmt.Errorf("could not load whisper model")
+		}

-		tr, err := whisperutil.Transcript(w, dst, input.Language)
+		w, ok := whisperModel.(whisper.Model)
+		if !ok {
+			return fmt.Errorf("loader returned non-whisper object")
+		}
+
+		tr, err := whisperutil.Transcript(w, dst, input.Language, uint(config.Threads))
 		if err != nil {
-			return c.Status(http.StatusBadRequest).JSON(fiber.Map{"error": err.Error()})
+			return err
 		}

 		log.Debug().Msgf("Trascribed: %+v", tr)
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -68,7 +68,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config)
 	case *bert.Bert:
 		fn = func() ([]float32, error) {
 			if len(tokens) > 0 {
-				return nil, fmt.Errorf("embeddings endpoint for this model supports only string")
+				return model.TokenEmbeddings(tokens, bert.SetThreads(c.Threads))
 			}
 			return model.Embeddings(s, bert.SetThreads(c.Threads))
 		}
@@ -199,6 +199,54 @@ func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback

 			return response, nil
 		}
+	case *gpt2.GPTNeoX:
+		fn = func() (string, error) {
+			// Generate the prediction using the language model
+			predictOptions := []gpt2.PredictOption{
+				gpt2.SetTemperature(c.Temperature),
+				gpt2.SetTopP(c.TopP),
+				gpt2.SetTopK(c.TopK),
+				gpt2.SetTokens(c.Maxtokens),
+				gpt2.SetThreads(c.Threads),
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, gpt2.SetBatch(c.Batch))
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
+	case *gpt2.Replit:
+		fn = func() (string, error) {
+			// Generate the prediction using the language model
+			predictOptions := []gpt2.PredictOption{
+				gpt2.SetTemperature(c.Temperature),
+				gpt2.SetTopP(c.TopP),
+				gpt2.SetTopK(c.TopK),
+				gpt2.SetTokens(c.Maxtokens),
+				gpt2.SetThreads(c.Threads),
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, gpt2.SetBatch(c.Batch))
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
 	case *gpt2.Starcoder:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
--- a/main.go
+++ b/main.go
@@ -62,6 +62,12 @@ func main() {
 				EnvVars:     []string{"CONTEXT_SIZE"},
 				Value:       512,
 			},
+			&cli.IntFlag{
+				Name:        "upload-limit",
+				DefaultText: "Default upload-limit. MB",
+				EnvVars:     []string{"UPLOAD_LIMIT"},
+				Value:       15,
+			},
 		},
 		Description: `
 LocalAI is a drop-in replacement OpenAI API which runs inference locally.
@@ -81,7 +87,7 @@ It uses llama.cpp, ggml and gpt4all as backend with golang c bindings.
 		Copyright: "go-skynet authors",
 		Action: func(ctx *cli.Context) error {
 			fmt.Printf("Starting LocalAI using %d threads, with models path: %s\n", ctx.Int("threads"), ctx.String("models-path"))
-			return api.App(ctx.String("config-file"), model.NewModelLoader(ctx.String("models-path")), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"), ctx.Bool("debug"), false).Listen(ctx.String("address"))
+			return api.App(ctx.String("config-file"), model.NewModelLoader(ctx.String("models-path")), ctx.Int("upload-limit"), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"), ctx.Bool("debug"), false).Listen(ctx.String("address"))
 		},
 	}

--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -24,6 +24,8 @@ const (
 	StableLMBackend       = "stablelm"
 	DollyBackend          = "dolly"
 	RedPajamaBackend      = "redpajama"
+	GPTNeoXBackend        = "gptneox"
+	ReplitBackend         = "replit"
 	Gpt2Backend           = "gpt2"
 	Gpt4AllLlamaBackend   = "gpt4all-llama"
 	Gpt4AllMptBackend     = "gpt4all-mpt"
@@ -45,6 +47,8 @@ var backends []string = []string{
 	StableLMBackend,
 	DollyBackend,
 	RedPajamaBackend,
+	GPTNeoXBackend,
+	ReplitBackend,
 	BertEmbeddingsBackend,
 	StarcoderBackend,
 }
@@ -61,6 +65,14 @@ var dolly = func(modelFile string) (interface{}, error) {
 	return gpt2.NewDolly(modelFile)
 }

+var gptNeoX = func(modelFile string) (interface{}, error) {
+	return gpt2.NewGPTNeoX(modelFile)
+}
+
+var replit = func(modelFile string) (interface{}, error) {
+	return gpt2.NewReplit(modelFile)
+}
+
 var stableLM = func(modelFile string) (interface{}, error) {
 	return gpt2.NewStableLM(modelFile)
 }
@@ -116,6 +128,10 @@ func (ml *ModelLoader) BackendLoader(backendString string, modelFile string, lla
 		return ml.LoadModel(modelFile, redPajama)
 	case Gpt2Backend:
 		return ml.LoadModel(modelFile, gpt2LM)
+	case GPTNeoXBackend:
+		return ml.LoadModel(modelFile, gptNeoX)
+	case ReplitBackend:
+		return ml.LoadModel(modelFile, replit)
 	case StarcoderBackend:
 		return ml.LoadModel(modelFile, starCoder)
 	case Gpt4AllLlamaBackend:
--- a/pkg/whisper/whisper.go
+++ b/pkg/whisper/whisper.go
@@ -28,7 +28,7 @@ func audioToWav(src, dst string) error {
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string) (string, error) {
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (string, error) {

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
@@ -65,8 +65,12 @@ func Transcript(model whisper.Model, audiopath, language string) (string, error)

 	}

+	context.SetThreads(threads)
+
 	if language != "" {
 		context.SetLanguage(language)
+	} else {
+		context.SetLanguage("auto")
 	}

 	if err := context.Process(data, nil); err != nil {
--- a/tests/fixtures/embeddings.yaml
+++ b/tests/fixtures/embeddings.yaml
@@ -0,0 +1,6 @@
+name: text-embedding-ada-002
+parameters:
+  model: bert
+threads: 14
+backend: bert-embeddings
+embeddings: true
--- a/tests/fixtures/whisper.yaml
+++ b/tests/fixtures/whisper.yaml
@@ -0,0 +1,4 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: whisper-en
Author	SHA1	Message	Date
Ettore Di Giacinto	2488c445b6	feat: bert.cpp token embeddings (#241 )	2023-05-12 17:16:49 +02:00
Ettore Di Giacinto	b4241d0a0d	tests: enable whisper (#239 )	2023-05-12 14:10:18 +02:00
Ettore Di Giacinto	8250391e49	Add support for gptneox/replit (#238 )	2023-05-12 11:36:35 +02:00
Ettore Di Giacinto	fd1df4e971	whisper: add tests and allow to set upload size (#237 )	2023-05-12 10:04:20 +02:00
ci-robbot [bot]	5115b2faa3	⬆️ Update go-skynet/go-llama.cpp (#219 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-05-11 23:43:55 +02:00
ci-robbot [bot]	93e82a8bf4	⬆️ Update go-skynet/go-gpt2.cpp (#220 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-05-11 23:43:44 +02:00