Do not take all CPU by default (#50 )

Add support for stablelm (#48 )
Signed-off-by: mudler <mudler@mocaccino.org>
2026-02-04 03:32:40 -05:00 · 2023-04-21 00:55:19 +02:00 · 2023-04-21 00:06:55 +02:00 · 2023-04-20 19:49:06 +02:00 · 2023-04-20 19:33:36 +02:00 · 2023-04-20 18:33:02 +02:00
11 changed files with 494 additions and 138 deletions
--- a/.env
+++ b/.env
@@ -1,3 +1,4 @@
 THREADS=14
 CONTEXT_SIZE=512
 MODELS_PATH=/models
+# DEBUG=true
--- a/41
+++ b/41
@@ -3,6 +3,8 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 GOLLAMA_VERSION?=llama.cpp-5ecff35
+GOGPT4ALLJ_VERSION?=1f548782d80d48b9a0fac33aae6f129358787bc0
+GOGPT2_VERSION?=1c24f5b86ac428cd5e81dae1f1427b1463bd2b06

 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -17,19 +19,23 @@ all: help
 ## Build:

 build: prepare ## Build the project
-	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j $(GOCMD) build -o $(BINARY_NAME) ./
+	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp $(GOCMD) build -o $(BINARY_NAME) ./

 buildgeneric: prepare-generic ## Build the project
-	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j $(GOCMD) build -o $(BINARY_NAME) ./
+	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp $(GOCMD) build -o $(BINARY_NAME) ./

+## GPT4ALL-J
 go-gpt4all-j:
-	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j && cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION)
 # This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
 	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +

 go-gpt4all-j/libgptj.a: go-gpt4all-j
 	$(MAKE) -C go-gpt4all-j libgptj.a
@@ -37,6 +43,23 @@ go-gpt4all-j/libgptj.a: go-gpt4all-j
 go-gpt4all-j/libgptj.a-generic: go-gpt4all-j
 	$(MAKE) -C go-gpt4all-j generic-libgptj.a

+# CEREBRAS GPT
+go-gpt2.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2.cpp && cd go-gpt2.cpp && git checkout -b build $(GOGPT2_VERSION)
+# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
+	@find ./go-gpt2.cpp -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gpt2_/g' {} +
+
+go-gpt2.cpp/libgpt2.a: go-gpt2.cpp
+	$(MAKE) -C go-gpt2.cpp libgpt2.a
+
+go-gpt2.cpp/libgpt2.a-generic: go-gpt2.cpp
+	$(MAKE) -C go-gpt2.cpp generic-libgpt2.a
+
 go-llama:
 	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	$(MAKE) -C go-llama libbinding.a
@@ -45,17 +68,19 @@ go-llama-generic:
 	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	$(MAKE) -C go-llama generic-libbinding.a

-prepare: go-llama go-gpt4all-j/libgptj.a
+replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2.cpp
+
+prepare: go-llama go-gpt4all-j/libgptj.a go-gpt2.cpp/libgpt2.a replace
+
+prepare-generic: go-llama-generic go-gpt4all-j/libgptj.a-generic go-gpt2.cpp/libgpt2.a-generic replace

-prepare-generic: go-llama-generic go-gpt4all-j/libgptj.a-generic
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
-	
 clean: ## Remove build related file
 	rm -fr ./go-llama
 	rm -rf ./go-gpt4all-j
+	rm -rf ./go-gpt2.cpp
 	rm -rf $(BINARY_NAME)

 ## Run:
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
-## :camel: LocalAI
+<h1 align="center">
+  <br>
+  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+    LocalAI
+<br>
+</h1>

 > :warning: This project has been renamed from `llama-cli` to `LocalAI` to reflect the fact that we are focusing on a fast drop-in OpenAI API rather on the CLI interface. We think that there are already many projects that can be used as a CLI interface already, for instance  [llama.cpp](https://github.com/ggerganov/llama.cpp) and [gpt4all](https://github.com/nomic-ai/gpt4all). If you are were using `llama-cli` for CLI interactions and want to keep using it, use older versions or please open up an issue - contributions are welcome!

@@ -7,13 +12,22 @@ LocalAI is a straightforward, drop-in replacement API compatible with OpenAI for
 - OpenAI compatible API
 - Supports multiple-models
 - Once loaded the first time, it keep models loaded in memory for faster inference
- Provides a simple command line interface that allows text generation directly from the terminal
 - Support for prompt templates
 - Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).

 ## Model compatibility

-It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) and also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all).
+It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) supports also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all) and [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml).
+
+Tested with:
+- Vicuna
+- Alpaca
+- [GPT4ALL](https://github.com/nomic-ai/gpt4all)
+- [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
+- Koala
+- [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+
+It should also be compatible with StableLM and GPTNeoX ggml models (untested)

 Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.

@@ -77,7 +91,7 @@ See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prom
 Example of starting the API with `docker`:

 ```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-api:latest --models-path /path/to/models --context-size 700 --threads 4
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
 ```

 And you'll see:
@@ -92,8 +106,6 @@ And you'll see:
 └───────────────────────────────────────────────────┘ 
 ```

-Note: Models have to end up with `.bin` so can be listed by the `/models` endpoint.
-
 You can control the API server options with command line arguments:

 ```
@@ -105,7 +117,7 @@ The API takes takes the following parameters:
 | Parameter    | Environment Variable | Default Value | Description                            |
 | ------------ | -------------------- | ------------- | -------------------------------------- |
 | models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
-| threads      | THREADS              | CPU cores     | The number of threads to use for text generation. |
+| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
 | address      | ADDRESS              | :8080         | The address and port to listen on. |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |

--- a/api/api.go
+++ b/api/api.go
@@ -1,16 +1,20 @@
 package api

 import (
+	"encoding/json"
+	"errors"
 	"fmt"
 	"strings"
 	"sync"

 	model "github.com/go-skynet/LocalAI/pkg/model"
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
 	"github.com/gofiber/fiber/v2/middleware/recover"
+	"github.com/rs/zerolog/log"
 )

 type OpenAIResponse struct {
@@ -65,56 +69,72 @@ type OpenAIRequest struct {
 }

 // https://platform.openai.com/docs/api-reference/completions
-func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16 bool, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16 bool, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		var err error
 		var model *llama.LLama
 		var gptModel *gptj.GPTJ
+		var gpt2Model *gpt2.GPT2
+		var stableLMModel *gpt2.StableLM

 		input := new(OpenAIRequest)
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
+		modelFile := input.Model
+		received, _ := json.Marshal(input)

-		if input.Model == "" {
+		log.Debug().Msgf("Request received: %s", string(received))
+
+		// Set model from bearer token, if available
+		bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+		bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+		if modelFile == "" && !bearerExists {
 			return fmt.Errorf("no model specified")
-		} else {
-			// Try to load the model with both
-			var llamaerr error
-			llamaOpts := []llama.ModelOption{}
-			if ctx != 0 {
-				llamaOpts = append(llamaOpts, llama.SetContext(ctx))
-			}
-			if f16 {
-				llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-			}
+		}

-			model, llamaerr = loader.LoadLLaMAModel(input.Model, llamaOpts...)
-			if llamaerr != nil {
-				gptModel, err = loader.LoadGPTJModel(input.Model)
-				if err != nil {
-					return fmt.Errorf("llama: %s gpt: %s", llamaerr.Error(), err.Error()) // llama failed first, so we want to catch both errors
+		if bearerExists { // model specified in bearer token takes precedence
+			log.Debug().Msgf("Using model from bearer token: %s", bearer)
+			modelFile = bearer
+		}
+
+		// Try to load the model with both
+		var llamaerr, gpt2err, gptjerr, stableerr error
+		llamaOpts := []llama.ModelOption{}
+		if ctx != 0 {
+			llamaOpts = append(llamaOpts, llama.SetContext(ctx))
+		}
+		if f16 {
+			llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+		}
+
+		// TODO: this is ugly, better identifying the model somehow! however, it is a good stab for a first implementation..
+		model, llamaerr = loader.LoadLLaMAModel(modelFile, llamaOpts...)
+		if llamaerr != nil {
+			gptModel, gptjerr = loader.LoadGPTJModel(modelFile)
+			if gptjerr != nil {
+				gpt2Model, gpt2err = loader.LoadGPT2Model(modelFile)
+				if gpt2err != nil {
+					stableLMModel, stableerr = loader.LoadStableLMModel(modelFile)
+					if stableerr != nil {
+						return fmt.Errorf("llama: %s gpt: %s gpt2: %s stableLM: %s", llamaerr.Error(), gptjerr.Error(), gpt2err.Error(), stableerr.Error()) // llama failed first, so we want to catch both errors
+					}
 				}
 			}
 		}

 		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		if input.Model != "" {
-			mutexMap.Lock()
-			l, ok := mutexes[input.Model]
-			if !ok {
-				m := &sync.Mutex{}
-				mutexes[input.Model] = m
-				l = m
-			}
-			mutexMap.Unlock()
-			l.Lock()
-			defer l.Unlock()
-		} else {
-			defaultMutex.Lock()
-			defer defaultMutex.Unlock()
+		mutexMap.Lock()
+		l, ok := mutexes[modelFile]
+		if !ok {
+			m := &sync.Mutex{}
+			mutexes[modelFile] = m
+			l = m
 		}
+		mutexMap.Unlock()
+		l.Lock()
+		defer l.Unlock()

 		// Set the parameters for the language model prediction
 		topP := input.TopP
@@ -139,6 +159,7 @@ func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16
 		predInput := input.Prompt
 		if chat {
 			mess := []string{}
+			// TODO: encode roles
 			for _, i := range input.Messages {
 				mess = append(mess, i.Content)
 			}
@@ -147,11 +168,12 @@ func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16
 		}

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
+		templatedInput, err := loader.TemplatePrefix(modelFile, struct {
 			Input string
 		}{Input: predInput})
 		if err == nil {
 			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
 		}

 		result := []Choice{}
@@ -164,6 +186,54 @@ func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16

 		var predFunc func() (string, error)
 		switch {
+		case stableLMModel != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []gpt2.PredictOption{
+					gpt2.SetTemperature(temperature),
+					gpt2.SetTopP(topP),
+					gpt2.SetTopK(topK),
+					gpt2.SetTokens(tokens),
+					gpt2.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
+				}
+
+				return stableLMModel.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
+		case gpt2Model != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []gpt2.PredictOption{
+					gpt2.SetTemperature(temperature),
+					gpt2.SetTopP(topP),
+					gpt2.SetTopK(topK),
+					gpt2.SetTokens(tokens),
+					gpt2.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
+				}
+
+				return gpt2Model.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
 		case gptModel != nil:
 			predFunc = func() (string, error) {
 				// Generate the prediction using the language model
@@ -223,8 +293,6 @@ func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16
 		}

 		for i := 0; i < n; i++ {
-			var prediction string
-
 			prediction, err := predFunc()
 			if err != nil {
 				return err
@@ -241,30 +309,19 @@ func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16
 			}
 		}

+		jsonResult, _ := json.Marshal(result)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
 		// Return the prediction in the response body
 		return c.JSON(OpenAIResponse{
-			Model:   input.Model,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 		})
 	}
 }

-func Start(loader *model.ModelLoader, listenAddr string, threads, ctxSize int, f16 bool) error {
-	app := fiber.New()
-
-	// Default middleware config
-	app.Use(recover.New())
-	app.Use(cors.New())
-
-	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-	var mutex = &sync.Mutex{}
-	mu := map[string]*sync.Mutex{}
-	var mumutex = &sync.Mutex{}
-
-	// openAI compatible API endpoint
-	app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mutex, mumutex, mu))
-	app.Post("/v1/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mutex, mumutex, mu))
-	app.Get("/v1/models", func(c *fiber.Ctx) error {
+func listModels(loader *model.ModelLoader) func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
 		models, err := loader.ListModels()
 		if err != nil {
 			return err
@@ -281,8 +338,48 @@ func Start(loader *model.ModelLoader, listenAddr string, threads, ctxSize int, f
 			Object: "list",
 			Data:   dataModels,
 		})
+	}
+}
+
+func Start(loader *model.ModelLoader, listenAddr string, threads, ctxSize int, f16 bool) error {
+	// Return errors as JSON responses
+	app := fiber.New(fiber.Config{
+		// Override default error handler
+		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+			// Status code defaults to 500
+			code := fiber.StatusInternalServerError
+
+			// Retrieve the custom status code if it's a *fiber.Error
+			var e *fiber.Error
+			if errors.As(err, &e) {
+				code = e.Code
+			}
+
+			// Send custom error page
+			return ctx.Status(code).JSON(struct {
+				Error string `json:"error"`
+			}{Error: err.Error()})
+		},
 	})

+	// Default middleware config
+	app.Use(recover.New())
+	app.Use(cors.New())
+
+	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+	mu := map[string]*sync.Mutex{}
+	var mumutex = &sync.Mutex{}
+
+	// openAI compatible API endpoint
+	app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mumutex, mu))
+	app.Post("/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mumutex, mu))
+
+	app.Post("/v1/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mumutex, mu))
+	app.Post("/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mumutex, mu))
+
+	app.Get("/v1/models", listModels(loader))
+	app.Get("/models", listModels(loader))
+
 	// Start the server
 	app.Listen(listenAddr)
 	return nil
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -14,5 +14,6 @@ services:
      - MODELS_PATH=$MODELS_PATH
      - CONTEXT_SIZE=$CONTEXT_SIZE
      - THREADS=$THREADS
+      - DEBUG=$DEBUG
    volumes:
      - ./models:/models:cached
--- a/go.mod
+++ b/go.mod
@@ -3,21 +3,31 @@ module github.com/go-skynet/LocalAI
 go 1.19

 require (
+	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4
+	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94
 	github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640
 	github.com/gofiber/fiber/v2 v2.42.0
+	github.com/jaypipes/ghw v0.10.0
+	github.com/rs/zerolog v1.29.1
 	github.com/urfave/cli/v2 v2.25.0
 )

 require (
+	github.com/StackExchange/wmi v1.2.1 // indirect
 	github.com/andybalholm/brotli v1.0.4 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
-	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94 // indirect
+	github.com/ghodss/yaml v1.0.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/google/uuid v1.3.0 // indirect
+	github.com/jaypipes/pcidb v1.0.0 // indirect
 	github.com/klauspost/compress v1.15.9 // indirect
+	github.com/kr/text v0.2.0 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.17 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
+	github.com/mitchellh/go-homedir v1.1.0 // indirect
 	github.com/philhofer/fwd v1.1.1 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 // indirect
@@ -28,4 +38,6 @@ require (
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
 	golang.org/x/sys v0.6.0 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	howett.net/plist v1.0.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,34 +1,63 @@
+github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
+github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
 github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
 github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
+github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4 h1:GkGuqnhDFKlCsT6Bo8sdY00A7rFXCzfU1nBOSS4ZnYM=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94 h1:rtrrMvlIq+g0/ltXjDdLeNtz0uc4wJ4Qs15GFU4ba4c=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640 h1:8SSVbQ3yvq7JnfLCLF4USV0PkQnnduUkaNCv/hHDa3E=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
 github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/jaypipes/ghw v0.10.0 h1:UHu9UX08Py315iPojADFPOkmjTsNzHj4g4adsNKKteY=
+github.com/jaypipes/ghw v0.10.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
+github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8=
+github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk=
+github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
 github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
 github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
 github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
 github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
 github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
 github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
+github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
+github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
@@ -63,10 +92,13 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
@@ -85,4 +117,11 @@ golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
+howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
--- a/kubernetes/data-volume.yaml
+++ b/kubernetes/data-volume.yaml
@@ -0,0 +1,28 @@
+# Create a PVC containing a model binary, sourced from an arbitrary HTTP server
+# (requires https://github.com/kubevirt/containerized-data-importer)
+apiVersion: cdi.kubevirt.io/v1beta1
+kind: DataVolume
+metadata:
+  name: models
+  namespace: local-ai
+spec:
+  contentType: archive
+  source:
+    http:
+      url: http://<model_server>/koala-7B-4bit-128g.GGML.tar
+      secretRef: model-secret
+  pvc:
+    accessModes:
+    - ReadWriteOnce
+    resources:
+      requests:
+        storage: 5Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: model-secret
+  namespace: local-ai
+data:
+  accessKeyId: <model_server_username_base64_encoded>
+  secretKey: <model_server_password_base64_encoded>
--- a/kubernetes/deployment.yaml
+++ b/kubernetes/deployment.yaml
@@ -1,38 +1,55 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: llama
+  name: local-ai
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: llama
-  namespace: llama
+  name: local-ai
+  namespace: local-ai
  labels:
-    app: llama
+    app: local-ai
 spec:
  selector:
    matchLabels:
-      app: llama
+      app: local-ai
  replicas: 1
  template:
    metadata:
      labels:
-        app: llama
-      name: llama
+        app: local-ai
+      name: local-ai
    spec:
      containers:
-        - name: llama
+        - name: local-ai
          image: quay.io/go-skynet/local-ai:latest
+          env:
+          - name: THREADS
+            value: "14"
+          - name: CONTEXT_SIZE
+            value: "512"
+          - name: MODELS_PATH
+            value: /models
+          volumeMounts:
+          - mountPath: /models
+            name: models
+      volumes:
+      - name: models
+        persistentVolumeClaim:
+          claimName: models
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: llama
-  namespace: llama
+  name: local-ai
+  namespace: local-ai
+  # If using AWS, you'll need to override the default 60s load balancer idle timeout
+  # annotations:
+  #   service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
 spec:
  selector:
-    app: llama
+    app: local-ai
  type: LoadBalancer
  ports:
    - protocol: TCP
--- a/main.go
+++ b/main.go
@@ -1,23 +1,31 @@
 package main

 import (
-	"fmt"
 	"os"
-	"runtime"

 	api "github.com/go-skynet/LocalAI/api"
 	model "github.com/go-skynet/LocalAI/pkg/model"
-
+	"github.com/jaypipes/ghw"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
 	"github.com/urfave/cli/v2"
 )

 func main() {
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
 	path, err := os.Getwd()
 	if err != nil {
-		fmt.Println(err)
+		log.Error().Msgf("error: %s", err.Error())
 		os.Exit(1)
 	}

+	threads := 4
+	cpu, err := ghw.CPU()
+	if err == nil {
+		threads = int(cpu.TotalCores)
+	}
+
 	app := &cli.App{
 		Name:  "LocalAI",
 		Usage: "OpenAI compatible API for running LLaMA/GPT models locally on CPU with consumer grade hardware.",
@@ -26,11 +34,15 @@ func main() {
 				Name:    "f16",
 				EnvVars: []string{"F16"},
 			},
+			&cli.BoolFlag{
+				Name:    "debug",
+				EnvVars: []string{"DEBUG"},
+			},
 			&cli.IntFlag{
 				Name:        "threads",
 				DefaultText: "Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested.",
 				EnvVars:     []string{"THREADS"},
-				Value:       runtime.NumCPU(),
+				Value:       threads,
 			},
 			&cli.StringFlag{
 				Name:        "models-path",
@@ -59,20 +71,26 @@ Some of the models compatible are:
 - Koala
 - GPT4ALL
 - GPT4ALL-J
+- Cerebras
 - Alpaca
+- StableLM (ggml quantized)

-It uses llama.cpp and gpt4all as backend, supporting all the models supported by both.
+It uses llama.cpp, ggml and gpt4all as backend with golang c bindings.
 `,
 		UsageText: `local-ai [options]`,
 		Copyright: "go-skynet authors",
 		Action: func(ctx *cli.Context) error {
+			zerolog.SetGlobalLevel(zerolog.InfoLevel)
+			if ctx.Bool("debug") {
+				zerolog.SetGlobalLevel(zerolog.DebugLevel)
+			}
 			return api.Start(model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"))
 		},
 	}

 	err = app.Run(os.Args)
 	if err != nil {
-		fmt.Println(err)
+		log.Error().Msgf("error: %s", err.Error())
 		os.Exit(1)
 	}
 }
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -10,20 +10,39 @@ import (
 	"sync"
 	"text/template"

+	"github.com/rs/zerolog/log"
+
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
 )

 type ModelLoader struct {
-	modelPath        string
-	mu               sync.Mutex
-	models           map[string]*llama.LLama
-	gptmodels        map[string]*gptj.GPTJ
+	modelPath string
+	mu        sync.Mutex
+
+	models            map[string]*llama.LLama
+	gptmodels         map[string]*gptj.GPTJ
+	gpt2models        map[string]*gpt2.GPT2
+	gptstablelmmodels map[string]*gpt2.StableLM
+
 	promptsTemplates map[string]*template.Template
 }

 func NewModelLoader(modelPath string) *ModelLoader {
-	return &ModelLoader{modelPath: modelPath, gptmodels: make(map[string]*gptj.GPTJ), models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
+	return &ModelLoader{
+		modelPath:         modelPath,
+		gpt2models:        make(map[string]*gpt2.GPT2),
+		gptmodels:         make(map[string]*gptj.GPTJ),
+		gptstablelmmodels: make(map[string]*gpt2.StableLM),
+		models:            make(map[string]*llama.LLama),
+		promptsTemplates:  make(map[string]*template.Template),
+	}
+}
+
+func (ml *ModelLoader) ExistsInModelPath(s string) bool {
+	_, err := os.Stat(filepath.Join(ml.modelPath, s))
+	return err == nil
 }

 func (ml *ModelLoader) ListModels() ([]string, error) {
@@ -34,9 +53,12 @@ func (ml *ModelLoader) ListModels() ([]string, error) {

 	models := []string{}
 	for _, file := range files {
-		if strings.HasSuffix(file.Name(), ".bin") {
-			models = append(models, strings.TrimRight(file.Name(), ".bin"))
+		// Skip templates, YAML and .keep files
+		if strings.HasSuffix(file.Name(), ".tmpl") || strings.HasSuffix(file.Name(), ".keep") || strings.HasSuffix(file.Name(), ".yaml") || strings.HasSuffix(file.Name(), ".yml") {
+			continue
 		}
+
+		models = append(models, file.Name())
 	}

 	return models, nil
@@ -48,12 +70,7 @@ func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string,

 	m, ok := ml.promptsTemplates[modelName]
 	if !ok {
-		// try to find a s.bin
-		modelBin := fmt.Sprintf("%s.bin", modelName)
-		m, ok = ml.promptsTemplates[modelBin]
-		if !ok {
-			return "", fmt.Errorf("no prompt template available")
-		}
+		return "", fmt.Errorf("no prompt template available")
 	}

 	var buf bytes.Buffer
@@ -64,15 +81,21 @@ func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string,
 	return buf.String(), nil
 }

-func (ml *ModelLoader) loadTemplate(modelName, modelFile string) error {
-	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
-
-	// Check if the model path exists
-	if _, err := os.Stat(modelTemplateFile); err != nil {
+func (ml *ModelLoader) loadTemplateIfExists(modelName, modelFile string) error {
+	// Check if the template was already loaded
+	if _, ok := ml.promptsTemplates[modelName]; ok {
 		return nil
 	}

-	dat, err := os.ReadFile(modelTemplateFile)
+	// Check if the model path exists
+	// skip any error here - we run anyway if a template is not exist
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelName)
+
+	if !ml.ExistsInModelPath(modelTemplateFile) {
+		return nil
+	}
+
+	dat, err := os.ReadFile(filepath.Join(ml.modelPath, modelTemplateFile))
 	if err != nil {
 		return err
 	}
@@ -87,41 +110,117 @@ func (ml *ModelLoader) loadTemplate(modelName, modelFile string) error {
 	return nil
 }

+func (ml *ModelLoader) LoadStableLMModel(modelName string) (*gpt2.StableLM, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.NewStableLM(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gptstablelmmodels[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPTStable model loaded instead - if we do we return an error so the API tries with StableLM
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gpt2models[modelName] = model
+	return model, err
+}
+
 func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

 	// Check if we already have a loaded model
-	modelFile := filepath.Join(ml.modelPath, modelName)
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}

-	if m, ok := ml.gptmodels[modelFile]; ok {
+	if m, ok := ml.gptmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
 		return m, nil
 	}

-	// Check if the model path exists
-	if _, err := os.Stat(modelFile); os.IsNotExist(err) {
-		// try to find a s.bin
-		modelBin := fmt.Sprintf("%s.bin", modelFile)
-		if _, err := os.Stat(modelBin); os.IsNotExist(err) {
-			return nil, err
-		} else {
-			modelName = fmt.Sprintf("%s.bin", modelName)
-			modelFile = modelBin
-		}
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPT2 model loaded instead - if we do we return an error so the API tries with GPT2
+	if _, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model is GPT2: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPT2 one")
+	}
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
 	}

 	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
 	model, err := gptj.New(modelFile)
 	if err != nil {
 		return nil, err
 	}

 	// If there is a prompt template, load it
-	if err := ml.loadTemplate(modelName, modelFile); err != nil {
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
 		return nil, err
 	}

-	ml.gptmodels[modelFile] = model
+	ml.gptmodels[modelName] = model
 	return model, err
 }

@@ -129,40 +228,47 @@ func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOptio
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

+	log.Debug().Msgf("Loading model name: %s", modelName)
+
 	// Check if we already have a loaded model
-	modelFile := filepath.Join(ml.modelPath, modelName)
-	if m, ok := ml.models[modelFile]; ok {
-		return m, nil
-	}
-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPTJ model loaded instead
-	if _, ok := ml.gptmodels[modelFile]; ok {
-		return nil, fmt.Errorf("this model is a GPTJ one")
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
 	}

-	// Check if the model path exists
-	if _, err := os.Stat(modelFile); os.IsNotExist(err) {
-		// try to find a s.bin
-		modelBin := fmt.Sprintf("%s.bin", modelFile)
-		if _, err := os.Stat(modelBin); os.IsNotExist(err) {
-			return nil, err
-		} else {
-			modelName = fmt.Sprintf("%s.bin", modelName)
-			modelFile = modelBin
-		}
+	if m, ok := ml.models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPTJ model loaded instead - if we do we return an error so the API tries with GPTJ
+	if _, ok := ml.gptmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTJ: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTJ one")
+	}
+	if _, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model is GPT2: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPT2 one")
+	}
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
 	}

 	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
 	model, err := llama.New(modelFile, opts...)
 	if err != nil {
 		return nil, err
 	}

 	// If there is a prompt template, load it
-	if err := ml.loadTemplate(modelName, modelFile); err != nil {
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
 		return nil, err
 	}

-	ml.models[modelFile] = model
+	ml.models[modelName] = model
 	return model, err
 }
Author	SHA1	Message	Date
Ettore Di Giacinto	ed954d66c3	Do not take all CPU by default (#50 )	2023-04-21 00:55:19 +02:00
Ettore Di Giacinto	f816dfae65	Add support for stablelm (#48 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-21 00:06:55 +02:00
Ettore Di Giacinto	142bcd66ca	Cleanup makefile, fix dep versions (#46 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:49:06 +02:00
Ettore Di Giacinto	1c4fbaae20	Add support for cerebras (#45 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:33:36 +02:00
Ettore Di Giacinto	d517a54e28	Major API enhancements (#44 )	2023-04-20 18:33:02 +02:00
Tyler Gillson	c905512bb0	Update example K8s manifests (#40 )	2023-04-20 18:31:11 +02:00
Ettore Di Giacinto	1254951fab	Add logo (#37 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 19:03:12 +02:00