Compare commits

..

2 Commits

Author SHA1 Message Date
Bruce MacDonald
938be81c45 Add TODO 2024-05-10 10:15:22 -07:00
Bruce MacDonald
19ce10e49e add a /tokenize endpoint 2024-05-10 10:13:50 -07:00
7 changed files with 124 additions and 30 deletions

View File

@@ -195,19 +195,28 @@ type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}
type TokenizeRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options"`
}
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
// CreateRequest is the request passed to [Client.Create].
type CreateRequest struct {
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
Quantize string `json:"quantize,omitempty"`
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
Quantization string `json:"quantization,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
// Quantization is deprecated, see Quantize
Quantization string `json:"quantization,omitempty"`
}
// DeleteRequest is the request passed to [Client.Delete].

View File

@@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return nil
}
quantize, _ := cmd.Flags().GetString("quantize")
quantization, _ := cmd.Flags().GetString("quantization")
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize}
request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
if err := client.Create(cmd.Context(), &request, fn); err != nil {
return err
}
@@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command {
}
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
showCmd := &cobra.Command{
Use: "show MODEL",

View File

@@ -329,10 +329,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4*batch*(1+4*embedding+context+context*heads),
)
partialOffload = max(
4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
4*batch*(2+3*embedding+context+context*heads),
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
case "stablelm":
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
partialOffload = max(

View File

@@ -12,8 +12,17 @@ import (
// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
// Split up the GPUs by type and try them
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)

View File

@@ -77,7 +77,15 @@ func LoadModel(model string) (*GGML, error) {
// The gpu list must be a single family.
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
var err error
var cpuRunner string
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
cpuRunner := ""
var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64

View File

@@ -407,6 +407,85 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
c.JSON(http.StatusOK, resp)
}
func (s *Server) TokenizeHandler(c *gin.Context) {
var req api.TokenizeRequest
err := c.ShouldBindJSON(&req)
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return
case err != nil:
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
if req.Model == "" {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
model, err := GetModel(req.Model)
if err != nil {
var pErr *fs.PathError
if errors.As(err, &pErr) {
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
opts, err := modelOptions(model, req.Options)
if err != nil {
// TODO: handle specific errors
// if errors.Is(err, api.ErrInvalidOpts) {
// c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
// return
// }
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
var runner *runnerRef
select {
case runner = <-rCh:
case err = <-eCh:
if errors.Is(err, context.Canceled) {
c.JSON(499, gin.H{"error": "request canceled"})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// an empty request loads the model
if req.Prompt == "" {
c.JSON(http.StatusOK, api.TokenizeResponse{Tokens: []int{}})
return
}
tokens, err := runner.llama.Tokenize(c.Request.Context(), req.Prompt)
if err != nil {
slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
return
}
resp := api.TokenizeResponse{
Tokens: tokens,
}
c.JSON(http.StatusOK, resp)
}
func (s *Server) PullModelHandler(c *gin.Context) {
var req api.PullRequest
err := c.ShouldBindJSON(&req)
@@ -554,12 +633,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
quantization := req.Quantization
if req.Quantize != "" {
quantization = req.Quantize
}
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil {
if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -972,6 +1046,7 @@ func (s *Server) GenerateRoutes() http.Handler {
r.POST("/api/generate", s.GenerateHandler)
r.POST("/api/chat", s.ChatHandler)
r.POST("/api/embeddings", s.EmbeddingsHandler)
r.POST("/api/tokenize", s.TokenizeHandler)
r.POST("/api/create", s.CreateModelHandler)
r.POST("/api/push", s.PushModelHandler)
r.POST("/api/copy", s.CopyModelHandler)

View File

@@ -61,10 +61,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
req := &LlmRequest{
@@ -567,9 +563,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUs
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
return sgl
if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
return gl
}
}
return nil