Add TODO

add a /tokenize endpoint
2026-01-21 05:48:35 -05:00 · 2024-05-10 10:15:22 -07:00 · 2024-05-10 10:13:50 -07:00
7 changed files with 124 additions and 30 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -195,19 +195,28 @@ type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }

+type TokenizeRequest struct {
+	Model     string    `json:"model"`
+	Prompt    string    `json:"prompt"`
+	KeepAlive *Duration `json:"keep_alive,omitempty"`
+
+	Options map[string]interface{} `json:"options"`
+}
+
+type TokenizeResponse struct {
+	Tokens []int `json:"tokens"`
+}
+
 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
-	Model     string `json:"model"`
-	Path      string `json:"path"`
-	Modelfile string `json:"modelfile"`
-	Stream    *bool  `json:"stream,omitempty"`
-	Quantize  string `json:"quantize,omitempty"`
+	Model        string `json:"model"`
+	Path         string `json:"path"`
+	Modelfile    string `json:"modelfile"`
+	Stream       *bool  `json:"stream,omitempty"`
+	Quantization string `json:"quantization,omitempty"`

 	// Name is deprecated, see Model
 	Name string `json:"name"`
-
-	// Quantization is deprecated, see Quantize
-	Quantization string `json:"quantization,omitempty"`
 }

 // DeleteRequest is the request passed to [Client.Delete].
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -142,9 +142,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	quantize, _ := cmd.Flags().GetString("quantize")
+	quantization, _ := cmd.Flags().GetString("quantization")

-	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize}
+	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -1051,7 +1051,7 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
+	createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -329,10 +329,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(1+4*embedding+context+context*heads),
 		)

-		partialOffload = max(
-			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2+3*embedding+context+context*heads),
-		)
+		partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
 	case "stablelm":
 		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
 		partialOffload = max(
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -12,8 +12,17 @@ import (

 // This algorithm looks for a complete fit to determine if we need to unload other models
 func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
-	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
+	if opts.NumCtx > int(ggml.KV().ContextLength()) {
+		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
+		opts.NumCtx = int(ggml.KV().ContextLength())
+	}
+
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+
+	// Split up the GPUs by type and try them
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
 		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
--- a/llm/server.go
+++ b/llm/server.go
@@ -77,7 +77,15 @@ func LoadModel(model string) (*GGML, error) {
 // The gpu list must be a single family.
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
-	var cpuRunner string
+	if opts.NumCtx > int(ggml.KV().ContextLength()) {
+		slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
+	}
+
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+
+	cpuRunner := ""
 	var estimatedVRAM uint64
 	var estimatedTotal uint64
 	var systemMemory uint64
--- a/server/routes.go
+++ b/server/routes.go
@@ -407,6 +407,85 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

+func (s *Server) TokenizeHandler(c *gin.Context) {
+	var req api.TokenizeRequest
+	err := c.ShouldBindJSON(&req)
+	switch {
+	case errors.Is(err, io.EOF):
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
+		return
+	case err != nil:
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	if req.Model == "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
+		return
+	}
+
+	model, err := GetModel(req.Model)
+	if err != nil {
+		var pErr *fs.PathError
+		if errors.As(err, &pErr) {
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	opts, err := modelOptions(model, req.Options)
+	if err != nil {
+		// TODO: handle specific errors
+		// if errors.Is(err, api.ErrInvalidOpts) {
+		// 	c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		// 	return
+		// }
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	var sessionDuration time.Duration
+	if req.KeepAlive == nil {
+		sessionDuration = getDefaultSessionDuration()
+	} else {
+		sessionDuration = req.KeepAlive.Duration
+	}
+
+	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
+	var runner *runnerRef
+	select {
+	case runner = <-rCh:
+	case err = <-eCh:
+		if errors.Is(err, context.Canceled) {
+			c.JSON(499, gin.H{"error": "request canceled"})
+			return
+		}
+
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	// an empty request loads the model
+	if req.Prompt == "" {
+		c.JSON(http.StatusOK, api.TokenizeResponse{Tokens: []int{}})
+		return
+	}
+
+	tokens, err := runner.llama.Tokenize(c.Request.Context(), req.Prompt)
+	if err != nil {
+		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		return
+	}
+
+	resp := api.TokenizeResponse{
+		Tokens: tokens,
+	}
+	c.JSON(http.StatusOK, resp)
+}
+
 func (s *Server) PullModelHandler(c *gin.Context) {
 	var req api.PullRequest
 	err := c.ShouldBindJSON(&req)
@@ -554,12 +633,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		quantization := req.Quantization
-		if req.Quantize != "" {
-			quantization = req.Quantize
-		}
-
-		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(quantization), modelfile, fn); err != nil {
+		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -972,6 +1046,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)
+	r.POST("/api/tokenize", s.TokenizeHandler)
 	r.POST("/api/create", s.CreateModelHandler)
 	r.POST("/api/push", s.PushModelHandler)
 	r.POST("/api/copy", s.CopyModelHandler)
--- a/server/sched.go
+++ b/server/sched.go
@@ -61,10 +61,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
 	// allocate a large enough kv cache for all parallel requests
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	opts.NumCtx = opts.NumCtx * envconfig.NumParallel

 	req := &LlmRequest{
@@ -567,9 +563,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		// - try subsets of GPUs instead of just falling back to 1 or all in a family

 		// Now try all the GPUs
-		if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
-			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
-			return sgl
+		if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
+			return gl
 		}
 	}
 	return nil
Author	SHA1	Message	Date
Bruce MacDonald	938be81c45	Add TODO	2024-05-10 10:15:22 -07:00
Bruce MacDonald	19ce10e49e	add a /tokenize endpoint	2024-05-10 10:13:50 -07:00