template extra args

2026-01-21 22:10:58 -05:00 · 2023-09-03 18:30:04 -04:00
22 changed files with 258 additions and 544 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,8 @@
+build
+llama/build
+.venv
 .vscode
 ollama
 app
-llm/llama.cpp/ggml
+web
+.env
--- a/18
+++ b/18
@@ -1,21 +1,15 @@
-FROM golang:alpine
-
+FROM golang:1.20
 WORKDIR /go/src/github.com/jmorganca/ollama
-RUN apk add --no-cache git build-base cmake
-
 COPY . .
-RUN go generate ./... && go build -ldflags '-linkmode external -extldflags "-static"' .
+RUN CGO_ENABLED=1 go build -ldflags '-linkmode external -extldflags "-static"' .

 FROM alpine
-ENV OLLAMA_HOST 0.0.0.0
-RUN apk add --no-cache libstdc++
-
+COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+EXPOSE 11434
 ARG USER=ollama
 ARG GROUP=ollama
-RUN addgroup $GROUP && adduser -D -G $GROUP $USER
-
-COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
-
+RUN addgroup -g 1000 $GROUP && adduser -u 1000 -DG $GROUP $USER
 USER $USER:$GROUP
 ENTRYPOINT ["/bin/ollama"]
+ENV OLLAMA_HOST 0.0.0.0
 CMD ["serve"]
--- a/README.md
+++ b/README.md
@@ -165,11 +165,10 @@ Ollama bundles model weights, configurations, and data into a single package, de

 ## Building

-Install `cmake` and `go`:
+Install `cmake`:

 ```
 brew install cmake
-brew install go
 ```

 Then generate dependencies and build:
--- a/api/client.go
+++ b/api/client.go
@@ -255,14 +255,6 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	return nil
 }

-func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
-	var resp ShowResponse
-	if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
 		return err
--- a/api/types.go
+++ b/api/types.go
@@ -32,11 +32,12 @@ func (e StatusError) Error() string {
 }

 type GenerateRequest struct {
-	Model    string `json:"model"`
-	Prompt   string `json:"prompt"`
-	System   string `json:"system"`
-	Template string `json:"template"`
-	Context  []int  `json:"context,omitempty"`
+	Model    string         `json:"model"`
+	Prompt   string         `json:"prompt"`
+	System   string         `json:"system"`
+	Template string         `json:"template"`
+	Context  []int          `json:"context,omitempty"`
+	Args     map[string]any `json:"args,omitempty"`

 	Options map[string]interface{} `json:"options"`
 }
@@ -61,18 +62,6 @@ type DeleteRequest struct {
 	Name string `json:"name"`
 }

-type ShowRequest struct {
-	Name string `json:"name"`
-}
-
-type ShowResponse struct {
-	License    string `json:"license,omitempty"`
-	Modelfile  string `json:"modelfile,omitempty"`
-	Parameters string `json:"parameters,omitempty"`
-	Template   string `json:"template,omitempty"`
-	System     string `json:"system,omitempty"`
-}
-
 type CopyRequest struct {
 	Source      string `json:"source"`
 	Destination string `json:"destination"`
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -230,84 +230,6 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func ShowHandler(cmd *cobra.Command, args []string) error {
-	client, err := api.FromEnv()
-	if err != nil {
-		return err
-	}
-
-	if len(args) != 1 {
-		return errors.New("missing model name")
-	}
-
-	license, errLicense := cmd.Flags().GetBool("license")
-	modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
-	parameters, errParams := cmd.Flags().GetBool("parameters")
-	system, errSystem := cmd.Flags().GetBool("system")
-	template, errTemplate := cmd.Flags().GetBool("template")
-
-	for _, boolErr := range []error{errLicense, errModelfile, errParams, errSystem, errTemplate} {
-		if boolErr != nil {
-			return errors.New("error retrieving flags")
-		}
-	}
-
-	flagsSet := 0
-	showType := ""
-
-	if license {
-		flagsSet++
-		showType = "license"
-	}
-
-	if modelfile {
-		flagsSet++
-		showType = "modelfile"
-	}
-
-	if parameters {
-		flagsSet++
-		showType = "parameters"
-	}
-
-	if system {
-		flagsSet++
-		showType = "system"
-	}
-
-	if template {
-		flagsSet++
-		showType = "template"
-	}
-
-	if flagsSet > 1 {
-		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
-	} else if flagsSet == 0 {
-		return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
-	}
-
-	req := api.ShowRequest{Name: args[0]}
-	resp, err := client.Show(context.Background(), &req)
-	if err != nil {
-		return err
-	}
-
-	switch showType {
-	case "license":
-		fmt.Println(resp.License)
-	case "modelfile":
-		fmt.Println(resp.Modelfile)
-	case "parameters":
-		fmt.Println(resp.Parameters)
-	case "system":
-		fmt.Println(resp.System)
-	case "template":
-		fmt.Println(resp.Template)
-	}
-
-	return nil
-}
-
 func CopyHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.FromEnv()
 	if err != nil {
@@ -455,6 +377,20 @@ func generate(cmd *cobra.Command, model, prompt string) error {
 	return nil
 }

+func showLayer(l *server.Layer) {
+	filename, err := server.GetBlobsPath(l.Digest)
+	if err != nil {
+		fmt.Println("Couldn't get layer's path")
+		return
+	}
+	bts, err := os.ReadFile(filename)
+	if err != nil {
+		fmt.Println("Couldn't read layer")
+		return
+	}
+	fmt.Println(string(bts))
+}
+
 func generateInteractive(cmd *cobra.Command, model string) error {
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -477,8 +413,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		),
 		readline.PcItem("/show",
 			readline.PcItem("license"),
-			readline.PcItem("modelfile"),
-			readline.PcItem("parameters"),
 			readline.PcItem("system"),
 			readline.PcItem("template"),
 		),
@@ -588,28 +522,42 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		case strings.HasPrefix(line, "/show"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
-				resp, err := server.GetModelInfo(model)
+				mp := server.ParseModelPath(model)
 				if err != nil {
-					fmt.Println("error: couldn't get model")
+					return err
+				}
+
+				manifest, _, err := server.GetManifest(mp)
+				if err != nil {
+					fmt.Println("error: couldn't get a manifest for this model")
 					continue
 				}
-
 				switch args[1] {
 				case "license":
-					fmt.Println(resp.License)
-				case "modelfile":
-					fmt.Println(resp.Modelfile)
-				case "parameters":
-					fmt.Println(resp.Parameters)
+					for _, l := range manifest.Layers {
+						if l.MediaType == "application/vnd.ollama.image.license" {
+							showLayer(l)
+						}
+					}
+					continue
 				case "system":
-					fmt.Println(resp.System)
+					for _, l := range manifest.Layers {
+						if l.MediaType == "application/vnd.ollama.image.system" {
+							showLayer(l)
+						}
+					}
+					continue
 				case "template":
-					fmt.Println(resp.Template)
+					for _, l := range manifest.Layers {
+						if l.MediaType == "application/vnd.ollama.image.template" {
+							showLayer(l)
+						}
+					}
+					continue
 				default:
-					fmt.Println("error: unknown command")
+					usage()
+					continue
 				}
-
-				continue
 			} else {
 				usage()
 				continue
@@ -801,20 +749,6 @@ func NewCLI() *cobra.Command {

 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")

-	showCmd := &cobra.Command{
-		Use:     "show MODEL",
-		Short:   "Show information for a model",
-		Args:    cobra.MinimumNArgs(1),
-		PreRunE: checkServerHeartbeat,
-		RunE:    ShowHandler,
-	}
-
-	showCmd.Flags().Bool("license", false, "Show license of a model")
-	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
-	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
-	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system prompt of a model")
-
 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
 		Short:   "Run a model",
@@ -880,7 +814,6 @@ func NewCLI() *cobra.Command {
 	rootCmd.AddCommand(
 		serveCmd,
 		createCmd,
-		showCmd,
 		runCmd,
 		pullCmd,
 		pushCmd,
--- a/docs/api.md
+++ b/docs/api.md
@@ -238,10 +238,6 @@ Generate embeddings from a model
 - `model`: name of model to generate embeddings from
 - `prompt`: text to generate embeddings for

-Advanced parameters:
-
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
-
 ### Request

 ```
--- a/format/openssh.go
+++ b/format/openssh.go
@@ -10,11 +10,15 @@ package format

 import (
 	"crypto"
+	"crypto/ecdsa"
 	"crypto/ed25519"
+	"crypto/elliptic"
 	"crypto/rand"
+	"crypto/rsa"
 	"encoding/binary"
 	"encoding/pem"
 	"fmt"
+	"math/big"

 	"golang.org/x/crypto/ssh"
 )
@@ -37,6 +41,25 @@ type openSSHPrivateKey struct {
 	Rest    []byte `ssh:"rest"`
 }

+type openSSHRSAPrivateKey struct {
+	N       *big.Int
+	E       *big.Int
+	D       *big.Int
+	Iqmp    *big.Int
+	P       *big.Int
+	Q       *big.Int
+	Comment string
+	Pad     []byte `ssh:"rest"`
+}
+
+type openSSHECDSAPrivateKey struct {
+	Curve   string
+	Pub     []byte
+	D       *big.Int
+	Comment string
+	Pad     []byte `ssh:"rest"`
+}
+
 type openSSHEd25519PrivateKey struct {
 	Pub     []byte
 	Priv    []byte
@@ -62,6 +85,64 @@ func OpenSSHPrivateKey(key crypto.PrivateKey, comment string) (*pem.Block, error
 	}

 	switch k := key.(type) {
+	case *rsa.PrivateKey:
+		e := new(big.Int).SetInt64(int64(k.E))
+
+		key := openSSHRSAPrivateKey{
+			N:       k.N,
+			E:       e,
+			D:       k.D,
+			Iqmp:    k.Precomputed.Qinv,
+			P:       k.Primes[0],
+			Q:       k.Primes[1],
+			Comment: comment,
+		}
+
+		pk1.Keytype = ssh.KeyAlgoRSA
+		pk1.Rest = ssh.Marshal(key)
+
+		w.PubKey = ssh.Marshal(struct {
+			KeyType string
+			E       *big.Int
+			N       *big.Int
+		}{
+			ssh.KeyAlgoRSA, e, k.N,
+		})
+	case *ecdsa.PrivateKey:
+		var curve, keytype string
+		switch name := k.Curve.Params().Name; name {
+		case "P-256":
+			curve = "nistp256"
+			keytype = ssh.KeyAlgoECDSA256
+		case "P-384":
+			curve = "nistp384"
+			keytype = ssh.KeyAlgoECDSA384
+		case "P-521":
+			curve = "nistp521"
+			keytype = ssh.KeyAlgoECDSA521
+		default:
+			return nil, fmt.Errorf("ssh: unknown curve %q", name)
+		}
+
+		pub := elliptic.Marshal(k.Curve, k.X, k.Y)
+
+		key := openSSHECDSAPrivateKey{
+			Curve:   curve,
+			Pub:     pub,
+			D:       k.D,
+			Comment: comment,
+		}
+
+		pk1.Keytype = keytype
+		pk1.Rest = ssh.Marshal(key)
+
+		w.PubKey = ssh.Marshal(struct {
+			KeyType string
+			Curve   string
+			Pub     []byte
+		}{
+			keytype, curve, pub,
+		})
 	case ed25519.PrivateKey:
 		pub, priv := k[32:], k
 		key := openSSHEd25519PrivateKey{
--- a/go.mod
+++ b/go.mod
@@ -39,7 +39,6 @@ require (
 	github.com/ugorji/go/codec v1.2.11 // indirect
 	golang.org/x/arch v0.3.0 // indirect
 	golang.org/x/crypto v0.10.0
-	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
 	golang.org/x/net v0.10.0 // indirect
 	golang.org/x/sys v0.11.0 // indirect
 	golang.org/x/term v0.10.0
--- a/go.sum
+++ b/go.sum
@@ -121,8 +121,6 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
-golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
-golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
 golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
--- a/llm/ggml_llama.go
+++ b/llm/ggml_llama.go
@@ -45,7 +45,6 @@ func osPath(llamaPath string) string {
 	if runtime.GOOS == "windows" {
 		return path.Join(llamaPath, "Release")
 	}
-
 	return llamaPath
 }

@@ -69,9 +68,7 @@ func initGGML() {
 		case "windows":
 			files = []string{"server.exe"}
 		case "darwin":
-			if llamaPath == osPath(ggmlGPU) {
-				files = append(files, "ggml-metal.metal")
-			}
+			files = append(files, "ggml-metal.metal")
 		}

 		for _, f := range files {
@@ -289,8 +286,8 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
 			runner.Path,
 			append(params, "--port", strconv.Itoa(port))...,
 		)
-		cmd.Stdout = os.Stderr
-		cmd.Stderr = os.Stderr
+		var stderr bytes.Buffer
+		cmd.Stderr = &stderr

 		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}

@@ -356,6 +353,11 @@ func (llm *llama) SetOptions(opts api.Options) {
 	llm.Options = opts
 }

+type Prediction struct {
+	Content string `json:"content"`
+	Stop    bool   `json:"stop"`
+}
+
 type GenerationSettings struct {
 	FrequencyPenalty float64       `json:"frequency_penalty"`
 	IgnoreEOS        bool          `json:"ignore_eos"`
@@ -383,19 +385,31 @@ type GenerationSettings struct {
 }

 type Timings struct {
-	PredictedN  int     `json:"predicted_n"`
-	PredictedMS float64 `json:"predicted_ms"`
-	PromptN     int     `json:"prompt_n"`
-	PromptMS    float64 `json:"prompt_ms"`
+	PredictedMS         float64 `json:"predicted_ms"`
+	PredictedN          int     `json:"predicted_n"`
+	PredictedPerSecond  float64 `json:"predicted_per_second"`
+	PredictedPerTokenMS float64 `json:"predicted_per_token_ms"`
+	PromptMS            float64 `json:"prompt_ms"`
+	PromptN             int     `json:"prompt_n"`
+	PromptPerSecond     float64 `json:"prompt_per_second"`
+	PromptPerTokenMS    float64 `json:"prompt_per_token_ms"`
 }

-type Prediction struct {
-	Content string `json:"content"`
-	Model   string `json:"model"`
-	Prompt  string `json:"prompt"`
-	Stop    bool   `json:"stop"`
-
-	Timings `json:"timings"`
+type PredictComplete struct {
+	Content            string             `json:"content"`
+	GenerationSettings GenerationSettings `json:"generation_settings"`
+	Model              string             `json:"model"`
+	Prompt             string             `json:"prompt"`
+	Stop               bool               `json:"stop"`
+	StoppedEOS         bool               `json:"stopped_eos"`
+	StoppedLimit       bool               `json:"stopped_limit"`
+	StoppedWord        bool               `json:"stopped_word"`
+	StoppingWord       string             `json:"stopping_word"`
+	Timings            Timings            `json:"timings"`
+	TokensCached       int                `json:"tokens_cached"`
+	TokensEvaluated    int                `json:"tokens_evaluated"`
+	TokensPredicted    int                `json:"tokens_predicted"`
+	Truncated          bool               `json:"truncated"`
 }

 type PredictRequest struct {
@@ -423,19 +437,15 @@ type PredictRequest struct {
 	Stop             []string        `json:"stop,omitempty"`
 }

-func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
-	prevConvo, err := llm.Decode(ctx, prevContext)
+func (llm *llama) Predict(ctx context.Context, predictCtx []int, prompt string, fn func(api.GenerateResponse)) error {
+	// we need to find the trimmed prompt context before predicting so that we can return it to the client
+	trimmedPrompt, err := llm.marshalPrompt(ctx, predictCtx, prompt)
 	if err != nil {
-		return err
+		return fmt.Errorf("marshaling prompt: %v", err)
 	}
-
-	var nextContext strings.Builder
-	nextContext.WriteString(prevConvo)
-	nextContext.WriteString(prompt)
-
 	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
 	predReq := PredictRequest{
-		Prompt:           nextContext.String(),
+		Prompt:           trimmedPrompt,
 		Stream:           true,
 		NPredict:         llm.NumPredict,
 		NKeep:            llm.NumKeep,
@@ -481,6 +491,7 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 	}

 	scanner := bufio.NewScanner(resp.Body)
+	genCtx := trimmedPrompt // start with the trimmed prompt
 	for scanner.Scan() {
 		select {
 		case <-ctx.Done():
@@ -495,33 +506,34 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 			// Read data from the server-side event stream
 			if strings.HasPrefix(line, "data: ") {
 				evt := line[6:]
-				var p Prediction
-				if err := json.Unmarshal([]byte(evt), &p); err != nil {
-					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+				var complete PredictComplete
+				if err := json.Unmarshal([]byte(evt), &complete); err != nil {
+					return fmt.Errorf("error unmarshaling llm complete response: %v", err)
 				}

-				if p.Content != "" {
-					fn(api.GenerateResponse{Response: p.Content})
-					nextContext.WriteString(p.Content)
-				}
-
-				if p.Stop {
-					embd, err := llm.Encode(ctx, nextContext.String())
+				if complete.Timings.PredictedMS > 0 {
+					genCtx += complete.Content
+					embd, err := llm.Encode(ctx, genCtx)
 					if err != nil {
 						return fmt.Errorf("encoding context: %v", err)
 					}
-
 					fn(api.GenerateResponse{
 						Done:               true,
 						Context:            embd,
-						PromptEvalCount:    p.PromptN,
-						PromptEvalDuration: parseDurationMs(p.PromptMS),
-						EvalCount:          p.PredictedN,
-						EvalDuration:       parseDurationMs(p.PredictedMS),
+						PromptEvalCount:    int(complete.Timings.PromptN),
+						PromptEvalDuration: parseDurationMs(float64(complete.Timings.PromptMS)),
+						EvalCount:          int(complete.Timings.PredictedN),
+						EvalDuration:       parseDurationMs(float64(complete.Timings.PredictedMS)),
 					})
-
 					return nil
 				}
+
+				var pred Prediction
+				if err := json.Unmarshal([]byte(evt), &pred); err != nil {
+					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+				}
+				genCtx += pred.Content
+				fn(api.GenerateResponse{Response: pred.Content})
 			}
 		}
 	}
@@ -533,6 +545,34 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 	return nil
 }

+func (llm *llama) marshalPrompt(ctx context.Context, pCtx []int, prompt string) (string, error) {
+	pEncode, err := llm.Encode(ctx, prompt)
+	if err != nil {
+		return "", fmt.Errorf("encoding prompt context: %w", err)
+	}
+	tokens := append(pCtx, pEncode...)
+	if llm.NumKeep < 0 {
+		llm.NumKeep = len(tokens)
+	}
+
+	// min(llm.NumCtx - 4, llm.NumKeep)
+	if llm.NumCtx-4 < llm.NumKeep {
+		llm.NumKeep = llm.NumCtx - 4
+	}
+
+	if len(tokens) >= llm.NumCtx {
+		// truncate input
+		numLeft := (llm.NumCtx - llm.NumKeep) / 2
+		truncated := tokens[:llm.NumKeep]
+		erasedBlocks := (len(tokens) - llm.NumKeep - numLeft - 1) / numLeft
+		truncated = append(truncated, tokens[llm.NumKeep+erasedBlocks*numLeft:]...)
+		tokens = truncated
+		log.Printf("input truncated: num_ctx=%d num_keep=%d num_left=%d num_tokens=%d", llm.NumCtx, llm.NumKeep, numLeft, len(truncated))
+	}
+
+	return llm.Decode(ctx, tokens)
+}
+
 type TokenizeRequest struct {
 	Content string `json:"content"`
 }
--- a/llm/llama.cpp/generate.go
+++ b/llm/llama.cpp/generate.go
@@ -1,13 +1,8 @@
-//go:build !darwin
-// +build !darwin
-
 package llm

 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
@@ -0,0 +1,11 @@
+//go:build darwin
+// +build darwin
+
+package llm
+
+//go:generate git submodule init
+//go:generate git submodule update --force ggml
+//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
+//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
+//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
+//go:generate cmake --build ggml/build/gpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -1,10 +0,0 @@
-package llm
-
-//go:generate git submodule init
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/cpu --target server --config Release
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -1,10 +0,0 @@
-package llm
-
-//go:generate git submodule init
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/gpu --target server --config Release
--- a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -1,32 +0,0 @@
-From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Tue, 5 Sep 2023 16:05:08 -0400
-Subject: [PATCH] metal: add missing barriers for mul-mat #2699
-
---
- ggml-metal.metal | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 3f31252..ce3541f 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         //load data and store to threadgroup memory
-         half4x4 temp_a;
-         dequantize_func(x, il, temp_a);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         #pragma unroll(16)
-         for (int i = 0; i < 16; i++) {
-             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         }
-     } else {
-         // block is smaller than 64x32, we should avoid writing data outside of the matrix
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
@@ -1,30 +0,0 @@
-From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
-From: Shouzheng Liu <lshzh.hi@gmail.com>
-Date: Mon, 21 Aug 2023 06:59:29 -0400
-Subject: [PATCH] metal : fix synchronization in new matrix multiplication
- kernel (#2686)
-
---
- ggml-metal.metal | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-+            threadgroup_barrier(mem_flags::mem_device);
-             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-         }
- 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-+        threadgroup_barrier(mem_flags::mem_device);
-         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-         if (sgitg==0) {
-             for (int i = 0; i < n_rows; i++) {
-- 
-2.41.0
-
--- a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
+++ b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
@@ -1,41 +0,0 @@
-From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
-From: Shouzheng Liu <lshzh.hi@gmail.com>
-Date: Tue, 22 Aug 2023 02:18:40 -0400
-Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
-
---
- ggml-metal.metal | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 88d48f6..ce3541f 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         //load data and store to threadgroup memory
-         half4x4 temp_a;
-         dequantize_func(x, il, temp_a);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         #pragma unroll(16)
-         for (int i = 0; i < 16; i++) {
-             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         }
-     } else {
-         // block is smaller than 64x32, we should avoid writing data outside of the matrix
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-            threadgroup_barrier(mem_flags::mem_device);
-             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-         }
- 
-        threadgroup_barrier(mem_flags::mem_device);
-+        threadgroup_barrier(mem_flags::mem_threadgroup);
-         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-         if (sgitg==0) {
-             for (int i = 0; i < n_rows; i++) {
-- 
-2.41.0
-
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -6,11 +6,8 @@ GO_LDFLAGS="-X github.com/jmorganca/ollama/version.Version=$VERSION"
 GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"

 # build universal binary
-GOARCH=arm64 go generate ./...
-GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
-rm -rf llm/llama.cpp/ggml/build/*/bin
-GOARCH=amd64 go generate ./...
-GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
+CGO_ENABLED=1 GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
+CGO_ENABLED=1 GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
 rm dist/ollama-darwin-amd64 dist/ollama-darwin-arm64
 codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
--- a/server/images.go
+++ b/server/images.go
@@ -22,8 +22,6 @@ import (
 	"strings"
 	"text/template"

-	"golang.org/x/exp/slices"
-
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
@@ -41,18 +39,15 @@ type RegistryOptions struct {
 }

 type Model struct {
-	Name          string `json:"name"`
-	ShortName     string
-	ModelPath     string
-	OriginalModel string
-	AdapterPaths  []string
-	Template      string
-	System        string
-	License       []string
-	Digest        string
-	ConfigDigest  string
-	Options       map[string]interface{}
-	Embeddings    []vector.Embedding
+	Name         string `json:"name"`
+	ModelPath    string
+	AdapterPaths []string
+	Template     string
+	System       string
+	Digest       string
+	ConfigDigest string
+	Options      map[string]interface{}
+	Embeddings   []vector.Embedding
 }

 func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
@@ -71,6 +66,7 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
 		System string
 		Prompt string
 		Embed  string
+		Args   map[string]any

 		// deprecated: versions <= 0.0.7 used this to omit the system prompt
 		Context []int
@@ -80,6 +76,7 @@ func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, e
 	vars.System = m.System
 	vars.Prompt = request.Prompt
 	vars.Context = request.Context
+	vars.Args = request.Args
 	vars.Embed = embedding

 	if request.System != "" {
@@ -116,7 +113,6 @@ type LayerReader struct {
 type ConfigV2 struct {
 	ModelFamily llm.ModelFamily `json:"model_family"`
 	ModelType   string          `json:"model_type"`
-	ModelFormat string          `json:"model_format"`
 	FileType    string          `json:"file_type"`
 	RootFS      RootFS          `json:"rootfs"`

@@ -175,11 +171,9 @@ func GetModel(name string) (*Model, error) {

 	model := &Model{
 		Name:         mp.GetFullTagname(),
-		ShortName:    mp.GetShortTagname(),
 		Digest:       digest,
 		ConfigDigest: manifest.Config.Digest,
 		Template:     "{{ .Prompt }}",
-		License:      []string{},
 	}

 	for _, layer := range manifest.Layers {
@@ -191,7 +185,6 @@ func GetModel(name string) (*Model, error) {
 		switch layer.MediaType {
 		case "application/vnd.ollama.image.model":
 			model.ModelPath = filename
-			model.OriginalModel = layer.From
 		case "application/vnd.ollama.image.embed":
 			file, err := os.Open(filename)
 			if err != nil {
@@ -236,12 +229,6 @@ func GetModel(name string) (*Model, error) {
 			if err = json.NewDecoder(params).Decode(&model.Options); err != nil {
 				return nil, err
 			}
-		case "application/vnd.ollama.image.license":
-			bts, err := os.ReadFile(filename)
-			if err != nil {
-				return nil, err
-			}
-			model.License = append(model.License, string(bts))
 		}
 	}

@@ -289,7 +276,6 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api

 	var layers []*LayerReader
 	params := make(map[string][]string)
-	var sourceParams map[string]any
 	embed := EmbeddingParams{fn: fn}
 	for _, c := range commands {
 		log.Printf("[%s] - %s\n", c.Name, c.Args)
@@ -336,7 +322,6 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api

 					config.ModelFamily = ggml.ModelFamily()
 					config.ModelType = ggml.ModelType().String()
-					config.ModelFormat = ggml.Name()
 					config.FileType = ggml.FileType().String()

 					// reset the file
@@ -368,30 +353,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 					return err
 				}

-				// copy the model metadata
+				// copie the model metadata
 				config.ModelFamily = source.ModelFamily
 				config.ModelType = source.ModelType
-				config.ModelFormat = source.ModelFormat
 				config.FileType = source.FileType

 				for _, l := range mf.Layers {
-					if l.MediaType == "application/vnd.ollama.image.params" {
-						sourceParamsBlobPath, err := GetBlobsPath(l.Digest)
-						if err != nil {
-							return err
-						}
-
-						sourceParamsBlob, err := os.Open(sourceParamsBlobPath)
-						if err != nil {
-							return err
-						}
-						defer sourceParamsBlob.Close()
-
-						if err := json.NewDecoder(sourceParamsBlob).Decode(&sourceParams); err != nil {
-							return err
-						}
-					}
-
 					newLayer, err := GetLayerWithBufferFromLayer(l)
 					if err != nil {
 						return err
@@ -462,19 +429,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 	// Create a single layer for the parameters
 	if len(params) > 0 {
 		fn(api.ProgressResponse{Status: "creating parameter layer"})
-
 		layers = removeLayerFromLayers(layers, "application/vnd.ollama.image.params")
 		formattedParams, err := formatParams(params)
 		if err != nil {
 			return fmt.Errorf("couldn't create params json: %v", err)
 		}

-		for k, v := range sourceParams {
-			if _, ok := formattedParams[k]; !ok {
-				formattedParams[k] = v
-			}
-		}
-
 		bts, err := json.Marshal(formattedParams)
 		if err != nil {
 			return err
@@ -672,9 +632,14 @@ func existingFileEmbeddings(digest string) (map[string][]float64, error) {
 }

 func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
-	return slices.DeleteFunc(layers, func(layer *LayerReader) bool {
-		return layer.MediaType == mediaType
-	})
+	j := 0
+	for _, l := range layers {
+		if l.MediaType != mediaType {
+			layers[j] = l
+			j++
+		}
+	}
+	return layers[:j]
 }

 func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error {
@@ -948,83 +913,6 @@ func DeleteModel(name string) error {
 	return nil
 }

-func ShowModelfile(model *Model) (string, error) {
-	type modelTemplate struct {
-		*Model
-		From   string
-		Params string
-	}
-
-	var params []string
-	for k, v := range model.Options {
-		switch val := v.(type) {
-		case string:
-			params = append(params, fmt.Sprintf("PARAMETER %s %s", k, val))
-		case int:
-			params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(val)))
-		case float64:
-			params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(val, 'f', 0, 64)))
-		case bool:
-			params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(val)))
-		case []interface{}:
-			for _, nv := range val {
-				switch nval := nv.(type) {
-				case string:
-					params = append(params, fmt.Sprintf("PARAMETER %s %s", k, nval))
-				case int:
-					params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.Itoa(nval)))
-				case float64:
-					params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatFloat(nval, 'f', 0, 64)))
-				case bool:
-					params = append(params, fmt.Sprintf("PARAMETER %s %s", k, strconv.FormatBool(nval)))
-				default:
-					log.Printf("unknown type: %s", reflect.TypeOf(nv).String())
-				}
-			}
-		default:
-			log.Printf("unknown type: %s", reflect.TypeOf(v).String())
-		}
-	}
-
-	mt := modelTemplate{
-		Model:  model,
-		From:   model.OriginalModel,
-		Params: strings.Join(params, "\n"),
-	}
-
-	if mt.From == "" {
-		mt.From = model.ModelPath
-	}
-
-	modelFile := `# Modelfile generated by "ollama show"
-# To build a new Modelfile based on this one, replace the FROM line with:
-# FROM {{ .ShortName }}
-
-FROM {{ .From }}
-TEMPLATE """{{ .Template }}"""
-SYSTEM """{{ .System }}"""
-{{ .Params }}
-`
-	for _, l := range mt.Model.AdapterPaths {
-		modelFile += fmt.Sprintf("ADAPTER %s\n", l)
-	}
-
-	tmpl, err := template.New("").Parse(modelFile)
-	if err != nil {
-		log.Printf("error parsing template: %q", err)
-		return "", err
-	}
-
-	var buf bytes.Buffer
-
-	if err = tmpl.Execute(&buf, mt); err != nil {
-		log.Printf("error executing template: %q", err)
-		return "", err
-	}
-
-	return buf.String(), nil
-}
-
 func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	fn(api.ProgressResponse{Status: "retrieving manifest"})
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -114,12 +114,7 @@ func GetManifestPath() (string, error) {
 		return "", err
 	}

-	path := filepath.Join(home, ".ollama", "models", "manifests")
-	if err := os.MkdirAll(path, 0o755); err != nil {
-		return "", err
-	}
-
-	return path, nil
+	return filepath.Join(home, ".ollama", "models", "manifests"), nil
 }

 func GetBlobsPath(digest string) (string, error) {
--- a/server/routes.go
+++ b/server/routes.go
@@ -12,7 +12,6 @@ import (
 	"os/signal"
 	"path/filepath"
 	"reflect"
-	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -118,13 +117,12 @@ func load(ctx context.Context, model *Model, reqOpts map[string]interface{}, ses
 			if err != nil {
 				return err
 			}
-
 			tokensNoSystem, err := llmModel.Encode(ctx, promptNoSystem)
 			if err != nil {
 				return err
 			}

-			opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem)
+			opts.NumKeep = len(tokensWithSystem) - len(tokensNoSystem) + 1

 			llmModel.SetOptions(opts)
 		}
@@ -365,77 +363,6 @@ func DeleteModelHandler(c *gin.Context) {
 	}
 }

-func ShowModelHandler(c *gin.Context) {
-	var req api.ShowRequest
-	if err := c.ShouldBindJSON(&req); err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	resp, err := GetModelInfo(req.Name)
-	if err != nil {
-		if os.IsNotExist(err) {
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
-		} else {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
-		return
-	}
-
-	c.JSON(http.StatusOK, resp)
-}
-
-func GetModelInfo(name string) (*api.ShowResponse, error) {
-	model, err := GetModel(name)
-	if err != nil {
-		return nil, err
-	}
-
-	resp := &api.ShowResponse{
-		License:  strings.Join(model.License, "\n"),
-		System:   model.System,
-		Template: model.Template,
-	}
-
-	mf, err := ShowModelfile(model)
-	if err != nil {
-		return nil, err
-	}
-
-	resp.Modelfile = mf
-
-	var params []string
-	cs := 30
-	for k, v := range model.Options {
-		switch val := v.(type) {
-		case string:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
-		case int:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
-		case float64:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
-		case bool:
-			params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
-		case []interface{}:
-			for _, nv := range val {
-				switch nval := nv.(type) {
-				case string:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
-				case int:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
-				case float64:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
-				case bool:
-					params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
-				}
-			}
-		}
-	}
-	resp.Parameters = strings.Join(params, "\n")
-
-	return resp, nil
-}
-
 func ListModelsHandler(c *gin.Context) {
 	var models []api.ModelResponse
 	fp, err := GetManifestPath()
@@ -529,7 +456,6 @@ func Serve(ln net.Listener, origins []string) error {
 	r.POST("/api/copy", CopyModelHandler)
 	r.GET("/api/tags", ListModelsHandler)
 	r.DELETE("/api/delete", DeleteModelHandler)
-	r.POST("/api/show", ShowModelHandler)

 	log.Printf("Listening on %s", ln.Addr())
 	s := &http.Server{