atomic for race

race
lint complained
2026-01-01 12:08:20 -05:00 · 2024-07-15 10:44:35 -07:00 · 2024-07-12 11:52:10 -07:00 · 2024-07-11 09:26:17 -07:00 · 2024-07-10 14:20:04 -07:00 · 2024-07-10 13:57:28 -07:00
23 changed files with 1894 additions and 690 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 llm/ext_server/* linguist-vendored
+llm/*.h linguist-vendored
--- a/api/client.go
+++ b/api/client.go
@@ -17,14 +17,20 @@ import (
 	"bufio"
 	"bytes"
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net"
 	"net/http"
 	"net/url"
+	"os"
+	"path/filepath"
 	"runtime"
+	"strings"
+	"time"

+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -347,16 +353,7 @@ func (c *Client) Heartbeat(ctx context.Context) error {
 	return nil
 }

-// Embed generates embeddings from a model.
-func (c *Client) Embed(ctx context.Context, req *EmbedRequest) (*EmbedResponse, error) {
-	var resp EmbedResponse
-	if err := c.do(ctx, http.MethodPost, "/api/embed", req, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Embeddings generates embeddings from a model. (Legacy)
+// Embeddings generates embeddings from a model.
 func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
 	var resp EmbeddingResponse
 	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
@@ -383,3 +380,27 @@ func (c *Client) Version(ctx context.Context) (string, error) {

 	return version.Version, nil
 }
+
+func Authorization(ctx context.Context, request *http.Request) (string, error) {
+	data := []byte(fmt.Sprintf("%s,%s,%d", request.Method, request.URL.RequestURI(), time.Now().Unix()))
+
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+
+	knownHostsFile, err := os.OpenFile(filepath.Join(home, ".ollama", "known_hosts"), os.O_CREATE|os.O_RDWR|os.O_APPEND, 0600)
+	if err != nil {
+		return "", err
+	}
+	defer knownHostsFile.Close()
+
+	token, err := auth.Sign(ctx, data)
+	if err != nil {
+		return "", err
+	}
+
+	// interleave request data into the token
+	key, sig, _ := strings.Cut(token, ":")
+	return fmt.Sprintf("%s:%s:%s", key, base64.StdEncoding.EncodeToString(data), sig), nil
+}
--- a/api/types.go
+++ b/api/types.go
@@ -173,30 +173,6 @@ type Runner struct {
 	NumThread int   `json:"num_thread,omitempty"`
 }

-// EmbedRequest is the request passed to [Client.Embed].
-type EmbedRequest struct {
-	// Model is the model name.
-	Model string `json:"model"`
-
-	// Input is the input to embed.
-	Input any `json:"input"`
-
-	// KeepAlive controls how long the model will stay loaded in memory following
-	// this request.
-	KeepAlive *Duration `json:"keep_alive,omitempty"`
-
-	Truncate *bool `json:"truncate,omitempty"`
-
-	// Options lists model-specific options.
-	Options map[string]interface{} `json:"options"`
-}
-
-// EmbedResponse is the response from [Client.Embed].
-type EmbedResponse struct {
-	Model      string      `json:"model"`
-	Embeddings [][]float32 `json:"embeddings,omitempty"`
-}
-
 // EmbeddingRequest is the request passed to [Client.Embeddings].
 type EmbeddingRequest struct {
 	// Model is the model name.
@@ -291,6 +267,7 @@ type PullRequest struct {
 type ProgressResponse struct {
 	Status    string `json:"status"`
 	Digest    string `json:"digest,omitempty"`
+	Quantize  string `json:"quantize,omitempty"`
 	Total     int64  `json:"total,omitempty"`
 	Completed int64  `json:"completed,omitempty"`
 }
--- a/auth/auth.go
+++ b/auth/auth.go
@@ -10,42 +10,37 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"strings"

 	"golang.org/x/crypto/ssh"
 )

 const defaultPrivateKey = "id_ed25519"

-func keyPath() (string, error) {
+func keyPath() (ssh.Signer, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
-		return "", err
-	}
-
-	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
-}
-
-func GetPublicKey() (string, error) {
-	keyPath, err := keyPath()
-	if err != nil {
-		return "", err
+		return nil, err
 	}

+	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
-		return "", err
+		return nil, err
 	}

-	privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
+	return ssh.ParsePrivateKey(privateKeyFile)
+}
+
+func GetPublicKey() (ssh.PublicKey, error) {
+	privateKey, err := keyPath()
+	// if privateKey, try public key directly
+
 	if err != nil {
-		return "", err
+		return nil, err
 	}

-	publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
-
-	return strings.TrimSpace(string(publicKey)), nil
+	return privateKey.PublicKey(), nil
 }

 func NewNonce(r io.Reader, length int) (string, error) {
@@ -58,25 +53,20 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }

 func Sign(ctx context.Context, bts []byte) (string, error) {
-	keyPath, err := keyPath()
-	if err != nil {
-		return "", err
-	}
-
-	privateKeyFile, err := os.ReadFile(keyPath)
-	if err != nil {
-		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
-		return "", err
-	}
-
-	privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
+	privateKey, err := keyPath()
 	if err != nil {
 		return "", err
 	}

 	// get the pubkey, but remove the type
-	publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
-	parts := bytes.Split(publicKey, []byte(" "))
+	publicKey, err := GetPublicKey()
+	if err != nil {
+		return "", err
+	}
+
+	publicKeyBytes := ssh.MarshalAuthorizedKey(publicKey)
+
+	parts := bytes.Split(publicKeyBytes, []byte(" "))
 	if len(parts) < 2 {
 		return "", fmt.Errorf("malformed public key")
 	}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -7,6 +7,7 @@ import (
 	"crypto/ed25519"
 	"crypto/rand"
 	"crypto/sha256"
+	"encoding/json"
 	"encoding/pem"
 	"errors"
 	"fmt"
@@ -15,6 +16,7 @@ import (
 	"math"
 	"net"
 	"net/http"
+	"net/url"
 	"os"
 	"os/signal"
 	"path/filepath"
@@ -78,6 +80,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	status := "transferring model data"
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
+	defer p.Stop()

 	for i := range modelfile.Commands {
 		switch modelfile.Commands[i].Name {
@@ -112,16 +115,17 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = tempfile
 			}

-			digest, err := createBlob(cmd, client, path)
+			// spinner.Stop()
+			digest, err := createBlob(cmd, client, path, spinner)
 			if err != nil {
 				return err
 			}
-
 			modelfile.Commands[i].Args = "@" + digest
 		}
 	}

 	bars := make(map[string]*progress.Bar)
+	var quantizeSpin *progress.Spinner
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			spinner.Stop()
@@ -134,11 +138,20 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}

 			bar.Set(resp.Completed)
+		} else if resp.Quantize != "" {
+			spinner.Stop()
+
+			if quantizeSpin != nil {
+				quantizeSpin.SetMessage(resp.Status)
+			} else {
+				quantizeSpin = progress.NewSpinner(resp.Status)
+				p.Add("quantize", quantizeSpin)
+			}
 		} else if status != resp.Status {
 			spinner.Stop()

 			status = resp.Status
-			spinner = progress.NewSpinner(status)
+			spinner := progress.NewSpinner(status)
 			p.Add(status, spinner)
 		}

@@ -263,13 +276,22 @@ func tempZipFiles(path string) (string, error) {
 	return tempfile.Name(), nil
 }

-func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
+var ErrBlobExists = errors.New("blob exists")
+
+func createBlob(cmd *cobra.Command, client *api.Client, path string, spinner *progress.Spinner) (string, error) {
 	bin, err := os.Open(path)
 	if err != nil {
 		return "", err
 	}
 	defer bin.Close()

+	// Get file info to retrieve the size
+	fileInfo, err := bin.Stat()
+	if err != nil {
+		return "", err
+	}
+	fileSize := fileInfo.Size()
+
 	hash := sha256.New()
 	if _, err := io.Copy(hash, bin); err != nil {
 		return "", err
@@ -279,13 +301,157 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 		return "", err
 	}

+	var pw progressWriter
+	// Create a progress bar and start a goroutine to update it
+	// JK Let's use a percentage
+
+	//bar := progress.NewBar("transferring model data...", fileSize, 0)
+	//p.Add("transferring model data", bar)
+
+	status := "transferring model data 0%"
+	spinner.SetMessage(status)
+
+	ticker := time.NewTicker(60 * time.Millisecond)
+	done := make(chan struct{})
+	defer close(done)
+
+	go func() {
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				spinner.SetMessage(fmt.Sprintf("transferring model data %d%%", int(100*pw.n/fileSize)))
+			case <-done:
+				spinner.SetMessage("transferring model data 100%")
+				return
+			}
+		}
+	}()
+
 	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
+
+	// We check if we can find the models directory locally
+	// If we can, we return the path to the directory
+	// If we can't, we return an error
+	// If the blob exists already, we return the digest
+	dest, err := getLocalPath(cmd.Context(), digest)
+
+	if errors.Is(err, ErrBlobExists) {
+		return digest, nil
+	}
+
+	// Successfully found the model directory
+	if err == nil {
+		// Copy blob in via OS specific copy
+		// Linux errors out to use io.copy
+		err = localCopy(path, dest)
+		if err == nil {
+			return digest, nil
+		}
+
+		// Default copy using io.copy
+		err = defaultCopy(path, dest)
+		if err == nil {
+			return digest, nil
+		}
+	}
+
+	// If at any point copying the blob over locally fails, we default to the copy through the server
+	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
 		return "", err
 	}
 	return digest, nil
 }

+type progressWriter struct {
+	n int64
+}
+
+func (w *progressWriter) Write(p []byte) (n int, err error) {
+	w.n += int64(len(p))
+	return len(p), nil
+}
+
+func getLocalPath(ctx context.Context, digest string) (string, error) {
+	ollamaHost := envconfig.Host
+
+	client := http.DefaultClient
+	base := &url.URL{
+		Scheme: ollamaHost.Scheme,
+		Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
+	}
+
+	data, err := json.Marshal(digest)
+	if err != nil {
+		return "", err
+	}
+
+	reqBody := bytes.NewReader(data)
+	path := fmt.Sprintf("/api/blobs/%s", digest)
+	requestURL := base.JoinPath(path)
+	request, err := http.NewRequestWithContext(ctx, http.MethodPost, requestURL.String(), reqBody)
+	if err != nil {
+		return "", err
+	}
+
+	authz, err := api.Authorization(ctx, request)
+	if err != nil {
+		return "", err
+	}
+
+	request.Header.Set("Authorization", authz)
+	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
+	request.Header.Set("X-Redirect-Create", "1")
+
+	resp, err := client.Do(request)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == http.StatusTemporaryRedirect {
+		dest := resp.Header.Get("LocalLocation")
+
+		return dest, nil
+	}
+	return "", ErrBlobExists
+}
+
+func defaultCopy(path string, dest string) error {
+	// This function should be called if the server is local
+	// It should find the model directory, copy the blob over, and return the digest
+	dirPath := filepath.Dir(dest)
+
+	if err := os.MkdirAll(dirPath, 0o755); err != nil {
+		return err
+	}
+
+	// Copy blob over
+	sourceFile, err := os.Open(path)
+	if err != nil {
+		return fmt.Errorf("could not open source file: %v", err)
+	}
+	defer sourceFile.Close()
+
+	destFile, err := os.Create(dest)
+	if err != nil {
+		return fmt.Errorf("could not create destination file: %v", err)
+	}
+	defer destFile.Close()
+
+	_, err = io.CopyBuffer(destFile, sourceFile, make([]byte, 4*1024*1024))
+	if err != nil {
+		return fmt.Errorf("error copying file: %v", err)
+	}
+
+	err = destFile.Sync()
+	if err != nil {
+		return fmt.Errorf("error flushing file: %v", err)
+	}
+
+	return nil
+}
+
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true

@@ -379,11 +545,13 @@ func errFromUnknownKey(unknownKeyErr error) error {
 	if len(matches) > 0 {
 		serverPubKey := matches[0]

-		localPubKey, err := auth.GetPublicKey()
+		publicKey, err := auth.GetPublicKey()
 		if err != nil {
 			return unknownKeyErr
 		}

+		localPubKey := strings.TrimSpace(string(ssh.MarshalAuthorizedKey(publicKey)))
+
 		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
 			// try the ollama service public key
 			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
--- a/cmd/copy_darwin.go
+++ b/cmd/copy_darwin.go
@@ -0,0 +1,23 @@
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sys/unix"
+)
+
+func localCopy(src, target string) error {
+	dirPath := filepath.Dir(target)
+
+	if err := os.MkdirAll(dirPath, 0o755); err != nil {
+		return err
+	}
+
+	err := unix.Clonefile(src, target, 0)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
--- a/cmd/copy_linux.go
+++ b/cmd/copy_linux.go
@@ -0,0 +1,7 @@
+package cmd
+
+import "errors"
+
+func localCopy(src, target string) error {
+	return errors.New("no local copy implementation for linux")
+}
--- a/cmd/copy_windows.go
+++ b/cmd/copy_windows.go
@@ -0,0 +1,67 @@
+//go:build windows
+// +build windows
+
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+	"syscall"
+	"unsafe"
+)
+
+func localCopy(src, target string) error {
+	// Create target directory if it doesn't exist
+	dirPath := filepath.Dir(target)
+	if err := os.MkdirAll(dirPath, 0o755); err != nil {
+		return err
+	}
+
+	// Open source file
+	sourceFile, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer sourceFile.Close()
+
+	// Create target file
+	targetFile, err := os.Create(target)
+	if err != nil {
+		return err
+	}
+	defer targetFile.Close()
+
+	// Use CopyFileExW to copy the file
+	err = copyFileEx(src, target)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func copyFileEx(src, dst string) error {
+	kernel32 := syscall.NewLazyDLL("kernel32.dll")
+	copyFileEx := kernel32.NewProc("CopyFileExW")
+
+	srcPtr, err := syscall.UTF16PtrFromString(src)
+	if err != nil {
+		return err
+	}
+
+	dstPtr, err := syscall.UTF16PtrFromString(dst)
+	if err != nil {
+		return err
+	}
+
+	r1, _, err := copyFileEx.Call(
+		uintptr(unsafe.Pointer(srcPtr)),
+		uintptr(unsafe.Pointer(dstPtr)),
+		0, 0, 0, 0)
+
+	if r1 == 0 {
+		return err
+	}
+
+	return nil
+}
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -1,152 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestAllMiniLMEmbed(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	req := api.EmbedRequest{
-		Model: "all-minilm",
-		Input: "why is the sky blue?",
-	}
-
-	res, err := embedTestHelper(ctx, t, req)
-
-	if err != nil {
-		t.Fatalf("error: %v", err)
-	}
-
-	if len(res.Embeddings) != 1 {
-		t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
-	}
-
-	if len(res.Embeddings[0]) != 384 {
-		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
-	}
-
-	if res.Embeddings[0][0] != 0.010071031 {
-		t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
-	}
-}
-
-func TestAllMiniLMBatchEmbed(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	req := api.EmbedRequest{
-		Model: "all-minilm",
-		Input: []string{"why is the sky blue?", "why is the grass green?"},
-	}
-
-	res, err := embedTestHelper(ctx, t, req)
-
-	if err != nil {
-		t.Fatalf("error: %v", err)
-	}
-
-	if len(res.Embeddings) != 2 {
-		t.Fatalf("expected 2 embeddings, got %d", len(res.Embeddings))
-	}
-
-	if len(res.Embeddings[0]) != 384 {
-		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
-	}
-
-	if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
-		t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
-	}
-}
-
-func TestAllMiniLmEmbedTruncate(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	truncTrue, truncFalse := true, false
-
-	type testReq struct {
-		Name    string
-		Request api.EmbedRequest
-	}
-
-	reqs := []testReq{
-		{
-			Name: "Target Truncation",
-			Request: api.EmbedRequest{
-				Model: "all-minilm",
-				Input: "why",
-			},
-		},
-		{
-			Name: "Default Truncate",
-			Request: api.EmbedRequest{
-				Model:   "all-minilm",
-				Input:   "why is the sky blue?",
-				Options: map[string]any{"num_ctx": 1},
-			},
-		},
-		{
-			Name: "Explicit Truncate",
-			Request: api.EmbedRequest{
-				Model:    "all-minilm",
-				Input:    "why is the sky blue?",
-				Truncate: &truncTrue,
-				Options:  map[string]any{"num_ctx": 1},
-			},
-		},
-	}
-
-	res := make(map[string]*api.EmbedResponse)
-
-	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, t, req.Request)
-		if err != nil {
-			t.Fatalf("error: %v", err)
-		}
-		res[req.Name] = response
-	}
-
-	if res["Target Truncation"].Embeddings[0][0] != res["Default Truncate"].Embeddings[0][0] {
-		t.Fatal("expected default request to truncate correctly")
-	}
-
-	if res["Default Truncate"].Embeddings[0][0] != res["Explicit Truncate"].Embeddings[0][0] {
-		t.Fatal("expected default request and truncate true request to be the same")
-	}
-
-	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
-		Model:    "all-minilm",
-		Input:    "why is the sky blue?",
-		Truncate: &truncFalse,
-		Options:  map[string]any{"num_ctx": 1},
-	})
-
-	if err == nil {
-		t.Fatal("expected error, got nil")
-	}
-}
-
-func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("failed to pull model %s: %v", req.Model, err)
-	}
-
-	response, err := client.Embed(ctx, &req)
-
-	if err != nil {
-		return nil, err
-	}
-
-	return response, nil
-}
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -3188,33 +3188,26 @@ int main(int argc, char **argv) {
                    prompt = "";
                }

-                if (prompt.size() == 1) {
-                    prompt = prompt[0];
+                json image_data;
+                if (body.count("image_data") != 0) {
+                    image_data = body["image_data"];
+                }
+                else
+                {
+                    image_data = "";
                }

                // create and queue the task
-                json responses;
-                {
-                    const int id_task = llama.queue_tasks.get_new_id();
-                    llama.queue_results.add_waiting_task_id(id_task);
-                    llama.request_completion(id_task, {{"prompt", prompt}}, true, -1);
+                const int task_id = llama.queue_tasks.get_new_id();
+                llama.queue_results.add_waiting_task_id(task_id);
+                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, true, -1);

-                    // get the result
-                    task_result result = llama.queue_results.recv(id_task);
-                    llama.queue_results.remove_waiting_task_id(id_task);
-                    if (result.error) {
-                        return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
-                    }
+                // get the result
+                task_result result = llama.queue_results.recv(task_id);
+                llama.queue_results.remove_waiting_task_id(task_id);

-                    responses = result.result_json.value("results", std::vector<json>{result.result_json});
-                    json embeddings = json::array();
-                    for (auto & elem : responses) {
-                        embeddings.push_back(elem.at("embedding"));
-                    }
-                    // send the result
-                    json embedding_res = json{{"embedding", embeddings}};
-                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
-                }
+                // send the result
+                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });

    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -366,6 +366,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
+            "-DLLAMA_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
--- a/llm/llama.h
+++ b/llm/llama.h
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,6 +1,6 @@
 package llm

-// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
+// #cgo CPPFLAGS: -Illama.cpp/ggml/include
 // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
@@ -9,11 +9,22 @@ package llm
 // #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
 // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
 // #include <stdlib.h>
+// #include <stdatomic.h>
 // #include "llama.h"
+// bool update_quantize_progress(float progress, void* data) {
+//	atomic_int* atomicData = (atomic_int*)data;
+//  int intProgress = *((int*)&progress);
+//  atomic_store(atomicData, intProgress);
+//  return true;
+// }
 import "C"
 import (
 	"fmt"
+	"sync/atomic"
+	"time"
 	"unsafe"
+
+	"github.com/ollama/ollama/api"
 )

 // SystemInfo is an unused example of calling llama.cpp functions using CGo
@@ -21,7 +32,7 @@ func SystemInfo() string {
 	return C.GoString(C.llama_print_system_info())
 }

-func Quantize(infile, outfile string, ftype fileType) error {
+func Quantize(infile, outfile string, ftype fileType, fn func(resp api.ProgressResponse), tensorCount int) error {
 	cinfile := C.CString(infile)
 	defer C.free(unsafe.Pointer(cinfile))

@@ -32,6 +43,42 @@ func Quantize(infile, outfile string, ftype fileType) error {
 	params.nthread = -1
 	params.ftype = ftype.Value()

+	// Initialize "global" to store progress
+	store := (*int32)(C.malloc(C.sizeof_int))
+	defer C.free(unsafe.Pointer(store))
+
+	// Initialize store value, e.g., setting initial progress to 0
+	atomic.StoreInt32(store, 0)
+
+	params.quantize_callback_data = unsafe.Pointer(store)
+	params.quantize_callback = (C.llama_progress_callback)(C.update_quantize_progress)
+
+	ticker := time.NewTicker(30 * time.Millisecond)
+	done := make(chan struct{})
+	defer close(done)
+
+	go func() {
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				progressInt := atomic.LoadInt32(store)
+				progress := *(*float32)(unsafe.Pointer(&progressInt))
+				fn(api.ProgressResponse{
+					Status:   fmt.Sprintf("quantizing model tensors %d/%d", int(progress), tensorCount),
+					Quantize: "quant",
+				})
+				fmt.Println("Progress: ", progress)
+			case <-done:
+				fn(api.ProgressResponse{
+					Status:   fmt.Sprintf("quantizing model tensors %d/%d", tensorCount, tensorCount),
+					Quantize: "quant",
+				})
+				return
+			}
+		}
+	}()
+
 	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
 		return fmt.Errorf("llama_model_quantize: %d", rc)
 	}
--- a/llm/patches/10-quantize-callback.diff
+++ b/llm/patches/10-quantize-callback.diff
@@ -0,0 +1,52 @@
+From ed941590d59fc07b1ad21d6aa458588e47d1e446 Mon Sep 17 00:00:00 2001
+From: Josh Yan <jyan00017@gmail.com>
+Date: Wed, 10 Jul 2024 13:39:39 -0700
+Subject: [PATCH] quantize progress
+
+---
+ include/llama.h | 3 +++
+ src/llama.cpp   | 8 ++++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/include/llama.h b/include/llama.h
+index bb4b05ba..613db68e 100644
+--- a/include/llama.h
+++ b/include/llama.h
+@@ -349,6 +349,9 @@ extern "C" {
+         bool keep_split;                     // quantize to the same number of shards
+         void * imatrix;                      // pointer to importance matrix data
+         void * kv_overrides;                 // pointer to vector containing overrides
+
+        llama_progress_callback quantize_callback;   // callback to report quantization progress
+        void * quantize_callback_data;               // user data for the callback
+     } llama_model_quantize_params;
+ 
+     // grammar types
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 2b9ace28..ac640c02 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -18252,6 +18252,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+     const auto tn = LLM_TN(model.arch);
+     new_ofstream(0);
+     for (int i = 0; i < ml.n_tensors; ++i) {
+        if (params->quantize_callback){
+            if (!params->quantize_callback(i, params->quantize_callback_data)) {
+                return;
+            }
+        }
+
+         auto weight = ml.get_weight(i);
+         struct ggml_tensor * tensor = weight->tensor;
+         if (weight->idx != cur_split && params->keep_split) {
+@@ -18789,6 +18795,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
+         /*.keep_split                  =*/ false,
+         /*.imatrix                     =*/ nullptr,
+         /*.kv_overrides                =*/ nullptr,
+        /*.quantize_callback           =*/ nullptr,
+        /*.quantize_callback_data      =*/ nullptr,
+     };
+ 
+     return result;
+-- 
+2.39.3 (Apple Git-146)
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embed(ctx context.Context, input []string) ([][]float32, error)
+	Embedding(ctx context.Context, prompt string) ([]float64, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -859,15 +859,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	return nil
 }

-type EmbedRequest struct {
-	Content []string `json:"content"`
+type EmbeddingRequest struct {
+	Content string `json:"content"`
 }

-type EmbedResponse struct {
-	Embedding [][]float32 `json:"embedding"`
+type EmbeddingResponse struct {
+	Embedding []float64 `json:"embedding"`
 }

-func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
+func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
@@ -882,7 +882,7 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
 		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
 	}

-	data, err := json.Marshal(EmbedRequest{Content: input})
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
@@ -909,7 +909,7 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
 		return nil, fmt.Errorf("%s", body)
 	}

-	var embedding EmbedResponse
+	var embedding EmbeddingResponse
 	if err := json.Unmarshal(body, &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -338,12 +338,16 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	switch stop := r.Stop.(type) {
 	case string:
 		options["stop"] = []string{stop}
-	case []string:
-		options["stop"] = stop
-	default:
-		if r.Stop != nil {
-			return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", r.Stop)
+	case []any:
+		var stops []string
+		for _, s := range stop {
+			if str, ok := s.(string); ok {
+				stops = append(stops, str)
+			} else {
+				return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", s)
+			}
 		}
+		options["stop"] = stops
 	}

 	if r.MaxTokens != nil {
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -3,7 +3,6 @@ package openai
 import (
 	"bytes"
 	"encoding/json"
-	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
@@ -16,7 +15,133 @@ import (
 	"github.com/stretchr/testify/assert"
 )

-func TestMiddleware(t *testing.T) {
+func TestMiddlewareRequests(t *testing.T) {
+	type testCase struct {
+		Name     string
+		Method   string
+		Path     string
+		Handler  func() gin.HandlerFunc
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, req *http.Request)
+	}
+
+	var capturedRequest *http.Request
+
+	captureRequestMiddleware := func() gin.HandlerFunc {
+		return func(c *gin.Context) {
+			bodyBytes, _ := io.ReadAll(c.Request.Body)
+			c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+			capturedRequest = c.Request
+			c.Next()
+		}
+	}
+
+	testCases := []testCase{
+		{
+			Name:    "chat handler",
+			Method:  http.MethodPost,
+			Path:    "/api/chat",
+			Handler: ChatMiddleware,
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model:    "test-model",
+					Messages: []Message{{Role: "user", Content: "Hello"}},
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, req *http.Request) {
+				var chatReq api.ChatRequest
+				if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil {
+					t.Fatal(err)
+				}
+
+				if chatReq.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role)
+				}
+
+				if chatReq.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content)
+				}
+			},
+		},
+		{
+			Name:    "completions handler",
+			Method:  http.MethodPost,
+			Path:    "/api/generate",
+			Handler: CompletionsMiddleware,
+			Setup: func(t *testing.T, req *http.Request) {
+				temp := float32(0.8)
+				body := CompletionRequest{
+					Model:       "test-model",
+					Prompt:      "Hello",
+					Temperature: &temp,
+					Stop:        []string{"\n", "stop"},
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, req *http.Request) {
+				var genReq api.GenerateRequest
+				if err := json.NewDecoder(req.Body).Decode(&genReq); err != nil {
+					t.Fatal(err)
+				}
+
+				if genReq.Prompt != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", genReq.Prompt)
+				}
+
+				if genReq.Options["temperature"] != 1.6 {
+					t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"])
+				}
+
+				stopTokens, ok := genReq.Options["stop"].([]any)
+
+				if !ok {
+					t.Fatalf("expected stop tokens to be a list")
+				}
+
+				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
+					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
+				}
+			},
+		},
+	}
+
+	gin.SetMode(gin.TestMode)
+	router := gin.New()
+
+	endpoint := func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			router = gin.New()
+			router.Use(captureRequestMiddleware())
+			router.Use(tc.Handler())
+			router.Handle(tc.Method, tc.Path, endpoint)
+			req, _ := http.NewRequest(tc.Method, tc.Path, nil)
+
+			if tc.Setup != nil {
+				tc.Setup(t, req)
+			}
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			tc.Expected(t, capturedRequest)
+		})
+	}
+}
+
+func TestMiddlewareResponses(t *testing.T) {
 	type testCase struct {
 		Name     string
 		Method   string
@@ -30,159 +155,7 @@ func TestMiddleware(t *testing.T) {

 	testCases := []testCase{
 		{
-			Name:     "chat handler",
-			Method:   http.MethodPost,
-			Path:     "/api/chat",
-			TestPath: "/api/chat",
-			Handler:  ChatMiddleware,
-			Endpoint: func(c *gin.Context) {
-				var chatReq api.ChatRequest
-				if err := c.ShouldBindJSON(&chatReq); err != nil {
-					c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"})
-					return
-				}
-
-				userMessage := chatReq.Messages[0].Content
-				var assistantMessage string
-
-				switch userMessage {
-				case "Hello":
-					assistantMessage = "Hello!"
-				default:
-					assistantMessage = "I'm not sure how to respond to that."
-				}
-
-				c.JSON(http.StatusOK, api.ChatResponse{
-					Message: api.Message{
-						Role:    "assistant",
-						Content: assistantMessage,
-					},
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model:    "test-model",
-					Messages: []Message{{Role: "user", Content: "Hello"}},
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-
-				var chatResp ChatCompletion
-				if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if chatResp.Object != "chat.completion" {
-					t.Fatalf("expected chat.completion, got %s", chatResp.Object)
-				}
-
-				if chatResp.Choices[0].Message.Content != "Hello!" {
-					t.Fatalf("expected Hello!, got %s", chatResp.Choices[0].Message.Content)
-				}
-			},
-		},
-		{
-			Name:     "completions handler",
-			Method:   http.MethodPost,
-			Path:     "/api/generate",
-			TestPath: "/api/generate",
-			Handler:  CompletionsMiddleware,
-			Endpoint: func(c *gin.Context) {
-				c.JSON(http.StatusOK, api.GenerateResponse{
-					Response: "Hello!",
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				body := CompletionRequest{
-					Model:  "test-model",
-					Prompt: "Hello",
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-				var completionResp Completion
-				if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if completionResp.Object != "text_completion" {
-					t.Fatalf("expected text_completion, got %s", completionResp.Object)
-				}
-
-				if completionResp.Choices[0].Text != "Hello!" {
-					t.Fatalf("expected Hello!, got %s", completionResp.Choices[0].Text)
-				}
-			},
-		},
-		{
-			Name:     "completions handler with params",
-			Method:   http.MethodPost,
-			Path:     "/api/generate",
-			TestPath: "/api/generate",
-			Handler:  CompletionsMiddleware,
-			Endpoint: func(c *gin.Context) {
-				var generateReq api.GenerateRequest
-				if err := c.ShouldBindJSON(&generateReq); err != nil {
-					c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"})
-					return
-				}
-
-				temperature := generateReq.Options["temperature"].(float64)
-				var assistantMessage string
-
-				switch temperature {
-				case 1.6:
-					assistantMessage = "Received temperature of 1.6"
-				default:
-					assistantMessage = fmt.Sprintf("Received temperature of %f", temperature)
-				}
-
-				c.JSON(http.StatusOK, api.GenerateResponse{
-					Response: assistantMessage,
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				temp := float32(0.8)
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: &temp,
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-				var completionResp Completion
-				if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if completionResp.Object != "text_completion" {
-					t.Fatalf("expected text_completion, got %s", completionResp.Object)
-				}
-
-				if completionResp.Choices[0].Text != "Received temperature of 1.6" {
-					t.Fatalf("expected Received temperature of 1.6, got %s", completionResp.Choices[0].Text)
-				}
-			},
-		},
-		{
-			Name:     "completions handler with error",
+			Name:     "completions handler error forwarding",
 			Method:   http.MethodPost,
 			Path:     "/api/generate",
 			TestPath: "/api/generate",
--- a/progress/spinner.go
+++ b/progress/spinner.go
@@ -31,6 +31,10 @@ func NewSpinner(message string) *Spinner {
 	return s
 }

+func (s *Spinner) SetMessage(message string) {
+	s.message = message
+}
+
 func (s *Spinner) String() string {
 	var sb strings.Builder
 	if len(s.message) > 0 {
--- a/server/images.go
+++ b/server/images.go
@@ -32,6 +32,7 @@ import (
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
+	"golang.org/x/crypto/ssh"
 )

 var errCapabilityCompletion = errors.New("completion")
@@ -422,11 +423,15 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 						return err
 					}

+					tensorCount := len(baseLayer.GGML.Tensors())
 					ft := baseLayer.GGML.KV().FileType()
 					if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
 						return errors.New("quantization is only supported for F16 and F32 models")
 					} else if want != ft {
-						fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantization)})
+						fn(api.ProgressResponse{
+							Status: "quantizing model tensors",
+							Quantize: "quant",
+						})

 						blob, err := GetBlobsPath(baseLayer.Digest)
 						if err != nil {
@@ -440,7 +445,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 						defer temp.Close()
 						defer os.Remove(temp.Name())

-						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
+						if err := llm.Quantize(blob, temp.Name(), want, fn, tensorCount); err != nil {
 							return err
 						}

@@ -1064,11 +1069,12 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 	if anonymous {
 		// no user is associated with the public key, and the request requires non-anonymous access
 		pubKey, nestedErr := auth.GetPublicKey()
+		localPubKey := strings.TrimSpace(string(ssh.MarshalAuthorizedKey(pubKey)))
 		if nestedErr != nil {
 			slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
 			return nil, errUnauthorized
 		}
-		return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
+		return nil, &errtypes.UnknownOllamaKey{Key: localPubKey}
 	}
 	// user is associated with the public key, but is not authorized to make the request
 	return nil, errUnauthorized
--- a/server/routes.go
+++ b/server/routes.go
@@ -4,12 +4,13 @@ import (
 	"bytes"
 	"cmp"
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
+	"log"
 	"log/slog"
-	"math"
 	"net"
 	"net/http"
 	"net/netip"
@@ -18,14 +19,15 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
-	"sync"
 	"syscall"
 	"time"

 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
+	"golang.org/x/crypto/ssh"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
@@ -248,152 +250,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) EmbedHandler(c *gin.Context) {
-	var req api.EmbedRequest
-	err := c.ShouldBindJSON(&req)
-	switch {
-	case errors.Is(err, io.EOF):
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
-		return
-	case err != nil:
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	if req.Truncate == nil {
-		truncate := true
-		req.Truncate = &truncate
-	}
-
-	reqEmbed := []string{}
-
-	switch embeddings := req.Input.(type) {
-	case string:
-		if embeddings == "" {
-			c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
-			return
-		}
-		reqEmbed = []string{embeddings}
-	case []any:
-		if len(embeddings) == 0 {
-			c.JSON(http.StatusOK, api.EmbedResponse{Model: req.Model, Embeddings: [][]float32{}})
-			return
-		}
-
-		for _, v := range embeddings {
-			if _, ok := v.(string); !ok {
-				c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
-				return
-			}
-			reqEmbed = append(reqEmbed, v.(string))
-		}
-	default:
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid input type"})
-		return
-	}
-
-	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive)
-	if err != nil {
-		handleScheduleError(c, req.Model, err)
-		return
-	}
-
-	kvData, err := getKVData(m.ModelPath, false)
-	if err != nil {
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	}
-
-	reqEmbedArray := make([]string, len(reqEmbed))
-	errCh := make(chan error, 1)
-	successCh := make(chan bool, 1)
-	sem := make(chan struct{}, 2)
-	var wg sync.WaitGroup
-	var mu sync.Mutex
-	for i, s := range reqEmbed {
-		wg.Add(1)
-		sem <- struct{}{}
-		go func(i int, s string) {
-			defer wg.Done()
-			defer func() { <-sem }()
-			tokens, err := r.Tokenize(c.Request.Context(), s)
-			if err != nil {
-				errCh <- err
-				return
-			}
-
-			ctxLen := min(opts.NumCtx, int(kvData.ContextLength()))
-			if len(tokens) > ctxLen {
-				if *req.Truncate {
-					tokens = tokens[:ctxLen]
-					s, err = r.Detokenize(c.Request.Context(), tokens)
-					if err != nil {
-						errCh <- err
-						return
-					}
-				} else {
-					errCh <- err
-					return
-				}
-			}
-			mu.Lock()
-			reqEmbedArray[i] = s
-			mu.Unlock()
-		}(i, s)
-	}
-	go func() {
-		wg.Wait()
-		successCh <- true
-		close(errCh)
-	}()
-
-	select {
-	case err := <-errCh:
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	case success := <-successCh:
-		if !success {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to process all embeddings"})
-			return
-		}
-	}
-
-	embeddings, err := r.Embed(c.Request.Context(), reqEmbedArray)
-
-	if err != nil {
-		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
-		return
-	}
-
-	for i, e := range embeddings {
-		embeddings[i] = normalize(e)
-	}
-
-	resp := api.EmbedResponse{
-		Model:      req.Model,
-		Embeddings: embeddings,
-	}
-	c.JSON(http.StatusOK, resp)
-}
-
-func normalize(vec []float32) []float32 {
-	var sum float32
-	for _, v := range vec {
-		sum += v * v
-	}
-
-	norm := float32(0.0)
-	if sum > 0 {
-		norm = float32(1.0 / math.Sqrt(float64(sum)))
-	}
-
-	for i := range vec {
-		vec[i] *= norm
-	}
-	return vec
-}
-
 func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	var req api.EmbeddingRequest
 	if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
@@ -416,24 +272,14 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding, err := r.Embed(c.Request.Context(), []string{req.Prompt})
-
+	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
 		return
 	}

-	embedding64 := make([]float64, len(embedding[0]))
-
-	for i, v := range embedding[0] {
-		embedding64[i] = float64(v)
-	}
-
-	resp := api.EmbeddingResponse{
-		Embedding: embedding64,
-	}
-	c.JSON(http.StatusOK, resp)
+	c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: embedding})
 }

 func (s *Server) PullModelHandler(c *gin.Context) {
@@ -928,7 +774,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
-
 	_, err = os.Stat(path)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
@@ -941,6 +786,12 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 		return
 	}

+	if c.GetHeader("X-Redirect-Create") == "1" && s.IsLocal(c) {
+		c.Header("LocalLocation", path)
+		c.Status(http.StatusTemporaryRedirect)
+		return
+	}
+
 	layer, err := NewLayer(c.Request.Body, "")
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -955,6 +806,54 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 	c.Status(http.StatusCreated)
 }

+func (s *Server) IsLocal(c *gin.Context) bool {
+	if authz := c.GetHeader("Authorization"); authz != "" {
+		parts := strings.Split(authz, ":")
+		if len(parts) != 3 {
+			return false
+		}
+
+		clientPublicKey, _, _, _, err := ssh.ParseAuthorizedKey([]byte(fmt.Sprintf("ssh-ed25519 %s", parts[0])))
+		if err != nil {
+			return false
+		}
+
+		// partialRequestData is formatted as http.Method,http.requestURI,timestamp,nonce
+		requestData, err := base64.StdEncoding.DecodeString(parts[1])
+		if err != nil {
+			return false
+		}
+
+		partialRequestDataParts := strings.Split(string(requestData), ",")
+		if len(partialRequestDataParts) != 3 {
+			return false
+		}
+
+		signature, err := base64.StdEncoding.DecodeString(parts[2])
+		if err != nil {
+			return false
+		}
+
+		if err := clientPublicKey.Verify(requestData, &ssh.Signature{Format: clientPublicKey.Type(), Blob: signature}); err != nil {
+			return false
+		}
+
+		serverPublicKey, err := auth.GetPublicKey()
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		if bytes.Equal(serverPublicKey.Marshal(), clientPublicKey.Marshal()) {
+			return true
+		}
+
+		c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "unauthorized"})
+		return false
+	}
+
+	return false
+}
+
 func isLocalIP(ip netip.Addr) bool {
 	if interfaces, err := net.Interfaces(); err == nil {
 		for _, iface := range interfaces {
@@ -1059,8 +958,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/pull", s.PullModelHandler)
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
-	r.POST("/api/embed", s.EmbedHandler)
-	r.POST("/api/embeddings", s.EmbeddingsHandler) // legacy
+	r.POST("/api/embeddings", s.EmbeddingsHandler)
 	r.POST("/api/create", s.CreateModelHandler)
 	r.POST("/api/push", s.PushModelHandler)
 	r.POST("/api/copy", s.CopyModelHandler)
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"math"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -273,73 +272,6 @@ func Test_Routes(t *testing.T) {
 				assert.Equal(t, "library", retrieveResp.OwnedBy)
 			},
 		},
-		{
-			Name:   "Embed Handler Empty Input",
-			Method: http.MethodPost,
-			Path:   "/api/embed",
-			Setup: func(t *testing.T, req *http.Request) {
-				embedReq := api.EmbedRequest{
-					Model: "t-bone",
-					Input: "",
-				}
-				jsonData, err := json.Marshal(embedReq)
-				require.NoError(t, err)
-				req.Body = io.NopCloser(bytes.NewReader(jsonData))
-			},
-			Expected: func(t *testing.T, resp *http.Response) {
-				contentType := resp.Header.Get("Content-Type")
-				if contentType != "application/json; charset=utf-8" {
-					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
-				}
-				body, err := io.ReadAll(resp.Body)
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				var embedResp api.EmbedResponse
-				err = json.Unmarshal(body, &embedResp)
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				if embedResp.Model != "t-bone" {
-					t.Fatalf("expected model t-bone, got %s", embedResp.Model)
-				}
-
-				if embedResp.Embeddings != nil {
-					t.Fatalf("expected embeddings to be nil, got %v", embedResp.Embeddings)
-				}
-			},
-		},
-		{
-			Name:   "Embed Handler Invalid Input",
-			Method: http.MethodPost,
-			Path:   "/api/embed",
-			Setup: func(t *testing.T, req *http.Request) {
-				embedReq := api.EmbedRequest{
-					Model: "t-bone",
-					Input: 2,
-				}
-				jsonData, err := json.Marshal(embedReq)
-				require.NoError(t, err)
-				req.Body = io.NopCloser(bytes.NewReader(jsonData))
-			},
-			Expected: func(t *testing.T, resp *http.Response) {
-				contentType := resp.Header.Get("Content-Type")
-				if contentType != "application/json; charset=utf-8" {
-					t.Fatalf("expected content type application/json; charset=utf-8, got %s", contentType)
-				}
-				_, err := io.ReadAll(resp.Body)
-
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				if resp.StatusCode != http.StatusBadRequest {
-					t.Fatalf("expected status code 400, got %d", resp.StatusCode)
-				}
-			},
-		},
 	}

 	t.Setenv("OLLAMA_MODELS", t.TempDir())
@@ -488,38 +420,3 @@ func TestShow(t *testing.T) {
 		t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
 	}
 }
-
-func TestNormalize(t *testing.T) {
-	type testCase struct {
-		input []float32
-	}
-
-	testCases := []testCase{
-		{input: []float32{1}},
-		{input: []float32{0, 1, 2, 3}},
-		{input: []float32{0.1, 0.2, 0.3}},
-		{input: []float32{-0.1, 0.2, 0.3, -0.4}},
-		{input: []float32{0, 0, 0}},
-	}
-
-	isNormalized := func(vec []float32) (res bool) {
-		sum := 0.0
-		for _, v := range vec {
-			sum += float64(v * v)
-		}
-		if math.Abs(sum-1) > 1e-6 {
-			return sum == 0
-		} else {
-			return true
-		}
-	}
-
-	for _, tc := range testCases {
-		t.Run("", func(t *testing.T) {
-			normalized := normalize(tc.input)
-			if !isNormalized(normalized) {
-				t.Errorf("Vector %v is not normalized", tc.input)
-			}
-		})
-	}
-}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -642,8 +642,8 @@ type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embedResp          [][]float32
-	embedRespErr       error
+	embeddingResp      []float64
+	embeddingRespErr   error
 	tokenizeResp       []int
 	tokenizeRespErr    error
 	detokenizeResp     string
@@ -660,8 +660,8 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	return s.completionResp
 }
-func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
-	return s.embedResp, s.embedRespErr
+func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
+	return s.embeddingResp, s.embeddingRespErr
 }
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
 	return s.tokenizeResp, s.tokenizeRespErr
Author	SHA1	Message	Date
Josh Yan	8476ef2bd8	atomic for race	2024-07-15 10:44:35 -07:00
Josh Yan	4c9a160a08	race	2024-07-12 11:52:10 -07:00
Josh Yan	657a1102fc	lint complained	2024-07-11 09:26:17 -07:00
Josh Yan	d352c68ffc	move llama.h	2024-07-10 14:20:04 -07:00
Josh Yan	d82d25d70c	patched	2024-07-10 13:57:28 -07:00
Josh Yan	60be9e2840	patch	2024-07-10 13:46:38 -07:00
Josh Yan	a083852eb5	quantize progress	2024-07-10 13:21:22 -07:00
Josh Yan	6bab0e2368	lint	2024-07-10 12:36:32 -07:00
Josh Yan	c4cccaf936	remove rebase err	2024-07-10 11:37:55 -07:00
Josh Yan	9fe5c393e4	hi	2024-07-10 11:35:01 -07:00
Josh Yan	007c988dba	rmv double msg	2024-07-10 11:34:31 -07:00
Josh Yan	91d21e7c7b	rmv double msg	2024-07-10 11:34:31 -07:00
Josh Yan	3e64284f69	percent	2024-07-10 11:34:31 -07:00
Josh Yan	39910f2ab2	percent	2024-07-10 11:34:29 -07:00
Josh Yan	96d0cd92f2	rebase	2024-07-10 11:31:53 -07:00
Josh Yan	3a724a7c80	isLocal firstdraft	2024-07-10 11:31:12 -07:00
Josh Yan	f520f0056e	rm config	2024-07-10 11:29:51 -07:00
Josh Yan	d25f85ede4	on disk copy	2024-07-10 11:29:49 -07:00
Josh Yan	b48420b74b	percent	2024-07-10 11:27:33 -07:00
Josh Yan	784958a1cb	transfer data	2024-07-10 11:24:50 -07:00
Josh Yan	ae65cc8dea	progress	2024-07-10 11:24:48 -07:00
Josh Yan	a037528bba	lint	2024-07-10 11:20:02 -07:00
Josh Yan	04bf41deb5	clean	2024-07-10 11:20:02 -07:00
Josh Yan	c23cec9547	removed cmt and prints	2024-07-10 11:20:02 -07:00
Josh Yan	8377dc48d0	removed client isLocal()	2024-07-10 11:20:02 -07:00
Josh Yan	3aee405dfa	lint	2024-07-10 11:20:02 -07:00
Josh Yan	9b3f47b674	lint	2024-07-10 11:20:02 -07:00
Josh Yan	f5441f01a2	lint	2024-07-10 11:20:02 -07:00
Josh Yan	ab165df43a	syscopy windows	2024-07-10 11:20:02 -07:00
Josh Yan	79cc4c9585	os copy	2024-07-10 11:20:02 -07:00
Josh Yan	bc3f59a6ad	rmv prints	2024-07-10 11:20:02 -07:00
Josh Yan	1a85cb904c	local copy	2024-07-10 11:20:02 -07:00
Josh Yan	10ea0987e9	isLocal firstdraft	2024-07-10 11:19:50 -07:00
Josh Yan	413d368a6a	clean	2024-07-10 11:19:32 -07:00
Josh Yan	cabf375059	rm bench	2024-07-10 11:19:32 -07:00
Josh Yan	ca0ee1d4fe	rm config	2024-07-10 11:19:32 -07:00
Josh Yan	1142999aab	rm config	2024-07-10 11:19:32 -07:00
Josh Yan	0d5a72aba9	clean	2024-07-10 11:19:32 -07:00
Josh Yan	ea837412c2	local path	2024-07-10 11:19:32 -07:00
Josh Yan	736ad6f438	still works	2024-07-10 11:19:32 -07:00
Josh Yan	64607d16a5	working	2024-07-10 11:19:32 -07:00
Josh Yan	a6cfe7f00b	benchmark	2024-07-10 11:19:32 -07:00
Josh Yan	c3b411a515	on disk copy	2024-07-10 11:19:32 -07:00
Josh Yan	928f37e3ae	start tests	2024-07-10 11:19:32 -07:00
Daniel Hiltgen	2d1e3c3229	Merge pull request #5503 from dhiltgen/dual_rocm Workaround broken ROCm p2p copy	2024-07-09 15:44:16 -07:00
royjhan	4918fae535	OpenAI v1/completions: allow stop token list (#5551 ) * stop token parsing fix * add stop test	2024-07-09 14:01:26 -07:00
royjhan	0aff67877e	separate request tests (#5578 )	2024-07-09 13:48:31 -07:00
Daniel Hiltgen	0bacb30007	Workaround broken ROCm p2p copy Enable the build flag for llama.cpp to use CPU copy for multi-GPU scenarios.	2024-07-08 09:40:52 -07:00