defaut timeout

update readme for gemma 2 (#5333 )
* update readme for gemma 2
2025-12-31 19:50:04 -05:00 · 2024-06-27 14:58:31 -07:00 · 2024-06-27 12:45:16 -04:00 · 2024-06-26 21:38:21 -07:00 · 2024-06-26 21:38:12 -07:00 · 2024-06-24 21:47:52 -07:00
50 changed files with 1533 additions and 522 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -437,6 +437,7 @@ jobs:
    env:
      OLLAMA_SKIP_IMAGE_BUILD: '1'
      PUSH: '1'
+      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
@@ -460,15 +461,20 @@ jobs:
          ls -lh dist/
          (cd dist; sha256sum * > sha256sum.txt)
          cat dist/sha256sum.txt
-      - uses: ncipollo/release-action@v1
-        with:
-          name: ${{ env.RELEASE_VERSION }}
-          allowUpdates: true
-          artifacts: 'dist/*'
-          draft: true
-          prerelease: true
-          omitBodyDuringUpdate: true
-          generateReleaseNotes: true
-          omitDraftDuringUpdate: true
-          omitPrereleaseDuringUpdate: true
-          replacesArtifacts: true
+      - name: Create or update Release
+        run: |
+          echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
+          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
+          if [ -n "$OLD_TAG" ]; then
+            echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
+            gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
+          else
+            echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
+            gh release create ${GITHUB_REF_NAME} \
+              --title ${{ env.RELEASE_VERSION }} \
+              --draft \
+              --generate-notes \
+              --prerelease
+          fi
+          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
+          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
--- a/README.md
+++ b/README.md
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
-| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```

+### Show model information
+
+```
+ollama show llama3
+```
+
 ### List models on your computer

 ```
--- a/api/types.go
+++ b/api/types.go
@@ -159,18 +159,49 @@ type Options struct {

 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool `json:"numa,omitempty"`
-	NumCtx    int  `json:"num_ctx,omitempty"`
-	NumBatch  int  `json:"num_batch,omitempty"`
-	NumGPU    int  `json:"num_gpu,omitempty"`
-	MainGPU   int  `json:"main_gpu,omitempty"`
-	LowVRAM   bool `json:"low_vram,omitempty"`
-	F16KV     bool `json:"f16_kv,omitempty"`
-	LogitsAll bool `json:"logits_all,omitempty"`
-	VocabOnly bool `json:"vocab_only,omitempty"`
-	UseMMap   bool `json:"use_mmap,omitempty"`
-	UseMLock  bool `json:"use_mlock,omitempty"`
-	NumThread int  `json:"num_thread,omitempty"`
+	UseNUMA   bool     `json:"numa,omitempty"`
+	NumCtx    int      `json:"num_ctx,omitempty"`
+	NumBatch  int      `json:"num_batch,omitempty"`
+	NumGPU    int      `json:"num_gpu,omitempty"`
+	MainGPU   int      `json:"main_gpu,omitempty"`
+	LowVRAM   bool     `json:"low_vram,omitempty"`
+	F16KV     bool     `json:"f16_kv,omitempty"`
+	LogitsAll bool     `json:"logits_all,omitempty"`
+	VocabOnly bool     `json:"vocab_only,omitempty"`
+	UseMMap   TriState `json:"use_mmap,omitempty"`
+	UseMLock  bool     `json:"use_mlock,omitempty"`
+	NumThread int      `json:"num_thread,omitempty"`
+}
+
+type TriState int
+
+const (
+	TriStateUndefined TriState = -1
+	TriStateFalse     TriState = 0
+	TriStateTrue      TriState = 1
+)
+
+func (b *TriState) UnmarshalJSON(data []byte) error {
+	var v bool
+	if err := json.Unmarshal(data, &v); err != nil {
+		return err
+	}
+	if v {
+		*b = TriStateTrue
+	}
+	*b = TriStateFalse
+	return nil
+}
+
+func (b *TriState) MarshalJSON() ([]byte, error) {
+	if *b == TriStateUndefined {
+		return nil, nil
+	}
+	var v bool
+	if *b == TriStateTrue {
+		v = true
+	}
+	return json.Marshal(v)
 }

 // EmbeddingRequest is the request passed to [Client.Embeddings].
@@ -222,6 +253,7 @@ type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
 	Template string `json:"template"`
+	Verbose  bool   `json:"verbose"`

 	Options map[string]interface{} `json:"options"`

@@ -231,14 +263,16 @@ type ShowRequest struct {

 // ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
-	License    string       `json:"license,omitempty"`
-	Modelfile  string       `json:"modelfile,omitempty"`
-	Parameters string       `json:"parameters,omitempty"`
-	Template   string       `json:"template,omitempty"`
-	System     string       `json:"system,omitempty"`
-	Details    ModelDetails `json:"details,omitempty"`
-	Messages   []Message    `json:"messages,omitempty"`
-	ModifiedAt time.Time    `json:"modified_at,omitempty"`
+	License       string         `json:"license,omitempty"`
+	Modelfile     string         `json:"modelfile,omitempty"`
+	Parameters    string         `json:"parameters,omitempty"`
+	Template      string         `json:"template,omitempty"`
+	System        string         `json:"system,omitempty"`
+	Details       ModelDetails   `json:"details,omitempty"`
+	Messages      []Message      `json:"messages,omitempty"`
+	ModelInfo     map[string]any `json:"model_info,omitempty"`
+	ProjectorInfo map[string]any `json:"projector_info,omitempty"`
+	ModifiedAt    time.Time      `json:"modified_at,omitempty"`
 }

 // CopyRequest is the request passed to [Client.Copy].
@@ -403,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 				continue
 			}

+			if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+				val, ok := val.(bool)
+				if !ok {
+					return fmt.Errorf("option %q must be of type boolean", key)
+				}
+				if val {
+					field.SetInt(int64(TriStateTrue))
+				} else {
+					field.SetInt(int64(TriStateFalse))
+				}
+				continue
+			}
+
 			switch field.Kind() {
 			case reflect.Int:
 				switch t := val.(type) {
@@ -491,7 +538,7 @@ func DefaultOptions() Options {
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
-			UseMMap:   true,
+			UseMMap:   TriStateUndefined,
 			UseNUMA:   false,
 		},
 	}
@@ -561,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
 		} else {
 			field := valueOpts.FieldByName(opt.Name)
 			if field.IsValid() && field.CanSet() {
+				if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
+					boolVal, err := strconv.ParseBool(vals[0])
+					if err != nil {
+						return nil, fmt.Errorf("invalid bool value %s", vals)
+					}
+					if boolVal {
+						out[key] = TriStateTrue
+					} else {
+						out[key] = TriStateFalse
+					}
+					continue
+				}
+
 				switch field.Kind() {
 				case reflect.Float32:
 					floatVal, err := strconv.ParseFloat(vals[0], 32)
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -2,6 +2,7 @@ package api

 import (
 	"encoding/json"
+	"fmt"
 	"math"
 	"testing"
 	"time"
@@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
 		})
 	}
 }
+
+func TestUseMmapParsingFromJSON(t *testing.T) {
+	tests := []struct {
+		name string
+		req  string
+		exp  TriState
+	}{
+		{
+			name: "Undefined",
+			req:  `{ }`,
+			exp:  TriStateUndefined,
+		},
+		{
+			name: "True",
+			req:  `{ "use_mmap": true }`,
+			exp:  TriStateTrue,
+		},
+		{
+			name: "False",
+			req:  `{ "use_mmap": false }`,
+			exp:  TriStateFalse,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var oMap map[string]interface{}
+			err := json.Unmarshal([]byte(test.req), &oMap)
+			require.NoError(t, err)
+			opts := DefaultOptions()
+			err = opts.FromMap(oMap)
+			require.NoError(t, err)
+			assert.Equal(t, test.exp, opts.UseMMap)
+		})
+	}
+}
+
+func TestUseMmapFormatParams(t *testing.T) {
+	tests := []struct {
+		name string
+		req  map[string][]string
+		exp  TriState
+		err  error
+	}{
+		{
+			name: "True",
+			req: map[string][]string{
+				"use_mmap": []string{"true"},
+			},
+			exp: TriStateTrue,
+			err: nil,
+		},
+		{
+			name: "False",
+			req: map[string][]string{
+				"use_mmap": []string{"false"},
+			},
+			exp: TriStateFalse,
+			err: nil,
+		},
+		{
+			name: "Numeric True",
+			req: map[string][]string{
+				"use_mmap": []string{"1"},
+			},
+			exp: TriStateTrue,
+			err: nil,
+		},
+		{
+			name: "Numeric False",
+			req: map[string][]string{
+				"use_mmap": []string{"0"},
+			},
+			exp: TriStateFalse,
+			err: nil,
+		},
+		{
+			name: "invalid string",
+			req: map[string][]string{
+				"use_mmap": []string{"foo"},
+			},
+			exp: TriStateUndefined,
+			err: fmt.Errorf("invalid bool value [foo]"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			resp, err := FormatParams(test.req)
+			require.Equal(t, err, test.err)
+			respVal, ok := resp["use_mmap"]
+			if test.exp != TriStateUndefined {
+				assert.True(t, ok, "resp: %v", resp)
+				assert.Equal(t, test.exp, respVal)
+			}
+		})
+	}
+}
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,6 +5,8 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"strconv"
+	"strings"

 	"github.com/ollama/ollama/envconfig"
 )
@@ -24,6 +26,7 @@ func InitLogging() {
 		logFile = os.Stderr
 		// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
 	} else {
+		rotateLogs(AppLogFile)
 		logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 		if err != nil {
 			slog.Error(fmt.Sprintf("failed to create server log %v", err))
@@ -46,3 +49,32 @@ func InitLogging() {

 	slog.Info("ollama app started")
 }
+
+func rotateLogs(logFile string) {
+	if _, err := os.Stat(logFile); os.IsNotExist(err) {
+		return
+	}
+	index := strings.LastIndex(logFile, ".")
+	pre := logFile[:index]
+	post := "." + logFile[index+1:]
+	for i := LogRotationCount; i > 0; i-- {
+		older := pre + "-" + strconv.Itoa(i) + post
+		newer := pre + "-" + strconv.Itoa(i-1) + post
+		if i == 1 {
+			newer = pre + post
+		}
+		if _, err := os.Stat(newer); err == nil {
+			if _, err := os.Stat(older); err == nil {
+				err := os.Remove(older)
+				if err != nil {
+					slog.Warn("Failed to remove older log", "older", older, "error", err)
+					continue
+				}
+			}
+			err := os.Rename(newer, older)
+			if err != nil {
+				slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
+			}
+		}
+	}
+}
--- a/app/lifecycle/logging_test.go
+++ b/app/lifecycle/logging_test.go
@@ -0,0 +1,44 @@
+package lifecycle
+
+import (
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestRotateLogs(t *testing.T) {
+	logDir := t.TempDir()
+	logFile := filepath.Join(logDir, "testlog.log")
+
+	// No log exists
+	rotateLogs(logFile)
+
+	require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
+	assert.FileExists(t, logFile)
+	// First rotation
+	rotateLogs(logFile)
+	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+	assert.NoFileExists(t, logFile)
+
+	// Should be a no-op without a new log
+	rotateLogs(logFile)
+	assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
+	assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
+	assert.NoFileExists(t, logFile)
+
+	for i := 2; i <= LogRotationCount+1; i++ {
+		require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
+		assert.FileExists(t, logFile)
+		rotateLogs(logFile)
+		assert.NoFileExists(t, logFile)
+		for j := 1; j < i; j++ {
+			assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
+		}
+		assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
+	}
+}
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -16,11 +16,12 @@ var (
 	AppDir     = "/opt/Ollama"
 	AppDataDir = "/opt/Ollama"
 	// TODO - should there be a distinct log dir?
-	UpdateStageDir = "/tmp"
-	AppLogFile     = "/tmp/ollama_app.log"
-	ServerLogFile  = "/tmp/ollama.log"
-	UpgradeLogFile = "/tmp/ollama_update.log"
-	Installer      = "OllamaSetup.exe"
+	UpdateStageDir   = "/tmp"
+	AppLogFile       = "/tmp/ollama_app.log"
+	ServerLogFile    = "/tmp/ollama.log"
+	UpgradeLogFile   = "/tmp/ollama_update.log"
+	Installer        = "OllamaSetup.exe"
+	LogRotationCount = 5
 )

 func init() {
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
 		return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
 	}

-	// TODO - rotation
+	rotateLogs(ServerLogFile)
 	logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create server log: %w", err)
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,10 +88,15 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+#if DirExists("..\dist\windows-amd64\cuda")
+  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
+#endif
+#if DirExists("..\dist\windows-amd64\oneapi")
+  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
+#endif
 #if DirExists("..\dist\windows-amd64\rocm")
  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
 #endif
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
 	}
 	defer tempfile.Close()

-	zipfile := zip.NewWriter(tempfile)
-	defer zipfile.Close()
-
 	detectContentType := func(path string) (string, error) {
 		f, err := os.Open(path)
 		if err != nil {
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
 		files = append(files, tks...)
 	}

+	zipfile := zip.NewWriter(tempfile)
+	defer zipfile.Close()
+
 	for _, file := range files {
 		f, err := os.Open(file)
 		if err != nil {
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 }

 func RunHandler(cmd *cobra.Command, args []string) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	name := args[0]
-
-	// check if the model exists on the server
-	show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
-	var statusError api.StatusError
-	switch {
-	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
-		if err := PullHandler(cmd, []string{name}); err != nil {
-			return err
-		}
-
-		show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
-		if err != nil {
-			return err
-		}
-	case err != nil:
-		return err
-	}
-
 	interactive := true

 	opts := runOptions{
-		Model:       args[0],
-		WordWrap:    os.Getenv("TERM") == "xterm-256color",
-		Options:     map[string]interface{}{},
-		MultiModal:  slices.Contains(show.Details.Families, "clip"),
-		ParentModel: show.Details.ParentModel,
+		Model:    args[0],
+		WordWrap: os.Getenv("TERM") == "xterm-256color",
+		Options:  map[string]interface{}{},
 	}

 	format, err := cmd.Flags().GetString("format")
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.WordWrap = !nowrap

-	if !interactive {
-		return generate(cmd, opts)
+	// Fill out the rest of the options based on information about the
+	// model.
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
 	}

-	return generateInteractive(cmd, opts)
+	name := args[0]
+	info, err := func() (*api.ShowResponse, error) {
+		showReq := &api.ShowRequest{Name: name}
+		info, err := client.Show(cmd.Context(), showReq)
+		var se api.StatusError
+		if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
+			if err := PullHandler(cmd, []string{name}); err != nil {
+				return nil, err
+			}
+			return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+		}
+		return info, err
+	}()
+	if err != nil {
+		return err
+	}
+
+	opts.MultiModal = slices.Contains(info.Details.Families, "clip")
+	opts.ParentModel = info.Details.ParentModel
+	opts.Messages = append(opts.Messages, info.Messages...)
+
+	if interactive {
+		return generateInteractive(cmd, opts)
+	}
+	return generate(cmd, opts)
 }

 func errFromUnknownKey(unknownKeyErr error) error {
@@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	if len(args) != 1 {
-		return errors.New("missing model name")
-	}
-
 	license, errLicense := cmd.Flags().GetBool("license")
 	modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
 	parameters, errParams := cmd.Flags().GetBool("parameters")
@@ -625,8 +622,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {

 	if flagsSet > 1 {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
-	} else if flagsSet == 0 {
-		return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
+	}
+
+	if flagsSet == 1 {
+		req := api.ShowRequest{Name: args[0]}
+		resp, err := client.Show(cmd.Context(), &req)
+		if err != nil {
+			return err
+		}
+
+		switch showType {
+		case "license":
+			fmt.Println(resp.License)
+		case "modelfile":
+			fmt.Println(resp.Modelfile)
+		case "parameters":
+			fmt.Println(resp.Parameters)
+		case "system":
+			fmt.Println(resp.System)
+		case "template":
+			fmt.Println(resp.Template)
+		}
+
+		return nil
 	}

 	req := api.ShowRequest{Name: args[0]}
@@ -635,22 +653,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	switch showType {
-	case "license":
-		fmt.Println(resp.License)
-	case "modelfile":
-		fmt.Println(resp.Modelfile)
-	case "parameters":
-		fmt.Println(resp.Parameters)
-	case "system":
-		fmt.Println(resp.System)
-	case "template":
-		fmt.Println(resp.Template)
+	arch := resp.ModelInfo["general.architecture"].(string)
+
+	modelData := [][]string{
+		{"arch", arch},
+		{"parameters", resp.Details.ParameterSize},
+		{"quantization", resp.Details.QuantizationLevel},
+		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
 	}

+	mainTableData := [][]string{
+		{"Model"},
+		{renderSubTable(modelData, false)},
+	}
+
+	if resp.ProjectorInfo != nil {
+		projectorData := [][]string{
+			{"arch", "clip"},
+			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+			{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
+			{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+			{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+		}
+
+		mainTableData = append(mainTableData,
+			[]string{"Projector"},
+			[]string{renderSubTable(projectorData, false)},
+		)
+	}
+
+	if resp.Parameters != "" {
+		mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
+	}
+
+	if resp.System != "" {
+		mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
+	}
+
+	if resp.License != "" {
+		mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
+	}
+
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetAutoWrapText(false)
+	table.SetBorder(false)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range mainTableData {
+		table.Append(v)
+	}
+
+	table.Render()
+
 	return nil
 }

+func renderSubTable(data [][]string, file bool) string {
+	var buf bytes.Buffer
+	table := tablewriter.NewWriter(&buf)
+	table.SetAutoWrapText(!file)
+	table.SetBorder(false)
+	table.SetNoWhiteSpace(true)
+	table.SetTablePadding("\t")
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range data {
+		table.Append(v)
+	}
+
+	table.Render()
+
+	renderedTable := buf.String()
+	lines := strings.Split(renderedTable, "\n")
+	for i, line := range lines {
+		lines[i] = "\t" + line
+	}
+
+	return strings.Join(lines, "\n")
+}
+
+func twoLines(s string) [][]string {
+	lines := strings.Split(s, "\n")
+	res := [][]string{}
+
+	count := 0
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			count++
+			res = append(res, []string{line})
+			if count == 2 {
+				return res
+			}
+		}
+	}
+	return res
+}
+
+func formatParams(s string) string {
+	lines := strings.Split(s, "\n")
+	table := [][]string{}
+
+	for _, line := range lines {
+		table = append(table, strings.Fields(line))
+	}
+	return renderSubTable(table, false)
+}
+
 func CopyHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -31,65 +31,40 @@ const (
 )

 func loadModel(cmd *cobra.Command, opts *runOptions) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()

 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)

-	showReq := api.ShowRequest{Name: opts.Model}
-	showResp, err := client.Show(cmd.Context(), &showReq)
+	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
-	opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
-	opts.ParentModel = showResp.Details.ParentModel
-
-	if len(showResp.Messages) > 0 {
-		opts.Messages = append(opts.Messages, showResp.Messages...)
-	}

 	chatReq := &api.ChatRequest{
-		Model:    opts.Model,
-		Messages: []api.Message{},
+		Model:     opts.Model,
+		KeepAlive: opts.KeepAlive,
 	}

-	if opts.KeepAlive != nil {
-		chatReq.KeepAlive = opts.KeepAlive
-	}
-
-	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+	return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		p.StopAndClear()
-		if len(opts.Messages) > 0 {
-			for _, msg := range opts.Messages {
-				switch msg.Role {
-				case "user":
-					fmt.Printf(">>> %s\n", msg.Content)
-				case "assistant":
-					state := &displayResponseState{}
-					displayResponse(msg.Content, opts.WordWrap, state)
-					fmt.Println()
-					fmt.Println()
-				}
+		for _, msg := range opts.Messages {
+			switch msg.Role {
+			case "user":
+				fmt.Printf(">>> %s\n", msg.Content)
+			case "assistant":
+				state := &displayResponseState{}
+				displayResponse(msg.Content, opts.WordWrap, state)
+				fmt.Println()
+				fmt.Println()
 			}
 		}
 		return nil
 	})
-	if err != nil {
-		return err
-	}
-
-	return nil
 }

 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
-	opts.Messages = make([]api.Message, 0)
-
 	err := loadModel(cmd, &opts)
 	if err != nil {
 		return err
--- a/docs/api.md
+++ b/docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show information about a model including details, modelfile, template, parameters, license, system prompt.

 ### Parameters

 - `name`: name of the model to show
+- `verbose`: (optional) if set to `true`, returns full data for verbose response fields

 ### Examples

@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
 ```json
 {
  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
-  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSISTANT:",
-  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
+  "parameters": "num_keep                       24\nstop                           \"<|start_header_id|>\"\nstop                           \"<|end_header_id|>\"\nstop                           \"<|eot_id|>\"",
+  "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
  "details": {
+    "parent_model": "",
    "format": "gguf",
    "family": "llama",
-    "families": ["llama", "clip"],
-    "parameter_size": "7B",
+    "families": [
+      "llama"
+    ],
+    "parameter_size": "8.0B",
    "quantization_level": "Q4_0"
+  },
+  "model_info": {
+    "general.architecture": "llama",
+    "general.file_type": 2,
+    "general.parameter_count": 8030261248,
+    "general.quantization_version": 2,
+    "llama.attention.head_count": 32,
+    "llama.attention.head_count_kv": 8,
+    "llama.attention.layer_norm_rms_epsilon": 0.00001,
+    "llama.block_count": 32,
+    "llama.context_length": 8192,
+    "llama.embedding_length": 4096,
+    "llama.feed_forward_length": 14336,
+    "llama.rope.dimension_count": 128,
+    "llama.rope.freq_base": 500000,
+    "llama.vocab_size": 128256,
+    "tokenizer.ggml.bos_token_id": 128000,
+    "tokenizer.ggml.eos_token_id": 128009,
+    "tokenizer.ggml.merges": [],            // populates if `verbose=true`
+    "tokenizer.ggml.model": "gpt2",
+    "tokenizer.ggml.pre": "llama-bpe",
+    "tokenizer.ggml.token_type": [],        // populates if `verbose=true`
+    "tokenizer.ggml.tokens": []             // populates if `verbose=true`
  }
 }
 ```
--- a/docs/import.md
+++ b/docs/import.md
@@ -47,19 +47,13 @@ success

 ### Supported Quantizations

-<details>
-<summary>Legacy Quantization</summary>
-
 - `Q4_0`
 - `Q4_1`
 - `Q5_0`
 - `Q5_1`
 - `Q8_0`

-</details>
-
-<details>
-<summary>K-means Quantization</summary>`
+#### K-means Quantizations

 - `Q3_K_S`
 - `Q3_K_M`
@@ -70,11 +64,6 @@ success
 - `Q5_K_M`
 - `Q6_K`

-</details>
-
-> [!NOTE]
-> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
-
 ## Template Detection

 > [!NOTE]
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -22,7 +22,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.

 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
 - `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -39,8 +39,8 @@ server.
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
-    - *server.log* contains the server logs
+    - *app.log* contains most resent logs from the GUI application
+    - *server.log* contains the most recent server logs
    - *upgrade.log* contains log output for upgrades
 - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
 			continue
 		}
 		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
-		if err == nil {
-			pid, err := strconv.Atoi(string(raw))
-			if err == nil {
-				if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-					// Another running ollama, ignore this tmpdir
-					continue
-				}
-			}
-		} else {
-			slog.Debug("failed to open ollama.pid", "path", d, "error", err)
-		}
-		err = os.RemoveAll(d)
 		if err != nil {
-			slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
+			slog.Warn("failed to read ollama.pid", "path", d, "error", err)
+			// No pid, ignore this tmpdir
+			continue
+		}
+
+		pid, err := strconv.Atoi(string(raw))
+		if err != nil {
+			slog.Warn("failed to parse pid", "path", d, "error", err)
+			continue
+		}
+
+		proc, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("found running ollama", "pid", pid, "path", d)
+			// Another running ollama, ignore this tmpdir
+			continue
+		}
+
+		if err := os.Remove(d); err != nil {
+			slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
 		}
 	}
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -231,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-			depPath = filepath.Dir(envconfig.RunnersDir)
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
 		}

 		// Load ALL libraries
@@ -282,6 +282,12 @@ func GetGPUInfo() GpuInfoList {
 		// Intel
 		if envconfig.IntelGpu {
 			oHandles = initOneAPIHandles()
+			// On windows we bundle the oneapi library one level above the runner dir
+			depPath = ""
+			if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
+			}
+
 			for d := range oHandles.oneapi.num_drivers {
 				if oHandles.oneapi == nil {
 					// shouldn't happen
@@ -306,7 +312,7 @@ func GetGPUInfo() GpuInfoList {
 					gpuInfo.FreeMemory = uint64(memInfo.free)
 					gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 					gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-					// TODO dependency path?
+					gpuInfo.DependencyPath = depPath
 					oneapiGPUs = append(oneapiGPUs, gpuInfo)
 				}
 			}
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*l[i].p) {
+    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -42,7 +42,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);

    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -50,7 +50,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);

    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p) {
+    if (!*(l[i].p)) {
      resp->oh.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -98,7 +98,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  }

  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet %d\n", resp->oh.drivers[d]);
+    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
                                   &resp->oh.num_devices[d], NULL);
    if (ret != ZE_RESULT_SUCCESS) {
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -56,7 +56,6 @@ struct server_params {
    std::string hostname = "127.0.0.1";
    std::vector<std::string> api_keys;
    std::string public_path = "examples/server/public";
-    std::string chat_template = "";
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
@@ -427,16 +426,6 @@ struct llama_server_context
        return true;
    }

-    void validate_model_chat_template(server_params & sparams) {
-        llama_chat_message chat[] = {{"user", "test"}};
-        std::vector<char> buf(1);
-        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
-        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "chatml";
-        }
-    }
-
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            sparams.chat_template = argv[i];
        }
        else if (arg == "--override-kv")
        {
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
    }
    const auto model_meta = llama.model_meta();

-    if (sparams.chat_template.empty()) { // custom chat template is not supplied
-        // check if the template comes with the model is supported by us
-        llama.validate_model_chat_template(sparams);
-    }
-
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
    fi
 }

-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"

 case "${GOARCH}" in
 "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
        # Approximately 400% faster than LCD on same CPU
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
        # Approximately 10% faster than AVX on same CPU
        #
        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}_static"
    echo "Building static library"
    build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake

-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,5 +1,7 @@
 #!powershell

+$ErrorActionPreference = "Stop"
+
 function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
@@ -37,7 +39,8 @@ function init_vars {
    }
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
+        "-DLLAMA_NATIVE=off",
+        "-DLLAMA_OPENMP=off"
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -83,9 +86,9 @@ function init_vars {
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }

 function apply_patches {
@@ -119,7 +122,7 @@ function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
@@ -127,7 +130,7 @@ function build {
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { write-host "cmake build exit status $LASTEXITCODE"; throw($LASTEXITCODE)}
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    # Rearrange output to be consistent between different generators
    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
@@ -141,7 +144,7 @@ function sign {
        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { throw($LASTEXITCODE)}
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
@@ -206,7 +209,8 @@ function build_static() {
            "-DLLAMA_AVX2=off",
            "-DLLAMA_AVX512=off",
            "-DLLAMA_F16C=off",
-            "-DLLAMA_FMA=off")
+            "-DLLAMA_FMA=off",
+            "-DLLAMA_OPENMP=off")
        $script:buildDir="../build/windows/${script:ARCH}_static"
        write-host "Building static library"
        build
@@ -216,13 +220,7 @@ function build_static() {
    }
 }

-function build_cpu() {
-    if ($script:ARCH -eq "arm64") {
-        $gen_arch = "ARM64"
-    } else { # amd64
-        $gen_arch = "x64"
-    }
-
+function build_cpu($gen_arch) {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
@@ -285,7 +283,7 @@ function build_cuda() {
            "-DLLAMA_AVX=on",
            "-DLLAMA_AVX2=off",
            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8"
+            "-DCMAKE_CUDA_FLAGS=-t8",
            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
@@ -297,10 +295,12 @@ function build_cuda() {
        sign
        install

-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
+        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
    } else {
        write-host "Skipping CUDA generation step"
    }
@@ -334,16 +334,18 @@ function build_oneapi() {
    sign
    install

-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
@@ -408,29 +410,16 @@ init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
-
-    $tasks = @("build_static", "build_cpu")
-    $jobs = @()
-    if ($script:ARCH -ne "arm64") {
-        $tasks += $("build_cpu_avx", "build_cpu_avx2", "build_cuda", "build_oneapi", "build_rocm")
-    }
-    foreach ($t in $tasks) {
-        $jobs += @(Start-ThreadJob -ThrottleLimit 12 -FilePath .\gen_windows.ps1 -ArgumentList $t -Name $t)
-    }
-    get-job
-    foreach ($job in $jobs) {
-        write-host "----" $job.Name output follows
-        receive-job -wait -job $job
-        write-host "----" $job.Name $job.State
-        write-host ""
-        if ($job.State -contains 'Failed') {
-            cleanup
-            write-host "Terminating remaining jobs (this takes a while, you can ^C)"
-            # TODO find some way to kill the spawned cmake processes faster
-            remove-job -force -job $jobs
-            exit(-1)
-        }
-        get-job
+    build_static
+    if ($script:ARCH -eq "arm64") {
+        build_cpu("ARM64")
+    } else { # amd64
+        build_cpu("x64")
+        build_cpu_avx
+        build_cpu_avx2
+        build_cuda
+        build_oneapi
+        build_rocm
    }

    cleanup
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }

-func (llm *ggla) decode(rs io.ReadSeeker) error {
+func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
 		return err
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 	for {
 		var dims uint32
 		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
+			if errors.Is(err, io.EOF) {
+				return nil
+			}
 			return err
 		}

+		defer func() {
+			if errors.Is(retErr, io.EOF) {
+				retErr = io.ErrUnexpectedEOF
+			}
+		}()
+
 		var namesize uint32
 		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
 			return err
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
 			return err
 		}

-		if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
+		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
 			return err
 		}

--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 	"io"
 	"strings"
+
+	"github.com/ollama/ollama/util/bufioutil"
 )

 type GGML struct {
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
 	return 1
 }

+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
+		return kv.EmbeddingLength() / kv.HeadCount()
+	}
+
+	return 0
+}
+
+func (kv KV) EmbeddingHeadCountK() uint64 {
+	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
+		return k
+	}
+
+	return kv.EmbeddingHeadCount()
+}
+
+func (kv KV) EmbeddingHeadCountV() uint64 {
+	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
+		return v
+	}
+
+	return kv.EmbeddingHeadCount()
+}
+
 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
 	}
 }

-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+// DecodeGGML decodes a GGML model from the given reader.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+	if maxArraySize == 0 {
+		maxArraySize = 1024
+	}
+
+	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
+
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
 		return nil, 0, err
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	case FILE_MAGIC_GGLA:
 		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian}
+		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian}
+		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
 		return nil, 0, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
-	if errors.Is(err, io.EOF) {
-		// noop
-	} else if err != nil {
+	if err != nil {
 		return nil, 0, err
 	}

@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
+
+	embeddingHeads := llm.KV().EmbeddingHeadCount()
+	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()

 	layers := llm.Tensors().Layers()

@@ -308,7 +346,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
 			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
+			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)

@@ -316,15 +354,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			// mixtral 8x22b
 			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
-				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
-				4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
+				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
 			)
 		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
 			// mixtral 8x7b
 			ffnGateWeight1 := ffnGateWeight.Shape[1]
 			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
 			partialOffload = max(
-				4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
@@ -367,6 +405,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(vocab+2*embedding),
 			fullOffload,
 		)
+	case "deepseek2":
+		fullOffload = max(
+			4*batch*(3*embedding+vocab),
+			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
+		)
+
+		partialOffload = max(
+			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
+		)
 	}

 	return
--- a/llm/ggml_test.go
+++ b/llm/ggml_test.go
@@ -0,0 +1 @@
+package llm
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -3,11 +3,10 @@ package llm
 import (
 	"bytes"
 	"encoding/binary"
+	"encoding/json"
 	"fmt"
 	"io"
 	"strings"
-
-	"log/slog"
 )

 type containerGGUF struct {
@@ -29,6 +28,12 @@ type containerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
+
+	maxArraySize int
+}
+
+func (c *containerGGUF) canCollectArray(size int) bool {
+	return c.maxArraySize < 0 || size <= c.maxArraySize
 }

 func (c *containerGGUF) Name() string {
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
 	}

 	model := newGGUF(c)
-	slog.Debug(fmt.Sprintf("model = %#v", model))
 	if err := model.Decode(rs); err != nil {
 		return nil, err
 	}
@@ -85,6 +89,8 @@ type gguf struct {
 	tensors []*Tensor

 	parameters uint64
+
+	scratch [16 << 10]byte
 }

 func newGGUF(container *containerGGUF) *gguf {
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	}

 	// decode tensors
-	for i := 0; uint64(i) < llm.numTensor(); i++ {
+	for range llm.numTensor() {
 		name, err := readGGUFString(llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor name: %w", err)
 		}

 		// dims is the number of dimensions in the tensor
 		dims, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor dimensions: %w", err)
 		}

 		shape := [4]uint64{1, 1, 1, 1}
 		for i := 0; uint32(i) < dims; i++ {
 			shape[i], err = readGGUF[uint64](llm, rs)
 			if err != nil {
-				return err
+				return fmt.Errorf("failed to read tensor shape: %w", err)
 			}
 		}

 		kind, err := readGGUF[uint32](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor kind: %w", err)
 		}

 		offset, err := readGGUF[uint64](llm, rs)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to read tensor offset: %w", err)
 		}

 		tensor := Tensor{
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		alignment = 32
 	}

-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	padding := llm.padding(offset, int64(alignment))
-	if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-		return err
-	}
-
 	for _, tensor := range llm.tensors {
-		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
-			return err
+		offset, err := rs.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return fmt.Errorf("failed to get current offset: %w", err)
 		}

-		padding := llm.padding(int64(tensor.Size()), int64(alignment))
+		padding := llm.padding(offset, int64(alignment))
 		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-			return err
+			return fmt.Errorf("failed to seek to init padding: %w", err)
+		}
+
+		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
+			return fmt.Errorf("failed to seek to tensor: %w", err)
 		}
 	}

@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
 	return b.String(), nil
 }

+func discardGGUFString(llm *gguf, r io.Reader) error {
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
+		return err
+	}
+
+	size := int(llm.ByteOrder.Uint64(buf))
+	for size > 0 {
+		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
+		if err != nil {
+			return err
+		}
+		size -= n
+	}
+	return nil
+}
+
 func readGGUFString(llm *gguf, r io.Reader) (string, error) {
 	if llm.Version == 1 {
 		return readGGUFV1String(llm, r)
 	}

-	var length uint64
-	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
 		return "", err
 	}

-	var b bytes.Buffer
-	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
+	length := int(llm.ByteOrder.Uint64(buf))
+	if length > len(llm.scratch) {
+		buf = make([]byte, length)
+	} else {
+		buf = llm.scratch[:length]
+	}
+	clear(buf)
+
+	_, err = io.ReadFull(r, buf)
+	if err != nil {
 		return "", err
 	}
-
-	return b.String(), nil
+	return string(buf), nil
 }

 func writeGGUFString(llm *gguf, w io.Writer, s string) error {
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
 	return err
 }

-func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
+type array struct {
+	size   int
+	values []any
+}
+
+func (a *array) MarshalJSON() ([]byte, error) {
+	return json.Marshal(a.values)
+}
+
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
 	t, err := readGGUF[uint32](llm, r)
 	if err != nil {
 		return nil, err
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}

-	for i := 0; uint32(i) < n; i++ {
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, 0, int(n))
+	}
+
+	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}

-		a = append(a, e)
+		if a.values != nil {
+			a.values[i] = e
+		}
 	}

-	return
+	return a, nil
 }

-func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	if llm.Version == 1 {
 		return readGGUFV1Array(llm, r)
 	}
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}

-	for i := 0; uint64(i) < n; i++ {
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, int(n))
+	}
+
+	for i := range n {
 		var e any
 		switch t {
 		case ggufTypeUint8:
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		case ggufTypeBool:
 			e, err = readGGUF[bool](llm, r)
 		case ggufTypeString:
-			e, err = readGGUFString(llm, r)
+			if a.values != nil {
+				e, err = readGGUFString(llm, r)
+			} else {
+				err = discardGGUFString(llm, r)
+			}
 		default:
 			return nil, fmt.Errorf("invalid array type: %d", t)
 		}
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}

-		a = append(a, e)
+		if a.values != nil {
+			a.values[i] = e
+		}
 	}

-	return
+	return a, nil
 }

 func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,6 +1,7 @@
 package llm

 import (
+	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -49,6 +50,18 @@ type MemoryEstimate struct {

 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
+
+	// internal fields for logging purposes
+	inferenceLibrary    string
+	layersRequested     int
+	layersModel         int
+	availableList       []string
+	kv                  uint64
+	allocationsList     []string
+	memoryWeights       uint64
+	memoryLayerOutput   uint64
+	graphFullOffload    uint64
+	graphPartialOffload uint64
 }

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@@ -102,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		slog.Warn("model missing blk.0 layer size")
 	}

-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
+	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()

 	// KV is proportional to the number of layers
 	layerSize += kv / ggml.KV().BlockCount()
@@ -167,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts

 	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
+		// Some models have inconsistent layer sizes
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			layerSize = blk.size()
+			layerSize += kv / ggml.KV().BlockCount()
+		}
 		memoryWeights += layerSize

 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
@@ -252,78 +270,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
 	}

+	estimate := MemoryEstimate{
+		TotalSize: memoryRequiredTotal,
+		Layers:    0,
+		Graph:     0,
+		VRAMSize:  0,
+		GPUSizes:  []uint64{},
+
+		inferenceLibrary:    gpus[0].Library,
+		layersRequested:     opts.NumGPU,
+		layersModel:         int(ggml.KV().BlockCount()) + 1,
+		availableList:       availableList,
+		kv:                  kv,
+		allocationsList:     allocationsList,
+		memoryWeights:       memoryWeights,
+		memoryLayerOutput:   memoryLayerOutput,
+		graphFullOffload:    graphFullOffload,
+		graphPartialOffload: graphPartialOffload,
+	}
+
+	if gpus[0].Library == "cpu" {
+		return estimate
+	}
+	if layerCount == 0 {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return estimate
+	}
+	estimate.Layers = layerCount
+	estimate.Graph = graphOffload
+	estimate.VRAMSize = memoryRequiredPartial
+	estimate.TotalSize = memoryRequiredTotal
+	estimate.TensorSplit = tensorSplit
+	estimate.GPUSizes = gpuAllocations
+	return estimate
+}
+
+func (m MemoryEstimate) log() {
 	slog.Info(
-		"offload to gpu",
+		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
-			"requested", opts.NumGPU,
+			"requested", m.layersRequested,
 			// The number of layers the model has (including output)
-			"model", int(ggml.KV().BlockCount())+1,
+			"model", m.layersModel,
 			// estimated number of layers that can be offloaded
-			"offload", layerCount,
-			// multi-gpu split for tesnors
-			"split", tensorSplit,
+			"offload", m.Layers,
+			// multi-gpu split for tensors
+			"split", m.TensorSplit,
 		),
 		slog.Group(
 			"memory",
 			// memory available by GPU for offloading
-			"available", availableList,
+			"available", m.availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
-				"full", format.HumanBytes2(memoryRequiredTotal),
+				"full", format.HumanBytes2(m.TotalSize),
 				// memory required to offload layers.estimate layers
-				"partial", format.HumanBytes2(memoryRequiredPartial),
+				"partial", format.HumanBytes2(m.VRAMSize),
 				// memory of KV cache
-				"kv", format.HumanBytes2(kv),
+				"kv", format.HumanBytes2(m.kv),
 				// Allocations across the GPUs
-				"allocations", allocationsList,
+				"allocations", m.allocationsList,
 			),
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(memoryWeights),
+				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
-				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
 				// memory of non-repeating layers
-				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
 			),
 			slog.Group(
 				"graph",
 				// memory of graph when fully offloaded
-				"full", format.HumanBytes2(graphFullOffload),
+				"full", format.HumanBytes2(m.graphFullOffload),
 				// memory of graph when not fully offloaded
-				"partial", format.HumanBytes2(graphPartialOffload),
+				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
 	)
-	if gpus[0].Library == "cpu" {
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
-	}
-	if layerCount == 0 {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return MemoryEstimate{
-			Layers:    0,
-			Graph:     0,
-			VRAMSize:  0,
-			TotalSize: memoryRequiredTotal,
-			GPUSizes:  []uint64{},
-		}
-	}
-
-	return MemoryEstimate{
-		Layers:      layerCount,
-		Graph:       graphOffload,
-		VRAMSize:    memoryRequiredPartial,
-		TotalSize:   memoryRequiredTotal,
-		TensorSplit: tensorSplit,
-		GPUSizes:    gpuAllocations,
-	}
 }
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
+
 	tensors := []Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = gguf.Encode(f, KV{
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
 	}, tensors)
 	require.NoError(t, err)

-	ggml, err := LoadModel(f.Name())
-	require.NoError(t, err)
+	ggml, err := LoadModel(f.Name(), 0)
+	if err != nil {
+		t.Fatal(err)
+	}

 	// Simple CPU scenario
 	gpus := []gpu.GpuInfo{
--- a/llm/patches/01-load-progress.diff
+++ b/llm/patches/01-load-progress.diff
@@ -1,8 +1,8 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
-     // multimodal models (see examples/llava)
+@@ -180,6 +180,13 @@ struct gpt_params {
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
-+
+ 
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
- };
- 
- void gpt_params_handle_model_default(gpt_params & params);
+
+     // server params
+     int32_t port           = 8080;         // server listens on this network port
+     int32_t timeout_read   = 600;          // http read timeout in seconds
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
 
         // for now, only BPE models have pre-tokenizers
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (
-+            if (
-                     tokenizer_pre == "default") {
+-            } else if (tokenizer_pre == "default") {
+            if (tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "smaug-bpe") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+                     tokenizer_pre == "llama3"   ||
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "poro-chat") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/patches/07-gemma.diff
+++ b/llm/patches/07-gemma.diff
@@ -0,0 +1,305 @@
+From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
+From: Ollama maintainers <hello@ollama.com>
+Date: Wed, 26 Jun 2024 16:18:09 -0700
+Subject: [PATCH] Architecture support
+
+---
+ llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 193 insertions(+), 1 deletion(-)
+
+diff --git a/llama.cpp b/llama.cpp
+index 61948751..3b4196f5 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -217,6 +217,7 @@ enum llm_arch {
+     LLM_ARCH_INTERNLM2,
+     LLM_ARCH_MINICPM,
+     LLM_ARCH_GEMMA,
+    LLM_ARCH_GEMMA2,
+     LLM_ARCH_STARCODER2,
+     LLM_ARCH_MAMBA,
+     LLM_ARCH_XVERSE,
+@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_INTERNLM2,       "internlm2"    },
+     { LLM_ARCH_MINICPM,         "minicpm"      },
+     { LLM_ARCH_GEMMA,           "gemma"        },
+    { LLM_ARCH_GEMMA2,          "gemma2"       },
+     { LLM_ARCH_STARCODER2,      "starcoder2"   },
+     { LLM_ARCH_MAMBA,           "mamba"        },
+     { LLM_ARCH_XVERSE,          "xverse"       },
+@@ -464,10 +466,12 @@ enum llm_tensor {
+     LLM_TENSOR_ATTN_NORM,
+     LLM_TENSOR_ATTN_NORM_2,
+     LLM_TENSOR_ATTN_OUT_NORM,
+    LLM_TENSOR_ATTN_POST_NORM,
+     LLM_TENSOR_ATTN_ROT_EMBD,
+     LLM_TENSOR_FFN_GATE_INP,
+     LLM_TENSOR_FFN_GATE_INP_SHEXP,
+     LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_FFN_POST_NORM,
+     LLM_TENSOR_FFN_GATE,
+     LLM_TENSOR_FFN_DOWN,
+     LLM_TENSOR_FFN_UP,
+@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+         },
+     },
+    {
+        LLM_ARCH_GEMMA2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
+     {
+         LLM_ARCH_STARCODER2,
+         {
+@@ -1941,6 +1963,8 @@ enum e_model {
+     MODEL_8x22B,
+     MODEL_16x12B,
+     MODEL_10B_128x3_66B,
+    MODEL_9B,
+    MODEL_27B,
+ };
+ 
+ static const size_t kiB = 1024;
+@@ -2114,6 +2138,7 @@ struct llama_layer {
+     struct ggml_tensor * attn_out_norm_b;
+     struct ggml_tensor * attn_q_a_norm;
+     struct ggml_tensor * attn_kv_a_norm;
+    struct ggml_tensor * attn_post_norm;
+ 
+     // attention
+     struct ggml_tensor * wq;
+@@ -2136,6 +2161,7 @@ struct llama_layer {
+     // normalization
+     struct ggml_tensor * ffn_norm;
+     struct ggml_tensor * ffn_norm_b;
+    struct ggml_tensor * ffn_post_norm;
+     struct ggml_tensor * layer_out_norm;
+     struct ggml_tensor * layer_out_norm_b;
+     struct ggml_tensor * ffn_norm_exps;
+@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
+                 }
+             } break;
+         case LLM_ARCH_GEMMA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 18: model.type = e_model::MODEL_9B; break;
+                    case 28: model.type = e_model::MODEL_27B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_GEMMA2:
+             {
+                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ 
+@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
+                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                     }
+                 } break;
+            case LLM_ARCH_GEMMA2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+                    const int64_t n_ff          = hparams.n_ff;
+                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+                        layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+                    }
+                } break;
+             case LLM_ARCH_STARCODER2:
+                 {
+                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+@@ -10614,6 +10684,123 @@ struct llm_build_context {
+         return gf;
+     }
+ 
+    struct ggml_cgraph * build_gemma2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up, NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+     struct ggml_cgraph * build_starcoder2() {
+         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ 
+@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
+             {
+                 result = llm.build_gemma();
+             } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                result = llm.build_gemma2();
+            } break;
+         case LLM_ARCH_STARCODER2:
+             {
+                 result = llm.build_starcoder2();
+@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_PHI2:
+         case LLM_ARCH_PHI3:
+         case LLM_ARCH_GEMMA:
+        case LLM_ARCH_GEMMA2:
+         case LLM_ARCH_STARCODER2:
+         case LLM_ARCH_GPTNEOX:
+             return LLAMA_ROPE_TYPE_NEOX;
+@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
+         if (add_ass) {
+             ss << "<s>assistant\n";
+         }
+-    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
+         // google/gemma-7b-it
+         std::string system_prompt = "";
+         for (auto message : chat) {
+-- 
+2.45.2
+
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
 	}

 	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*")
+	pattern := filepath.Join(payloadsDir, "*", "ollama_*")

 	files, err := filepath.Glob(pattern)
 	if err != nil {
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(file)] = file
+		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}

 	return servers
--- a/llm/server.go
+++ b/llm/server.go
@@ -60,7 +60,12 @@ type llmServer struct {
 	sem *semaphore.Weighted
 }

-func LoadModel(model string) (*GGML, error) {
+// LoadModel will load a model from disk. The model must be in the GGML format.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
 	}
 	defer f.Close()

-	ggml, _, err := DecodeGGML(f)
+	ggml, _, err := DecodeGGML(f, maxArraySize)
 	return ggml, err
 }

@@ -81,7 +86,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
-	var systemMemory uint64
+	var systemTotalMemory uint64
+	var systemFreeMemory uint64
+
+	systemMemInfo, err := gpu.GetCPUMem()
+	if err != nil {
+		slog.Error("failed to lookup system memory", "error", err)
+	} else {
+		systemTotalMemory = systemMemInfo.TotalMemory
+		systemFreeMemory = systemMemInfo.FreeMemory
+		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
+	}

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
@@ -91,19 +106,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		cpuRunner = serverForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
-		if gpus[0].Library == "metal" {
-			memInfo, err := gpu.GetCPUMem()
-			if err != nil {
-				slog.Error("failed to lookup system memory", "error", err)
-			} else {
-				systemMemory = memInfo.TotalMemory
-				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
-			}
-		}
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)

 		switch {
-		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
@@ -116,6 +122,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}

+	estimate.log()
+
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")

@@ -158,6 +166,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	params = append(params, "--log-disable")

+	params = append(params, "--timeout", fmt.Sprintf("%d", 600))
+
 	if opts.NumGPU >= 0 {
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}
@@ -200,7 +210,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-			opts.UseMMap = false
+			opts.UseMMap = api.TriStateFalse
 		}
 	}

@@ -208,7 +218,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--flash-attn")
 	}

-	if !opts.UseMMap {
+	// Windows CUDA should not use mmap for best performance
+	// Linux  with a model larger than free space, mmap leads to thrashing
+	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
+		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
+		opts.UseMMap == api.TriStateFalse {
 		params = append(params, "--no-mmap")
 	}

@@ -271,8 +285,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{dir}
+		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
+		libraryPaths := []string{dir, filepath.Dir(dir)}

 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// Append our runner directory to the path
@@ -405,7 +419,7 @@ func projectorMemoryRequirements(filename string) uint64 {
 	}
 	defer file.Close()

-	ggml, _, err := DecodeGGML(file)
+	ggml, _, err := DecodeGGML(file, 0)
 	if err != nil {
 		return 0
 	}
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,19 +103,19 @@ function buildApp() {
 function gatherDependencies() {
    write-host "Gathering runtime dependencies"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null

    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"


    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
    case $OS_NAME in
        centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
        rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
-        fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+        fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
        amzn) install_cuda_driver_yum 'fedora' '37' ;;
        debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
        ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
--- a/server/images.go
+++ b/server/images.go
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 						}

-						layers, err := parseFromFile(ctx, temp, "", fn)
+						layer, err := NewLayer(temp, baseLayer.MediaType)
 						if err != nil {
 							return err
 						}

-						if len(layers) != 1 {
-							return errors.New("quantization failed")
+						if _, err := temp.Seek(0, io.SeekStart); err != nil {
+							return err
 						}

-						baseLayer.Layer = layers[0].Layer
-						baseLayer.GGML = layers[0].GGML
+						ggml, _, err := llm.DecodeGGML(temp, 0)
+						if err != nil {
+							return err
+						}
+
+						baseLayer.Layer = layer
+						baseLayer.GGML = ggml
 					}
 				}

--- a/server/model.go
+++ b/server/model.go
@@ -11,6 +11,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
+	"strings"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()

-			ggml, _, err := llm.DecodeGGML(blob)
+			ggml, _, err := llm.DecodeGGML(blob, 0)
 			if err != nil {
 				return nil, err
 			}
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }

-func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
 	stat, err := file.Stat()
 	if err != nil {
-		return nil, err
+		return err
 	}

 	r, err := zip.NewReader(file, stat.Size())
 	if err != nil {
-		return nil, err
+		return err
 	}

-	tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
-	if err != nil {
-		return nil, err
-	}
-	defer os.RemoveAll(tempdir)
-
 	fn(api.ProgressResponse{Status: "unpacking model metadata"})
 	for _, f := range r.File {
+		n := filepath.Join(p, f.Name)
+		if !strings.HasPrefix(n, p) {
+			slog.Warn("skipped extracting file outside of context", "name", f.Name)
+			continue
+		}
+
+		if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
+			return err
+		}
+
 		// TODO(mxyng): this should not write out all files to disk
-		outfile, err := os.Create(filepath.Join(tempdir, f.Name))
+		outfile, err := os.Create(n)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer outfile.Close()

 		infile, err := f.Open()
 		if err != nil {
-			return nil, err
+			return err
 		}
 		defer infile.Close()

 		if _, err = io.Copy(outfile, infile); err != nil {
-			return nil, err
+			return err
 		}

 		if err := outfile.Close(); err != nil {
-			return nil, err
+			return err
 		}

 		if err := infile.Close(); err != nil {
-			return nil, err
+			return err
 		}
 	}

-	mf, err := convert.GetModelFormat(tempdir)
+	return nil
+}
+
+func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+	tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(tempDir)
+
+	if err := extractFromZipFile(tempDir, file, fn); err != nil {
+		return nil, err
+	}
+
+	mf, err := convert.GetModelFormat(tempDir)
 	if err != nil {
 		return nil, err
 	}

-	params, err := mf.GetParams(tempdir)
+	params, err := mf.GetParams(tempDir)
 	if err != nil {
 		return nil, err
 	}

-	mArch, err := mf.GetModelArch("", tempdir, params)
+	mArch, err := mf.GetModelArch("", tempDir, params)
 	if err != nil {
 		return nil, err
 	}
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a

 	// TODO(mxyng): this should write directly into a layer
 	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
-	temp, err := os.CreateTemp(tempdir, "fp16")
+	temp, err := os.CreateTemp(tempDir, "fp16")
 	if err != nil {
 		return nil, err
 	}
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
 	}
 	defer bin.Close()

-	ggml, _, err := llm.DecodeGGML(bin)
+	ggml, _, err := llm.DecodeGGML(bin, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap

 	var offset int64
 	for offset < stat.Size() {
-		ggml, n, err := llm.DecodeGGML(file)
+		ggml, n, err := llm.DecodeGGML(file, 0)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -0,0 +1,92 @@
+package server
+
+import (
+	"archive/zip"
+	"bytes"
+	"io"
+	"os"
+	"path/filepath"
+	"slices"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+)
+
+func createZipFile(t *testing.T, name string) *os.File {
+	t.Helper()
+
+	f, err := os.CreateTemp(t.TempDir(), "")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	zf := zip.NewWriter(f)
+	defer zf.Close()
+
+	zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
+		t.Fatal(err)
+	}
+
+	return f
+}
+
+func TestExtractFromZipFile(t *testing.T) {
+	cases := []struct {
+		name   string
+		expect []string
+	}{
+		{
+			name:   "good",
+			expect: []string{"good"},
+		},
+		{
+			name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			f := createZipFile(t, tt.name)
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
+				t.Fatal(err)
+			}
+
+			var matches []string
+			if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
+				if err != nil {
+					return err
+				}
+
+				if !fi.IsDir() {
+					matches = append(matches, p)
+				}
+
+				return nil
+			}); err != nil {
+				t.Fatal(err)
+			}
+
+			var actual []string
+			for _, match := range matches {
+				rel, err := filepath.Rel(tempDir, match)
+				if err != nil {
+					t.Error(err)
+				}
+
+				actual = append(actual, rel)
+			}
+
+			if !slices.Equal(actual, tt.expect) {
+				t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
+			}
+		})
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -734,9 +734,48 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()

+	kvData, err := getKVData(m.ModelPath, req.Verbose)
+	if err != nil {
+		return nil, err
+	}
+	delete(kvData, "general.name")
+	delete(kvData, "tokenizer.chat_template")
+	resp.ModelInfo = kvData
+
+	if len(m.ProjectorPaths) > 0 {
+		projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
+		if err != nil {
+			return nil, err
+		}
+		resp.ProjectorInfo = projectorData
+	}
+
 	return resp, nil
 }

+func getKVData(digest string, verbose bool) (llm.KV, error) {
+	maxArraySize := 0
+	if verbose {
+		maxArraySize = -1
+	}
+	kvData, err := llm.LoadModel(digest, maxArraySize)
+	if err != nil {
+		return nil, err
+	}
+
+	kv := kvData.KV()
+
+	if !verbose {
+		for k := range kv {
+			if t, ok := kv[k].([]any); len(t) > 5 && ok {
+				kv[k] = []any{}
+			}
+		}
+	}
+
+	return kv, nil
+}
+
 func (s *Server) ListModelsHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
@@ -1066,11 +1105,20 @@ func Serve(ln net.Listener) error {
 	schedCtx, schedDone := context.WithCancel(ctx)
 	sched := InitScheduler(schedCtx)
 	s := &Server{addr: ln.Addr(), sched: sched}
-	r := s.GenerateRoutes()
+
+	http.Handle("/", s.GenerateRoutes())

 	slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
 	srvr := &http.Server{
-		Handler: r,
+		// Use http.DefaultServeMux so we get net/http/pprof for
+		// free.
+		//
+		// TODO(bmizerany): Decide if we want to make this
+		// configurable so it is not exposed by default, or allow
+		// users to bind it to a different port. This was a quick
+		// and easy way to get pprof, but it may not be the best
+		// way.
+		Handler: nil,
 	}

 	// listen for a ctrl+c and stop any loaded llm
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,6 +19,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
 					"top_p 0.9",
 				}
 				assert.Equal(t, expectedParams, params)
+				assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
 			},
 		},
 	}
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
 		})
 	}
 }
+
+func TestShow(t *testing.T) {
+	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()
+
+	var s Server
+
+	createRequest(t, s.CreateModelHandler, api.CreateRequest{
+		Name: "show-model",
+		Modelfile: fmt.Sprintf(
+			"FROM %s\nFROM %s",
+			createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
+			createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
+		),
+	})
+
+	w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
+		Name: "show-model",
+	})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status code 200, actual %d", w.Code)
+	}
+
+	var resp api.ShowResponse
+	if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+		t.Fatal(err)
+	}
+
+	if resp.ModelInfo["general.architecture"] != "test" {
+		t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
+	}
+
+	if resp.ProjectorInfo["general.architecture"] != "clip" {
+		t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
+	}
+}
--- a/server/sched.go
+++ b/server/sched.go
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}

 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
+					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
 					if err != nil {
 						pending.errCh <- err
 						break
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	})
 	require.NoError(t, err)

 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	scenario.ggml, err = llm.LoadModel(model.ModelPath)
+	scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)

 	scenario.req = &LlmRequest{
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -4,7 +4,6 @@ package model

 import (
 	"cmp"
-	"encoding/hex"
 	"errors"
 	"fmt"
 	"log/slog"
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
 	}
 	return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
 }
-
-type DigestType byte
-
-const (
-	DigestTypeInvalid DigestType = iota
-	DigestTypeSHA256
-)
-
-func (t DigestType) String() string {
-	switch t {
-	case DigestTypeSHA256:
-		return "sha256"
-	default:
-		return "invalid"
-	}
-}
-
-type Digest struct {
-	Type DigestType
-	Sum  [32]byte
-}
-
-func ParseDigest(s string) (Digest, error) {
-	i := strings.IndexAny(s, "-:")
-	if i < 0 {
-		return Digest{}, fmt.Errorf("invalid digest %q", s)
-	}
-	typ, encSum := s[:i], s[i+1:]
-	if typ != "sha256" {
-		return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
-	}
-	d := Digest{
-		Type: DigestTypeSHA256,
-	}
-	n, err := hex.Decode(d.Sum[:], []byte(encSum))
-	if err != nil {
-		return Digest{}, err
-	}
-	if n != 32 {
-		return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
-	}
-	return d, nil
-}
-
-func (d Digest) String() string {
-	if d.Type == DigestTypeInvalid {
-		return ""
-	}
-	return fmt.Sprintf("sha256-%x", d.Sum)
-}
-
-func (d Digest) IsValid() bool {
-	return d.Type != DigestTypeInvalid
-}
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
 	}
 }

-const (
-	validSha256    = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
-	validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
-)
-
-func TestParseDigest(t *testing.T) {
-	cases := []struct {
-		in   string
-		want string
-	}{
-		{"", ""},           // empty
-		{"sha123-12", ""},  // invalid type
-		{"sha256-", ""},    // invalid sum
-		{"sha256-123", ""}, // invalid odd length sum
-
-		{validSha256, validSha256},
-		{validSha256Old, validSha256},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			got, err := ParseDigest(tt.in)
-			if err != nil {
-				if tt.want != "" {
-					t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
-				}
-				return
-			}
-			if got.String() != tt.want {
-				t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestParseNameFromFilepath(t *testing.T) {
 	cases := map[string]Name{
 		filepath.Join("host", "namespace", "model", "tag"):      {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
--- a/util/bufioutil/buffer_seeker.go
+++ b/util/bufioutil/buffer_seeker.go
@@ -0,0 +1,34 @@
+package bufioutil
+
+import (
+	"bufio"
+	"io"
+)
+
+type BufferedSeeker struct {
+	rs io.ReadSeeker
+	br *bufio.Reader
+}
+
+func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
+	return &BufferedSeeker{
+		rs: rs,
+		br: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (b *BufferedSeeker) Read(p []byte) (int, error) {
+	return b.br.Read(p)
+}
+
+func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
+	if whence == io.SeekCurrent {
+		offset -= int64(b.br.Buffered())
+	}
+	n, err := b.rs.Seek(offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	b.br.Reset(b.rs)
+	return n, nil
+}
--- a/util/bufioutil/buffer_seeker_test.go
+++ b/util/bufioutil/buffer_seeker_test.go
@@ -0,0 +1,64 @@
+package bufioutil
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestBufferedSeeker(t *testing.T) {
+	const alphabet = "abcdefghijklmnopqrstuvwxyz"
+
+	bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
+
+	checkRead := func(buf []byte, expected string) {
+		t.Helper()
+		_, err := bs.Read(buf)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if !bytes.Equal(buf, []byte(expected)) {
+			t.Fatalf("expected %s, got %s", expected, buf)
+		}
+	}
+
+	// Read the first 5 bytes
+	buf := make([]byte, 5)
+
+	checkRead(buf, "abcde")
+
+	// Seek back to the beginning
+	_, err := bs.Seek(0, io.SeekStart)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// read 'a'
+	checkRead(buf[:1], "a")
+
+	if bs.br.Buffered() == 0 {
+		t.Fatalf("totally unexpected sanity check failed")
+	}
+
+	// Seek past 'b'
+	_, err = bs.Seek(1, io.SeekCurrent)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "cdefg")
+
+	// Seek back to the beginning
+	_, err = bs.Seek(0, io.SeekStart)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "abcde")
+
+	// Seek to the end
+	_, err = bs.Seek(-5, io.SeekEnd)
+	if err != nil {
+		t.Fatal(err)
+	}
+	checkRead(buf, "vwxyz")
+}
Author	SHA1	Message	Date
Roy Han	d77a174eb4	defaut timeout	2024-06-27 14:58:31 -07:00
Michael	2cc7d05012	update readme for gemma 2 (#5333 ) * update readme for gemma 2	2024-06-27 12:45:16 -04:00
Michael Yang	123a722a6f	zip: prevent extracting files into parent dirs (#5314 )	2024-06-26 21:38:21 -07:00
Jeffrey Morgan	4d311eb731	llm: architecture patch (#5316 )	2024-06-26 21:38:12 -07:00
Blake Mizerany	cb42e607c5	llm: speed up gguf decoding by a lot (#5246 ) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF.	2024-06-24 21:47:52 -07:00
Blake Mizerany	2aa91a937b	cmd: defer stating model info until necessary (#5248 ) This commit changes the 'ollama run' command to defer fetching model information until it really needs it. That is, when in interactive mode. It also removes one such case where the model information is fetch in duplicate, just before calling generateInteractive and then again, first thing, in generateInteractive. This positively impacts the performance of the command: ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.168 total ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.220 total ; time ./before run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./before run llama3 'hi' 0.02s user 0.01s system 2% cpu 1.217 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 4% cpu 0.652 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.01s user 0.01s system 5% cpu 0.498 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with or would you like to chat? ./after run llama3 'hi' 0.01s user 0.01s system 3% cpu 0.479 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 5% cpu 0.507 total ; time ./after run llama3 'hi' Hi! It's nice to meet you. Is there something I can help you with, or would you like to chat? ./after run llama3 'hi' 0.02s user 0.01s system 5% cpu 0.507 total	2024-06-24 20:14:03 -07:00
Daniel Hiltgen	ccef9431c8	Merge pull request #5205 from dhiltgen/modelfile_use_mmap Fix use_mmap parsing for modelfiles	2024-06-21 16:30:36 -07:00
royjhan	9a9e7d83c4	Docs (#5149 )	2024-06-21 15:52:09 -07:00
Michael Yang	189a43caa2	Merge pull request #5206 from ollama/mxyng/quantize fix: quantization with template	2024-06-21 13:44:34 -07:00
Michael Yang	e835ef1836	fix: quantization with template	2024-06-21 13:39:25 -07:00
Daniel Hiltgen	7e7749224c	Fix use_mmap parsing for modelfiles Add the new tristate parsing logic for the code path for modelfiles, as well as a unit test.	2024-06-21 12:27:19 -07:00
Daniel Hiltgen	c7c2f3bc22	Merge pull request #5194 from dhiltgen/linux_mmap_auto Refine mmap default logic on linux	2024-06-20 11:44:08 -07:00
Daniel Hiltgen	54a79d6a8a	Merge pull request #5125 from dhiltgen/fedora39 Bump latest fedora cuda repo to 39	2024-06-20 11:27:24 -07:00
Daniel Hiltgen	5bf5aeec01	Refine mmap default logic on linux If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.	2024-06-20 11:07:04 -07:00
Michael Yang	e01e535cbb	Merge pull request #5192 from ollama/mxyng/kv handle asymmetric embedding KVs	2024-06-20 10:46:24 -07:00
Josh	0195d6a2f8	Merge pull request #5188 from ollama/jyan/tmpdir2 fix: skip os.removeAll() if PID does not exist	2024-06-20 10:40:59 -07:00
Michael Yang	8e0641a9bf	handle asymmetric embedding KVs	2024-06-20 09:57:27 -07:00
Josh Yan	662568d453	err!=nil check	2024-06-20 09:30:59 -07:00
Josh Yan	4ebb66c662	reformat error check	2024-06-20 09:23:43 -07:00
Josh Yan	23e899f32d	skip os.removeAll() if PID does not exist	2024-06-20 08:51:35 -07:00
royjhan	fedf71635e	Extend api/show and ollama show to return more model info (#4881 ) * API Show Extended * Initial Draft of Information Co-Authored-By: Patrick Devine <pdevine@sonic.net> * Clean Up * Descriptive arg error messages and other fixes * Second Draft of Show with Projectors Included * Remove Chat Template * Touches * Prevent wrapping from files * Verbose functionality * Docs * Address Feedback * Lint * Resolve Conflicts * Function Name * Tests for api/show model info * Show Test File * Add Projector Test * Clean routes * Projector Check * Move Show Test * Touches * Doc update --------- Co-authored-by: Patrick Devine <pdevine@sonic.net>	2024-06-19 14:19:02 -07:00
Daniel Hiltgen	97c59be653	Merge pull request #5074 from dhiltgen/app_log_rotation Implement log rotation for tray app	2024-06-19 13:02:24 -07:00
Daniel Hiltgen	9d8a4988e8	Implement log rotation for tray app	2024-06-19 12:53:34 -07:00
Michael Yang	1ae0750a21	Merge pull request #5147 from ollama/mxyng/cleanup remove confusing log message	2024-06-19 12:50:31 -07:00
Michael Yang	9d91e5e587	remove confusing log message	2024-06-19 11:14:11 -07:00
Daniel Hiltgen	96624aa412	Merge pull request #5072 from dhiltgen/windows_path Move libraries out of users path	2024-06-19 09:13:39 -07:00
Daniel Hiltgen	10f33b8537	Merge pull request #5146 from dhiltgen/backout Put back temporary intel GPU env var	2024-06-19 09:12:45 -07:00
Daniel Hiltgen	4a633cc295	Merge pull request #5145 from dhiltgen/bad_loads Fix bad symbol load detection	2024-06-19 09:12:33 -07:00
Daniel Hiltgen	d34d88e417	Revert "Revert "gpu: add env var for detecting Intel oneapi gpus (#5076 )"" This reverts commit `755b4e4fc2`.	2024-06-19 08:57:41 -07:00
Daniel Hiltgen	52ce350b7a	Fix bad symbol load detection pointer deref's weren't correct on a few libraries, which explains some crashes on older systems or miswired symlinks for discovery libraries.	2024-06-19 08:39:07 -07:00
Daniel Hiltgen	2abebb2cbe	Merge pull request #5128 from zhewang1-intc/fix_levelzero_empty_symbol_detect Fix levelzero empty symbol detect	2024-06-19 08:33:16 -07:00
Blake Mizerany	380e06e5be	types/model: remove Digest The Digest type in its current form is awkward to work with and presents challenges with regard to how it serializes via String using the '-' prefix. We currently only use this in ollama.com, so we'll move our specific needs around digest parsing and validation there.	2024-06-18 20:28:11 -07:00
Wang,Zhe	badf975e45	get real func ptr.	2024-06-19 09:00:51 +08:00
Wang,Zhe	755b4e4fc2	Revert "gpu: add env var for detecting Intel oneapi gpus (#5076 )" This reverts commit `163cd3e77c`.	2024-06-19 08:59:58 +08:00
Daniel Hiltgen	1a1c99e334	Bump latest fedora cuda repo to 39	2024-06-18 17:13:54 -07:00
Michael Yang	21adf8b6d2	Merge pull request #5121 from ollama/mxyng/deepseekv2 deepseek v2 graph	2024-06-18 16:30:58 -07:00
Michael Yang	e873841cbb	deepseek v2 graph	2024-06-18 15:35:12 -07:00
Daniel Hiltgen	26d0bf9236	Merge pull request #5117 from dhiltgen/fix_prediction Handle models with divergent layer sizes	2024-06-18 11:36:51 -07:00
Daniel Hiltgen	359b15a597	Handle models with divergent layer sizes The recent refactoring of the memory prediction assumed all layers are the same size, but for some models (like deepseek-coder-v2) this is not the case, so our predictions were significantly off.	2024-06-18 11:05:34 -07:00
Daniel Hiltgen	b55958a587	Merge pull request #5106 from dhiltgen/clean_logs Tighten up memory prediction logging	2024-06-18 09:24:38 -07:00
Daniel Hiltgen	7784ca33ce	Tighten up memory prediction logging Prior to this change, we logged the memory prediction multiple times as the scheduler iterates to find a suitable configuration, which can be confusing since only the last log before the server starts is actually valid. This now logs once just before starting the server on the final configuration. It also reports what library instead of always saying "offloading to gpu" when using CPU.	2024-06-18 09:15:35 -07:00
Daniel Hiltgen	c9c8c98bf6	Merge pull request #5105 from dhiltgen/cuda_mmap Adjust mmap logic for cuda windows for faster model load	2024-06-17 17:07:30 -07:00
Daniel Hiltgen	171796791f	Adjust mmap logic for cuda windows for faster model load On Windows, recent llama.cpp changes make mmap slower in most cases, so default to off. This also implements a tri-state for use_mmap so we can detect the difference between a user provided value of true/false, or unspecified.	2024-06-17 16:54:30 -07:00
Jeffrey Morgan	176d0f7075	Update import.md	2024-06-17 19:44:14 -04:00
Daniel Hiltgen	8ed51cac37	Merge pull request #5103 from dhiltgen/faster_win_build Revert powershell jobs, but keep nvcc and cmake parallelism	2024-06-17 14:23:18 -07:00
Daniel Hiltgen	c9e6f0542d	Merge pull request #5069 from dhiltgen/ci_release Implement custom github release action	2024-06-17 13:59:37 -07:00
Daniel Hiltgen	b0930626c5	Add back lower level parallel flags nvcc supports parallelism (threads) and cmake + make can use -j, while msbuild requires /p:CL_MPcount=8	2024-06-17 13:44:46 -07:00
Daniel Hiltgen	e890be4814	Revert "More parallelism on windows generate" This reverts commit `0577af98f4`.	2024-06-17 13:32:46 -07:00
Daniel Hiltgen	b2799f111b	Move libraries out of users path We update the PATH on windows to get the CLI mapped, but this has an unintended side effect of causing other apps that may use our bundled DLLs to get terminated when we upgrade.	2024-06-17 13:12:18 -07:00
Jeffrey Morgan	152fc202f5	llm: update llama.cpp commit to `7c26775` (#4896 ) * llm: update llama.cpp submodule to `7c26775` * disable `LLAMA_BLAS` for now * `-DLLAMA_OPENMP=off`	2024-06-17 15:56:16 -04:00
Lei Jitang	4ad0d4d6d3	Fix a build warning (#5096 ) Signed-off-by: Lei Jitang <leijitang@outlook.com>	2024-06-17 14:47:48 -04:00
Daniel Hiltgen	a12283e2ff	Implement custom github release action This implements the release logic we want via gh cli to support updating releases with rc tags in place and retain release notes and other community reactions.	2024-06-15 11:36:56 -07:00