testing new cmake script

2026-02-25 11:36:54 -05:00 · 2024-03-03 00:51:07 -08:00
63 changed files with 930 additions and 5801 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -2,22 +2,17 @@ name: test

 on:
  pull_request:
-    paths:
-      - '**/*'
-      - '!docs/**'
-      - '!examples/**'
-      - '!README.md'

 jobs:
  generate:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
@@ -26,21 +21,10 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: "Windows Go Generate"
      - run: go generate -x ./...
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: "Unix Go Generate"
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -62,7 +46,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
      - run: |
@@ -78,6 +62,7 @@ jobs:
    strategy:
      matrix:
        rocm-version:
+          - '5.7.1'
          - '6.0'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
@@ -91,7 +76,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
      - run: |
@@ -106,26 +91,26 @@ jobs:
  lint:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
          - os: macos-latest
            arch: amd64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
+      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: false
      - run: |
          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
@@ -145,24 +130,24 @@ jobs:
    needs: generate
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
+      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
--- a/29
+++ b/29
@@ -1,7 +1,6 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.21.3
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -29,7 +28,7 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -40,14 +39,18 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-RUN mkdir /tmp/scratch && \
-    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
-    done && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/rocm-amd64-deps.tgz . )

+FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG AMDGPU_TARGETS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -88,8 +91,8 @@ COPY . .
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
+COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build .
@@ -114,7 +117,7 @@ RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
 RUN update-pciids
 COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
--- a/README.md
+++ b/README.md
@@ -276,10 +276,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
 - [Msty](https://msty.app)
 - [Chatbox](https://github.com/Bin-Huang/Chatbox)
- [WinForm Ollama Copilot](https://github.com/tgraupmann/WinForm_Ollama_Copilot)
 - [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)

 ### Terminal

@@ -342,7 +339,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Continue](https://github.com/continuedev/continue)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
- [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -91,14 +91,6 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-; Assumes v5.7, may need adjustments for v6
-#if GetEnv("HIP_PATH") != ""
-  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
-  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
-  ; amdhip64.dll dependency comes from the driver and must be installed already
-  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
-#endif
-

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1,7 +1,6 @@
 package cmd

 import (
-	"archive/zip"
 	"bytes"
 	"context"
 	"crypto/ed25519"
@@ -88,82 +87,22 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = filepath.Join(filepath.Dir(filename), path)
 			}

-			fi, err := os.Stat(path)
+			bin, err := os.Open(path)
 			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
 				continue
 			} else if err != nil {
 				return err
 			}
+			defer bin.Close()

-			// TODO make this work w/ adapters
-			if fi.IsDir() {
-				tf, err := os.CreateTemp("", "ollama-tf")
-				if err != nil {
-					return err
-				}
-				defer os.RemoveAll(tf.Name())
-
-				zf := zip.NewWriter(tf)
-
-				files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
-				if err != nil {
-					return err
-				}
-
-				if len(files) == 0 {
-					return fmt.Errorf("no safetensors files were found in '%s'", path)
-				}
-
-				// add the safetensor config file + tokenizer
-				files = append(files, filepath.Join(path, "config.json"))
-				files = append(files, filepath.Join(path, "added_tokens.json"))
-				files = append(files, filepath.Join(path, "tokenizer.model"))
-
-				for _, fn := range files {
-					f, err := os.Open(fn)
-					if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
-						continue
-					} else if err != nil {
-						return err
-					}
-
-					fi, err := f.Stat()
-					if err != nil {
-						return err
-					}
-
-					h, err := zip.FileInfoHeader(fi)
-					if err != nil {
-						return err
-					}
-
-					h.Name = filepath.Base(fn)
-					h.Method = zip.Store
-
-					w, err := zf.CreateHeader(h)
-					if err != nil {
-						return err
-					}
-
-					_, err = io.Copy(w, f)
-					if err != nil {
-						return err
-					}
-
-				}
-
-				if err := zf.Close(); err != nil {
-					return err
-				}
-
-				if err := tf.Close(); err != nil {
-					return err
-				}
-				path = tf.Name()
+			hash := sha256.New()
+			if _, err := io.Copy(hash, bin); err != nil {
+				return err
 			}
+			bin.Seek(0, io.SeekStart)

-			digest, err := createBlob(cmd, client, path)
-			if err != nil {
+			digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
+			if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
 				return err
 			}

@@ -202,26 +141,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
-	bin, err := os.Open(path)
-	if err != nil {
-		return "", err
-	}
-	defer bin.Close()
-
-	hash := sha256.New()
-	if _, err := io.Copy(hash, bin); err != nil {
-		return "", err
-	}
-	bin.Seek(0, io.SeekStart)
-
-	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
-		return "", err
-	}
-	return digest, nil
-}
-
 func RunHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -887,14 +806,6 @@ func versionHandler(cmd *cobra.Command, _ []string) {
 	}
 }

-func appendHostEnvDocs(cmd *cobra.Command) {
-	const hostEnvDocs = `
-Environment Variables:
-      OLLAMA_HOST        The host:port or base URL of the Ollama server (e.g. http://localhost:11434)
-`
-	cmd.SetUsageTemplate(cmd.UsageTemplate() + hostEnvDocs)
-}
-
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	cobra.EnableCommandSorting = false
@@ -960,6 +871,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
+
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@@ -967,13 +879,6 @@ func NewCLI() *cobra.Command {
 		Args:    cobra.ExactArgs(0),
 		RunE:    RunServer,
 	}
-	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
-Environment Variables:
-
-    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS    A comma separated list of allowed origins.
-    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
-`)

 	pullCmd := &cobra.Command{
 		Use:     "pull MODEL",
@@ -1002,6 +907,7 @@ Environment Variables:
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListHandler,
 	}
+
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE TARGET",
 		Short:   "Copy a model",
@@ -1018,19 +924,6 @@ Environment Variables:
 		RunE:    DeleteHandler,
 	}

-	for _, cmd := range []*cobra.Command{
-		createCmd,
-		showCmd,
-		runCmd,
-		pullCmd,
-		pushCmd,
-		listCmd,
-		copyCmd,
-		deleteCmd,
-	} {
-		appendHostEnvDocs(cmd)
-	}
-
 	rootCmd.AddCommand(
 		serveCmd,
 		createCmd,
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,331 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"slices"
-
-	"github.com/mitchellh/mapstructure"
-	"google.golang.org/protobuf/proto"
-
-	"github.com/jmorganca/ollama/convert/sentencepiece"
-	"github.com/jmorganca/ollama/llm"
-)
-
-type Params struct {
-	Architectures    []string `json:"architectures"`
-	VocabSize        int      `json:"vocab_size"`
-	HiddenSize       int      `json:"hidden_size"`       // n_embd
-	HiddenLayers     int      `json:"num_hidden_layers"` // n_layer
-	ContextSize      int      `json:"max_position_embeddings"`
-	IntermediateSize int      `json:"intermediate_size"`
-	AttentionHeads   int      `json:"num_attention_heads"` // n_head
-	KeyValHeads      int      `json:"num_key_value_heads"`
-	NormEPS          float64  `json:"rms_norm_eps"`
-	RopeFreqBase     float64  `json:"rope_theta"`
-	BoSTokenID       int      `json:"bos_token_id"`
-	EoSTokenID       int      `json:"eos_token_id"`
-}
-
-type MetaData struct {
-	Type    string `mapstructure:"dtype"`
-	Shape   []int  `mapstructure:"shape"`
-	Offsets []int  `mapstructure:"data_offsets"`
-}
-
-func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
-	f, err := os.Open(fn)
-	if err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-	defer f.Close()
-
-	var jsonSize uint64
-	binary.Read(f, binary.LittleEndian, &jsonSize)
-
-	buf := make([]byte, jsonSize)
-	_, err = io.ReadFull(f, buf)
-	if err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-
-	d := json.NewDecoder(bytes.NewBuffer(buf))
-	d.UseNumber()
-	var parsed map[string]interface{}
-	if err = d.Decode(&parsed); err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-
-	var keys []string
-	for k := range parsed {
-		keys = append(keys, k)
-	}
-
-	slices.Sort(keys)
-
-	slog.Info("converting layers")
-
-	var tensors []llm.Tensor
-	for _, k := range keys {
-		vals := parsed[k].(map[string]interface{})
-		var data MetaData
-		if err = mapstructure.Decode(vals, &data); err != nil {
-			return []llm.Tensor{}, 0, err
-		}
-
-		var size uint64
-		var kind uint32
-		switch len(data.Shape) {
-		case 0:
-			// metadata
-			continue
-		case 1:
-			// convert to float32
-			kind = 0
-			size = uint64(data.Shape[0] * 4)
-		case 2:
-			// convert to float16
-			kind = 1
-			size = uint64(data.Shape[0] * data.Shape[1] * 2)
-		}
-
-		ggufName, err := GetTensorName(k)
-		if err != nil {
-			slog.Error("%v", err)
-			return []llm.Tensor{}, 0, err
-		}
-
-		shape := [4]uint64{0, 0, 0, 0}
-		for cnt, s := range data.Shape {
-			shape[cnt] = uint64(s)
-		}
-
-		t := llm.Tensor{
-			Name:          ggufName,
-			Kind:          kind,
-			Offset:        offset,
-			Shape:         shape,
-			FileName:      fn,
-			OffsetPadding: 8 + jsonSize,
-			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
-		}
-		slog.Debug(fmt.Sprintf("%v", t))
-		tensors = append(tensors, t)
-		offset += size
-	}
-	return tensors, offset, nil
-}
-
-func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
-	var tensors []llm.Tensor
-	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
-	if err != nil {
-		return []llm.Tensor{}, err
-	}
-
-	var offset uint64
-	for _, f := range files {
-		var t []llm.Tensor
-		var err error
-		t, offset, err = ReadSafeTensors(f, offset)
-		if err != nil {
-			slog.Error("%v", err)
-			return []llm.Tensor{}, err
-		}
-		tensors = append(tensors, t...)
-	}
-	return tensors, nil
-}
-
-func GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	var params Params
-
-	d := json.NewDecoder(f)
-	err = d.Decode(&params)
-	if err != nil {
-		return nil, err
-	}
-
-	return &params, nil
-}
-
-// Details on gguf's tokenizer can be found at:
-// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer
-type Vocab struct {
-	Tokens []string
-	Scores []float32
-	Types  []int32
-}
-
-func LoadTokens(dirpath string) (*Vocab, error) {
-	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
-	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
-	if err != nil {
-		return nil, err
-	}
-
-	// To regenerate sentencepiece from the protobufs use:
-	// protoc -I=./ --go_out=./ sentencepiece_model.proto
-	modelProto := &sentencepiece.ModelProto{}
-	if err := proto.Unmarshal(in, modelProto); err != nil {
-		return nil, err
-	}
-
-	v := &Vocab{
-		Tokens: make([]string, 0),
-		Scores: make([]float32, 0),
-		Types:  make([]int32, 0),
-	}
-
-	pieces := modelProto.GetPieces()
-	for _, p := range pieces {
-		v.Tokens = append(v.Tokens, p.GetPiece())
-		v.Scores = append(v.Scores, p.GetScore())
-		t := p.GetType()
-		v.Types = append(v.Types, int32(t))
-	}
-
-	slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens)))
-
-	// add any additional tokens
-	addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json"))
-	if os.IsNotExist(err) {
-		return v, nil
-	} else if err != nil {
-		return nil, err
-	}
-
-	slog.Info("reading user defined tokens")
-
-	var extraTokenData map[string]int
-	if err := json.Unmarshal(addIn, &extraTokenData); err != nil {
-		return nil, err
-	}
-
-	type token struct {
-		key string
-		pos int
-	}
-
-	extraTokens := make([]token, 0)
-	for k, id := range extraTokenData {
-		extraTokens = append(extraTokens, token{k, id})
-	}
-
-	slices.SortFunc(extraTokens, func(a, b token) int {
-		return cmp.Compare(a.pos, b.pos)
-	})
-
-	numToks := len(v.Tokens)
-
-	for cnt, t := range extraTokens {
-		// the token id should match the specific index for the total number of tokens
-		if t.pos != cnt+numToks {
-			return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key)
-		}
-		v.Tokens = append(v.Tokens, t.key)
-		v.Scores = append(v.Scores, -1000.0)
-		v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
-	}
-	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
-
-	return v, nil
-}
-
-func GetTensorName(n string) (string, error) {
-	tMap := map[string]string{
-		"model.embed_tokens.weight":                           "token_embd.weight",
-		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
-		"lm_head.weight":    "output.weight",
-		"model.norm.weight": "output_norm.weight",
-	}
-
-	v, ok := tMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range tMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
-	c := llm.ContainerGGUF{
-		ByteOrder: binary.LittleEndian,
-	}
-
-	m := llm.NewGGUFModel(&c)
-	m.Tensors = tensors
-	m.KV["general.architecture"] = "llama"
-	m.KV["general.name"] = name
-	m.KV["llama.context_length"] = uint32(params.ContextSize)
-	m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
-	m.KV["llama.block_count"] = uint32(params.HiddenLayers)
-	m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
-	m.KV["llama.rope.dimension_count"] = uint32(128)
-	m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
-	m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
-	m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
-	m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
-	m.KV["general.file_type"] = uint32(1)
-	m.KV["tokenizer.ggml.model"] = "llama"
-
-	m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
-	m.KV["tokenizer.ggml.scores"] = vocab.Scores
-	m.KV["tokenizer.ggml.token_type"] = vocab.Types
-
-	m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
-	m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
-	m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
-	m.KV["tokenizer.ggml.add_bos_token"] = true
-	m.KV["tokenizer.ggml.add_eos_token"] = false
-
-	// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
-	// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
-
-	c.V3.NumTensor = uint64(len(tensors))
-	c.V3.NumKV = uint64(len(m.KV))
-
-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	err = m.Encode(f)
-	if err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
-}
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
--- a/convert/sentencepiece_model.proto
+++ b/convert/sentencepiece_model.proto
@@ -1,333 +0,0 @@
-// Copyright 2016 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.!
-
-syntax = "proto2";
-
-// TODO(taku): Needs to use LITE RUNTIME in OSS release.
-option optimize_for = LITE_RUNTIME;
-option go_package = "./sentencepiece";
-
-package sentencepiece;
-
-// TrainerSpec encodes a various parameters for SentencePiece training.
-// Next id: 55
-message TrainerSpec {
-  ///////////////////////////////////////////////////////////////////
-  // General parameters
-  //
-  // Input corpus files.
-  //  Trainer accepts the following two formats:
-  //  A) Monolingual: plain text, one sentence per line.
-  //  B) Bilingual:   TSV, source sentence <tab> target sentence
-  //  When bilingual data is passed, shared vocabulary model is built.
-  //  Note that the input file must be raw corpus, not a preprocessed corpus.
-  //  Trainer only loads the first `input_sentence_size` sentences specified
-  //  with this parameter.
-  repeated string input = 1;
-
-  // Input corpus format:
-  // "text": one-sentence-per-line text format (default)
-  // "tsv":  sentence <tab> freq
-  optional string input_format = 7;
-
-  // Output model file prefix.
-  // <model_prefix>.model and <model_prefix>.vocab are generated.
-  optional string model_prefix = 2;
-
-  // Model type. only have UNIGRAM now.
-  enum ModelType {
-    UNIGRAM = 1;  // Unigram language model with dynamic algorithm
-    BPE = 2;      // Byte Pair Encoding
-    WORD = 3;     // Delimitered by whitespace.
-    CHAR = 4;     // tokenizes into character sequence
-  }
-  optional ModelType model_type = 3 [default = UNIGRAM];
-
-  // Vocabulary size. 8k is the default size.
-  optional int32 vocab_size = 4 [default = 8000];
-
-  // List of the languages this model can accept.
-  // Since the model is language-agnostic, this field is used as a reference.
-  repeated string accept_language = 5;
-
-  // Size of self-test samples, which are encoded in the model file.
-  optional int32 self_test_sample_size = 6 [default = 0];
-
-  // Whether to use DP version of sentencepiece. Use it with TSV input format
-  // (requires precomputed word tab counts to work).
-  optional bool enable_differential_privacy = 50 [default = false];
-  // Set these parameters if you need DP version of sentencepiece.
-  // std of noise to add.
-  optional float differential_privacy_noise_level = 51 [default = 0.0];
-  // Clipping threshold to apply after adding noise. All the words with
-  // frequency less than this value are dropped.
-  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
-
-  ///////////////////////////////////////////////////////////////////
-  // Training parameters.
-  //
-  // Uses characters which cover the corpus with the ratio of `chars_coverage`.
-  // This parameter determines the set of basic Alphabet of sentence piece.
-  // 1.0 - `chars_coverage` characters are treated as UNK.
-  // See also required_chars field.
-  optional float character_coverage = 10 [default = 0.9995];
-
-  // Maximum size of sentences the trainer loads from `input` parameter.
-  // Trainer simply loads the `input` files in sequence.
-  // It is better to shuffle the input corpus randomly.
-  optional uint64 input_sentence_size = 11 [default = 0];
-  optional bool shuffle_input_sentence = 19 [default = true];
-
-  // Maximum size of sentences to make seed sentence pieces.
-  // Extended suffix array is constructed to extract frequent
-  // sub-strings from the corpus. This uses 20N working space,
-  // where N is the size of corpus.
-  optional int32 mining_sentence_size = 12 [deprecated = true];
-
-  // Maximum size of sentences to train sentence pieces.
-  optional int32 training_sentence_size = 13 [deprecated = true];
-
-  // The size of seed sentencepieces.
-  // `seed_sentencepiece_size` must be larger than `vocab_size`.
-  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
-
-  // In every EM sub-iterations, keeps top
-  // `shrinking_factor` * `current sentencepieces size` with respect to
-  // the loss of the sentence piece. This value should be smaller than 1.0.
-  optional float shrinking_factor = 15 [default = 0.75];
-
-  // The maximum sentence length in byte. The sentences with the length
-  // larger than `max_sentence_length` is simply ignored.
-  // Longer input tends to bring the following risks:
-  //  * Overflow during EM training (unigram language model only)
-  //  * Performance drop because of O(n log n) cost in BPE.
-  optional int32 max_sentence_length = 18 [default = 4192];
-
-  // Number of threads in the training.
-  optional int32 num_threads = 16 [default = 16];
-
-  // Number of EM sub iterations.
-  optional int32 num_sub_iterations = 17 [default = 2];
-
-  ///////////////////////////////////////////////////////////////////
-  // SentencePiece parameters which control the shapes of sentence piece.
-  //
-  // Maximum length of sentencepiece.
-  optional int32 max_sentencepiece_length = 20 [default = 16];
-
-  // Uses Unicode script to split sentence pieces.
-  // When `split_by_unicode_script` is true, we do not allow sentence piece to
-  // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
-  // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
-  // as one script type, since Japanese word can consist of multiple scripts.
-  // This exception is always applied regardless of the accept-language
-  // parameter.
-  optional bool split_by_unicode_script = 21 [default = true];
-
-  // When `split_by_number` is true, put a boundary between number and
-  // non-number transition. If we want to treat "F1" is one token, set this flag
-  // to be false.
-  optional bool split_by_number = 23 [default = true];
-
-  // Use a white space to split sentence pieces.
-  // When `split_by_whitespace` is false, we may have the piece containing
-  // a white space in the middle. e.g., "in_the".
-  optional bool split_by_whitespace = 22 [default = true];
-
-  // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
-  // hello_. When `treat_whitespace_as_suffix` is true,
-  // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
-  // of sentence.
-  optional bool treat_whitespace_as_suffix = 24 [default = false];
-
-  // Allows pieces that only contain whitespaces instead of appearing only as
-  // prefix or suffix of other pieces.
-  optional bool allow_whitespace_only_pieces = 26 [default = false];
-
-  // Split all digits (0-9) into separate pieces.
-  optional bool split_digits = 25 [default = false];
-
-  // Defines the pre-tokenization delimiter.
-  // When specified, no pieces crossing this delimiter is not included
-  // in the vocab. Then the delimiter string is virtually ignored
-  // during the training. This field can allows constraints on the vocabulary
-  // selection. Note that this field is available on unigram mode.
-  optional string pretokenization_delimiter = 53 [ default = ""];
-
-  ///////////////////////////////////////////////////////////////////
-  // Vocabulary management
-  //
-  // Defines control symbols used as an indicator to
-  // change the behavior of the decoder. <s> and </s> are pre-defined.
-  // We can use this field to encode various meta information,
-  // including language indicator in multilingual model.
-  // These symbols are not visible to users, but visible to
-  // the decoder. Note that when the input sentence contains control symbols,
-  // they are not treated as one token, but segmented into normal pieces.
-  // Control symbols must be inserted independently from the segmentation.
-  repeated string control_symbols = 30;
-
-  // Defines user defined symbols.
-  // These symbols are added with extremely high score
-  // so they are always treated as one unique symbol in any context.
-  // Typical usage of user_defined_symbols is placeholder for named entities.
-  repeated string user_defined_symbols = 31;
-
-  // Defines required characters. Each UTF8 character in this string is included
-  // in the character set regardless of character_coverage value. Unlike
-  // user_defined_symbols, these characters have scores based on the frequency
-  // on input sentences, and the model can form subwords using characters
-  // in this field.
-  optional string required_chars = 36;
-
-  // Decomposes unknown pieces into UTF-8 bytes.
-  optional bool byte_fallback = 35 [default = false];
-
-  // When creating the vocabulary file, defines whether or not to additionally
-  // output the score for each piece.
-  optional bool vocabulary_output_piece_score = 32 [default = true];
-
-  // `vocab_size` is treated as hard limit. Crash if
-  // the model can not produce the vocab of size `vocab_size`,
-  // When `hard_vocab_limit` is false, vocab_size is treated
-  // as soft limit. Note that when model_type=char,
-  // always assumes hard_vocab_limit = false.
-  optional bool hard_vocab_limit = 33 [default = true];
-
-  // use all symbols for vocab extraction. This flag is valid
-  // if model type is either CHAR or WORD
-  optional bool use_all_vocab = 34 [default = false];
-
-  ///////////////////////////////////////////////////////////////////
-  // Reserved special meta tokens.
-  // * -1 is not used.
-  // * unk_id must not be -1.
-  // Id must starts with 0 and be contigous.
-  optional int32 unk_id = 40 [default = 0];   // <unk>
-  optional int32 bos_id = 41 [default = 1];   // <s>
-  optional int32 eos_id = 42 [default = 2];   // </s>
-  optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
-  optional string unk_piece = 45 [default = "<unk>"];
-  optional string bos_piece = 46 [default = "<s>"];
-  optional string eos_piece = 47 [default = "</s>"];
-  optional string pad_piece = 48 [default = "<pad>"];
-
-  // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
-  // since this character can be useful both for user and
-  // developer. We can easily figure out that <unk> is emitted.
-  optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
-
-  // Increase bit depth to allow unigram model training on large
-  // (>10M sentences) corpora. A Side-effect of enabling this flag
-  // is increased memory usage.
-  optional bool train_extremely_large_corpus = 49 [default = false];
-
- // Path to a seed sentencepieces file, with one tab-separated
-  // seed sentencepiece <tab> frequency per line.
-  optional string seed_sentencepieces_file = 54 [default = ""];
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// NormalizerSpec encodes a various parameters for string normalizaiton
-message NormalizerSpec {
-  // name of normalization rule.
-  optional string name = 1;
-
-  // Pre-compiled normalization rule created by
-  // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
-  // Usually this field is set by Builder::GetNormalizerSpec() method.
-  optional bytes precompiled_charsmap = 2;
-
-  // Adds dummy whitespace at the beginning of text in order to
-  // treat "world" in "world" and "hello world" in the same way.
-  optional bool add_dummy_prefix = 3 [default = true];
-
-  // Removes leading, trailing, and duplicate internal whitespace.
-  optional bool remove_extra_whitespaces = 4 [default = true];
-
-  // Replaces whitespace with meta symbol.
-  // This field must be true to train sentence piece model.
-  optional bool escape_whitespaces = 5 [default = true];
-
-  // Custom normalization rule file in TSV format.
-  // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
-  // This field is only used in SentencePieceTrainer::Train() method, which
-  // compiles the rule into the binary rule stored in `precompiled_charsmap`.
-  optional string normalization_rule_tsv = 6;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// Proto to store samples for self-testing.
-message SelfTestData {
-  message Sample {
-    optional string input = 1;
-    optional string expected = 2;
-  }
-  repeated Sample samples = 1;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// ModelProto stores model parameters.
-// SentencePieceProcessor is supposed to be self-contained.
-// All settings/parameters which may change the behavior must be encoded
-// in ModelProto.
-message ModelProto {
-  message SentencePiece {
-    enum Type {
-      NORMAL = 1;        // normal symbol
-      UNKNOWN = 2;       // unknown symbol. only <unk> for now.
-      CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
-      USER_DEFINED = 4;  // user defined symbols.
-                         // Typical usage of USER_DEFINED symbol
-                         // is placeholder.
-      BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
-      UNUSED = 5;        // this piece is not used.
-    }
-    optional string piece = 1;  // piece must not be empty.
-    optional float score = 2;
-    optional Type type = 3 [default = NORMAL];
-
-    // Customized extensions: the range of field numbers
-    // are open to third-party extensions.
-    extensions 200 to max;
-  }
-
-  // Sentence pieces with scores.
-  repeated SentencePiece pieces = 1;
-
-  // Spec used to generate this model file.
-  optional TrainerSpec trainer_spec = 2;
-
-  // Spec for text normalization.
-  optional NormalizerSpec normalizer_spec = 3;
-
-  // Stores sample input and its expected segmentation to verify the model.
-  optional SelfTestData self_test_data = 4;
-
-  // Spec for text de-normalization.
-  optional NormalizerSpec denormalizer_spec = 5;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
--- a/docs/api.md
+++ b/docs/api.md
@@ -54,7 +54,7 @@ Advanced parameters (optional):

 #### JSON mode

-Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.

 > Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

@@ -256,9 +256,9 @@ For reproducible outputs, set `temperature` to 0 and `seed` to a number:
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "mistral",
-  "prompt": "Why is the sky blue?",
+  "prompt": "[INST] why is the sky blue? [/INST]",
  "options": {
-    "seed": 123,
+    "seed": 101,
    "temperature": 0
  }
 }'
@@ -1024,7 +1024,7 @@ Advanced parameters:

 ```shell
 curl http://localhost:11434/api/embeddings -d '{
-  "model": "all-minilm",
+  "model": "llama2",
  "prompt": "Here is an article about llamas..."
 }'
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,7 +3,7 @@
 Install required tools:

 - cmake version 3.24 or higher
- go version 1.22 or higher
+- go version 1.21 or higher
 - gcc version 11.4.0 or higher

 ```bash
@@ -42,15 +42,15 @@ Now you can run `ollama`:

 #### Linux CUDA (NVIDIA)

-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
+development and runtime packages. 

 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
+libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
 set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:
@@ -67,15 +67,15 @@ go build .

 #### Linux ROCm (AMD)

-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) development packages first, as well as `cmake` and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
+CLBlast install (typically `/usr/lib/cmake/CLBlast`).  You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)

 ```
@@ -88,17 +88,17 @@ Then build the binary:
 go build .
 ```

-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.

 #### Advanced CPU Settings

 By default, running `go generate ./...` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
+somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
+load.  If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
+like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:

 ```
@@ -108,7 +108,8 @@ go build .

 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
+

 ### Windows

@@ -116,8 +117,8 @@ Note: The windows build for Ollama is still under development.

 Install required tools:

- MSVC toolchain - C/C++ and cmake as minimal requirements - You must build from a "Developer Shell" with the environment variables set
- go version 1.22 or higher
+- MSVC toolchain - C/C++ and cmake as minimal requirements
+- go version 1.21 or higher
 - MinGW (pick one variant) with GCC.
  - <https://www.mingw-w64.org/>
  - <https://www.msys2.org/>
@@ -132,6 +133,6 @@ go build .

 #### Windows CUDA (NVIDIA)

-In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.
+In addition to the common Windows development tools described above, install:

 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,14 +10,6 @@ Install Ollama running this one-liner:
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-## AMD Radeon GPU support
-
-While AMD has contributed the `amdgpu` driver upstream to the official linux
-kernel source, the version is older and may not support all ROCm features. We
-recommend you install the latest driver from
-https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-GPU.
-
 ## Manual install

 ### Download the `ollama` binary
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -67,43 +67,6 @@ You can see what features your CPU has with the following.
 cat /proc/cpuinfo| grep flags  | head -1
 ```

-## AMD Radeon GPU Support
-
-Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
-some cases you can force the system to try to use a close GPU type.  For example
-The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) however, ROCm does not
-support this patch-level, the closest support is `gfx1030`.  You can use the
-environment variable `HSA_OVERRIDE_GFX_VERSION` with `x.y.z` syntax.  So for
-example, to force the system to run on the RX 5400, you would set
-`HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server.
-
-At this time, the known supported GPU types are the following: (This may change from
-release to release)
- gfx900
- gfx906
- gfx908
- gfx90a
- gfx940
- gfx941
- gfx942
- gfx1030
- gfx1100
- gfx1101
- gfx1102
-
-This will not work for all unsupported GPUs.  Reach out on [Discord](https://discord.gg/ollama)
-or file an [issue](https://github.com/ollama/ollama/issues) for additional help.
-
-
-## Installing older versions on Linux
-
-If you run into problems on Linux and want to install an older version you can tell the install script
-which version to install.
-
-```sh
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
-```
-
 ## Known issues

 * N/A
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -4,7 +4,7 @@ Welcome to the Ollama Windows preview.

 No more WSL required!

-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+Ollama now runs as a native Windows application, including NVIDIA GPU support.
 After installing Ollama Windows Preview, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
@@ -21,7 +21,6 @@ Logs will often be helpful in dianosing the problem (see

 * Windows 10 or newer, Home or Pro
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card

 ## API Access

--- a/examples/modelfile-tweetwriter/readme.md
+++ b/examples/modelfile-tweetwriter/readme.md
@@ -0,0 +1,23 @@
+# Example Modelfile - Tweetwriter
+
+This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
+
+1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
+2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
+
+## Running the Example
+
+1. Create the model:
+
+   ```bash
+   ollama create tweetwriter
+   ```
+
+2. Enter a topic to generate a tweet about.
+3. Show the Modelfile in the REPL.
+
+   ```bash
+   /show modelfile
+   ```
+
+   Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.
--- a/examples/python-chat-app/README.md
+++ b/examples/python-chat-app/README.md
@@ -0,0 +1,21 @@
+# Ollama Chat App
+
+Build a Llama2 chat app using Streamlit and Ollama.
+
+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
--- a/go.mod
+++ b/go.mod
@@ -1,43 +1,23 @@
 module github.com/jmorganca/ollama

-go 1.22
-
-toolchain go1.22.0
+go 1.21

 require (
 	github.com/containerd/console v1.0.3
-	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
-	github.com/golang/protobuf v1.5.0
 	github.com/google/uuid v1.0.0
-	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.8.4
-	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.3.0
 )

-require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
-
 require (
-	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
-	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.0.8 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/google/flatbuffers v1.12.0 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
-	github.com/xtgo/set v1.0.0 // indirect
-	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
-	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.8.2 // indirect
-	gorgonia.org/vecf32 v0.9.0 // indirect
-	gorgonia.org/vecf64 v0.9.0 // indirect
 )

 require (
@@ -58,6 +38,7 @@ require (
 	github.com/mattn/go-isatty v0.0.19 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 	github.com/pelletier/go-toml/v2 v2.0.8 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
@@ -69,6 +50,6 @@ require (
 	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
-	google.golang.org/protobuf v1.30.0
+	google.golang.org/protobuf v1.30.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,38 +1,18 @@
-cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
 github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
 github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
 github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
-github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
-github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
-github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM=
-github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
-github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
-github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
 github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
@@ -57,31 +37,7 @@ github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QX
 github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
-github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
-github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
-github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
@@ -92,9 +48,6 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
-github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
-github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
 github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
@@ -115,8 +68,6 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
-github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -124,17 +75,14 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
-github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
@@ -148,8 +96,6 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -166,61 +112,19 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
 github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
 github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
 github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
-github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
-github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
-github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
-github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
-github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
-go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
 golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
 golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
-golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
 golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
-golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -233,56 +137,12 @@ golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
 golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
-golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
-golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
-gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
-google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
-google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
-google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
-google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
-google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
-google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
@@ -297,10 +157,4 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
-gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
-gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
-gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
--- a/gpu/amd.go
+++ b/gpu/amd.go
@@ -0,0 +1,101 @@
+package gpu
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// TODO - windows vs. non-windows vs darwin
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
+	// TODO probably break these down per GPU to make the logic simpler
+	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
+)
+
+func AMDDetected() bool {
+	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
+	sysfsDir := filepath.Dir(DriverVersionFile)
+	_, err := os.Stat(sysfsDir)
+	if errors.Is(err, os.ErrNotExist) {
+		slog.Debug("amd driver not detected " + sysfsDir)
+		return false
+	} else if err != nil {
+		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
+		return false
+	}
+	return true
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() []Version {
+	res := []Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+
+		scanner := bufio.NewScanner(fp)
+		// optionally, resize scanner's capacity for lines over 64K, see next example
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+					slog.Debug("malformed " + line)
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res = append(res, Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				})
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -1,58 +0,0 @@
-//go:build linux || windows
-
-package gpu
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
-func rocmLibUsable(libDir string) bool {
-	slog.Debug("evaluating potential rocm lib dir " + libDir)
-	for _, g := range ROCmLibGlobs {
-		res, _ := filepath.Glob(filepath.Join(libDir, g))
-		if len(res) == 0 {
-			return false
-		}
-	}
-	return true
-}
-
-func GetSupportedGFX(libDir string) ([]string, error) {
-	var ret []string
-	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
-	if err != nil {
-		return nil, err
-	}
-	for _, file := range files {
-		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
-	}
-	return ret, nil
-}
-
-func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
-	// Set the visible devices if not already set
-	// TODO - does sort order matter?
-	devices := []string{}
-	for i := range ids {
-		slog.Debug(fmt.Sprintf("i=%d", i))
-		if _, skipped := skip[i]; skipped {
-			slog.Debug("skipped")
-			continue
-		}
-		devices = append(devices, strconv.Itoa(i))
-	}
-	slog.Debug(fmt.Sprintf("devices=%v", devices))
-
-	val := strings.Join(devices, ",")
-	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
-	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
-	}
-	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
-}
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -1,141 +0,0 @@
-package gpu
-
-import (
-	"fmt"
-	"log/slog"
-	"strconv"
-	"syscall"
-	"unsafe"
-
-	"golang.org/x/sys/windows"
-)
-
-const (
-	hipSuccess       = 0
-	hipErrorNoDevice = 100
-)
-
-type hipDevicePropMinimal struct {
-	Name        [256]byte
-	unused1     [140]byte
-	GcnArchName [256]byte // gfx####
-	iGPU        int       // Doesn't seem to actually report correctly
-	unused2     [128]byte
-}
-
-// Wrap the amdhip64.dll library for GPU discovery
-type HipLib struct {
-	dll                    windows.Handle
-	hipGetDeviceCount      uintptr
-	hipGetDeviceProperties uintptr
-	hipMemGetInfo          uintptr
-	hipSetDevice           uintptr
-	hipDriverGetVersion    uintptr
-}
-
-func NewHipLib() (*HipLib, error) {
-	h, err := windows.LoadLibrary("amdhip64.dll")
-	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
-	}
-	hl := &HipLib{}
-	hl.dll = h
-	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
-	if err != nil {
-		return nil, err
-	}
-	return hl, nil
-}
-
-// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
-// so we have to unload/reset the library after we do our initial discovery
-// to make sure our updates to that variable are processed by llama.cpp
-func (hl *HipLib) Release() {
-	err := windows.FreeLibrary(hl.dll)
-	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
-	}
-	hl.dll = 0
-}
-
-func (hl *HipLib) AMDDriverVersion() (string, error) {
-	if hl.dll == 0 {
-		return "", fmt.Errorf("dll has been unloaded")
-	}
-	var version int
-	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
-	if status != hipSuccess {
-		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
-	}
-	return strconv.Itoa(version), nil
-}
-
-func (hl *HipLib) HipGetDeviceCount() int {
-	if hl.dll == 0 {
-		slog.Error("dll has been unloaded")
-		return 0
-	}
-	var count int
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
-	if status == hipErrorNoDevice {
-		slog.Info("AMD ROCm reports no devices found")
-		return 0
-	}
-	if status != hipSuccess {
-		slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
-	}
-	return count
-}
-
-func (hl *HipLib) HipSetDevice(device int) error {
-	if hl.dll == 0 {
-		return fmt.Errorf("dll has been unloaded")
-	}
-	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
-	if status != hipSuccess {
-		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
-	}
-	return nil
-}
-
-func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
-	if hl.dll == 0 {
-		return nil, fmt.Errorf("dll has been unloaded")
-	}
-	var props hipDevicePropMinimal
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
-	if status != hipSuccess {
-		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
-	}
-	return &props, nil
-}
-
-// free, total, err
-func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
-	if hl.dll == 0 {
-		return 0, 0, fmt.Errorf("dll has been unloaded")
-	}
-	var totalMemory uint64
-	var freeMemory uint64
-	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
-	if status != hipSuccess {
-		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
-	}
-	return freeMemory, totalMemory, nil
-}
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -1,411 +0,0 @@
-package gpu
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"slices"
-	"strconv"
-	"strings"
-
-	"github.com/jmorganca/ollama/version"
-)
-
-// Discovery logic for AMD/ROCm GPUs
-
-const (
-	curlMsg               = "curl -fsSL https://github.com/ollama/ollama/releases/download/v%s/rocm-amd64-deps.tgz | tar -zxf - -C %s"
-	DriverVersionFile     = "/sys/module/amdgpu/version"
-	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
-	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
-
-	// Prefix with the node dir
-	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
-	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
-	RocmStandardLocation   = "/opt/rocm/lib"
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
-)
-
-// Gather GPU information from the amdgpu driver if any supported GPUs are detected
-// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
-// and the user hasn't already set this variable
-func AMDGetGPUInfo(resp *GpuInfo) {
-	// TODO - DRY this out with windows
-	if !AMDDetected() {
-		return
-	}
-	skip := map[int]interface{}{}
-
-	// Opportunistic logging of driver version to aid in troubleshooting
-	ver, err := AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
-	}
-
-	// If the user has specified exactly which GPUs to use, look up their memory
-	visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
-	if visibleDevices != "" {
-		ids := []int{}
-		for _, idStr := range strings.Split(visibleDevices, ",") {
-			id, err := strconv.Atoi(idStr)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
-			} else {
-				ids = append(ids, id)
-			}
-		}
-		amdProcMemLookup(resp, nil, ids)
-		return
-	}
-
-	// Gather GFX version information from all detected cards
-	gfx := AMDGFXVersions()
-	verStrings := []string{}
-	for i, v := range gfx {
-		verStrings = append(verStrings, v.ToGFXString())
-		if v.Major == 0 {
-			// Silently skip CPUs
-			skip[i] = struct{}{}
-			continue
-		}
-		if v.Major < 9 {
-			// TODO consider this a build-time setting if we can support 8xx family GPUs
-			slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
-			skip[i] = struct{}{}
-		}
-	}
-	slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
-
-	// Abort if all GPUs are skipped
-	if len(skip) >= len(gfx) {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-
-	// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
-	libDir, err := AMDValidateLibDir()
-	if err != nil {
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
-		return
-	}
-
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
-	if gfxOverride == "" {
-		supported, err := GetSupportedGFX(libDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
-			return
-		}
-		slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
-
-		for i, v := range gfx {
-			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
-			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
-			}
-		}
-	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
-	}
-
-	if len(skip) >= len(gfx) {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-
-	ids := make([]int, len(gfx))
-	i := 0
-	for k := range gfx {
-		ids[i] = k
-		i++
-	}
-	amdProcMemLookup(resp, skip, ids)
-	if resp.memInfo.DeviceCount == 0 {
-		return
-	}
-	if len(skip) > 0 {
-		amdSetVisibleDevices(ids, skip)
-	}
-}
-
-// Walk the sysfs nodes for the available GPUs and gather information from them
-// skipping over any devices in the skip map
-func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
-	if len(ids) == 0 {
-		slog.Debug("discovering all amdgpu devices")
-		entries, err := os.ReadDir(AMDNodesSysfsDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
-			return
-		}
-		for _, node := range entries {
-			if !node.IsDir() {
-				continue
-			}
-			id, err := strconv.Atoi(node.Name())
-			if err != nil {
-				slog.Warn("malformed amdgpu sysfs node id " + node.Name())
-				continue
-			}
-			ids = append(ids, id)
-		}
-	}
-	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
-
-	for _, id := range ids {
-		if _, skipped := skip[id]; skipped {
-			continue
-		}
-		totalMemory := uint64(0)
-		usedMemory := uint64(0)
-		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
-		propFiles, err := filepath.Glob(propGlob)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
-		}
-		// 1 or more memory banks - sum the values of all of them
-		for _, propFile := range propFiles {
-			fp, err := os.Open(propFile)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
-				continue
-			}
-			defer fp.Close()
-			scanner := bufio.NewScanner(fp)
-			for scanner.Scan() {
-				line := strings.TrimSpace(scanner.Text())
-				if strings.HasPrefix(line, "size_in_bytes") {
-					ver := strings.Fields(line)
-					if len(ver) != 2 {
-						slog.Warn("malformed " + line)
-						continue
-					}
-					bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
-					if err != nil {
-						slog.Warn("malformed int " + line)
-						continue
-					}
-					totalMemory += bankSizeInBytes
-				}
-			}
-		}
-		if totalMemory == 0 {
-			continue
-		}
-		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
-		usedFiles, err := filepath.Glob(usedGlob)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
-			continue
-		}
-		for _, usedFile := range usedFiles {
-			fp, err := os.Open(usedFile)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
-				continue
-			}
-			defer fp.Close()
-			data, err := io.ReadAll(fp)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
-				continue
-			}
-			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
-				continue
-			}
-			usedMemory += used
-		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
-		resp.memInfo.DeviceCount++
-		resp.memInfo.TotalMemory += totalMemory
-		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
-	}
-	if resp.memInfo.DeviceCount > 0 {
-		resp.Library = "rocm"
-	}
-}
-
-// Quick check for AMD driver so we can skip amdgpu discovery if not present
-func AMDDetected() bool {
-	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amdgpu driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
-		return false
-	}
-	return true
-}
-
-func setupLink(source, target string) error {
-	if err := os.RemoveAll(target); err != nil {
-		return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
-	}
-	if err := os.Symlink(source, target); err != nil {
-		return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
-	}
-	slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
-	return nil
-}
-
-// Ensure the AMD rocm lib dir is wired up
-// Prefer to use host installed ROCm, as long as it meets our minimum requirements
-// failing that, tell the user how to download it on their own
-func AMDValidateLibDir() (string, error) {
-	// We rely on the rpath compiled into our library to find rocm
-	// so we establish a symlink to wherever we find it on the system
-	// to $AssetsDir/rocm
-
-	// If we already have a rocm dependency wired, nothing more to do
-	assetsDir, err := AssetsDir()
-	if err != nil {
-		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
-	}
-	// Versioned directory
-	rocmTargetDir := filepath.Join(assetsDir, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		return rocmTargetDir, nil
-	}
-	// Parent dir (unversioned)
-	commonRocmDir := filepath.Join(filepath.Dir(assetsDir), "rocm")
-	if rocmLibUsable(commonRocmDir) {
-		return rocmTargetDir, setupLink(commonRocmDir, rocmTargetDir)
-	}
-
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "lib")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
-		}
-	}
-
-	// Scan the library path for potential matches
-	ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
-		if err != nil {
-			continue
-		}
-		if rocmLibUsable(d) {
-			return rocmTargetDir, setupLink(d, rocmTargetDir)
-		}
-	}
-
-	// Well known location(s)
-	if rocmLibUsable("/opt/rocm/lib") {
-		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
-	}
-	err = os.MkdirAll(rocmTargetDir, 0755)
-	if err != nil {
-		return "", fmt.Errorf("failed to create empty rocm dir %s %w", rocmTargetDir, err)
-	}
-
-	// If we still haven't found a usable rocm, the user will have to download it on their own
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or run the following")
-	slog.Warn(fmt.Sprintf(curlMsg, version.Version, rocmTargetDir))
-	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
-}
-
-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
-	if err != nil {
-		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
-	}
-	fp, err := os.Open(DriverVersionFile)
-	if err != nil {
-		return "", err
-	}
-	defer fp.Close()
-	verString, err := io.ReadAll(fp)
-	if err != nil {
-		return "", err
-	}
-	return strings.TrimSpace(string(verString)), nil
-}
-
-func AMDGFXVersions() map[int]Version {
-	res := map[int]Version{}
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	for _, match := range matches {
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
-			continue
-		}
-		defer fp.Close()
-		i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
-			continue
-		}
-
-		scanner := bufio.NewScanner(fp)
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 || len(ver[1]) < 5 {
-
-					if ver[1] == "0" {
-						// Silently skip the CPU
-						continue
-					} else {
-						slog.Debug("malformed " + line)
-					}
-					res[i] = Version{
-						Major: 0,
-						Minor: 0,
-						Patch: 0,
-					}
-					continue
-				}
-				l := len(ver[1])
-				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-
-				res[i] = Version{
-					Major: uint(major),
-					Minor: uint(minor),
-					Patch: uint(patch),
-				}
-			}
-		}
-	}
-	return res
-}
-
-func (v Version) ToGFXString() string {
-	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
-}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -1,190 +0,0 @@
-package gpu
-
-import (
-	"bytes"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"slices"
-	"strings"
-)
-
-const (
-	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
-
-	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
-	iGPUName = "AMD Radeon(TM) Graphics"
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
-)
-
-func AMDGetGPUInfo(resp *GpuInfo) {
-	hl, err := NewHipLib()
-	if err != nil {
-		slog.Debug(err.Error())
-		return
-	}
-	defer hl.Release()
-	skip := map[int]interface{}{}
-	ids := []int{}
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
-
-	ver, err := hl.AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
-	}
-
-	// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
-	count := hl.HipGetDeviceCount()
-	if count == 0 {
-		return
-	}
-	libDir, err := AMDValidateLibDir()
-	if err != nil {
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
-		return
-	}
-
-	var supported []string
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
-	if gfxOverride == "" {
-		supported, err = GetSupportedGFX(libDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
-			return
-		}
-	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
-	}
-
-	slog.Info(fmt.Sprintf("detected %d hip devices", count))
-	for i := 0; i < count; i++ {
-		ids = append(ids, i)
-		err = hl.HipSetDevice(i)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			skip[i] = struct{}{}
-			continue
-		}
-
-		props, err := hl.HipGetDeviceProperties(i)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			skip[i] = struct{}{}
-			continue
-		}
-		n := bytes.IndexByte(props.Name[:], 0)
-		name := string(props.Name[:n])
-		slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
-		n = bytes.IndexByte(props.GcnArchName[:], 0)
-		gfx := string(props.GcnArchName[:n])
-		slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
-		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
-		// TODO  Why isn't props.iGPU accurate!?
-		if strings.EqualFold(name, iGPUName) {
-			slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
-			skip[i] = struct{}{}
-			continue
-		}
-		if gfxOverride == "" {
-			if !slices.Contains[[]string, string](supported, gfx) {
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
-				continue
-			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
-			}
-		}
-
-		totalMemory, freeMemory, err := hl.HipMemGetInfo()
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			continue
-		}
-
-		// TODO according to docs, freeMem may lie on windows!
-		slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] Free Mem:  %d", i, freeMemory))
-		resp.memInfo.DeviceCount++
-		resp.memInfo.TotalMemory += totalMemory
-		resp.memInfo.FreeMemory += freeMemory
-	}
-	if resp.memInfo.DeviceCount > 0 {
-		resp.Library = "rocm"
-	}
-	// Abort if all GPUs are skipped
-	if len(skip) >= count {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-	if len(skip) > 0 {
-		amdSetVisibleDevices(ids, skip)
-	}
-	UpdatePath(libDir)
-}
-
-func AMDValidateLibDir() (string, error) {
-	// On windows non-admins typically can't create links
-	// so instead of trying to rely on rpath and a link in
-	// $LibDir/rocm, we instead rely on setting PATH to point
-	// to the location of the ROCm library
-
-	// Installer payload location
-	exe, err := os.Executable()
-	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
-	}
-
-	// If we already have a rocm dependency wired, nothing more to do
-	libDir, err := AssetsDir()
-	if err != nil {
-		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
-	}
-	rocmTargetDir := filepath.Join(libDir, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		return rocmTargetDir, nil
-	}
-
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "bin")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return hipLibDir, nil
-		}
-	}
-
-	// Well known location(s)
-	if rocmLibUsable(RocmStandardLocation) {
-		return RocmStandardLocation, nil
-	}
-
-	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
-	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir = filepath.Join(appDir, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
-		return rocmTargetDir, nil
-	}
-
-	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm v6")
-	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
-}
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -1,41 +0,0 @@
-package gpu
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-func AssetsDir() (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-
-	return filepath.Join(home, ".ollama", "assets"), nil
-}
-
-func UpdatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -24,6 +24,7 @@ import (

 type handles struct {
 	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
 }

 var gpuMutex sync.Mutex
@@ -53,23 +54,39 @@ var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }

+var RocmLinuxGlobs = []string{
+	"/opt/rocm*/lib*/librocm_smi64.so*",
+}
+
+var RocmWindowsGlobs = []string{
+	"c:\\Windows\\System32\\rocm_smi64.dll",
+}
+
 // Note: gpuMutex must already be held
 func initGPUHandles() {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	gpuHandles = &handles{nil}
+	gpuHandles = &handles{nil, nil}
 	var cudaMgmtName string
 	var cudaMgmtPatterns []string
+	var rocmMgmtName string
+	var rocmMgmtPatterns []string
 	switch runtime.GOOS {
 	case "windows":
 		cudaMgmtName = "nvml.dll"
 		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
 		copy(cudaMgmtPatterns, CudaWindowsGlobs)
+		rocmMgmtName = "rocm_smi64.dll"
+		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
+		copy(rocmMgmtPatterns, RocmWindowsGlobs)
 	case "linux":
 		cudaMgmtName = "libnvidia-ml.so"
 		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
 		copy(cudaMgmtPatterns, CudaLinuxGlobs)
+		rocmMgmtName = "librocm_smi64.so"
+		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
+		copy(rocmMgmtPatterns, RocmLinuxGlobs)
 	default:
 		return
 	}
@@ -84,6 +101,16 @@ func initGPUHandles() {
 			return
 		}
 	}
+
+	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
+	if len(rocmLibPaths) > 0 {
+		rocm := LoadROCMMgmt(rocmLibPaths)
+		if rocm != nil {
+			slog.Info("Radeon GPU detected")
+			gpuHandles.rocm = rocm
+			return
+		}
+	}
 }

 func GetGPUInfo() GpuInfo {
@@ -122,10 +149,66 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else {
-		AMDGetGPUInfo(&resp)
-		if resp.Library != "" {
-			return resp
+	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		ver, err := AMDDriverVersion()
+		if err == nil {
+			slog.Info("AMD Driver: " + ver)
+		} else {
+			// For now this is benign, but we may eventually need to fail compatibility checks
+			slog.Debug("error looking up amd driver version: %s", err)
+		}
+		gfx := AMDGFXVersions()
+		tooOld := false
+		for _, v := range gfx {
+			if v.Major < 9 {
+				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
+				tooOld = true
+				break
+			}
+
+			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
+			// e.g. gfx1034 works if we map it to gfx1030 at runtime
+
+		}
+		if !tooOld {
+			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
+			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+			if memInfo.err != nil {
+				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+				C.free(unsafe.Pointer(memInfo.err))
+			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+				// Only one GPU detected and it appears to be an integrated GPU - skip it
+				slog.Info("ROCm unsupported integrated GPU detected")
+			} else if memInfo.count > 0 {
+				if memInfo.igpu_index >= 0 {
+					// We have multiple GPUs reported, and one of them is an integrated GPU
+					// so we have to set the env var to bypass it
+					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
+					val := os.Getenv("ROCR_VISIBLE_DEVICES")
+					if val == "" {
+						devices := []string{}
+						for i := 0; i < int(memInfo.count); i++ {
+							if i == int(memInfo.igpu_index) {
+								continue
+							}
+							devices = append(devices, strconv.Itoa(i))
+						}
+						val = strings.Join(devices, ",")
+						os.Setenv("ROCR_VISIBLE_DEVICES", val)
+					}
+					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
+				}
+				resp.Library = "rocm"
+				var version C.rocm_version_resp_t
+				C.rocm_get_version(*gpuHandles.rocm, &version)
+				verString := C.GoString(version.str)
+				if version.status == 0 {
+					resp.Variant = "v" + verString
+				} else {
+					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+				}
+				C.free(unsafe.Pointer(version.str))
+			}
 		}
 	}
 	if resp.Library == "" {
@@ -159,15 +242,6 @@ func getCPUMem() (memInfo, error) {
 }

 func CheckVRAM() (int64, error) {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return avail, nil
-	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
 		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
@@ -255,6 +329,23 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	return nil
 }

+func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
+	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
+	for _, libPath := range rocmLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.rocm_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.rh
+		}
+	}
+	return nil
+}
+
 func getVerboseState() C.uint16_t {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -1,7 +1,6 @@
 //go:build darwin

 package gpu
-
 /*
 #cgo CFLAGS: -x objective-c
 #cgo LDFLAGS: -framework Foundation -framework CoreGraphics -framework Metal
@@ -9,25 +8,11 @@ package gpu
 */
 import "C"
 import (
-	"fmt"
-	"log/slog"
-	"os"
 	"runtime"
-	"strconv"
 )

 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return avail, nil
-	}
-
 	if runtime.GOARCH == "amd64" {
 		// gpu not supported, this may not be metal
 		return 0, nil
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -53,6 +53,7 @@ void cpu_check_ram(mem_info_t *resp);
 #endif

 #include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -124,31 +124,31 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -0,0 +1,198 @@
+#ifndef __APPLE__
+
+#include "gpu_info_rocm.h"
+
+#include <string.h>
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
+  rsmi_status_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
+  };
+
+  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
+  if (!resp->rh.handle) {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      resp->rh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->rh.rsmi_init)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->rh.handle);
+    resp->rh.handle = NULL;
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  resp->igpu_index = -1;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("rocm handle not initialized");
+    return;
+  }
+
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
+
+    // Get total memory - used memory for available memory
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    if (totalMem < 1024 * 1024 * 1024) {
+      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
+      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
+      resp->igpu_index = i;
+    } else {
+      resp->total += totalMem;
+      resp->free += totalMem - usedMem;
+    }
+  }
+}
+
+void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
+  const int buflen = 256;
+  char buf[buflen + 1];
+  if (h.handle == NULL) {
+    resp->str = strdup("rocm handle not initialized");
+    resp->status = 1;
+    return;
+  }
+  rsmi_version_t ver;
+  rsmi_status_t ret;
+  ret = h.rsmi_version_get(&ver);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
+    resp->status = 1;
+  } else {
+    snprintf(buf, buflen, "%d", ver.major);
+    resp->status = 0;
+  }
+  resp->str = strdup(buf);
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -0,0 +1,59 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+
+ typedef struct {
+     uint32_t major;     
+     uint32_t minor;     
+     uint32_t patch;     
+     const char *build;  
+ } rsmi_version_t;
+
+typedef struct rocm_handle {
+  void *handle;
+  uint16_t verbose;
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
+} rocm_handle_t;
+
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+
+typedef struct rocm_version_resp {
+  rsmi_status_t status;
+  char *str; // Contains version or error string if status != 0 
+} rocm_version_resp_t;
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
+
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -1,142 +0,0 @@
-#include "dyn_ext_server.h"
-
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __linux__
-#include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
-#define LOAD_ERR() strdup(dlerror())
-#define UNLOAD_LIBRARY(handle) dlclose(handle)
-#elif _WIN32
-#include <windows.h>
-#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
-#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
-#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
-#define LOAD_ERR() ({\
-  LPSTR messageBuffer = NULL; \
-  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
-                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
-  char *resp = strdup(messageBuffer); \
-  LocalFree(messageBuffer); \
-  resp; \
-})
-#else
-#include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
-#define LOAD_ERR() strdup(dlerror())
-#define UNLOAD_LIBRARY(handle) dlclose(handle)
-#endif
-
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
-                       ext_server_resp_t *err) {
-  int i = 0;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"llama_server_init", (void *)&s->llama_server_init},
-      {"llama_server_start", (void *)&s->llama_server_start},
-      {"llama_server_stop", (void *)&s->llama_server_stop},
-      {"llama_server_completion", (void *)&s->llama_server_completion},
-      {"llama_server_completion_next_result",
-       (void *)&s->llama_server_completion_next_result},
-      {"llama_server_completion_cancel",
-       (void *)&s->llama_server_completion_cancel},
-      {"llama_server_release_task_result",
-       (void *)&s->llama_server_release_task_result},
-      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
-      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
-      {"llama_server_embedding", (void *)&s->llama_server_embedding},
-      {"llama_server_release_json_resp",
-       (void *)&s->llama_server_release_json_resp},
-      {"", NULL},
-  };
-
-  printf("loading library %s\n", libPath);
-  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
-  if (!s->handle) {
-    err->id = -1;
-    char *msg = LOAD_ERR();
-    snprintf(err->msg, err->msg_len,
-             "Unable to load dynamic server library: %s", msg);
-    free(msg);
-    return;
-  }
-
-  for (i = 0; l[i].p != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
-    if (!l[i].p) {
-      UNLOAD_LIBRARY(s->handle);
-      err->id = -1;
-      char *msg = LOAD_ERR();
-      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
-               l[i].s, msg);
-      free(msg);
-      return;
-    }
-  }
-}
-
-inline void dyn_llama_server_init(struct dynamic_llama_server s,
-                                           ext_server_params_t *sparams,
-                                           ext_server_resp_t *err) {
-  s.llama_server_init(sparams, err);
-}
-
-inline void dyn_llama_server_start(struct dynamic_llama_server s) {
-  s.llama_server_start();
-}
-
-inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
-  s.llama_server_stop();
-}
-
-inline void dyn_llama_server_completion(struct dynamic_llama_server s,
-                                                 const char *json_req,
-                                                 ext_server_resp_t *resp) {
-  s.llama_server_completion(json_req, resp);
-}
-
-inline void dyn_llama_server_completion_next_result(
-    struct dynamic_llama_server s, const int task_id,
-    ext_server_task_result_t *result) {
-  s.llama_server_completion_next_result(task_id, result);
-}
-
-inline void dyn_llama_server_completion_cancel(
-    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
-  s.llama_server_completion_cancel(task_id, err);
-}
-inline void dyn_llama_server_release_task_result(
-    struct dynamic_llama_server s, ext_server_task_result_t *result) {
-  s.llama_server_release_task_result(result);
-}
-
-inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
-                                               const char *json_req,
-                                               char **json_resp,
-                                               ext_server_resp_t *err) {
-  s.llama_server_tokenize(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
-                                                 const char *json_req,
-                                                 char **json_resp,
-                                                 ext_server_resp_t *err) {
-  s.llama_server_detokenize(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
-                                                const char *json_req,
-                                                char **json_resp,
-                                                ext_server_resp_t *err) {
-  s.llama_server_embedding(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_release_json_resp(
-    struct dynamic_llama_server s, char **json_resp) {
-  s.llama_server_release_json_resp(json_resp);
-}
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -28,13 +28,13 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"

 	"github.com/jmorganca/ollama/api"
-	"github.com/jmorganca/ollama/gpu"
 )

 type dynExtServer struct {
@@ -72,7 +72,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	gpu.UpdatePath(filepath.Dir(library))
+	updatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
@@ -148,7 +148,6 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	}

 	slog.Info("Initializing llama server")
-	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
 	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
@@ -366,3 +365,25 @@ func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
+
+func updatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -1,74 +0,0 @@
-#include <stdlib.h>
-
-#include "ext_server.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-struct dynamic_llama_server {
-  void *handle;
-  void (*llama_server_init)(ext_server_params_t *sparams,
-                            ext_server_resp_t *err);
-  void (*llama_server_start)();
-  void (*llama_server_stop)();
-  void (*llama_server_completion)(const char *json_req,
-                                  ext_server_resp_t *resp);
-  void (*llama_server_completion_next_result)(const int task_id,
-                                              ext_server_task_result_t *result);
-  void (*llama_server_completion_cancel)(const int task_id,
-                                         ext_server_resp_t *err);
-  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
-  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
-                                ext_server_resp_t *err);
-  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
-                                  ext_server_resp_t *err);
-  void (*llama_server_embedding)(const char *json_req, char **json_resp,
-                                 ext_server_resp_t *err);
-  void (*llama_server_release_json_resp)(char **json_resp);
-};
-
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
-                       ext_server_resp_t *err);
-
-// No good way to call C function pointers from Go so inline the indirection
-void dyn_llama_server_init(struct dynamic_llama_server s,
-                                    ext_server_params_t *sparams,
-                                    ext_server_resp_t *err);
-
-void dyn_llama_server_start(struct dynamic_llama_server s);
-
-void dyn_llama_server_stop(struct dynamic_llama_server s);
-
-void dyn_llama_server_completion(struct dynamic_llama_server s,
-                                          const char *json_req,
-                                          ext_server_resp_t *resp);
-
-void dyn_llama_server_completion_next_result(
-    struct dynamic_llama_server s, const int task_id,
-    ext_server_task_result_t *result);
-
-void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
-                                                 const int task_id,
-                                                 ext_server_resp_t *err);
-
-void dyn_llama_server_release_task_result(
-    struct dynamic_llama_server s, ext_server_task_result_t *result);
-
-void dyn_llama_server_tokenize(struct dynamic_llama_server s,
-                                        const char *json_req, char **json_resp,
-                                        ext_server_resp_t *err);
-
-void dyn_llama_server_detokenize(struct dynamic_llama_server s,
-                                          const char *json_req,
-                                          char **json_resp,
-                                          ext_server_resp_t *err);
-
-void dyn_llama_server_embedding(struct dynamic_llama_server s,
-                                         const char *json_req, char **json_resp,
-                                         ext_server_resp_t *err);
-void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
-                                                 char **json_resp);
-
-#ifdef __cplusplus
-}
-#endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,25 +0,0 @@
-# Ollama specific CMakefile to include in llama.cpp/examples/server
-
-set(TARGET ext_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-if (WIN32)
-    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
-else()
-    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
-endif()
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_include_directories(${TARGET} PRIVATE ../..)
-target_include_directories(${TARGET} PRIVATE ../../..)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
-target_link_libraries(${TARGET} PRIVATE ggml llava common )
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
-install(TARGETS ext_server LIBRARY)
-
-if (CUDAToolkit_FOUND)
-    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    if (WIN32)
-        target_link_libraries(${TARGET} PRIVATE nvml)
-    endif()
-endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,18 +0,0 @@
-# Extern C Server
-
-This directory contains a thin facade we layer on top of the Llama.cpp server to
-expose `extern C` interfaces to access the functionality through direct API
-calls in-process.  The llama.cpp code uses compile time macros to configure GPU
-type along with other settings.  During the `go generate ./...` execution, the
-build will generate one or more copies of the llama.cpp `extern C` server based
-on what GPU libraries are detected to support multiple GPU types as well as CPU
-only support. The Ollama go build then embeds these different servers to support
-different GPUs and settings at runtime.
-
-If you are making changes to the code in this directory, make sure to disable
-caching during your go build to ensure you pick up your changes.  A typical
-iteration cycle from the top of the source tree looks like:
-
-```
-go generate ./... && go build -a .
-```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -1,381 +0,0 @@
-#include "ext_server.h"
-#include <atomic>
-
-// Necessary evil since the server types are not defined in a header
-#include "server.cpp"
-
-// Low level API access to verify GPU access
-#if defined(GGML_USE_CUBLAS)
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define cudaGetDevice hipGetDevice
-#define cudaError_t hipError_t
-#define cudaSuccess hipSuccess
-#define cudaGetErrorString hipGetErrorString
-#else
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#endif // defined(GGML_USE_HIPBLAS)
-#endif // GGML_USE_CUBLAS
-
-// Expose the llama server as a callable extern "C" API
-server_context *llama = NULL;
-std::thread ext_server_thread;
-bool shutting_down = false;
-std::atomic_int recv_counter;
-
-// RAII wrapper for tracking in-flight recv calls
-class atomicRecv {
-  public:
-    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
-      ++this->atomic;
-    }
-    ~atomicRecv() {
-      --this->atomic;
-    }
-  private:
-    std::atomic<int> &atomic;
-};
- 
-void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
-  recv_counter = 0;
-  assert(err != NULL && sparams != NULL);
-  log_set_target(stderr);
-  if (!sparams->verbose_logging) {
-    server_verbose = true;
-    log_disable();
-  }
-
-  LOG_TEE("system info: %s\n", llama_print_system_info());
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    llama = new server_context;
-    gpt_params params;
-    params.n_ctx = sparams->n_ctx;
-    params.n_batch = sparams->n_batch;
-    if (sparams->n_threads > 0) {
-      params.n_threads = sparams->n_threads;
-    }
-    params.n_parallel = sparams->n_parallel;
-    params.rope_freq_base = sparams->rope_freq_base;
-    params.rope_freq_scale = sparams->rope_freq_scale;
-
-    if (sparams->memory_f16) {
-      params.cache_type_k = "f16";
-      params.cache_type_v = "f16";
-    } else {
-      params.cache_type_k = "f32";
-      params.cache_type_v = "f32";
-    }
-
-    params.n_gpu_layers = sparams->n_gpu_layers;
-    params.main_gpu = sparams->main_gpu;
-    params.use_mlock = sparams->use_mlock;
-    params.use_mmap = sparams->use_mmap;
-    params.numa = (ggml_numa_strategy)sparams->numa;
-    params.embedding = sparams->embedding;
-    if (sparams->model != NULL) {
-      params.model = sparams->model;
-    }
-
-    if (sparams->lora_adapters != NULL) {
-      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
-          la = la->next) {
-        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
-      }
-
-      params.use_mmap = false;
-    }
-
-    if (sparams->mmproj != NULL) {
-      params.mmproj = std::string(sparams->mmproj);
-    }
-
-#if defined(GGML_USE_CUBLAS)
-    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
-    LOG_TEE("Performing pre-initialization of GPU\n");
-    int id;
-    cudaError_t cudaErr = cudaGetDevice(&id);
-    if (cudaErr != cudaSuccess) {
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
-      return;
-    }
-#endif
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model
-    if (!llama->load_model(params)) {
-      // TODO - consider modifying the logging logic or patching load_model so
-      // we can capture more detailed error messages and pass them back to the
-      // caller for better UX
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "error loading model %s",
-               params.model.c_str());
-      return;
-    }
-
-    llama->initialize();
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len,
-             "Unknown exception initializing llama server");
-  }
-}
-
-void llama_server_start() {
-  assert(llama != NULL);
-  // TODO mutex to protect thread creation
-  ext_server_thread = std::thread([&]() {
-    try {
-      LOG_TEE("llama server main loop starting\n");
-      ggml_time_init();
-      llama->queue_tasks.on_new_task(std::bind(
-        &server_context::process_single_task, llama, std::placeholders::_1));
-      llama->queue_tasks.on_finish_multitask(std::bind(
-        &server_context::on_finish_multitask, llama, std::placeholders::_1));
-      llama->queue_tasks.on_run_slots(std::bind(
-        &server_context::update_slots, llama));
-      llama->queue_results.on_multitask_update(std::bind(
-          &server_queue::update_multitask,
-          &llama->queue_tasks,
-          std::placeholders::_1,
-          std::placeholders::_2,
-          std::placeholders::_3
-        ));
-      llama->queue_tasks.start_loop();
-    } catch (std::exception &e) {
-      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
-    } catch (...) {
-      LOG_TEE("caught unknown exception in llama server main loop\n");
-    }
-    LOG_TEE("\nllama server shutting down\n");
-    llama_backend_free();
-  });
-}
-
-void llama_server_stop() {
-  assert(llama != NULL);
-  // Shutdown any in-flight requests and block incoming requests.
-  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
-  shutting_down = true;
-
-  while (recv_counter.load() > 0) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(50));
-  }
-
-  // This may take a while for any pending tasks to drain
-  // TODO - consider a timeout to cancel tasks if it's taking too long
-  llama->queue_tasks.terminate();
-  ext_server_thread.join();
-  delete llama;
-  llama = NULL;
-  LOG_TEE("llama server shutdown complete\n");
-  shutting_down = false;
-}
-
-void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
-  assert(llama != NULL && json_req != NULL && resp != NULL);
-  resp->id = -1;
-  resp->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    json data = json::parse(json_req);
-    resp->id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, -1, data, false, false);
-  } catch (std::exception &e) {
-    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
-  } catch (...) {
-    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
-  }
-}
-
-void llama_server_completion_next_result(const int task_id,
-                                         ext_server_task_result_t *resp) {
-  assert(llama != NULL && resp != NULL);
-  resp->id = -1;
-  resp->stop = false;
-  resp->error = false;
-  resp->json_resp = NULL;
-  std::string result_json;
-  try {
-    atomicRecv ar(recv_counter);
-    server_task_result result = llama->queue_results.recv(task_id);
-    result_json =
-        result.data.dump(-1, ' ', false, json::error_handler_t::replace);
-    resp->id = result.id;
-    resp->stop = result.stop;
-    resp->error = result.error;
-    if (result.error) {
-      LOG_TEE("next result cancel on error\n");
-      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-    } else if (result.stop) {
-      LOG_TEE("next result cancel on stop\n");
-      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-    } else if (shutting_down) {
-      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
-      llama->request_cancel(task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-      resp->stop = true;
-    }
-  } catch (std::exception &e) {
-    resp->error = true;
-    resp->id = -1;
-    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
-    LOG_TEE("llama server completion exception %s\n", e.what());
-  } catch (...) {
-    resp->error = true;
-    resp->id = -1;
-    result_json = "{\"error\":\"Unknown exception during completion\"}";
-    LOG_TEE("llama server completion unknown exception\n");
-  }
-  const std::string::size_type size = result_json.size() + 1;
-  resp->json_resp = new char[size];
-  snprintf(resp->json_resp, size, "%s", result_json.c_str());
-}
-
-void llama_server_release_task_result(ext_server_task_result_t *result) {
-  if (result == NULL || result->json_resp == NULL) {
-    return;
-  }
-  delete[] result->json_resp;
-}
-
-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
-  assert(llama != NULL && err != NULL);
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    llama->request_cancel(task_id);
-    llama->queue_results.remove_waiting_task_id(task_id);
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len,
-             "Unknown exception completion cancel in llama server");
-  }
-}
-
-void llama_server_tokenize(const char *json_req, char **json_resp,
-                           ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    std::vector<llama_token> tokens;
-    if (body.count("content") != 0) {
-      tokens = llama->tokenize(body["content"], false);
-    }
-    const json data = format_tokenizer_response(tokens);
-    std::string result_json = data.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
-  }
-}
-
-void llama_server_release_json_resp(char **json_resp) {
-  if (json_resp == NULL || *json_resp == NULL) {
-    return;
-  }
-  delete[] *json_resp;
-}
-
-void llama_server_detokenize(const char *json_req, char **json_resp,
-                             ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    std::string content;
-    if (body.count("tokens") != 0) {
-      const std::vector<llama_token> tokens = body["tokens"];
-      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
-    }
-    const json data = format_detokenized_response(content);
-    std::string result_json = data.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
-  }
-}
-
-void llama_server_embedding(const char *json_req, char **json_resp,
-                            ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    json prompt;
-    if (body.count("content") != 0) {
-      prompt = body["content"];
-    } else {
-      prompt = "";
-    }
-    const int task_id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
-    atomicRecv ar(recv_counter);
-    server_task_result result = llama->queue_results.recv(task_id);
-    std::string result_json = result.data.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-    llama->queue_results.remove_waiting_task_id(task_id);
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
-  }
-}
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -1,95 +0,0 @@
-#if defined(LLAMA_SERVER_LIBRARY)
-#ifndef LLAMA_SERVER_H
-#define LLAMA_SERVER_H
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-int __main(int argc, char **argv);
-
-// This exposes extern C entrypoints into the llama_server
-// To enable the server compile with LLAMA_SERVER_LIBRARY
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-typedef struct ext_server_resp {
-  int id;          // < 0 on error
-  size_t msg_len;  // caller must allocate msg and set msg_len
-  char *msg;
-} ext_server_resp_t;
-
-// Allocated and freed by caller
-typedef struct ext_server_lora_adapter {
-  char *adapter;
-  float scale;
-  struct ext_server_lora_adapter *next;
-} ext_server_lora_adapter_t;
-
-// Allocated and freed by caller
-typedef struct ext_server_params {
-  char *model;
-  uint32_t n_ctx;         // token context window, 0 = from model
-  uint32_t n_batch;       // prompt processing maximum batch size
-  uint32_t n_threads;     // number of threads to use for generation
-  int32_t n_parallel;     // number of parallel sequences to decodewra
-  float rope_freq_base;   // RoPE base frequency, 0 = from model
-  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-  bool memory_f16;        // use f16 instead of f32 for memory kv
-  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
-  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
-  bool use_mlock;        // force system to keep model in RAM
-  bool use_mmap;         // use mmap if possible
-  int numa;              // attempt optimizations that help on some NUMA systems
-  bool embedding;        // get only sentence embedding
-  ext_server_lora_adapter_t *lora_adapters;
-  char *mmproj;
-  bool verbose_logging;  // Enable verbose logging of the server
-} ext_server_params_t;
-
-typedef struct ext_server_task_result {
-  int id;
-  bool stop;
-  bool error;
-  char *json_resp;  // null terminated, memory managed by ext_server
-} ext_server_task_result_t;
-
-// Initialize the server once per process
-// err->id = 0 for success and err->msg[0] = NULL
-// err->id != 0 for failure, and err->msg contains error message
-void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
-
-// Run the main loop, called once per init
-void llama_server_start();
-// Stop the main loop and free up resources allocated in init and start.  Init
-// must be called again to reuse
-void llama_server_stop();
-
-// json_req null terminated string, memory managed by caller
-// resp->id >= 0 on success (task ID)
-// resp->id < 0 on error, and resp->msg contains error message
-void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
-
-// Caller must call llama_server_release_task_result to free resp->json_resp
-void llama_server_completion_next_result(const int task_id,
-                                         ext_server_task_result_t *result);
-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
-void llama_server_release_task_result(ext_server_task_result_t *result);
-
-// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
-// 0
-void llama_server_tokenize(const char *json_req, char **json_resp,
-                           ext_server_resp_t *err);
-void llama_server_detokenize(const char *json_req, char **json_resp,
-                             ext_server_resp_t *err);
-void llama_server_embedding(const char *json_req, char **json_resp,
-                            ext_server_resp_t *err);
-void llama_server_release_json_resp(char **json_resp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-#endif  // LLAMA_SERVER_LIBRARY
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -1,125 +0,0 @@
-# common logic across linux and darwin
-
-init_vars() {
-    case "${GOARCH}" in
-    "amd64")
-        ARCH="x86_64"
-        ;;
-    "arm64")
-        ARCH="arm64"
-        ;;
-    *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
-    esac
-
-    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ext_server"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
-    else
-        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
-    fi
-    case $(uname -s) in 
-    "Darwin")
-        LIB_EXT="dylib"
-        WHOLE_ARCHIVE="-Wl,-force_load"
-        NO_WHOLE_ARCHIVE=""
-        GCC_ARCH="-arch ${ARCH}"
-        ;;
-    "Linux")
-        LIB_EXT="so"
-        WHOLE_ARCHIVE="-Wl,--whole-archive"
-        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
-
-        # Cross compiling not supported on linux - Use docker
-        GCC_ARCH=""
-        ;;
-    *)
-        ;;
-    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
-}
-
-git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
-        echo "Skipping submodule initialization"
-        return
-    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
-    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
-
-}
-
-apply_patches() {
-    # Wire up our CMakefile
-    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
-        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
-    fi
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        # apply temporary patches until fix is upstream
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-        for patch in ../patches/*.diff; do
-            (cd ${LLAMACPP_DIR} && git apply ${patch})
-        done
-    fi
-
-    # Avoid duplicate main symbols when we link into the cgo binary
-    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
-        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
-}
-
-build() {
-    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    mkdir -p ${BUILD_DIR}/lib/
-    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
-        ${GCC_ARCH} \
-        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
-        ${BUILD_DIR}/common/libcommon.a \
-        ${BUILD_DIR}/libllama.a \
-        -Wl,-rpath,\$ORIGIN \
-        -lpthread -ldl -lm \
-        ${EXTRA_LIBS}
-}
-
-compress_libs() {
-    echo "Compressing payloads to reduce overall binary size..."
-    pids=""
-    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
-    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
-        gzip -n --best -f ${lib} &
-        pids+=" $!"
-    done
-    echo 
-    for pid in ${pids}; do
-        wait $pid
-    done
-    echo "Finished compression"
-}
-
-# Keep the local tree clean after we're done with the build
-cleanup() {
-    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-    fi
-}
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -1,77 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
-
-# TODO - add hardening to detect missing tools (cmake, etc.)
-
-set -ex
-set -o pipefail
-echo "Starting darwin generate script"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-sign() {
-    if [ -n "$APPLE_IDENTITY" ]; then
-        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
-    fi
-}
-
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
-
-case "${GOARCH}" in
-"amd64")
-    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
-
-    #
-    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-    #
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
-    echo "Building LCD CPU"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
-    compress_libs
-
-    #
-    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-    # Approximately 400% faster than LCD on same CPU
-    #
-    init_vars
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
-    echo "Building AVX CPU"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
-    compress_libs
-
-    #
-    # ~2013 CPU Dynamic library
-    # Approximately 10% faster than AVX on same CPU
-    #
-    init_vars
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
-    echo "Building AVX2 CPU"
-    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
-    compress_libs
-    ;;
-"arm64")
-    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
-    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
-    compress_libs
-    ;;
-*)
-    echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
-    exit 1
-    ;;
-esac
-
-cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -1,200 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be llm/generate/
-
-# First we build one or more CPU based LLM libraries
-#
-# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
-# library dependencies
-#
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
-# libraries are quite large, and also dynamically load data files at runtime
-# which in turn are large, so we don't attempt to cary them as payload
-
-set -ex
-set -o pipefail
-
-# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
-amdGPUs() {
-    if [ -n "${AMDGPU_TARGETS}" ]; then
-        echo "${AMDGPU_TARGETS}"
-        return
-    fi
-    GPU_LIST=(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    (
-        IFS=$';'
-        echo "'${GPU_LIST[*]}'"
-    )
-}
-
-echo "Starting linux generate script"
-if [ -z "${CUDACXX}" ]; then
-    if [ -x /usr/local/cuda/bin/nvcc ]; then
-        export CUDACXX=/usr/local/cuda/bin/nvcc
-    else
-        # Try the default location in case it exists
-        export CUDACXX=$(command -v nvcc)
-    fi
-fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-    # Users building from source can tune the exact flags we pass to cmake for configuring
-    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
-    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
-        echo "Building custom CPU"
-        build
-        compress_libs
-    else
-        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
-        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-        # Note: the following seem to yield slower results than AVX2 - ymmv
-        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
-        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
-        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
-
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
-            #
-            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-            #
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
-            echo "Building LCD CPU"
-            build
-            compress_libs
-        fi
-
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
-            #
-            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-            # Approximately 400% faster than LCD on same CPU
-            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
-            echo "Building AVX CPU"
-            build
-            compress_libs
-        fi
-
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-            #
-            # ~2013 CPU Dynamic library
-            # Approximately 10% faster than AVX on same CPU
-            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
-            echo "Building AVX2 CPU"
-            build
-            compress_libs
-        fi
-    fi
-else
-    echo "Skipping CPU generation step as requested"
-fi
-
-# If needed, look for the default CUDA toolkit location
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
-    CUDA_LIB_DIR=/usr/local/cuda/lib64
-fi
-
-# If needed, look for CUDA on Arch Linux
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
-    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
-fi
-
-# Allow override in case libcudart is in the wrong place
-if [ -z "${CUDART_LIB_DIR}" ]; then
-    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
-fi
-
-if [ -d "${CUDA_LIB_DIR}" ]; then
-    echo "CUDA libraries detected - building dynamic CUDA library"
-    init_vars
-    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
-        CUDA_VARIANT=_v${CUDA_MAJOR}
-    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    build
-
-    # Cary the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
-        fi
-    done
-    compress_libs
-
-fi
-
-if [ -z "${ROCM_PATH}" ]; then
-    # Try the default location in case it exists
-    ROCM_PATH=/opt/rocm
-fi
-
-if [ -z "${CLBlast_DIR}" ]; then
-    # Try the default location in case it exists
-    if [ -d /usr/lib/cmake/CLBlast ]; then
-        export CLBlast_DIR=/usr/lib/cmake/CLBlast
-    fi
-fi
-
-if [ -d "${ROCM_PATH}" ]; then
-    echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
-    fi
-    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
-    build
-
-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/lib/deps.txt"
-    touch "${BUILD_DIR}/lib/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
-    done
-    compress_libs
-fi
-
-cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,275 +0,0 @@
-#!powershell
-
-$ErrorActionPreference = "Stop"
-
-function amdGPUs {
-    if ($env:AMDGPU_TARGETS) {
-        return $env:AMDGPU_TARGETS
-    }
-    # TODO - load from some common data file for linux + windows build consistency
-    $GPU_LIST = @(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    $GPU_LIST -join ';'
-}
-
-function init_vars {
-    # Verify the environment is a Developer Shell for MSVC 2019
-    write-host $env:VSINSTALLDIR
-    if (($env:VSINSTALLDIR -eq $null)) {
-        Write-Error "`r`nBUILD ERROR - YOUR DEVELOPMENT ENVIRONMENT IS NOT SET UP CORRECTLY`r`nTo build Ollama you must run from an MSVC Developer Shell`r`nSee .\docs\development.md for instructions to set up your dev environment"
-        exit 1
-    }
-    $script:SRC_DIR = $(resolve-path "..\..\")
-    $script:llamacppDir = "../llama.cpp"
-    $script:cmakeDefs = @(
-        "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
-        )
-    $script:cmakeTargets = @("ext_server")
-    $script:ARCH = "amd64" # arm not yet supported.
-    if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
-        $script:config = "RelWithDebInfo"
-    } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
-        $script:config = "Release"
-    }
-    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
-        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
-    }
-    # Try to find the CUDA dir
-    if ($env:CUDA_LIB_DIR -eq $null) {
-        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:CUDA_LIB_DIR=($d| split-path -parent)
-            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
-        }
-    } else {
-        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
-    }
-    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
-    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
-    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
-        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    } else {
-        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
-    }
-    # Note: 10 Windows Kit signtool crashes with GCP's plugin
-    ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
-    if ("${env:KEY_CONTAINER}") {
-        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
-    }
-}
-
-function git_module_setup {
-    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
-    & git submodule init
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function apply_patches {
-    # Wire up our CMakefile
-    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
-    }
-
-    # Apply temporary patches until fix is upstream
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        Set-Location -Path ${script:llamacppDir}
-        foreach ($file in $filePaths) {
-            git checkout $file
-        }
-    }
-
-    # Apply each patch
-    foreach ($patch in $patches) {
-        Set-Location -Path ${script:llamacppDir}
-        git apply $patch.FullName
-    }
-
-    # Avoid duplicate main symbols when we link into the cgo binary
-    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
-    $content = $content -replace 'int main\(', 'int __main('
-    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
-}
-
-function build {
-    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
-    & cmake --version
-    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function install {
-    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
-    md "${script:buildDir}/lib" -ea 0 > $null
-    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
-    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
-    # Display the dll dependencies in the build log
-    if ($script:DUMPBIN -ne $null) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
-    }
-}
-
-function sign {
-    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/lib/*.dll"
-        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
-            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function compress_libs {
-    if ($script:GZIP -eq $null) {
-        write-host "gzip not installed, not compressing files"
-        return
-    }
-    write-host "Compressing dlls..."
-    $libs = dir "${script:buildDir}/lib/*.dll"
-    foreach ($file in $libs) {
-        & "$script:GZIP" --best -f $file
-    }
-}
-
-function cleanup {
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        Set-Location -Path ${script:llamacppDir}
-        foreach ($file in $filePaths) {            
-            git checkout $file
-        }
-    }
-    Set-Location "${script:llamacppDir}/examples/server"
-    git checkout CMakeLists.txt server.cpp
-
-}
-
-init_vars
-git_module_setup
-apply_patches
-
-# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-
-$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-
-init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
-write-host "Building LCD CPU"
-build
-install
-sign
-compress_libs
-
-init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
-write-host "Building AVX CPU"
-build
-install
-sign
-compress_libs
-
-init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
-write-host "Building AVX2 CPU"
-build
-install
-sign
-compress_libs
-
-if ($null -ne $script:CUDA_LIB_DIR) {
-    # Then build cuda as a dynamically loaded library
-    $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-    $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
-    if ($null -ne $script:CUDA_VERSION) {
-        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
-    }
-    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
-    write-host "Building CUDA"
-    build
-    install
-    sign
-    compress_libs
-}
-
-if ($null -ne $env:HIP_PATH) {
-    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
-    if ($null -ne $script:ROCM_VERSION) {
-        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
-    }
-
-    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
-    $script:cmakeDefs += @(
-        "-G", "Ninja", 
-        "-DCMAKE_C_COMPILER=clang.exe",
-        "-DCMAKE_CXX_COMPILER=clang++.exe",
-        "-DLLAMA_HIPBLAS=on",
-        "-DLLAMA_AVX=on",
-        "-DLLAMA_AVX2=off",
-        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
-        "-DAMDGPU_TARGETS=$(amdGPUs)",
-        "-DGPU_TARGETS=$(amdGPUs)"
-        )
-
-    # Make sure the ROCm binary dir is first in the path
-    $env:PATH="$env:HIP_PATH\bin;$env:VSINSTALLDIR\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja;$env:PATH"
-
-    # We have to clobber the LIB var from the developer shell for clang to work properly
-    $env:LIB=""
-
-    write-host "Building ROCm"
-    build
-    # Ninja doesn't prefix with config name
-    ${script:config}=""
-    install
-    if ($null -ne $script:DUMPBIN) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
-    }
-    sign
-    compress_libs
-}
-
-cleanup
-write-host "`ngo generate completed"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate sh ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -163,9 +163,9 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	case FILE_MAGIC_GGLA:
 		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
+		c = &containerGGUF{bo: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &ContainerGGUF{ByteOrder: binary.BigEndian}
+		c = &containerGGUF{bo: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -5,20 +5,12 @@ import (
 	"encoding/binary"
 	"fmt"
 	"io"
-	"log/slog"
-	"os"
-	"regexp"
-
-	"github.com/d4l3k/go-bfloat16"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-	"github.com/x448/float16"

 	"github.com/jmorganca/ollama/format"
 )

-type ContainerGGUF struct {
-	ByteOrder binary.ByteOrder
+type containerGGUF struct {
+	bo binary.ByteOrder

 	Version uint32

@@ -31,28 +23,23 @@ type ContainerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
-
-	V3 struct {
-		NumTensor uint64
-		NumKV     uint64
-	}
 }

-func (c *ContainerGGUF) Name() string {
+func (c *containerGGUF) Name() string {
 	return "gguf"
 }

-func (c *ContainerGGUF) Decode(rso *readSeekOffset) (model, error) {
-	binary.Read(rso, c.ByteOrder, &c.Version)
+func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) {
+	binary.Read(rso, c.bo, &c.Version)

 	switch c.Version {
 	case 1:
-		binary.Read(rso, c.ByteOrder, &c.V1)
+		binary.Read(rso, c.bo, &c.V1)
 	default:
-		binary.Read(rso, c.ByteOrder, &c.V2)
+		binary.Read(rso, c.bo, &c.V2)
 	}

-	model := NewGGUFModel(c)
+	model := newGGUFModel(c)
 	if err := model.Decode(rso); err != nil {
 		return nil, err
 	}
@@ -61,61 +48,47 @@ func (c *ContainerGGUF) Decode(rso *readSeekOffset) (model, error) {
 }

 const (
-	_ uint32 = iota
-	GGUFTokenNormal
-	GGUFTokenUnknown
-	GGUFTokenControl
-	GGUFTokenUserDefined
-	GGUFTokenUnused
-	GGUFTokenByte
+	ggufTypeUint8 uint32 = iota
+	ggufTypeInt8
+	ggufTypeUint16
+	ggufTypeInt16
+	ggufTypeUint32
+	ggufTypeInt32
+	ggufTypeFloat32
+	ggufTypeBool
+	ggufTypeString
+	ggufTypeArray
+	ggufTypeUint64
+	ggufTypeInt64
+	ggufTypeFloat64
 )

-const (
-	GGUFTypeUint8 uint32 = iota
-	GGUFTypeInt8
-	GGUFTypeUint16
-	GGUFTypeInt16
-	GGUFTypeUint32
-	GGUFTypeInt32
-	GGUFTypeFloat32
-	GGUFTypeBool
-	GGUFTypeString
-	GGUFTypeArray
-	GGUFTypeUint64
-	GGUFTypeInt64
-	GGUFTypeFloat64
-)
+type kv map[string]any

-type KV map[string]any
-
-type Tensor struct {
-	Name   string
-	Kind   uint32
-	Offset uint64
+type tensor struct {
+	name   string
+	kind   uint32
+	offset uint64

 	// shape is the number of elements in each dimension
-	Shape [4]uint64
-
-	FileName      string
-	OffsetPadding uint64
-	FileOffsets   []uint64
+	shape [4]uint64
 }

-func (t Tensor) BlockSize() uint64 {
+func (t tensor) blockSize() uint64 {
 	switch {
-	case t.Kind < 2:
+	case t.kind < 2:
 		return 1
-	case t.Kind < 10:
+	case t.kind < 10:
 		return 32
 	default:
 		return 256
 	}
 }

-func (t Tensor) TypeSize() uint64 {
-	blockSize := t.BlockSize()
+func (t tensor) typeSize() uint64 {
+	blockSize := t.blockSize()

-	switch t.Kind {
+	switch t.kind {
 	case 0: // FP32
 		return 4
 	case 1: // FP16
@@ -155,63 +128,31 @@ func (t Tensor) TypeSize() uint64 {
 	}
 }

-func (t Tensor) Parameters() uint64 {
-	return t.Shape[0] * t.Shape[1] * t.Shape[2] * t.Shape[3]
+func (t tensor) parameters() uint64 {
+	return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
 }

-func (t Tensor) Size() uint64 {
-	return t.Parameters() * t.TypeSize() / t.BlockSize()
+func (t tensor) size() uint64 {
+	return t.parameters() * t.typeSize() / t.blockSize()
 }

-func (t Tensor) Repack(data []uint16, heads int) ([]uint16, error) {
-	n := tensor.New(tensor.WithShape(int(t.Shape[0]), int(t.Shape[1])), tensor.WithBacking(data))
-	origShape := n.Shape().Clone()
+type ggufModel struct {
+	*containerGGUF

-	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
-	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
-		return []uint16{}, err
-	}
-
-	if err := n.T(0, 2, 1, 3); err != nil {
-		return []uint16{}, err
-	}
-
-	if err := n.Reshape(origShape...); err != nil {
-		return []uint16{}, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return []uint16{}, err
-	}
-	newN, err := native.SelectU16(n, 1)
-	if err != nil {
-		return []uint16{}, err
-	}
-
-	var fullTensor []uint16
-	for _, v := range newN {
-		fullTensor = append(fullTensor, v...)
-	}
-	return fullTensor, nil
-}
-
-type GGUFModel struct {
-	*ContainerGGUF
-
-	KV
-	Tensors []Tensor
+	kv
+	tensors []tensor

 	parameters uint64
 }

-func NewGGUFModel(container *ContainerGGUF) *GGUFModel {
-	return &GGUFModel{
-		ContainerGGUF: container,
-		KV:            make(KV),
+func newGGUFModel(container *containerGGUF) *ggufModel {
+	return &ggufModel{
+		containerGGUF: container,
+		kv:            make(kv),
 	}
 }

-func (llm *GGUFModel) NumTensor() uint64 {
+func (llm *ggufModel) NumTensor() uint64 {
 	if llm.Version == 1 {
 		return uint64(llm.V1.NumTensor)
 	}
@@ -219,7 +160,7 @@ func (llm *GGUFModel) NumTensor() uint64 {
 	return llm.V2.NumTensor
 }

-func (llm *GGUFModel) NumKV() uint64 {
+func (llm *ggufModel) NumKV() uint64 {
 	if llm.Version == 1 {
 		return uint64(llm.V1.NumKV)
 	}
@@ -227,15 +168,15 @@ func (llm *GGUFModel) NumKV() uint64 {
 	return llm.V2.NumKV
 }

-func (llm *GGUFModel) ModelFamily() string {
-	if t, ok := llm.KV["general.architecture"].(string); ok {
+func (llm *ggufModel) ModelFamily() string {
+	if t, ok := llm.kv["general.architecture"].(string); ok {
 		return t
 	}

 	return "unknown"
 }

-func (llm *GGUFModel) ModelType() string {
+func (llm *ggufModel) ModelType() string {
 	if llm.parameters > 0 {
 		return format.HumanNumber(llm.parameters)
 	}
@@ -243,393 +184,15 @@ func (llm *GGUFModel) ModelType() string {
 	return "unknown"
 }

-func (llm *GGUFModel) FileType() string {
-	if t, ok := llm.KV["general.file_type"].(uint32); ok {
+func (llm *ggufModel) FileType() string {
+	if t, ok := llm.kv["general.file_type"].(uint32); ok {
 		return fileType(t)
 	}

 	return "unknown"
 }

-func (llm *GGUFModel) Encode(f *os.File) error {
-	// this mimics the order of the llama.cpp convert script
-	kOrder := []string{
-		"general.architecture",
-		"general.name",
-		"llama.context_length",
-		"llama.embedding_length",
-		"llama.block_count",
-		"llama.feed_forward_length",
-		"llama.rope.dimension_count",
-		"llama.attention.head_count",
-		"llama.attention.head_count_kv",
-		"llama.attention.layer_norm_rms_epsilon",
-		"llama.rope.freq_base",
-		"general.file_type",
-		"tokenizer.ggml.model",
-		"tokenizer.ggml.tokens",
-		"tokenizer.ggml.scores",
-		"tokenizer.ggml.token_type",
-		"tokenizer.ggml.bos_token_id",
-		"tokenizer.ggml.eos_token_id",
-		"tokenizer.ggml.unknown_token_id",
-		"tokenizer.ggml.add_bos_token",
-		"tokenizer.ggml.add_eos_token",
-		"tokenizer.chat_template",
-	}
-
-	if err := binary.Write(f, llm.ByteOrder, []byte("GGUF")); err != nil {
-		return err
-	}
-
-	if err := binary.Write(f, llm.ByteOrder, uint32(3)); err != nil {
-		return err
-	}
-
-	if err := binary.Write(f, llm.ByteOrder, uint64(llm.V3.NumTensor)); err != nil {
-		return err
-	}
-
-	if err := binary.Write(f, llm.ByteOrder, uint64(llm.V3.NumKV)); err != nil {
-		return err
-	}
-
-	for _, k := range kOrder {
-		val, ok := llm.KV[k]
-		if !ok {
-			continue
-		}
-
-		if err := binary.Write(f, llm.ByteOrder, uint64(len(k))); err != nil {
-			return err
-		}
-		if err := binary.Write(f, llm.ByteOrder, []byte(k)); err != nil {
-			return err
-		}
-
-		switch v := val.(type) {
-		case uint32:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeUint32); err != nil {
-				return err
-			}
-
-			if err := llm.writeUint32(f, v); err != nil {
-				return err
-			}
-		case float32:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeFloat32); err != nil {
-				return err
-			}
-
-			if err := llm.writeF32(f, v); err != nil {
-				return err
-			}
-		case bool:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeBool); err != nil {
-				return err
-			}
-
-			if err := llm.writeBool(f, v); err != nil {
-				return err
-			}
-		case string:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeString); err != nil {
-				return err
-			}
-
-			if err := llm.writeString(f, v); err != nil {
-				return err
-			}
-		case []int32:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeInt32); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil {
-				return err
-			}
-			for _, i := range v {
-				if err := llm.writeInt32(f, i); err != nil {
-					return err
-				}
-			}
-		case []uint32:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeUint32); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil {
-				return err
-			}
-			for _, i := range v {
-				if err := llm.writeUint32(f, i); err != nil {
-					return err
-				}
-			}
-		case []float32:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeFloat32); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil {
-				return err
-			}
-			for _, fl := range v {
-				if err := llm.writeF32(f, fl); err != nil {
-					return err
-				}
-			}
-		case []string:
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, GGUFTypeString); err != nil {
-				return err
-			}
-
-			if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil {
-				return err
-			}
-
-			for _, s := range v {
-				if err := llm.writeString(f, s); err != nil {
-					return err
-				}
-			}
-		}
-	}
-
-	// write layer metadata
-	for _, t := range llm.Tensors {
-		if err := llm.writeString(f, t.Name); err != nil {
-			return err
-		}
-
-		// the dimensions of the tensor
-		dims := 1
-		if t.Shape[1] > 0 {
-			dims = 2
-		}
-
-		if err := binary.Write(f, llm.ByteOrder, uint32(dims)); err != nil {
-			return err
-		}
-
-		for i := 0; i < dims; i++ {
-			if err := binary.Write(f, llm.ByteOrder, uint64(t.Shape[dims-1-i])); err != nil {
-				return err
-			}
-		}
-
-		if err := binary.Write(f, llm.ByteOrder, uint32(t.Kind)); err != nil {
-			return err
-		}
-
-		if err := binary.Write(f, llm.ByteOrder, uint64(t.Offset)); err != nil {
-			return err
-		}
-	}
-
-	offset, terr := f.Seek(0, io.SeekCurrent)
-	if terr != nil {
-		return terr
-	}
-	slog.Debug(fmt.Sprintf("tensors offset = %x", offset))
-
-	if err := llm.writePadding(f, 32); err != nil {
-		return err
-	}
-
-	var dataFile *os.File
-	var currentFile string
-	var err error
-	for _, t := range llm.Tensors {
-		if currentFile != t.FileName {
-			if f != nil {
-				dataFile.Close()
-			}
-			currentFile = t.FileName
-			dataFile, err = os.Open(t.FileName)
-			if err != nil {
-				fmt.Println(err)
-				return err
-			}
-		}
-
-		dataFile.Seek(int64(t.OffsetPadding+t.FileOffsets[0]), 0)
-
-		pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-		re, err := regexp.Compile(pattern)
-		if err != nil {
-			return err
-		}
-
-		matches := re.FindAllStringSubmatch(t.Name, -1)
-		if len(matches) > 0 {
-			layerSize := t.FileOffsets[1] - t.FileOffsets[0]
-
-			var err error
-			tData := make([]uint16, layerSize/2)
-			if err = binary.Read(dataFile, llm.ByteOrder, tData); err != nil {
-				return err
-			}
-
-			layerType := matches[0][re.SubexpIndex("layer")]
-			var heads uint32
-			switch layerType {
-			case "q":
-				heads = llm.KV["llama.attention.head_count"].(uint32)
-			case "k":
-				heads = llm.KV["llama.attention.head_count_kv"].(uint32)
-				if heads == 0 {
-					heads = llm.KV["llama.attention.head_count"].(uint32)
-				}
-			}
-
-			tData, err = t.Repack(tData, int(heads))
-			if err != nil {
-				return err
-			}
-
-			var buf []byte
-			for _, n := range tData {
-				buf = binary.LittleEndian.AppendUint16(buf, n)
-			}
-
-			tempBuf := make([]uint16, len(tData))
-			tDataF32 := bfloat16.DecodeFloat32(buf)
-			for cnt, v := range tDataF32 {
-				tDataF16 := float16.Fromfloat32(v)
-				tempBuf[cnt] = uint16(tDataF16)
-			}
-
-			if err = binary.Write(f, llm.ByteOrder, tempBuf); err != nil {
-				return err
-			}
-
-			if err := llm.writePadding(f, 32); err != nil {
-				return err
-			}
-			continue
-		}
-
-		remaining := t.FileOffsets[1] - t.FileOffsets[0]
-
-		bufSize := uint64(10240)
-		var finished bool
-		for {
-			data := make([]byte, min(bufSize, remaining))
-
-			b, err := io.ReadFull(dataFile, data)
-			remaining -= uint64(b)
-
-			if err == io.EOF || remaining <= 0 {
-				finished = true
-			} else if err != nil {
-				return err
-			}
-
-			// convert bfloat16 -> ieee float32
-			tDataF32 := bfloat16.DecodeFloat32(data)
-
-			switch t.Kind {
-			case 0:
-				if err := binary.Write(f, llm.ByteOrder, tDataF32); err != nil {
-					return err
-				}
-			case 1:
-				// convert float32 -> float16
-				tempBuf := make([]uint16, len(data)/2)
-				for cnt, v := range tDataF32 {
-					tDataF16 := float16.Fromfloat32(v)
-					tempBuf[cnt] = uint16(tDataF16)
-				}
-				if err := binary.Write(f, llm.ByteOrder, tempBuf); err != nil {
-					return err
-				}
-			}
-			if finished {
-				break
-			}
-		}
-
-		if err := llm.writePadding(f, 32); err != nil {
-			return err
-		}
-	}
-	f.Close()
-
-	return nil
-}
-
-func (llm *GGUFModel) writePadding(f *os.File, align int64) error {
-	// gguf file padding is defined in https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure
-	offset, err := f.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-	padding := ((offset + align - 1) / align) * align
-	buf := make([]byte, padding-offset)
-	if err := binary.Write(f, llm.ByteOrder, buf); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func (llm *GGUFModel) writeInt32(f *os.File, v int32) error {
-	if err := binary.Write(f, llm.ByteOrder, v); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (llm *GGUFModel) writeUint32(f *os.File, v uint32) error {
-	if err := binary.Write(f, llm.ByteOrder, v); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (llm *GGUFModel) writeF32(f *os.File, v float32) error {
-	if err := binary.Write(f, llm.ByteOrder, v); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (llm *GGUFModel) writeBool(f *os.File, b bool) error {
-	if err := binary.Write(f, llm.ByteOrder, b); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (llm *GGUFModel) writeString(f *os.File, s string) error {
-	if err := binary.Write(f, llm.ByteOrder, uint64(len(s))); err != nil {
-		return err
-	}
-
-	if err := binary.Write(f, llm.ByteOrder, []byte(s)); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (llm *GGUFModel) Decode(rso *readSeekOffset) error {
+func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 	// decode key-values
 	for i := 0; uint64(i) < llm.NumKV(); i++ {
 		k, err := llm.readString(rso)
@@ -641,36 +204,36 @@ func (llm *GGUFModel) Decode(rso *readSeekOffset) error {

 		var v any
 		switch vtype {
-		case GGUFTypeUint8:
+		case ggufTypeUint8:
 			v = llm.readU8(rso)
-		case GGUFTypeInt8:
+		case ggufTypeInt8:
 			v = llm.readI8(rso)
-		case GGUFTypeUint16:
+		case ggufTypeUint16:
 			v = llm.readU16(rso)
-		case GGUFTypeInt16:
+		case ggufTypeInt16:
 			v = llm.readI16(rso)
-		case GGUFTypeUint32:
+		case ggufTypeUint32:
 			v = llm.readU32(rso)
-		case GGUFTypeInt32:
+		case ggufTypeInt32:
 			v = llm.readI32(rso)
-		case GGUFTypeUint64:
+		case ggufTypeUint64:
 			v = llm.readU64(rso)
-		case GGUFTypeInt64:
+		case ggufTypeInt64:
 			v = llm.readI64(rso)
-		case GGUFTypeFloat32:
+		case ggufTypeFloat32:
 			v = llm.readF32(rso)
-		case GGUFTypeFloat64:
+		case ggufTypeFloat64:
 			v = llm.readF64(rso)
-		case GGUFTypeBool:
+		case ggufTypeBool:
 			v = llm.readBool(rso)
-		case GGUFTypeString:
+		case ggufTypeString:
 			s, err := llm.readString(rso)
 			if err != nil {
 				return err
 			}

 			v = s
-		case GGUFTypeArray:
+		case ggufTypeArray:
 			a, err := llm.readArray(rso)
 			if err != nil {
 				return err
@@ -681,7 +244,7 @@ func (llm *GGUFModel) Decode(rso *readSeekOffset) error {
 			return fmt.Errorf("invalid type: %d", vtype)
 		}

-		llm.KV[k] = v
+		llm.kv[k] = v
 	}

 	// decode tensors
@@ -699,33 +262,33 @@ func (llm *GGUFModel) Decode(rso *readSeekOffset) error {
 			shape[i] = llm.readU64(rso)
 		}

-		tensor := Tensor{
-			Name:   name,
-			Kind:   llm.readU32(rso),
-			Offset: llm.readU64(rso),
-			Shape:  shape,
+		tensor := tensor{
+			name:   name,
+			kind:   llm.readU32(rso),
+			offset: llm.readU64(rso),
+			shape:  shape,
 		}

-		llm.Tensors = append(llm.Tensors, tensor)
-		llm.parameters += tensor.Parameters()
+		llm.tensors = append(llm.tensors, tensor)
+		llm.parameters += tensor.parameters()
 	}

-	alignment, ok := llm.KV["general.alignment"].(uint32)
+	alignment, ok := llm.kv["general.alignment"].(uint32)
 	if !ok {
 		alignment = 32
 	}

 	rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
-	for _, tensor := range llm.Tensors {
-		padded := (int64(tensor.Size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
+	for _, tensor := range llm.tensors {
+		padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
 		rso.Seek(padded, io.SeekCurrent)
 	}

 	return nil
 }

-func (llm *GGUFModel) NumLayers() uint32 {
-	value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
+func (llm *ggufModel) NumLayers() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
@@ -733,8 +296,8 @@ func (llm *GGUFModel) NumLayers() uint32 {
 	return value.(uint32)
 }

-func (llm *GGUFModel) NumHead() uint32 {
-	value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
+func (llm *ggufModel) NumHead() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
@@ -742,8 +305,8 @@ func (llm *GGUFModel) NumHead() uint32 {
 	return value.(uint32)
 }

-func (llm *GGUFModel) NumEmbed() uint32 {
-	value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
+func (llm *ggufModel) NumEmbed() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
@@ -751,8 +314,8 @@ func (llm *GGUFModel) NumEmbed() uint32 {
 	return value.(uint32)
 }

-func (llm *GGUFModel) NumHeadKv() uint32 {
-	value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
+func (llm *ggufModel) NumHeadKv() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
@@ -760,8 +323,8 @@ func (llm *GGUFModel) NumHeadKv() uint32 {
 	return value.(uint32)
 }

-func (llm *GGUFModel) NumCtx() uint32 {
-	value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
+func (llm *ggufModel) NumCtx() uint32 {
+	value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
 	if !exists {
 		return 0
 	}
@@ -769,7 +332,7 @@ func (llm *GGUFModel) NumCtx() uint32 {
 	return value.(uint32)
 }

-func (llm *GGUFModel) NumGQA() uint32 {
+func (llm *ggufModel) NumGQA() uint32 {
 	numHeadKv := llm.NumHeadKv()
 	if numHeadKv == 0 {
 		return 0
@@ -778,75 +341,75 @@ func (llm *GGUFModel) NumGQA() uint32 {
 	return llm.NumHead() / numHeadKv
 }

-func (llm GGUFModel) readU8(r io.Reader) uint8 {
+func (llm ggufModel) readU8(r io.Reader) uint8 {
 	var u8 uint8
-	binary.Read(r, llm.ByteOrder, &u8)
+	binary.Read(r, llm.bo, &u8)
 	return u8
 }

-func (llm GGUFModel) readI8(r io.Reader) int8 {
+func (llm ggufModel) readI8(r io.Reader) int8 {
 	var i8 int8
-	binary.Read(r, llm.ByteOrder, &i8)
+	binary.Read(r, llm.bo, &i8)
 	return i8
 }

-func (llm GGUFModel) readU16(r io.Reader) uint16 {
+func (llm ggufModel) readU16(r io.Reader) uint16 {
 	var u16 uint16
-	binary.Read(r, llm.ByteOrder, &u16)
+	binary.Read(r, llm.bo, &u16)
 	return u16
 }

-func (llm GGUFModel) readI16(r io.Reader) int16 {
+func (llm ggufModel) readI16(r io.Reader) int16 {
 	var i16 int16
-	binary.Read(r, llm.ByteOrder, &i16)
+	binary.Read(r, llm.bo, &i16)
 	return i16
 }

-func (llm GGUFModel) readU32(r io.Reader) uint32 {
+func (llm ggufModel) readU32(r io.Reader) uint32 {
 	var u32 uint32
-	binary.Read(r, llm.ByteOrder, &u32)
+	binary.Read(r, llm.bo, &u32)
 	return u32
 }

-func (llm GGUFModel) readI32(r io.Reader) int32 {
+func (llm ggufModel) readI32(r io.Reader) int32 {
 	var i32 int32
-	binary.Read(r, llm.ByteOrder, &i32)
+	binary.Read(r, llm.bo, &i32)
 	return i32
 }

-func (llm GGUFModel) readU64(r io.Reader) uint64 {
+func (llm ggufModel) readU64(r io.Reader) uint64 {
 	var u64 uint64
-	binary.Read(r, llm.ByteOrder, &u64)
+	binary.Read(r, llm.bo, &u64)
 	return u64
 }

-func (llm GGUFModel) readI64(r io.Reader) int64 {
+func (llm ggufModel) readI64(r io.Reader) int64 {
 	var i64 int64
-	binary.Read(r, llm.ByteOrder, &i64)
+	binary.Read(r, llm.bo, &i64)
 	return i64
 }

-func (llm GGUFModel) readF32(r io.Reader) float32 {
+func (llm ggufModel) readF32(r io.Reader) float32 {
 	var f32 float32
-	binary.Read(r, llm.ByteOrder, &f32)
+	binary.Read(r, llm.bo, &f32)
 	return f32
 }

-func (llm GGUFModel) readF64(r io.Reader) float64 {
+func (llm ggufModel) readF64(r io.Reader) float64 {
 	var f64 float64
-	binary.Read(r, llm.ByteOrder, &f64)
+	binary.Read(r, llm.bo, &f64)
 	return f64
 }

-func (llm GGUFModel) readBool(r io.Reader) bool {
+func (llm ggufModel) readBool(r io.Reader) bool {
 	var b bool
-	binary.Read(r, llm.ByteOrder, &b)
+	binary.Read(r, llm.bo, &b)
 	return b
 }

-func (llm GGUFModel) readStringV1(r io.Reader) (string, error) {
+func (llm ggufModel) readStringV1(r io.Reader) (string, error) {
 	var nameLength uint32
-	binary.Read(r, llm.ByteOrder, &nameLength)
+	binary.Read(r, llm.bo, &nameLength)

 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
@@ -859,13 +422,13 @@ func (llm GGUFModel) readStringV1(r io.Reader) (string, error) {
 	return b.String(), nil
 }

-func (llm GGUFModel) readString(r io.Reader) (string, error) {
+func (llm ggufModel) readString(r io.Reader) (string, error) {
 	if llm.Version == 1 {
 		return llm.readStringV1(r)
 	}

 	var nameLength uint64
-	binary.Read(r, llm.ByteOrder, &nameLength)
+	binary.Read(r, llm.bo, &nameLength)

 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
@@ -875,29 +438,29 @@ func (llm GGUFModel) readString(r io.Reader) (string, error) {
 	return b.String(), nil
 }

-func (llm *GGUFModel) readArrayV1(r io.Reader) (arr []any, err error) {
+func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
 	atype := llm.readU32(r)
 	n := llm.readU32(r)

 	for i := 0; uint32(i) < n; i++ {
 		switch atype {
-		case GGUFTypeUint8:
+		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
-		case GGUFTypeInt8:
+		case ggufTypeInt8:
 			arr = append(arr, llm.readI8(r))
-		case GGUFTypeUint16:
+		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
-		case GGUFTypeInt16:
+		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
-		case GGUFTypeUint32:
+		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
-		case GGUFTypeInt32:
+		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
-		case GGUFTypeFloat32:
+		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
-		case GGUFTypeBool:
+		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
-		case GGUFTypeString:
+		case ggufTypeString:
 			s, err := llm.readStringV1(r)
 			if err != nil {
 				return nil, err
@@ -912,7 +475,7 @@ func (llm *GGUFModel) readArrayV1(r io.Reader) (arr []any, err error) {
 	return
 }

-func (llm *GGUFModel) readArray(r io.Reader) (arr []any, err error) {
+func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
 	if llm.Version == 1 {
 		return llm.readArrayV1(r)
 	}
@@ -922,29 +485,29 @@ func (llm *GGUFModel) readArray(r io.Reader) (arr []any, err error) {

 	for i := 0; uint64(i) < n; i++ {
 		switch atype {
-		case GGUFTypeUint8:
+		case ggufTypeUint8:
 			arr = append(arr, llm.readU8(r))
-		case GGUFTypeInt8:
+		case ggufTypeInt8:
 			arr = append(arr, llm.readI8(r))
-		case GGUFTypeUint16:
+		case ggufTypeUint16:
 			arr = append(arr, llm.readU16(r))
-		case GGUFTypeInt16:
+		case ggufTypeInt16:
 			arr = append(arr, llm.readI16(r))
-		case GGUFTypeUint32:
+		case ggufTypeUint32:
 			arr = append(arr, llm.readU32(r))
-		case GGUFTypeInt32:
+		case ggufTypeInt32:
 			arr = append(arr, llm.readI32(r))
-		case GGUFTypeUint64:
+		case ggufTypeUint64:
 			arr = append(arr, llm.readU64(r))
-		case GGUFTypeInt64:
+		case ggufTypeInt64:
 			arr = append(arr, llm.readI64(r))
-		case GGUFTypeFloat32:
+		case ggufTypeFloat32:
 			arr = append(arr, llm.readF32(r))
-		case GGUFTypeFloat64:
+		case ggufTypeFloat64:
 			arr = append(arr, llm.readF64(r))
-		case GGUFTypeBool:
+		case ggufTypeBool:
 			arr = append(arr, llm.readBool(r))
-		case GGUFTypeString:
+		case ggufTypeString:
 			s, err := llm.readString(r)
 			if err != nil {
 				return nil, err
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -6,6 +6,7 @@ import (
 	"log/slog"
 	"os"
 	"runtime"
+	"time"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
@@ -19,7 +20,7 @@ type LLM interface {
 	Close()
 }

-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -120,15 +121,15 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er

 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, model, adapters, projectors, opts)
+	return newLlmServer(info, workDir, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize
-func Init() error {
-	return nativeInit()
+func Init(workdir string) error {
+	return nativeInit(workdir)
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -143,15 +144,34 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri
 		}
 	}

-	err := fmt.Errorf("unable to locate suitable llm library")
+	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
+	_, err := os.Stat(dynLibs[0])
+	if err != nil {
+		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
+		err = nativeInit(workDir)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	err2 := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
 		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
-		err = err
+		err2 = err
 	}

-	return nil, err
+	return nil, err2
+}
+
+func parseDurationMs(ms float64) time.Duration {
+	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
+	if err != nil {
+		panic(err)
+	}
+
+	return dur
 }
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -0,0 +1,14 @@
+//go:generate cmake -S server -B server/build/cpu -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off
+//go:generate cmake -S server -B server/build/cpu_avx -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on
+//go:generate cmake -S server -B server/build/cpu_avx2 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=on
+//go:generate cmake --build server/build/cpu --target server -- -j4
+//go:generate cmake --build server/build/cpu_avx --target server -- -j4
+//go:generate cmake --build server/build/cpu_avx2 --target server -- -j4
+package llm
+
+import "embed"
+
+//go:embed server/build/cpu/server
+//go:embed server/build/cpu_avx/server
+//go:embed server/build/cpu_avx2/server
+var libEmbed embed.FS
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -0,0 +1,8 @@
+//go:generate cmake -S server -B server/build/metal -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64
+//go:generate cmake --build server/build/metal --target server -- -j4
+package llm
+
+import "embed"
+
+//go:embed server/build/metal/ggml-metal.metal server/build/metal/server
+var libEmbed embed.FS
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,115 +0,0 @@
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index f255ad76..5b83acb1 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -28,6 +28,10 @@
- #include <thread>
- #include <signal.h>
- 
-+#ifdef GGML_USE_CUBLAS
-+extern "C" GGML_CALL void ggml_free_cublas(void);
-+#endif
-+
- using json = nlohmann::json;
- 
- bool server_verbose = false;
-@@ -648,6 +652,10 @@ struct server_context {
-             llama_free_model(model);
-             model = nullptr;
-         }
-+
-+#ifdef GGML_USE_CUBLAS
-+        ggml_free_cublas();
-+#endif
-     }
- 
-     bool load_model(const gpt_params & params_) {
-@@ -3339,6 +3347,7 @@ int main(int argc, char ** argv) {
-     sigemptyset (&sigint_action.sa_mask);
-     sigint_action.sa_flags = 0;
-     sigaction(SIGINT, &sigint_action, NULL);
-+    sigaction(SIGUSR1, &sigint_action, NULL);
- #elif defined (_WIN32)
-     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 72bcec8c..50a45e3d 100644
--- a/ggml-cuda.cu
-+++ b/ggml-cuda.cu
-@@ -43,6 +43,7 @@
- #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
- #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
- #define cublasCreate hipblasCreate
-+#define cublasDestroy hipblasDestroy
- #define cublasGemmEx hipblasGemmEx
- #define cublasGemmBatchedEx hipblasGemmBatchedEx
- #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
-     return g_cublas_loaded;
- }
- 
-GGML_CALL void ggml_init_cublas() {
-    static bool initialized = false;
-+static bool g_cublas_initialized = false;
- 
-    if (!initialized) {
-+GGML_CALL void ggml_init_cublas() {
-+    if (!g_cublas_initialized) {
- 
- #ifdef __HIP_PLATFORM_AMD__
-         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
- #endif
- 
-         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
-            initialized = true;
-+            g_cublas_initialized = true;
-             g_cublas_loaded = false;
-             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
-             return;
-@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
-         // configure logging to stdout
-         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- 
-        initialized = true;
-+        g_cublas_initialized = true;
-         g_cublas_loaded = true;
-     }
- }
-@@ -12490,3 +12491,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
-     }
-     return device_count;
- }
-+
-+extern "C" GGML_CALL void ggml_free_cublas(void);
-+GGML_CALL void ggml_free_cublas(void) {
-+    for (int id = 0; id < g_device_count; ++id) {
-+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-+        if (g_device_caps[id].vmm) {
-+            CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
-+            g_cuda_pool_size[id] = 0;
-+            g_cuda_pool_addr[id] = 0;
-+        }
-+#endif
-+        // TODO: free legacy non-vmm memory
-+        // destroy cublas handle
-+        CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
-+        g_cublas_handles[id] = nullptr;
-+    }
-+
-+    g_cublas_initialized = false;
-+}
-diff --git a/ggml-cuda.h b/ggml-cuda.h
-index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
-+++ b/ggml-cuda.h
-@@ -23,6 +23,9 @@ GGML_API GGML_CALL void   ggml_init_cublas(void);
- // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
- GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
- 
-+// Release CUDA resources
-+GGML_API GGML_CALL void   ggml_free_cublas(void);
-+
- GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
- GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
- 
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -103,20 +103,10 @@ func rocmDynLibPresent() bool {
 	return false
 }

-func nativeInit() error {
+func nativeInit(workdir string) error {
 	slog.Info("Extracting dynamic libraries...")
-	assetsDir, err := gpu.AssetsDir()
-	if err != nil {
-		return err
-	}
-
-	// delete the assetsDir
-	if err := os.RemoveAll(assetsDir); err != nil {
-		return err
-	}
-
 	if runtime.GOOS == "darwin" {
-		err := extractPayloadFiles(assetsDir, "llama.cpp/ggml-metal.metal")
+		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
 			if err == payloadMissing {
 				// TODO perhaps consider this a hard failure on arm macs?
@@ -125,10 +115,10 @@ func nativeInit() error {
 			}
 			return err
 		}
-		os.Setenv("GGML_METAL_PATH_RESOURCES", assetsDir)
+		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	}

-	libs, err := extractDynamicLibs(assetsDir, "llama.cpp/build/*/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
@@ -159,13 +149,17 @@ func nativeInit() error {
 	return nil
 }

-func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
+func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	libs := []string{}

+	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
+	// and tracking by version so we don't reexpand the files every time
+	// Also maybe consider lazy loading only what is needed
+
 	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
@@ -178,14 +172,14 @@ func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(assetsDir, pathComps[pathComponentCount-3])
+			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
+				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
@@ -202,13 +196,19 @@ func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 				libs = append(libs, destFile)
 			}

-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFp.Close()
-			if _, err := io.Copy(destFp, src); err != nil {
-				return fmt.Errorf("copy payload %s: %v", file, err)
+			_, err = os.Stat(destFile)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", file, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", file, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", file, err)
 			}
 			return nil
 		})
@@ -216,7 +216,7 @@ func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 	return libs, g.Wait()
 }

-func extractPayloadFiles(assetsDir, glob string) error {
+func extractPayloadFiles(workDir, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return payloadMissing
@@ -228,8 +228,8 @@ func extractPayloadFiles(assetsDir, glob string) error {
 			return fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
-		if err := os.MkdirAll(assetsDir, 0o755); err != nil {
-			return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
+		if err := os.MkdirAll(workDir, 0o755); err != nil {
+			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
 		src := io.Reader(srcFile)
 		filename := file
@@ -241,22 +241,20 @@ func extractPayloadFiles(assetsDir, glob string) error {
 			filename = strings.TrimSuffix(filename, ".gz")
 		}

-		destFile := filepath.Join(assetsDir, filepath.Base(filename))
+		destFile := filepath.Join(workDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
-			defer destFp.Close()
-			if _, err := io.Copy(destFp, src); err != nil {
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return fmt.Errorf("stat payload %s: %v", file, err)
-		case err == nil:
-			slog.Debug("payload already exists: " + destFile)
 		}
 	}
 	return nil
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/build/linux/*/*/lib/*
-var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -1,58 +0,0 @@
-package llm
-
-import (
-	"testing"
-
-	"github.com/jmorganca/ollama/gpu"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestGetDynLibs(t *testing.T) {
-	availableDynLibs = map[string]string{
-		"cpu": "X_cpu",
-	}
-	assert.Equal(t, false, rocmDynLibPresent())
-	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"], res[0])
-
-	variant := gpu.GetCPUVariant()
-	if variant != "" {
-		variant = "_" + variant
-	}
-	availableDynLibs = map[string]string{
-		"rocm_v5":       "X_rocm_v5",
-		"rocm_v6":       "X_rocm_v6",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-
-	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
-
-	res = getDynLibs(gpu.GpuInfo{Library: "default"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, "default", res[0])
-
-	availableDynLibs = map[string]string{
-		"rocm":          "X_rocm_v5",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 2)
-	assert.Equal(t, availableDynLibs["rocm"], res[0])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
-}
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
-var libEmbed embed.FS
--- a/llm/server/.gitignore
+++ b/llm/server/.gitignore
@@ -0,0 +1 @@
+build
--- a/llm/server/CMakeLists.txt
+++ b/llm/server/CMakeLists.txt
@@ -0,0 +1,93 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(llm)
+
+include(FetchContent)
+
+set(add_token_patch
+    git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/add_token.patch
+)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/build/llama.cpp")
+
+FetchContent_Declare(
+  llama_cpp
+  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
+  GIT_TAG        c29af7e2252d288f2ea58a7d437c1cb7c0abf160
+
+  # this could be risky if the patch doesn't apply
+  PATCH_COMMAND ${add_token_patch} || true
+)
+
+FetchContent_MakeAvailable(llama_cpp)
+add_subdirectory(${llama_cpp_SOURCE_DIR}/examples/llava)
+
+# code signing
+function(sign target)
+    if(APPLE)
+        if(DEFINED ENV{APPLE_IDENTITY})
+            add_custom_command(TARGET ${target} POST_BUILD
+                COMMAND codesign
+                        -f 
+                        --timestamp
+                        --deep
+                        --options=runtime
+                        --sign "$ENV{APPLE_IDENTITY}"
+                        --identifier ai.ollama.ollama
+                        $<TARGET_FILE:${target}>
+                COMMENT "Signing macOS binary: ${target}"
+            )
+        endif()
+    elseif(WIN32)
+        find_program(SIGNTOOL_EXE NAMES signtool PATHS "C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64" NO_DEFAULT_PATH)
+        set(KEY_CONTAINER "$ENV{KEY_CONTAINER}")
+        set(OLLAMA_CERT "$ENV{OLLAMA_CERT}")
+
+        if(SIGNTOOL_EXE AND KEY_CONTAINER AND OLLAMA_CERT)
+            add_custom_command(TARGET ${target} POST_BUILD
+                COMMAND "${SIGNTOOL_EXE}"
+                        "sign"
+                        "/v"
+                        "/fd" "sha256"
+                        "/t" "http://timestamp.digicert.com"
+                        "/f" "${OLLAMA_CERT}"
+                        "/csp" "Google Cloud KMS Provider"
+                        "/kc" "${KEY_CONTAINER}"
+                        "$<TARGET_FILE:${target}>"
+                COMMENT "Signing Windows binary: ${target}"
+            )
+        endif()
+    endif()
+endfunction()
+
+set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75;80")
+
+function(gzip target)
+    set(gzip_target "gzip_${target}")
+    add_custom_target(${gzip_target} ALL
+        COMMAND gzip -k -f ${target}
+        COMMENT "Gzipping ${target}"
+        VERBATIM
+    )
+    add_dependencies(${gzip_target} ${target})
+endfunction()
+
+function(link_windows_libraries target)
+    if (WIN32)
+        target_link_libraries(${target} PRIVATE ws2_32)
+    endif()
+endfunction()
+
+add_executable(server ${llama_cpp_SOURCE_DIR}/examples/server/server.cpp ${llama_cpp_SOURCE_DIR})
+target_compile_definitions(server PRIVATE)
+target_link_libraries(server PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(server PRIVATE cxx_std_17)
+link_windows_libraries(server)
+sign(server)
+gzip(server)
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+    configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_BINARY_DIR}/ggml-metal.metal COPYONLY)
+endif()
+
+# TODO: ROCm
--- a/llm/server/patches/01-cache.diff
+++ b/llm/server/patches/01-cache.diff
@@ -1,19 +1,21 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index f255ad76..914ecfdd 100644
+index 2b2f4a0f..afac49af 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1101,12 +1101,13 @@ struct server_context {
+@@ -997,13 +997,15 @@ struct llama_server_context
+                 slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
             }
- 
 -            slot.add_token_string(result);
-             if (slot.params.stream) {
+
+             if (slot.params.stream)
+             {
                 send_partial_response(slot, result);
             }
         }
 
 +        slot.add_token_string(result);
 +
-         if (incomplete) {
+         if (incomplete)
+         {
             slot.has_next_token = true;
-         }
--- a/llm/utils.go
+++ b/llm/utils.go
@@ -1,15 +0,0 @@
-package llm
-
-import (
-	"fmt"
-	"time"
-)
-
-func parseDurationMs(ms float64) time.Duration {
-	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
-	if err != nil {
-		panic(err)
-	}
-
-	return dur
-}
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,6 +22,5 @@ for TARGETARCH in ${BUILD_ARCH}; do
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
    docker rm builder-$TARGETARCH
 done
--- a/server/images.go
+++ b/server/images.go
@@ -1,7 +1,6 @@
 package server

 import (
-	"archive/zip"
 	"bytes"
 	"context"
 	"crypto/sha256"
@@ -24,7 +23,6 @@ import (
 	"golang.org/x/exp/slices"

 	"github.com/jmorganca/ollama/api"
-	"github.com/jmorganca/ollama/convert"
 	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
@@ -318,24 +316,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 				c.Args = blobPath
 			}

-			pathName := realpath(modelFileDir, c.Args)
-
-			ggufName, err := convertSafetensors(name, pathName)
-			if err != nil {
-				switch {
-				case errors.Is(err, zip.ErrFormat):
-					// it's not a safetensor archive
-				default:
-					return err
-				}
-			}
-
-			if ggufName != "" {
-				pathName = ggufName
-				defer os.RemoveAll(ggufName)
-			}
-
-			bin, err := os.Open(pathName)
+			bin, err := os.Open(realpath(modelFileDir, c.Args))
 			if err != nil {
 				// not a file on disk so must be a model reference
 				modelpath := ParseModelPath(c.Args)
@@ -611,73 +592,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	return nil
 }

-func convertSafetensors(name, fn string) (string, error) {
-	r, err := zip.OpenReader(fn)
-	if err != nil {
-		return "", err
-	}
-	defer r.Close()
-
-	tempDir, err := os.MkdirTemp("", "ollama-convert")
-	if err != nil {
-		return "", err
-	}
-	defer os.RemoveAll(tempDir)
-
-	for _, f := range r.File {
-		fpath := filepath.Join(tempDir, f.Name)
-		outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
-		if err != nil {
-			return "", err
-		}
-
-		rc, err := f.Open()
-		if err != nil {
-			return "", err
-		}
-
-		_, err = io.Copy(outFile, rc)
-		if err != nil {
-			return "", err
-		}
-
-		outFile.Close()
-		rc.Close()
-	}
-
-	params, err := convert.GetParams(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	SupportedArchs := []string{
-		"MistralForCausalLM",
-	}
-
-	for _, arch := range params.Architectures {
-		if !slices.Contains(SupportedArchs, arch) {
-			return "", fmt.Errorf("this safetensors model is not yet supported")
-		}
-	}
-
-	t, err := convert.GetSafeTensors(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	vocab, err := convert.LoadTokens(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	fn, err = convert.WriteGGUF(name, t, params, vocab)
-	if err != nil {
-		return "", err
-	}
-
-	return fn, nil
-}
-
 func CopyModel(src, dest string) error {
 	srcModelPath := ParseModelPath(src)
 	srcPath, err := srcModelPath.GetManifestPath()
--- a/server/routes.go
+++ b/server/routes.go
@@ -66,6 +66,8 @@ var defaultSessionDuration = 5 * time.Minute

 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
+	workDir := c.GetString("workDir")
+
 	needLoad := loaded.runner == nil || // is there a model loaded?
 		loaded.ModelPath != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@@ -80,7 +82,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 			loaded.Options = nil
 		}

-		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llmRunner, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -1033,7 +1035,7 @@ func Serve(ln net.Listener) error {
 		os.Exit(0)
 	}()

-	if err := llm.Init(); err != nil {
+	if err := llm.Init(s.WorkDir); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}
 	if runtime.GOOS == "linux" { // TODO - windows too