replace assets on load

2026-02-11 16:13:08 -05:00 · 2024-03-08 00:44:22 -08:00
28 changed files with 243 additions and 465 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,9 +1,8 @@
 .vscode
 ollama
 app
-macapp
 dist
 llm/llama.cpp
 .env
 .cache
-test_data
+test_data
--- a/4
+++ b/4
@@ -42,11 +42,11 @@ ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-    cp ${dep} /tmp/scratch/ || exit 1 ; \
+        cp ${dep} /tmp/scratch/ || exit 1 ; \
    done && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
+    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/rocm-amd64-deps.tgz . )


 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -103,7 +103,7 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
 			return []llm.Tensor{}, 0, err
 		}

-		shape := [4]uint64{1, 1, 1, 1}
+		shape := [4]uint64{0, 0, 0, 0}
 		for cnt, s := range data.Shape {
 			shape[cnt] = uint64(s)
 		}
@@ -112,7 +112,7 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
 			Name:          ggufName,
 			Kind:          kind,
 			Offset:        offset,
-			Shape:         shape[:],
+			Shape:         shape,
 			FileName:      fn,
 			OffsetPadding: 8 + jsonSize,
 			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,21 +1,25 @@
 # Documentation

-### Getting Started
-* [Quickstart](../README.md#quickstart)
-* [Examples](../examples)
-* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
-* [Linux Documentation](./linux.md)
-* [Windows Documentation](./windows.md)
-* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
+To get started, see the project's **[quickstart](../README.md#quickstart)**.

-### Reference
+Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.

-* [API Reference](./api.md)
-* [Modelfile Reference](./modelfile.md)
-* [OpenAI Compatibility](./openai.md)
+Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.

-### Resources
+Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.

-* [Troubleshooting Guide](./troubleshooting.md)
-* [FAQ](./faq.md)
-* [Development guide](./development.md)
+Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
+
+Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
+
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
+
+It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
+
+If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
+
+Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
+
+[Tutorials](./tutorials.md) apply the documentation to tasks.
+
+For working code examples of using Ollama, see [Examples](../examples).
--- a/docs/development.md
+++ b/docs/development.md
@@ -135,10 +135,3 @@ go build .
 In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.

 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
-
-
-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools described above, install AMDs HIP package **AFTER** you install MSVC
-
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -72,11 +72,6 @@ Verify that the drivers are installed by running the following command, which sh
 nvidia-smi
 ```

-### Install ROCm (optional - for Radeon GPUs)
-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
-
-Make sure to install ROCm v6
-
 ### Start Ollama

 Start Ollama using `systemd`:
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -70,36 +70,30 @@ cat /proc/cpuinfo| grep flags  | head -1
 ## AMD Radeon GPU Support

 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
-some cases you can force the system to try to use a similar LLVM target that is
-close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
-however, ROCm does not currently support this target. The closest support is
-`gfx1030`.  You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
-`x.y.z` syntax.  So for example, to force the system to run on the RX 5400, you
-would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
-server.  If you have an unsupported AMD GPU you can experiment using the list of
-supported types below.
+some cases you can force the system to try to use a close GPU type.  For example
+The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) however, ROCm does not
+support this patch-level, the closest support is `gfx1030`.  You can use the
+environment variable `HSA_OVERRIDE_GFX_VERSION` with `x.y.z` syntax.  So for
+example, to force the system to run on the RX 5400, you would set
+`HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server.

-At this time, the known supported GPU types are the following LLVM Targets.
-This table shows some example GPUs that map to these LLVM targets:
-| **LLVM Target** | **An Example GPU** |
-|-----------------|---------------------|
-| gfx900 | Radeon RX Vega 56 |
-| gfx906 | Radeon Instinct MI50 |
-| gfx908 | Radeon Instinct MI100 |
-| gfx90a | Radeon Instinct MI210 |
-| gfx940 | Radeon Instinct MI300 |
-| gfx941 | |
-| gfx942 | |
-| gfx1030 | Radeon PRO V620 |
-| gfx1100 | Radeon PRO W7900 |
-| gfx1101 | Radeon PRO W7700 |
-| gfx1102 | Radeon RX 7600 |
+At this time, the known supported GPU types are the following: (This may change from
+release to release)
+- gfx900
+- gfx906
+- gfx908
+- gfx90a
+- gfx940
+- gfx941
+- gfx942
+- gfx1030
+- gfx1100
+- gfx1101
+- gfx1102

-AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
-future release which should increase support for more GPUs.
+This will not work for all unsupported GPUs.  Reach out on [Discord](https://discord.gg/ollama)
+or file an [issue](https://github.com/ollama/ollama/issues) for additional help.

-Reach out on [Discord](https://discord.gg/ollama) or file an
-[issue](https://github.com/ollama/ollama/issues) for additional help.

 ## Installing older versions on Linux

--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -11,11 +11,14 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+
+	"github.com/jmorganca/ollama/version"
 )

 // Discovery logic for AMD/ROCm GPUs

 const (
+	curlMsg               = "curl -fsSL https://github.com/ollama/ollama/releases/download/v%s/rocm-amd64-deps.tgz | tar -zxf - -C %s"
 	DriverVersionFile     = "/sys/module/amdgpu/version"
 	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
 	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
@@ -275,37 +278,22 @@ func setupLink(source, target string) error {
 func AMDValidateLibDir() (string, error) {
 	// We rely on the rpath compiled into our library to find rocm
 	// so we establish a symlink to wherever we find it on the system
-	// to <payloads>/rocm
-	payloadsDir, err := PayloadsDir()
-	if err != nil {
-		return "", err
-	}
+	// to $AssetsDir/rocm

 	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Join(payloadsDir, "rocm")
+	assetsDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	// Versioned directory
+	rocmTargetDir := filepath.Join(assetsDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		return rocmTargetDir, nil
 	}
-
-	// next to the running binary
-	exe, err := os.Executable()
-	if err == nil {
-		peerDir := filepath.Dir(exe)
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
-		peerDir = filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
-	}
-
-	// Well known ollama installer path
-	installedRocmDir := "/usr/share/ollama/lib/rocm"
-	if rocmLibUsable(installedRocmDir) {
-		return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
+	// Parent dir (unversioned)
+	commonRocmDir := filepath.Join(filepath.Dir(assetsDir), "rocm")
+	if rocmLibUsable(commonRocmDir) {
+		return rocmTargetDir, setupLink(commonRocmDir, rocmTargetDir)
 	}

 	// Prefer explicit HIP env var
@@ -334,9 +322,14 @@ func AMDValidateLibDir() (string, error) {
 	if rocmLibUsable("/opt/rocm/lib") {
 		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
 	}
+	err = os.MkdirAll(rocmTargetDir, 0755)
+	if err != nil {
+		return "", fmt.Errorf("failed to create empty rocm dir %s %w", rocmTargetDir, err)
+	}

-	// If we still haven't found a usable rocm, the user will have to install it on their own
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
+	// If we still haven't found a usable rocm, the user will have to download it on their own
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or run the following")
+	slog.Warn(fmt.Sprintf(curlMsg, version.Version, rocmTargetDir))
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }

--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -140,7 +140,7 @@ func AMDValidateLibDir() (string, error) {
 	// $LibDir/rocm, we instead rely on setting PATH to point
 	// to the location of the ROCm library

-	// Installer payload location if we're running the installed binary
+	// Installer payload location
 	exe, err := os.Executable()
 	if err == nil {
 		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
@@ -150,12 +150,13 @@ func AMDValidateLibDir() (string, error) {
 		}
 	}

-	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
-	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
+	// If we already have a rocm dependency wired, nothing more to do
+	libDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	rocmTargetDir := filepath.Join(libDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
-		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
 	}

@@ -174,7 +175,16 @@ func AMDValidateLibDir() (string, error) {
 		return RocmStandardLocation, nil
 	}

+	// Installer payload (if we're running from some other location)
+	localAppData := os.Getenv("LOCALAPPDATA")
+	appDir := filepath.Join(localAppData, "Programs", "Ollama")
+	rocmTargetDir = filepath.Join(appDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
+		return rocmTargetDir, nil
+	}
+
 	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm v6")
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -7,37 +7,15 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-	"sync"
 )

-var (
-	lock        sync.Mutex
-	payloadsDir = ""
-)
-
-func PayloadsDir() (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	if payloadsDir == "" {
-		tmpDir, err := os.MkdirTemp("", "ollama")
-		if err != nil {
-			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-		}
-		payloadsDir = tmpDir
+func AssetsDir() (string, error) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
 	}
-	return payloadsDir, nil
-}

-func Cleanup() {
-	lock.Lock()
-	defer lock.Unlock()
-	if payloadsDir != "" {
-		slog.Debug("cleaning up", "dir", payloadsDir)
-		err := os.RemoveAll(payloadsDir)
-		if err != nil {
-			slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
-		}
-	}
+	return filepath.Join(home, ".ollama", "assets"), nil
 }

 func UpdatePath(dir string) {
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -125,7 +125,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
      return;
    }

-    llama->init();
+    llama->initialize();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,19 +18,6 @@ sign() {
    fi
 }

-# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
-bundle_metal() {
-    grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
-    echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
-    cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
-    cat  "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
-    rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
-}
-
-cleanup_metal() {
-    (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
-}
-
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"

 case "${GOARCH}" in
@@ -73,14 +60,12 @@ case "${GOARCH}" in
    compress_libs
    ;;
 "arm64")
-    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-    bundle_metal
    build
    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
    compress_libs
-    cleanup_metal
    ;;
 *)
    echo "GOARCH must be set"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -191,10 +191,6 @@ if [ -d "${ROCM_PATH}" ]; then
    # Record the ROCM dependencies
    rm -f "${BUILD_DIR}/lib/deps.txt"
    touch "${BUILD_DIR}/lib/deps.txt"
-
-    # having the execstack bit set on the HIP runtime sometimes causes `ldd` to error
-    execstack -c "${ROCM_PATH}/lib/libamdhip64.so*"
-
    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -1,152 +0,0 @@
-package llm
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-	"slices"
-)
-
-type ContainerGGLA struct {
-	version uint32
-}
-
-func (c *ContainerGGLA) Name() string {
-	return "ggla"
-}
-
-func (c *ContainerGGLA) Decode(rso *readSeekOffset) (model, error) {
-	binary.Read(rso, binary.LittleEndian, &c.version)
-
-	switch c.version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	model := newModelGGLA(c)
-	err := model.decode(rso)
-	return model, err
-}
-
-type ModelGGLA struct {
-	*ContainerGGLA
-
-	kv      KV
-	tensors []Tensor
-}
-
-func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
-	return &ModelGGLA{
-		ContainerGGLA: container,
-		kv:            make(KV),
-	}
-}
-
-func (m *ModelGGLA) decode(rso *readSeekOffset) error {
-	var r uint32
-	if err := binary.Read(rso, binary.LittleEndian, &r); err != nil {
-		return err
-	}
-	m.kv["r"] = r
-
-	var alpha uint32
-	if err := binary.Read(rso, binary.LittleEndian, &alpha); err != nil {
-		return err
-	}
-	m.kv["alpha"] = alpha
-
-	for {
-		var dims uint32
-		if err := binary.Read(rso, binary.LittleEndian, &dims); err != nil {
-			return err
-		}
-
-		var namesize uint32
-		if err := binary.Read(rso, binary.LittleEndian, &namesize); err != nil {
-			return err
-		}
-
-		var t Tensor
-		if err := binary.Read(rso, binary.LittleEndian, &t.Kind); err != nil {
-			return err
-		}
-
-		t.Shape = make([]uint64, dims)
-		for i := 0; uint32(i) < dims; i++ {
-			var shape32 uint32
-			if err := binary.Read(rso, binary.LittleEndian, &shape32); err != nil {
-				return err
-			}
-
-			t.Shape[i] = uint64(shape32)
-		}
-
-		// ggla tensor shape is reversed
-		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
-		slices.Reverse(t.Shape)
-
-		name := make([]byte, namesize)
-		if err := binary.Read(rso, binary.LittleEndian, &name); err != nil {
-			return err
-		}
-
-		t.Name = string(name)
-
-		if _, err := rso.Seek((rso.offset+31)&-32, io.SeekStart); err != nil {
-			return err
-		}
-
-		t.Offset = uint64(rso.offset)
-
-		if _, err := rso.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
-			return err
-		}
-
-		m.tensors = append(m.tensors, t)
-	}
-}
-
-func (m *ModelGGLA) KV() KV {
-	return m.kv
-}
-
-func (m *ModelGGLA) Tensor() []Tensor {
-	return m.tensors
-}
-
-func (*ModelGGLA) ModelFamily() string {
-	return "ggla"
-}
-
-func (*ModelGGLA) ModelType() string {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) FileType() string {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumLayers() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumGQA() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumEmbed() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumHead() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumHeadKv() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumCtx() uint32 {
-	panic("not implemented")
-}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -106,6 +106,32 @@ type container interface {
 	Decode(*readSeekOffset) (model, error)
 }

+type containerLORA struct {
+	version uint32
+}
+
+func (c *containerLORA) Name() string {
+	return "ggla"
+}
+
+func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
+	var version uint32
+	binary.Read(rso, binary.LittleEndian, &version)
+
+	switch version {
+	case 1:
+	default:
+		return nil, errors.New("invalid version")
+	}
+
+	c.version = version
+
+	// remaining file contents aren't decoded
+	rso.Seek(0, io.SeekEnd)
+
+	return nil, nil
+}
+
 const (
 	// Magic constant for `ggml` files (unversioned).
 	FILE_MAGIC_GGML = 0x67676d6c
@@ -135,7 +161,7 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
 		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
-		c = &ContainerGGLA{}
+		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
 		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
@@ -145,9 +171,7 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	}

 	model, err := c.Decode(&ro)
-	if errors.Is(err, io.EOF) {
-		// noop
-	} else if err != nil {
+	if err != nil {
 		return nil, err
 	}

--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -94,7 +94,7 @@ type Tensor struct {
 	Offset uint64

 	// shape is the number of elements in each dimension
-	Shape []uint64
+	Shape [4]uint64

 	FileName      string
 	OffsetPadding uint64
@@ -156,11 +156,7 @@ func (t Tensor) TypeSize() uint64 {
 }

 func (t Tensor) Parameters() uint64 {
-	var count uint64 = 1
-	for _, n := range t.Shape {
-		count *= n
-	}
-	return count
+	return t.Shape[0] * t.Shape[1] * t.Shape[2] * t.Shape[3]
 }

 func (t Tensor) Size() uint64 {
@@ -707,7 +703,7 @@ func (llm *GGUFModel) Decode(rso *readSeekOffset) error {
 			Name:   name,
 			Kind:   llm.readU32(rso),
 			Offset: llm.readU64(rso),
-			Shape:  shape[:],
+			Shape:  shape,
 		}

 		llm.Tensors = append(llm.Tensors, tensor)
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -6,7 +6,6 @@ import (
 	"log/slog"
 	"os"
 	"runtime"
-	"slices"

 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
@@ -20,10 +19,6 @@ type LLM interface {
 	Close()
 }

-var cpuOnlyFamilies = []string{
-	"mamba",
-}
-
 func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
@@ -53,18 +48,13 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 	size := ggml.Size

 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())

 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.NumGQA()) * kv / 6

-	// certain model architectures don't support gpu inference yet
-	if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
-		opts.NumGPU = 0
-	}
-
 	info := gpu.GetGPUInfo()
 	switch runtime.GOOS {
 	case "darwin":
@@ -73,7 +63,9 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 		}

 		if size+kv+graph > vram {
-			slog.Info("not enough vram available, setting num_gpu=0")
+			slog.Info("not enough vram available, falling back to CPU only")
+			info.Library = "cpu"
+			info.Variant = gpu.GetCPUVariant()
 			opts.NumGPU = 0
 			break
 		}
@@ -151,25 +143,15 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri
 		}
 	}

-	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
-	_, err := os.Stat(dynLibs[0])
-	if err != nil {
-		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit()
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	err2 := fmt.Errorf("unable to locate suitable llm library")
+	err := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
 		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
-		err2 = err
+		err = err
 	}

-	return nil, err2
+	return nil, err
 }
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,10 +1,10 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index b14cca61..02bfd4b1 100644
+index f255ad76..5b83acb1 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -29,6 +29,10 @@
+@@ -28,6 +28,10 @@
+ #include <thread>
 #include <signal.h>
- #include <memory>
 
 +#ifdef GGML_USE_CUBLAS
 +extern "C" GGML_CALL void ggml_free_cublas(void);
@@ -13,7 +13,7 @@ index b14cca61..02bfd4b1 100644
 using json = nlohmann::json;
 
 bool server_verbose = false;
-@@ -664,6 +668,10 @@ struct server_context {
+@@ -648,6 +652,10 @@ struct server_context {
             llama_free_model(model);
             model = nullptr;
         }
@@ -24,7 +24,7 @@ index b14cca61..02bfd4b1 100644
     }
 
     bool load_model(const gpt_params & params_) {
-@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) {
+@@ -3339,6 +3347,7 @@ int main(int argc, char ** argv) {
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index c207ff87..945708a4 100644
+index 72bcec8c..50a45e3d 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -46,6 +46,7 @@
+@@ -43,6 +43,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
 
 #ifdef __HIP_PLATFORM_AMD__
         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
 #endif
 
         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
             g_cublas_loaded = false;
             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
             return;
-@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 
@@ -76,12 +76,11 @@ index c207ff87..945708a4 100644
         g_cublas_loaded = true;
     }
 }
-@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+@@ -12490,3 +12491,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
     }
     return device_count;
 }
 +
-+
 +extern "C" GGML_CALL void ggml_free_cublas(void);
 +GGML_CALL void ggml_free_cublas(void) {
 +    for (int id = 0; id < g_device_count; ++id) {
--- a/llm/patches/03-locale.diff
+++ b/llm/patches/03-locale.diff
@@ -1,13 +0,0 @@
-diff --git a/llama.cpp b/llama.cpp
-index b19616e8..519b9602 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -9938,7 +9938,7 @@ struct llm_tokenizer_wpm {
-     }
-
-     uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-+        static const std::locale locale("");
- #if defined(_WIN32)
-         if (code > 0xFFFF) {
-             return code;
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -104,14 +104,31 @@ func rocmDynLibPresent() bool {
 }

 func nativeInit() error {
-	payloadsDir, err := gpu.PayloadsDir()
+	slog.Info("Extracting dynamic libraries...")
+	assetsDir, err := gpu.AssetsDir()
 	if err != nil {
 		return err
 	}

-	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
+	// delete the assetsDir
+	if err := os.RemoveAll(assetsDir); err != nil {
+		return err
+	}

-	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
+	if runtime.GOOS == "darwin" {
+		err := extractPayloadFiles(assetsDir, "llama.cpp/ggml-metal.metal")
+		if err != nil {
+			if err == payloadMissing {
+				// TODO perhaps consider this a hard failure on arm macs?
+				slog.Info("ggml-meta.metal payload missing")
+				return nil
+			}
+			return err
+		}
+		os.Setenv("GGML_METAL_PATH_RESOURCES", assetsDir)
+	}
+
+	libs, err := extractDynamicLibs(assetsDir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
@@ -142,7 +159,7 @@ func nativeInit() error {
 	return nil
 }

-func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
+func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
@@ -161,14 +178,14 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
+			targetDir := filepath.Join(assetsDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
+				return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
@@ -199,6 +216,52 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
 	return libs, g.Wait()
 }

+func extractPayloadFiles(assetsDir, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return payloadMissing
+	}
+
+	for _, file := range files {
+		srcFile, err := libEmbed.Open(file)
+		if err != nil {
+			return fmt.Errorf("read payload %s: %v", file, err)
+		}
+		defer srcFile.Close()
+		if err := os.MkdirAll(assetsDir, 0o755); err != nil {
+			return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
+		}
+		src := io.Reader(srcFile)
+		filename := file
+		if strings.HasSuffix(file, ".gz") {
+			src, err = gzip.NewReader(src)
+			if err != nil {
+				return fmt.Errorf("decompress payload %s: %v", file, err)
+			}
+			filename = strings.TrimSuffix(filename, ".gz")
+		}
+
+		destFile := filepath.Join(assetsDir, filepath.Base(filename))
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return fmt.Errorf("stat payload %s: %v", file, err)
+		case err == nil:
+			slog.Debug("payload already exists: " + destFile)
+		}
+	}
+	return nil
+}
+
 func verifyDriverAccess() error {
 	if runtime.GOOS != "linux" {
 		return nil
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -19,9 +19,10 @@ type Buffer struct {

 func NewBuffer(prompt *Prompt) (*Buffer, error) {
 	fd := int(os.Stdout.Fd())
-	width, height := 80, 24
-	if termWidth, termHeight, err := term.GetSize(fd); err == nil {
-		width, height = termWidth, termHeight
+	width, height, err := term.GetSize(fd)
+	if err != nil {
+		fmt.Println("Error getting size:", err)
+		return nil, err
 	}

 	lwidth := width - len(prompt.prompt())
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,10 +22,6 @@ for TARGETARCH in ${BUILD_ARCH}; do
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-
-    if [ "$TARGETARCH" = "amd64" ]; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
-    fi
-
+    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
    docker rm builder-$TARGETARCH
 done
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -9,7 +9,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
    # Centos 7 derivatives have too old of a git version to run our generate script
    # uninstall and ignore failures
    yum remove -y git
-    yum -y install epel-release centos-release-scl prelink
+    yum -y install epel-release centos-release-scl
    yum -y install dnf
    if [ "${MACHINE}" = "x86_64" ]; then
        yum -y install https://repo.ius.io/ius-release-el7.rpm
--- a/server/images.go
+++ b/server/images.go
@@ -10,7 +10,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/fs"
 	"log"
 	"log/slog"
 	"net/http"
@@ -323,12 +322,9 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars

 			ggufName, err := convertSafetensors(name, pathName)
 			if err != nil {
-				var pathErr *fs.PathError
 				switch {
 				case errors.Is(err, zip.ErrFormat):
 					// it's not a safetensor archive
-				case errors.As(err, &pathErr):
-					// it's not a file on disk, could be a model reference
 				default:
 					return err
 				}
@@ -473,13 +469,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 			defer bin.Close()

-			ggml, err := llm.DecodeGGML(bin)
-			if err != nil {
-				return err
-			}
-
-			sr := io.NewSectionReader(bin, 0, ggml.Size)
-			layer, err := NewLayer(sr, mediatype)
+			layer, err := NewLayer(bin, mediatype)
 			if err != nil {
 				return err
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -10,7 +10,6 @@ import (
 	"log/slog"
 	"net"
 	"net/http"
-	"net/netip"
 	"os"
 	"os/signal"
 	"path/filepath"
@@ -36,7 +35,7 @@ import (
 var mode string = gin.DebugMode

 type Server struct {
-	addr net.Addr
+	WorkDir string
 }

 func init() {
@@ -905,83 +904,15 @@ var defaultAllowOrigins = []string{
 	"0.0.0.0",
 }

-func isLocalIP(ip netip.Addr) bool {
-	if interfaces, err := net.Interfaces(); err == nil {
-		for _, iface := range interfaces {
-			addrs, err := iface.Addrs()
-			if err != nil {
-				continue
-			}
-
-			for _, a := range addrs {
-				if parsed, _, err := net.ParseCIDR(a.String()); err == nil {
-					if parsed.String() == ip.String() {
-						return true
-					}
-				}
-			}
-		}
+func NewServer() (*Server, error) {
+	workDir, err := os.MkdirTemp("", "ollama")
+	if err != nil {
+		return nil, err
 	}

-	return false
-}
-
-func allowedHost(host string) bool {
-	if host == "" || host == "localhost" {
-		return true
-	}
-
-	if hostname, err := os.Hostname(); err == nil && host == hostname {
-		return true
-	}
-
-	var tlds = []string{
-		"localhost",
-		"local",
-		"internal",
-	}
-
-	// check if the host is a local TLD
-	for _, tld := range tlds {
-		if strings.HasSuffix(host, "."+tld) {
-			return true
-		}
-	}
-
-	return false
-}
-
-func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
-	return func(c *gin.Context) {
-		if addr == nil {
-			c.Next()
-			return
-		}
-
-		if addr, err := netip.ParseAddrPort(addr.String()); err == nil && !addr.Addr().IsLoopback() {
-			c.Next()
-			return
-		}
-
-		host, _, err := net.SplitHostPort(c.Request.Host)
-		if err != nil {
-			host = c.Request.Host
-		}
-
-		if addr, err := netip.ParseAddr(host); err == nil {
-			if addr.IsLoopback() || addr.IsPrivate() || addr.IsUnspecified() || isLocalIP(addr) {
-				c.Next()
-				return
-			}
-		}
-
-		if allowedHost(host) {
-			c.Next()
-			return
-		}
-
-		c.AbortWithStatus(http.StatusForbidden)
-	}
+	return &Server{
+		WorkDir: workDir,
+	}, nil
 }

 func (s *Server) GenerateRoutes() http.Handler {
@@ -1007,7 +938,10 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r := gin.Default()
 	r.Use(
 		cors.New(config),
-		allowedHostsMiddleware(s.addr),
+		func(c *gin.Context) {
+			c.Set("workDir", s.WorkDir)
+			c.Next()
+		},
 	)

 	r.POST("/api/pull", PullModelHandler)
@@ -1076,7 +1010,10 @@ func Serve(ln net.Listener) error {
 		}
 	}

-	s := &Server{addr: ln.Addr()}
+	s, err := NewServer()
+	if err != nil {
+		return err
+	}
 	r := s.GenerateRoutes()

 	slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
@@ -1092,7 +1029,7 @@ func Serve(ln net.Listener) error {
 		if loaded.runner != nil {
 			loaded.runner.Close()
 		}
-		gpu.Cleanup()
+		os.RemoveAll(s.WorkDir)
 		os.Exit(0)
 	}()

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -21,6 +21,12 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

+func setupServer(t *testing.T) (*Server, error) {
+	t.Helper()
+
+	return NewServer()
+}
+
 func Test_Routes(t *testing.T) {
 	type testCase struct {
 		Name     string
@@ -201,7 +207,9 @@ func Test_Routes(t *testing.T) {
 		},
 	}

-	s := Server{}
+	s, err := setupServer(t)
+	assert.Nil(t, err)
+
 	router := s.GenerateRoutes()

 	httpSrv := httptest.NewServer(router)