update to resolve jmorganca comments

Signed-off-by: Matt Williams <m@technovangelist.com>
add faq about quant and context
2026-02-18 15:25:27 -05:00 · 2024-01-04 12:58:07 -08:00 · 2024-01-04 09:45:13 -08:00
30 changed files with 155 additions and 280 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,7 +2,7 @@
 ollama
 app
 dist
-llm/llama.cpp
+llm/llama.cpp/gguf
 .env
 .cache
 test_data
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,5 @@
-[submodule "llama.cpp"]
-	path = llm/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
-	shallow = true
+[submodule "llm/llama.cpp/gguf"]
+    path = llm/llama.cpp/gguf
+    url = https://github.com/ggerganov/llama.cpp.git
+    ignore = dirty
+    shallow = true
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,101 +1,74 @@
-ARG GOLANG_VERSION=1.21.3
+# Ubuntu 20.04 amd64 dependencies
+FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
+ARG CUDA_VERSION=11.3.1-1
 ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=5.7.1
+# ROCm only supports amd64
+ARG ROCM_VERSION=6.0
+ARG CLBLAST_VER=1.6.1

-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    mkdir --parents --mode=0755 /etc/apt/keyrings && \
+    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev

-ARG CMAKE_VERSION
+# CLBlast
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
+    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install

-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV ROCM_PATH=/opt/rocm

-ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
-RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
+# Ubuntu 22.04 arm64 dependencies
+FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.27.6
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    apt-get update && \
+    apt-cache madison cuda && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} 

+FROM base-${TARGETARCH}
+ARG TARGETARCH
+ARG GOFLAGS="'-ldflags -w -s'"
+ARG CGO_CFLAGS
+ARG GOLANG_VERSION=1.21.3
+
+# Common toolchain
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
+
+# install go
+ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
+
+# build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .

-WORKDIR llm/generate
-RUN sh gen_linux.sh
+ENV GOOS=linux
+ENV GOARCH=$TARGETARCH
+ENV GOFLAGS=$GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
-
-ARG CMAKE_VERSION
-
-RUN dnf install -y git cmake
-
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
-
-WORKDIR llm/generate
-RUN sh gen_linux.sh
-
-FROM --platform=linux/amd64 rocm/dev-centos-7:$ROCM_VERSION-complete AS rocm-build-amd64
-
-ARG CMAKE_VERSION
-
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
-    && yum update -y \
-    && yum remove -y git \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV LIBRARY_PATH /opt/amdgpu/lib64
-
-ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
-RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
-
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
-
-WORKDIR llm/generate
-RUN sh gen_linux.sh
-
-FROM --platform=linux/amd64 centos:7 AS build-amd64
-ENV CGO_ENABLED 1
-
-ARG GOLANG_VERSION
-ARG GOFLAGS
-ARG CGO_FLAGS
-
-RUN yum install -y centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-amd64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
-ENV PATH /usr/local/go/bin:$PATH
-
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
-COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
-COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
-COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/rocm/lib llm/llama.cpp/build/linux/rocm/lib
-RUN go build .
-
-FROM --platform=linux/arm64 centos:7 AS build-arm64
-ENV CGO_ENABLED 1
-
-ARG GOLANG_VERSION
-ARG GOFLAGS
-ARG CGO_FLAGS
-
-RUN yum install -y centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-arm64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
-ENV PATH /usr/local/go/bin:$PATH
-
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
-COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
-COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
-RUN go build .
-
-FROM build-$TARGETARCH
+RUN /usr/local/go/bin/go generate ./... && \
+    /usr/local/go/bin/go build .
--- a/api/client.go
+++ b/api/client.go
@@ -309,13 +309,6 @@ func (c *Client) Heartbeat(ctx context.Context) error {
 	}
 	return nil
 }
-func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
-	var resp EmbeddingResponse
-	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}

 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
--- a/docs/README.md
+++ b/docs/README.md
@@ -12,7 +12,7 @@ Import models using source model weights found on Hugging Face and similar sites

 Installing on Linux in most cases is easy using the script on Ollama.ai. To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.

-Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](./docker.md)**.

 It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -112,3 +112,26 @@ This can impact both installing Ollama, as well as downloading models.
 Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
 Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
 properties.
+
+## What does the q in the model tag mean? What is quantization?
+
+Whenever you pull a model without a tag, Ollama will actually pull the q4_0 quantization of the model. You can verify this on the tags page. On https://ollama.ai/library/llama2/tags you can see that the hash for the latest tag matches the hash for the 7b model. ![quant hashes](https://github.com/jmorganca/ollama/assets/633681/814b1b78-8205-4845-89f9-e671b3b96085)
+
+Looking at the that page for any model, you can see several quantization options available. Quantization is a method of compression that allows the model to fit in less space and thus use less RAM and VRAM on your machine.
+
+At a high level, a model is made of an enormous collection of nodes that determine how to generate text. These nodes are connected at different levels with weights. The training process adjusts these weights to be able to output the right text every time.
+
+Most of the source models that we use start with weights that are 32bit floating-point numbers. Those weights, and another concept called biases, add up to be the parameters. So a source model with 7 billion parameters has 7 billion 32bit floating-point numbers, plus a description of all the nodes and more. That adds up to needing at least 28 Gigabytes of memory to load, if you choose to load one of those source models.
+
+Quantization turns those 32bit floating point weights into much smaller integers. The number next to the q indicates the bit size of the weights. So a q4 model converted those 32bit floats into 4bit integers. A 4bit quantization takes up the space for 7billion 4bit integers, plus a little overhead. That comes out to almost 4 Gigabytes. Obviously, there is some loss of information in this process of going from 30GB to 4GB, but it turns out in most cases it isn't really noticeable. In fact, even the 2bit quantization which fits in less than 3GB can be very useful.
+
+There are three major sets of quantizations you will see in the Ollama Library of models: **fp16**, models with just a q and a number, like **q4_0**, and then models with a **K** in the tag. The **fp16** model is one that has been converted and quantized from the source 32bit to 16bit. This will be about half the size of the 32bit source model and is the largest quantization we deliver in the library. The **q4_0**, **q4_1**, **q5_0**, etc. models use two different quantization methods that were the original methods.
+
+The models with a **K** are often referred to as K Quants. This is a method that allows for models of a similar quality but smaller than the original method used. Essentially, it finds clusters of weights and quantizes those together, allowing for higher precision while using the same bit sizes as the regular quantization options. But this requires a set of maps for the model to figure out the original values which have a computational cost. You may see some impact on the speed of models with K quants compared to the regular quantizations.
+
+## What is context, can I increase it, and why doesn't every model support a huge context?
+
+Context refers to the size of the input you can send to a model and get sensible output back. Many models have a context size of 2048 tokens. It's sometimes possible to give it more using the **num_ctx** parameter, but the answers start to degrade. This is because half of the context is "freed" up to allow for more memory. Newer models have been able to increase that context size using different methods. This increase in context size results in a corresponding increase in memory required, sometimes by orders of magnitude.
+
+> !WARNING]
+> Currently, over-allocating context size may result in model quality or stability issues.
--- a/examples/typescript-airenamer/1084-536x354-grayscale.jpg
+++ b/examples/typescript-airenamer/1084-536x354-grayscale.jpg
--- a/examples/typescript-airenamer/package.json
+++ b/examples/typescript-airenamer/package.json
@@ -1,18 +0,0 @@
-{
-  "name": "typescript-airenamer",
-  "version": "1.0.0",
-  "description": "",
-  "main": "index.js",
-  "scripts": {
-    "start": "tsx renamer.ts",
-    "test": "echo \"Error: no test specified\" && exit 1"
-  },
-  "author": "",
-  "license": "ISC",
-  "dependencies": {
-    "tsx": "^4.7.0"
-  },
-  "devDependencies": {
-    "@types/node": "^20.10.6"
-  }
-}
--- a/examples/typescript-airenamer/readme.md
+++ b/examples/typescript-airenamer/readme.md
@@ -1,29 +0,0 @@
-# Renaming Files with AI
-
-![airenamer 2024-01-05 09_09_08](https://github.com/jmorganca/ollama/assets/633681/b98df1c8-61a7-4dff-aeb7-b04e034dced0)
-
-This example applies the benefits of the llava models to managing images. It will find any images in your current directory, generate keywords for the image, and then copy the file to a new name based on the keywords.
-
-## Running the example
-
-1. Clone this repo and navigate to the `examples/typescript-airenamer` directory.
-2. Install the dependencies with `npm install`.
-3. Run `npm run start`.
-
-## Review the Code
-
-The main part of the code is in the `getkeywords` function. It calls the `/api/generate` endpoint passing in the body: 
-
-```json
-{
-    "model": "llava:13b-v1.5-q5_K_M",
-    "prompt": `Describe the image as a collection of keywords. Output in JSON format. Use the following schema: { filename: string, keywords: string[] }`,
-    "format": "json",
-    "images": [image],
-    "stream": false
-  }
-```
-
-This demonstrates how to use images as well as `format: json` to allow calling another function. The images key takes an array of base64 encoded images. And `format: json` tells the model to output JSON instead of regular text. When using `format: json`, it's important to also say that you expect the output to be JSON in the prompt. Adding the expected schema to the prompt also helps the model understand what you're looking for.
-
-The `main` function calls getkeywords passing it the base64 encoded image. Then it parses the JSON output, formats the keywords into a string, and copies the file to the new name.
--- a/examples/typescript-airenamer/renamer.ts
+++ b/examples/typescript-airenamer/renamer.ts
@@ -1,42 +0,0 @@
-import fs from 'fs';
-
-export async function getkeywords(image: string): Promise<string[]> {
-  const body = {
-    "model": "llava:13b-v1.5-q5_K_M",
-    "prompt": `Describe the image as a collection of keywords. Output in JSON format. Use the following schema: { filename: string, keywords: string[] }`,
-    "format": "json",
-    "images": [image],
-    "stream": false
-  };
-
-  const response = await fetch("http://localhost:11434/api/generate", {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify(body),
-  });
-
-  const json = await response.json();
-  const keywords = JSON.parse(json.response);
-
-  return keywords?.keywords || [];
-}
-
-async function main() {
-  for (const file of fs.readdirSync(".")) {
-    if (file.endsWith(".jpg") || file.endsWith(".png")) {
-      const currentpath = __dirname;
-      const b64 = fs.readFileSync(`${currentpath}/${file}`, { encoding: 'base64' });
-      const keywords = await getkeywords(b64.toString());
-      const fileparts = keywords.map(k => k.replace(/ /g, "_"));
-      const fileext = file.split(".").pop();
-      const newfilename = fileparts.join("-") + "." + fileext;
-      fs.copyFileSync(`${currentpath}/${file}`, `${currentpath}/${newfilename}`);
-      console.log(`Copied ${file} to ${newfilename}`);
-    }
-  }
-
-}
-
-main();
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -20,8 +20,6 @@ const char *cuda_lib_paths[] = {
 };
 #endif

-#define CUDA_LOOKUP_SIZE 5
-
 void cuda_init(cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
@@ -32,12 +30,11 @@ void cuda_init(cuda_init_resp_t *resp) {
  struct lookup {
    char *s;
    void **p;
-  } l[CUDA_LOOKUP_SIZE] = {
+  } l[4] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
-      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  };

  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
@@ -55,7 +52,7 @@ void cuda_init(cuda_init_resp_t *resp) {
    return;
  }

-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
@@ -92,34 +89,22 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
    return;
  }

-  unsigned int devices;
-  ret = (*h.getCount)(&devices);
+  // TODO - handle multiple GPUs
+  ret = (*h.getHandle)(0, &device);
  if (ret != NVML_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
    resp->err = strdup(buf);
    return;
  }

-  resp->total = 0;
-  resp->free = 0;
-
-  for (i = 0; i < devices; i++) {
-    ret = (*h.getHandle)(i, &device);
-    if (ret != NVML_SUCCESS) {
-      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
-      resp->err = strdup(buf);
-      return;
-    }
-
-    ret = (*h.getMemInfo)(device, &memInfo);
-    if (ret != NVML_SUCCESS) {
-      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
-      resp->err = strdup(buf);
-      return;
-    }
-
-    resp->total += memInfo.total;
-    resp->free += memInfo.free;
+  ret = (*h.getMemInfo)(device, &memInfo);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
  }
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+  return;
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -21,7 +21,6 @@ typedef struct cuda_handle {
  nvmlReturn_t (*shutdownFn)(void);
  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
-  nvmlReturn_t (*getCount)(unsigned int *);
 } cuda_handle_t;

 typedef struct cuda_init_resp {
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,4 +0,0 @@
-# Extern C Server
-
-This directory contains a thin facade we layer on top of the Llama.cpp server
-to expose `extern C` interfaces to access the functionality through direct API calls in-process
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -1,7 +1,7 @@
 package llm

 /*
-#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
@@ -10,17 +10,17 @@ package llm
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
 #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread

--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -2,7 +2,7 @@

 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
+add_library(${TARGET} STATIC ../../../ext_server.cpp)
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -1,7 +1,8 @@
 # common logic accross linux and darwin

 init_vars() {
-    LLAMACPP_DIR=../llama.cpp
+    LLAMACPP_DIR=gguf
+    PATCHES="0001-Expose-callable-API-for-server.patch"
    CMAKE_DEFS=""
    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
@@ -17,24 +18,19 @@ git_module_setup() {
        echo "Skipping submodule initialization"
        return
    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
+    git submodule update --force gguf

 }

 apply_patches() {
    # Wire up our CMakefile
-    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
-        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
+    if ! grep ollama gguf/examples/server/CMakeLists.txt; then
+        echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt
    fi
    # Avoid duplicate main symbols when we link into the cgo binary
-    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
-        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
+    sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp &&
+        mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp
 }

 build() {
@@ -53,5 +49,5 @@ install() {

 # Keep the local tree clean after we're done with the build
 cleanup() {
-    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
+    (cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp)
 }
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
+# working directory must be ../llm/llama.cpp

 # TODO - add hardening to detect missing tools (cmake, etc.)

@@ -10,7 +10,7 @@ echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
-BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
+BUILD_DIR="gguf/build/darwin/metal"
 case "${GOARCH}" in
 "amd64")
    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
-# working directory must be llm/generate/
+# working directory must be llm/llama.cpp

 # First we build our default built-in library which will be linked into the CGO
 # binary as a normal dependency. This default build is CPU based.
@@ -52,7 +52,7 @@ apply_patches
 # CPU first for the default library
 #
 CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+BUILD_DIR="gguf/build/linux/cpu"

 build
 install
@@ -64,7 +64,7 @@ if [ -d /usr/local/cuda/lib64/ ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
+    BUILD_DIR="gguf/build/linux/cuda"
    CUDA_LIB_DIR=/usr/local/cuda/lib64
    build
    install
@@ -98,7 +98,7 @@ if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
+    BUILD_DIR="gguf/build/linux/rocm"
    build
    install
    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
@@ -3,7 +3,7 @@
 $ErrorActionPreference = "Stop"

 function init_vars {
-    $script:llamacppDir = "../llama.cpp"
+    $script:patches = @("0001-Expose-callable-API-for-server.patch")
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
    $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
    if ($env:CGO_CFLAGS -contains "-g") {
@@ -19,25 +19,25 @@ function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force "${script:llamacppDir}"
+    & git submodule update --force gguf
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }

 function apply_patches {
    # Wire up our CMakefile
-    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
+    if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
+        Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama'
    }
    # Avoid duplicate main symbols when we link into the cgo binary
-    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
+    $content = Get-Content -Path "./gguf/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
-    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
+    Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content
 }

 function build {
-    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
+    write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
    & cmake --version
-    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
+    & cmake -S gguf -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
@@ -55,7 +55,7 @@ function install {
 }

 function cleanup {
-    Set-Location "${script:llamacppDir}/examples/server"
+    Set-Location "gguf/examples/server"
    git checkout CMakeLists.txt server.cpp
 }

@@ -64,20 +64,20 @@ git_module_setup
 apply_patches

 # first build CPU based
-$script:buildDir="${script:llamacppDir}/build/windows/cpu"
+$script:buildDir="gguf/build/windows/cpu"

 build
 install

 # Then build cuda as a dynamically loaded library
 init_vars
-$script:buildDir="${script:llamacppDir}/build/windows/cuda"
+$script:buildDir="gguf/build/windows/cuda"
 $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
 build
 install

 # TODO - actually implement ROCm support on windows
-$script:buildDir="${script:llamacppDir}/build/windows/rocm"
+$script:buildDir="gguf/build/windows/rocm"

 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
@@ -1,3 +1,3 @@
-package generate
+package llm

 //go:generate sh ./gen_darwin.sh
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,3 +1,3 @@
-package generate
+package llm

 //go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,3 +1,3 @@
-package generate
+package llm

 //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -13,7 +13,7 @@ import (
 	"github.com/jmorganca/ollama/api"
 )

-//go:embed llama.cpp/ggml-metal.metal
+//go:embed llama.cpp/gguf/ggml-metal.metal
 var libEmbed embed.FS

 func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
@@ -22,7 +22,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
 }

 func nativeInit(workdir string) error {
-	err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
+	err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal")
 	if err != nil {
 		if err == payloadMissing {
 			// TODO perhaps consider this a hard failure on arm macs?
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -34,8 +34,6 @@ type shimExtServer struct {
 var shimMutex sync.Mutex
 var llm *shimExtServer

-const pathComponentCount = 6
-
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
@@ -114,7 +112,7 @@ func (llm *shimExtServer) Close() {
 }

 func nativeInit(workdir string) error {
-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			log.Printf("%s", payloadMissing)
@@ -153,13 +151,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {

 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
-		if len(pathComps) != pathComponentCount {
+		if len(pathComps) != 7 {
 			log.Printf("unexpected payload components: %v", pathComps)
 			continue
 		}
-		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
+		// llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY
 		// Include the variant in the path to avoid conflicts between multiple server libs
-		targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
+		targetDir := filepath.Join(workDir, pathComps[4])
 		srcFile, err := libEmbed.Open(file)
 		if err != nil {
 			return nil, fmt.Errorf("read payload %s: %v", file, err)
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -10,7 +10,7 @@ import (
 	"strings"
 )

-//go:embed llama.cpp/build/*/*/lib/*.so
+//go:embed llama.cpp/gguf/build/*/*/lib/*.so
 var libEmbed embed.FS

 func updatePath(dir string) {
--- a/llm/shim_ext_server_windows.go
+++ b/llm/shim_ext_server_windows.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 )

-//go:embed llama.cpp/build/windows/*/lib/*.dll
+//go:embed llama.cpp/gguf/build/windows/*/lib/*.dll
 var libEmbed embed.FS

 func updatePath(dir string) {
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
 mkdir -p dist

 for TARGETARCH in amd64 arm64; do
-    docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
+    docker build --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
    docker rm builder-$TARGETARCH
Author	SHA1	Message	Date
Matt Williams	fed3843be2	update to resolve jmorganca comments Signed-off-by: Matt Williams <m@technovangelist.com>	2024-01-04 12:58:07 -08:00
Matt Williams	01d4047ed3	add faq about quant and context Signed-off-by: Matt Williams <m@technovangelist.com>	2024-01-04 09:45:13 -08:00