⬆️ Update ggerganov/llama.cpp (#2696 )

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
fix(initializer): do select backends that exist (#2694 )
2026-02-03 11:13:31 -05:00 · 2024-07-01 22:52:43 +02:00 · 2024-07-01 22:50:36 +02:00 · 2024-07-01 18:11:04 +02:00
3 changed files with 60 additions and 101 deletions
--- a/103
+++ b/103
@@ -5,12 +5,6 @@ ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

 # The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM ${BASE_IMAGE} AS requirements-core
-# TODO(mudler): install all accellerators here
-# and use make dist instead of build.
-# TODO(mudler): modify make dist to build also go-piper and stablediffusion
-# This way the same binary can work for everything(!)
-# TODO(mudler): also make sure that we bundle all the required libs in the backend-assets/lib
-# For the GPU-accell we are going to generate a tar file instead that will be extracted by the bash installer, and the libs will also be installed in the final docker image, so no need to pull ALL the dependencies

 USER root

@@ -55,12 +49,10 @@ ENV PATH /usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH /opt/rocm/bin:${PATH}

-# OpenBLAS requirements and stable diffusion, tts (espeak)
+# OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libopenblas-dev \
-        espeak-ng \
-        espeak \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -85,6 +77,8 @@ ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
+        espeak-ng \
+        espeak \
        python3-pip \
        python-is-python3 \
        python3-dev \
@@ -99,8 +93,9 @@ RUN pip install --user grpcio-tools
 ###################################
 ###################################

-# Base image for the build-type. 
-FROM requirements-${IMAGE_TYPE} AS run-requirements-drivers
+# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
+FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
@@ -191,82 +186,6 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        ldconfig \
    ; fi

-# The build-requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
-# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
-FROM requirements-${IMAGE_TYPE} AS build-requirements-drivers
-
-ARG BUILD_TYPE
-ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=5
-
-ENV BUILD_TYPE=${BUILD_TYPE}
-
-# Vulkan requirements
-RUN <<EOT bash
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-                        software-properties-common pciutils wget gpg-agent && \
-        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-        apt-get update && \
-            apt-get install -y \
-            vulkan-sdk && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-EOT
-
-# CuBLAS requirements
-RUN <<EOT bash
-    apt-get update && \
-    apt-get install -y  --no-install-recommends \
-                    software-properties-common pciutils
-    if [ "amd64" = "$TARGETARCH" ]; then
-        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-    if [ "arm64" = "$TARGETARCH" ]; then
-        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
-    fi
-    dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
-        apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-EOT
-
-# clblas
-RUN apt-get update && \
-        apt-get install -y --no-install-recommends \
-            libclblast-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-
-# intel
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && apt update && apt install -y intel-basekit && apt-get clean && \
-rm -rf /var/lib/apt/lists/*
-
-# hipblas
-RUN wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-        gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && apt-get update && \
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.1.2/ubuntu jammy main" \
-        | tee /etc/apt/sources.list.d/amdgpu.list && \
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.1.2 jammy main" |  tee --append /etc/apt/sources.list.d/rocm.list && printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | tee /etc/apt/preferences.d/rocm-pin-600 && \
-        apt update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev rocm-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig
-
 ###################################
 ###################################

@@ -318,7 +237,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall

 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
-FROM build-requirements-drivers AS builder
+FROM requirements-drivers AS builder

 ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
@@ -363,8 +282,9 @@ COPY --from=grpc /opt/grpc /usr/local

 # Rebuild with defaults backends
 WORKDIR /build
-# Need to build tts and stablediffusion separately first (?)
-RUN make dist && rm release/*.sha256 && mv release/* local-ai
+
+## Build the binary
+RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -376,7 +296,7 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \

 # This is the final target. The result of this target will be the image uploaded to the registry.
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
-FROM run-requirements-drivers
+FROM requirements-drivers

 ARG FFMPEG
 ARG BUILD_TYPE
@@ -421,7 +341,6 @@ RUN make prepare-sources
 COPY --from=builder /build/local-ai ./

 # Copy shared libraries for piper
-# TODO(mudler): bundle these libs in backend-assets/lib/ (like we do for llama.cpp deps)
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/

 # do not let stablediffusion rebuild (requires an older version of absl)
--- a/4
+++ b/4
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=9ef07800622e4c371605f9419864d15667c3558f
+CPPLLAMA_VERSION?=cb5fad4c6c2cbef92e9b8b63449e1cb7664e4846

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -337,12 +337,10 @@ dist:
 ifeq ($(OS),Darwin)
 	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
-ifneq ($(ARCH),arm64)
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
-endif
 endif
 	STATIC=true $(MAKE) build
 	mkdir -p release
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -247,14 +247,23 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 	}

 	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
-		log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-		grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
+		p := backendPath(assetDir, LLamaCPPAVX2)
+		if _, err := os.Stat(p); err == nil {
+			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
+			grpcProcess = p
+		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
-		log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-		grpcProcess = backendPath(assetDir, LLamaCPPAVX)
+		p := backendPath(assetDir, LLamaCPPAVX)
+		if _, err := os.Stat(p); err == nil {
+			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
+			grpcProcess = p
+		}
 	} else {
-		log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-		grpcProcess = backendPath(assetDir, LLamaCPPFallback)
+		p := backendPath(assetDir, LLamaCPPFallback)
+		if _, err := os.Stat(p); err == nil {
+			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
+			grpcProcess = p
+		}
 	}

 	return grpcProcess
@@ -509,6 +518,39 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 			err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
 			log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
 		}
+
+		if autoDetect && key == LLamaCPP && err != nil {
+			// try as hard as possible to run the llama.cpp variants
+			backendToUse := ""
+			if xsysinfo.HasCPUCaps(cpuid.AVX2) {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
+					backendToUse = LLamaCPPAVX2
+				}
+			} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
+					backendToUse = LLamaCPPAVX
+				}
+			} else {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
+					backendToUse = LLamaCPPFallback
+				} else {
+					// If we don't have a fallback, just skip fallback
+					continue
+				}
+			}
+
+			// Autodetection failed, try the fallback
+			log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
+			options = append(options, WithBackendString(backendToUse))
+			model, modelerr = ml.BackendLoader(options...)
+			if modelerr == nil && model != nil {
+				log.Info().Msgf("[%s] Loads OK", key)
+				return model, nil
+			} else {
+				err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
+				log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
+			}
+		}
 	}

 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())