mirror of
https://github.com/ollama/ollama.git
synced 2026-01-11 17:09:45 -05:00
Compare commits
17 Commits
improve-cl
...
llama-upda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
961fdb69d5 | ||
|
|
9aff392e6c | ||
|
|
d3c1524fa4 | ||
|
|
7cc2a653f2 | ||
|
|
2584940016 | ||
|
|
c6d4c0c7f2 | ||
|
|
1ef4241727 | ||
|
|
68fafd3002 | ||
|
|
2b2cda7a2b | ||
|
|
3cfe9fe146 | ||
|
|
a23b559b4c | ||
|
|
33ee7168ba | ||
|
|
34d0c55ea5 | ||
|
|
53a5a9e9ae | ||
|
|
e30e08a7d6 | ||
|
|
12e2b3514a | ||
|
|
626af2d809 |
7
.github/workflows/release.yaml
vendored
7
.github/workflows/release.yaml
vendored
@@ -68,6 +68,7 @@ jobs:
|
||||
name: bundles-darwin
|
||||
path: |
|
||||
dist/*.tgz
|
||||
dist/*.tar.zst
|
||||
dist/*.zip
|
||||
dist/*.dmg
|
||||
|
||||
@@ -392,13 +393,13 @@ jobs:
|
||||
done
|
||||
- run: |
|
||||
for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
|
||||
tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
|
||||
tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd --ultra -22 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst);
|
||||
done
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: bundles-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
|
||||
path: |
|
||||
*.tgz
|
||||
*.tar.zst
|
||||
|
||||
# Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
|
||||
docker-build-push:
|
||||
@@ -531,7 +532,7 @@ jobs:
|
||||
- name: Upload release artifacts
|
||||
run: |
|
||||
pids=()
|
||||
for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.exe dist/*.dmg ; do
|
||||
for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.tar.zst dist/*.exe dist/*.dmg ; do
|
||||
echo "Uploading $payload"
|
||||
gh release upload ${GITHUB_REF_NAME} $payload --clobber &
|
||||
pids[$!]=$!
|
||||
|
||||
@@ -2,6 +2,22 @@ cmake_minimum_required(VERSION 3.21)
|
||||
|
||||
project(Ollama C CXX)
|
||||
|
||||
# Handle cross-compilation on macOS: when CMAKE_OSX_ARCHITECTURES is set to a
|
||||
# single architecture different from the host, override CMAKE_SYSTEM_PROCESSOR
|
||||
# to match. This is necessary because CMAKE_SYSTEM_PROCESSOR defaults to the
|
||||
# host architecture, but downstream projects (like MLX) use it to detect the
|
||||
# target architecture.
|
||||
if(CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES ";")
|
||||
# Single architecture specified
|
||||
if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
message(STATUS "Cross-compiling for x86_64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to x86_64")
|
||||
set(CMAKE_SYSTEM_PROCESSOR "x86_64")
|
||||
elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||
message(STATUS "Cross-compiling for arm64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to arm64")
|
||||
set(CMAKE_SYSTEM_PROCESSOR "arm64")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(CheckLanguage)
|
||||
include(GNUInstallDirs)
|
||||
|
||||
@@ -12,7 +28,7 @@ set(BUILD_SHARED_LIBS ON)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_EXTENSIONS ON) # Recent versions of MLX Requires gnu++17 extensions to compile properly
|
||||
|
||||
set(GGML_BUILD ON)
|
||||
set(GGML_SHARED ON)
|
||||
@@ -147,14 +163,48 @@ if(CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(Vulkan)
|
||||
if(Vulkan_FOUND)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
|
||||
install(TARGETS ggml-vulkan
|
||||
RUNTIME_DEPENDENCIES
|
||||
PRE_INCLUDE_REGEXES vulkan
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||
)
|
||||
if(NOT APPLE)
|
||||
find_package(Vulkan)
|
||||
if(Vulkan_FOUND)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
|
||||
install(TARGETS ggml-vulkan
|
||||
RUNTIME_DEPENDENCIES
|
||||
PRE_INCLUDE_REGEXES vulkan
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
option(MLX_ENGINE "Enable MLX backend" OFF)
|
||||
|
||||
if(MLX_ENGINE)
|
||||
message(STATUS "Setting up MLX (this takes a while...)")
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/ml/backend/mlx)
|
||||
|
||||
# Find CUDA toolkit if MLX is built with CUDA support
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
install(TARGETS mlx mlxc
|
||||
RUNTIME_DEPENDENCIES
|
||||
DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
|
||||
PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
|
||||
PRE_EXCLUDE_REGEXES ".*"
|
||||
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
|
||||
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
|
||||
FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
|
||||
)
|
||||
|
||||
# Manually install cudart and cublas since they might not be picked up as direct dependencies
|
||||
if(CUDAToolkit_FOUND)
|
||||
file(GLOB CUDART_LIBS
|
||||
"${CUDAToolkit_LIBRARY_DIR}/libcudart.so*"
|
||||
"${CUDAToolkit_LIBRARY_DIR}/libcublas.so*")
|
||||
if(CUDART_LIBS)
|
||||
install(FILES ${CUDART_LIBS}
|
||||
DESTINATION ${OLLAMA_INSTALL_DIR}
|
||||
COMPONENT MLX)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
@@ -41,7 +41,7 @@
|
||||
"inherits": [ "CUDA" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
|
||||
"CMAKE_CUDA_FLAGS": "-t 2",
|
||||
"CMAKE_CUDA_FLAGS": "-t 4",
|
||||
"OLLAMA_RUNNER_DIR": "cuda_v13"
|
||||
}
|
||||
},
|
||||
@@ -83,6 +83,28 @@
|
||||
"cacheVariables": {
|
||||
"OLLAMA_RUNNER_DIR": "vulkan"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "MLX",
|
||||
"inherits": [ "Default" ],
|
||||
"cacheVariables": {
|
||||
"MLX_ENGINE": "ON",
|
||||
"OLLAMA_RUNNER_DIR": "mlx"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "MLX CUDA 12",
|
||||
"inherits": [ "MLX", "CUDA 12" ],
|
||||
"cacheVariables": {
|
||||
"OLLAMA_RUNNER_DIR": "mlx_cuda_v12"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "MLX CUDA 13",
|
||||
"inherits": [ "MLX", "CUDA 13" ],
|
||||
"cacheVariables": {
|
||||
"OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
|
||||
}
|
||||
}
|
||||
],
|
||||
"buildPresets": [
|
||||
@@ -140,6 +162,21 @@
|
||||
"name": "Vulkan",
|
||||
"targets": [ "ggml-vulkan" ],
|
||||
"configurePreset": "Vulkan"
|
||||
},
|
||||
{
|
||||
"name": "MLX",
|
||||
"targets": [ "mlx", "mlxc" ],
|
||||
"configurePreset": "MLX"
|
||||
},
|
||||
{
|
||||
"name": "MLX CUDA 12",
|
||||
"targets": [ "mlx", "mlxc" ],
|
||||
"configurePreset": "MLX CUDA 12"
|
||||
},
|
||||
{
|
||||
"name": "MLX CUDA 13",
|
||||
"targets": [ "mlx", "mlxc" ],
|
||||
"configurePreset": "MLX CUDA 13"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
33
Dockerfile
33
Dockerfile
@@ -131,8 +131,36 @@ COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'Vulkan' \
|
||||
&& cmake --build --parallel --preset 'Vulkan' \
|
||||
&& cmake --install build --component Vulkan --strip --parallel 8
|
||||
&& cmake --install build --component Vulkan --strip --parallel 8
|
||||
|
||||
FROM base AS mlx
|
||||
ARG CUDA13VERSION=13.0
|
||||
RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \
|
||||
&& dnf install -y openblas-devel lapack-devel \
|
||||
&& dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \
|
||||
&& dnf install -y libnccl libnccl-devel
|
||||
ENV PATH=/usr/local/cuda-13/bin:$PATH
|
||||
ENV BLAS_INCLUDE_DIRS=/usr/include/openblas
|
||||
ENV LAPACK_INCLUDE_DIRS=/usr/include/openblas
|
||||
ENV CGO_LDFLAGS="-L/usr/local/cuda-13/lib64 -L/usr/local/cuda-13/targets/x86_64-linux/lib/stubs"
|
||||
ARG PARALLEL
|
||||
WORKDIR /go/src/github.com/ollama/ollama
|
||||
COPY CMakeLists.txt CMakePresets.json .
|
||||
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
|
||||
COPY x/ml/backend/mlx x/ml/backend/mlx
|
||||
COPY go.mod go.sum .
|
||||
RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
|
||||
ENV PATH=/usr/local/go/bin:$PATH
|
||||
RUN go mod download
|
||||
RUN --mount=type=cache,target=/root/.ccache \
|
||||
cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
|
||||
&& cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
|
||||
&& cmake --install build --component MLX --strip --parallel ${PARALLEL}
|
||||
COPY . .
|
||||
ARG GOFLAGS="'-ldflags=-w -s'"
|
||||
ENV CGO_ENABLED=1
|
||||
ARG CGO_CFLAGS
|
||||
ARG CGO_CXXFLAGS
|
||||
|
||||
FROM base AS build
|
||||
WORKDIR /go/src/github.com/ollama/ollama
|
||||
@@ -153,6 +181,7 @@ FROM --platform=linux/amd64 scratch AS amd64
|
||||
COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
|
||||
COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
|
||||
COPY --from=vulkan dist/lib/ollama /lib/ollama/
|
||||
COPY --from=mlx /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
|
||||
|
||||
FROM --platform=linux/arm64 scratch AS arm64
|
||||
# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
|
||||
@@ -171,7 +200,7 @@ COPY --from=build /bin/ollama /bin/ollama
|
||||
|
||||
FROM ubuntu:24.04
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y ca-certificates libvulkan1 \
|
||||
&& apt-get install -y ca-certificates libvulkan1 libopenblas0 \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY --from=archive /bin /usr/bin
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=ec98e2002
|
||||
FETCH_HEAD=b1377188784f9aea26b8abde56d4aee8c733eec7
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
||||
778
anthropic/anthropic.go
Normal file
778
anthropic/anthropic.go
Normal file
@@ -0,0 +1,778 @@
|
||||
package anthropic
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
// Error types matching Anthropic API
|
||||
type Error struct {
|
||||
Type string `json:"type"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
type ErrorResponse struct {
|
||||
Type string `json:"type"` // always "error"
|
||||
Error Error `json:"error"`
|
||||
RequestID string `json:"request_id,omitempty"`
|
||||
}
|
||||
|
||||
// NewError creates a new ErrorResponse with the appropriate error type based on HTTP status code
|
||||
func NewError(code int, message string) ErrorResponse {
|
||||
var etype string
|
||||
switch code {
|
||||
case http.StatusBadRequest:
|
||||
etype = "invalid_request_error"
|
||||
case http.StatusUnauthorized:
|
||||
etype = "authentication_error"
|
||||
case http.StatusForbidden:
|
||||
etype = "permission_error"
|
||||
case http.StatusNotFound:
|
||||
etype = "not_found_error"
|
||||
case http.StatusTooManyRequests:
|
||||
etype = "rate_limit_error"
|
||||
case http.StatusServiceUnavailable, 529:
|
||||
etype = "overloaded_error"
|
||||
default:
|
||||
etype = "api_error"
|
||||
}
|
||||
|
||||
return ErrorResponse{
|
||||
Type: "error",
|
||||
Error: Error{Type: etype, Message: message},
|
||||
RequestID: generateID("req"),
|
||||
}
|
||||
}
|
||||
|
||||
// Request types
|
||||
|
||||
// MessagesRequest represents an Anthropic Messages API request
|
||||
type MessagesRequest struct {
|
||||
Model string `json:"model"`
|
||||
MaxTokens int `json:"max_tokens"`
|
||||
Messages []MessageParam `json:"messages"`
|
||||
System any `json:"system,omitempty"` // string or []ContentBlock
|
||||
Stream bool `json:"stream,omitempty"`
|
||||
Temperature *float64 `json:"temperature,omitempty"`
|
||||
TopP *float64 `json:"top_p,omitempty"`
|
||||
TopK *int `json:"top_k,omitempty"`
|
||||
StopSequences []string `json:"stop_sequences,omitempty"`
|
||||
Tools []Tool `json:"tools,omitempty"`
|
||||
ToolChoice *ToolChoice `json:"tool_choice,omitempty"`
|
||||
Thinking *ThinkingConfig `json:"thinking,omitempty"`
|
||||
Metadata *Metadata `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
// MessageParam represents a message in the request
|
||||
type MessageParam struct {
|
||||
Role string `json:"role"` // "user" or "assistant"
|
||||
Content any `json:"content"` // string or []ContentBlock
|
||||
}
|
||||
|
||||
// ContentBlock represents a content block in a message.
|
||||
// Text and Thinking use pointers so they serialize as the field being present (even if empty)
|
||||
// only when set, which is required for SDK streaming accumulation.
|
||||
type ContentBlock struct {
|
||||
Type string `json:"type"` // text, image, tool_use, tool_result, thinking
|
||||
|
||||
// For text blocks - pointer so field only appears when set (SDK requires it for accumulation)
|
||||
Text *string `json:"text,omitempty"`
|
||||
|
||||
// For image blocks
|
||||
Source *ImageSource `json:"source,omitempty"`
|
||||
|
||||
// For tool_use blocks
|
||||
ID string `json:"id,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
Input any `json:"input,omitempty"`
|
||||
|
||||
// For tool_result blocks
|
||||
ToolUseID string `json:"tool_use_id,omitempty"`
|
||||
Content any `json:"content,omitempty"` // string or []ContentBlock
|
||||
IsError bool `json:"is_error,omitempty"`
|
||||
|
||||
// For thinking blocks - pointer so field only appears when set (SDK requires it for accumulation)
|
||||
Thinking *string `json:"thinking,omitempty"`
|
||||
Signature string `json:"signature,omitempty"`
|
||||
}
|
||||
|
||||
// ImageSource represents the source of an image
|
||||
type ImageSource struct {
|
||||
Type string `json:"type"` // "base64" or "url"
|
||||
MediaType string `json:"media_type,omitempty"`
|
||||
Data string `json:"data,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
}
|
||||
|
||||
// Tool represents a tool definition
|
||||
type Tool struct {
|
||||
Type string `json:"type,omitempty"` // "custom" for user-defined tools
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description,omitempty"`
|
||||
InputSchema json.RawMessage `json:"input_schema,omitempty"`
|
||||
}
|
||||
|
||||
// ToolChoice controls how the model uses tools
|
||||
type ToolChoice struct {
|
||||
Type string `json:"type"` // "auto", "any", "tool", "none"
|
||||
Name string `json:"name,omitempty"`
|
||||
DisableParallelToolUse bool `json:"disable_parallel_tool_use,omitempty"`
|
||||
}
|
||||
|
||||
// ThinkingConfig controls extended thinking
|
||||
type ThinkingConfig struct {
|
||||
Type string `json:"type"` // "enabled" or "disabled"
|
||||
BudgetTokens int `json:"budget_tokens,omitempty"`
|
||||
}
|
||||
|
||||
// Metadata for the request
|
||||
type Metadata struct {
|
||||
UserID string `json:"user_id,omitempty"`
|
||||
}
|
||||
|
||||
// Response types
|
||||
|
||||
// MessagesResponse represents an Anthropic Messages API response
|
||||
type MessagesResponse struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"` // "message"
|
||||
Role string `json:"role"` // "assistant"
|
||||
Model string `json:"model"`
|
||||
Content []ContentBlock `json:"content"`
|
||||
StopReason string `json:"stop_reason,omitempty"`
|
||||
StopSequence string `json:"stop_sequence,omitempty"`
|
||||
Usage Usage `json:"usage"`
|
||||
}
|
||||
|
||||
// Usage contains token usage information
|
||||
type Usage struct {
|
||||
InputTokens int `json:"input_tokens"`
|
||||
OutputTokens int `json:"output_tokens"`
|
||||
}
|
||||
|
||||
// Streaming event types
|
||||
|
||||
// MessageStartEvent is sent at the start of streaming
|
||||
type MessageStartEvent struct {
|
||||
Type string `json:"type"` // "message_start"
|
||||
Message MessagesResponse `json:"message"`
|
||||
}
|
||||
|
||||
// ContentBlockStartEvent signals the start of a content block
|
||||
type ContentBlockStartEvent struct {
|
||||
Type string `json:"type"` // "content_block_start"
|
||||
Index int `json:"index"`
|
||||
ContentBlock ContentBlock `json:"content_block"`
|
||||
}
|
||||
|
||||
// ContentBlockDeltaEvent contains incremental content updates
|
||||
type ContentBlockDeltaEvent struct {
|
||||
Type string `json:"type"` // "content_block_delta"
|
||||
Index int `json:"index"`
|
||||
Delta Delta `json:"delta"`
|
||||
}
|
||||
|
||||
// Delta represents an incremental update
|
||||
type Delta struct {
|
||||
Type string `json:"type"` // "text_delta", "input_json_delta", "thinking_delta", "signature_delta"
|
||||
Text string `json:"text,omitempty"`
|
||||
PartialJSON string `json:"partial_json,omitempty"`
|
||||
Thinking string `json:"thinking,omitempty"`
|
||||
Signature string `json:"signature,omitempty"`
|
||||
}
|
||||
|
||||
// ContentBlockStopEvent signals the end of a content block
|
||||
type ContentBlockStopEvent struct {
|
||||
Type string `json:"type"` // "content_block_stop"
|
||||
Index int `json:"index"`
|
||||
}
|
||||
|
||||
// MessageDeltaEvent contains updates to the message
|
||||
type MessageDeltaEvent struct {
|
||||
Type string `json:"type"` // "message_delta"
|
||||
Delta MessageDelta `json:"delta"`
|
||||
Usage DeltaUsage `json:"usage"`
|
||||
}
|
||||
|
||||
// MessageDelta contains stop information
|
||||
type MessageDelta struct {
|
||||
StopReason string `json:"stop_reason,omitempty"`
|
||||
StopSequence string `json:"stop_sequence,omitempty"`
|
||||
}
|
||||
|
||||
// DeltaUsage contains cumulative token usage
|
||||
type DeltaUsage struct {
|
||||
OutputTokens int `json:"output_tokens"`
|
||||
}
|
||||
|
||||
// MessageStopEvent signals the end of the message
|
||||
type MessageStopEvent struct {
|
||||
Type string `json:"type"` // "message_stop"
|
||||
}
|
||||
|
||||
// PingEvent is a keepalive event
|
||||
type PingEvent struct {
|
||||
Type string `json:"type"` // "ping"
|
||||
}
|
||||
|
||||
// StreamErrorEvent is an error during streaming
|
||||
type StreamErrorEvent struct {
|
||||
Type string `json:"type"` // "error"
|
||||
Error Error `json:"error"`
|
||||
}
|
||||
|
||||
// FromMessagesRequest converts an Anthropic MessagesRequest to an Ollama api.ChatRequest
|
||||
func FromMessagesRequest(r MessagesRequest) (*api.ChatRequest, error) {
|
||||
var messages []api.Message
|
||||
|
||||
if r.System != nil {
|
||||
switch sys := r.System.(type) {
|
||||
case string:
|
||||
if sys != "" {
|
||||
messages = append(messages, api.Message{Role: "system", Content: sys})
|
||||
}
|
||||
case []any:
|
||||
// System can be an array of content blocks
|
||||
var content strings.Builder
|
||||
for _, block := range sys {
|
||||
if blockMap, ok := block.(map[string]any); ok {
|
||||
if blockMap["type"] == "text" {
|
||||
if text, ok := blockMap["text"].(string); ok {
|
||||
content.WriteString(text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if content.Len() > 0 {
|
||||
messages = append(messages, api.Message{Role: "system", Content: content.String()})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, msg := range r.Messages {
|
||||
converted, err := convertMessage(msg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
messages = append(messages, converted...)
|
||||
}
|
||||
|
||||
options := make(map[string]any)
|
||||
|
||||
options["num_predict"] = r.MaxTokens
|
||||
|
||||
if r.Temperature != nil {
|
||||
options["temperature"] = *r.Temperature
|
||||
}
|
||||
|
||||
if r.TopP != nil {
|
||||
options["top_p"] = *r.TopP
|
||||
}
|
||||
|
||||
if r.TopK != nil {
|
||||
options["top_k"] = *r.TopK
|
||||
}
|
||||
|
||||
if len(r.StopSequences) > 0 {
|
||||
options["stop"] = r.StopSequences
|
||||
}
|
||||
|
||||
var tools api.Tools
|
||||
for _, t := range r.Tools {
|
||||
tool, err := convertTool(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tools = append(tools, tool)
|
||||
}
|
||||
|
||||
var think *api.ThinkValue
|
||||
if r.Thinking != nil && r.Thinking.Type == "enabled" {
|
||||
think = &api.ThinkValue{Value: true}
|
||||
}
|
||||
|
||||
stream := r.Stream
|
||||
|
||||
return &api.ChatRequest{
|
||||
Model: r.Model,
|
||||
Messages: messages,
|
||||
Options: options,
|
||||
Stream: &stream,
|
||||
Tools: tools,
|
||||
Think: think,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// convertMessage converts an Anthropic MessageParam to Ollama api.Message(s)
|
||||
func convertMessage(msg MessageParam) ([]api.Message, error) {
|
||||
var messages []api.Message
|
||||
role := strings.ToLower(msg.Role)
|
||||
|
||||
switch content := msg.Content.(type) {
|
||||
case string:
|
||||
messages = append(messages, api.Message{Role: role, Content: content})
|
||||
|
||||
case []any:
|
||||
var textContent strings.Builder
|
||||
var images []api.ImageData
|
||||
var toolCalls []api.ToolCall
|
||||
var thinking string
|
||||
var toolResults []api.Message
|
||||
|
||||
for _, block := range content {
|
||||
blockMap, ok := block.(map[string]any)
|
||||
if !ok {
|
||||
return nil, errors.New("invalid content block format")
|
||||
}
|
||||
|
||||
blockType, _ := blockMap["type"].(string)
|
||||
|
||||
switch blockType {
|
||||
case "text":
|
||||
if text, ok := blockMap["text"].(string); ok {
|
||||
textContent.WriteString(text)
|
||||
}
|
||||
|
||||
case "image":
|
||||
source, ok := blockMap["source"].(map[string]any)
|
||||
if !ok {
|
||||
return nil, errors.New("invalid image source")
|
||||
}
|
||||
|
||||
sourceType, _ := source["type"].(string)
|
||||
if sourceType == "base64" {
|
||||
data, _ := source["data"].(string)
|
||||
decoded, err := base64.StdEncoding.DecodeString(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid base64 image data: %w", err)
|
||||
}
|
||||
images = append(images, decoded)
|
||||
} else {
|
||||
return nil, fmt.Errorf("invalid image source type: %s. Only base64 images are supported.", sourceType)
|
||||
}
|
||||
// URL images would need to be fetched - skip for now
|
||||
|
||||
case "tool_use":
|
||||
id, ok := blockMap["id"].(string)
|
||||
if !ok {
|
||||
return nil, errors.New("tool_use block missing required 'id' field")
|
||||
}
|
||||
name, ok := blockMap["name"].(string)
|
||||
if !ok {
|
||||
return nil, errors.New("tool_use block missing required 'name' field")
|
||||
}
|
||||
tc := api.ToolCall{
|
||||
ID: id,
|
||||
Function: api.ToolCallFunction{
|
||||
Name: name,
|
||||
},
|
||||
}
|
||||
if input, ok := blockMap["input"].(map[string]any); ok {
|
||||
tc.Function.Arguments = mapToArgs(input)
|
||||
}
|
||||
toolCalls = append(toolCalls, tc)
|
||||
|
||||
case "tool_result":
|
||||
toolUseID, _ := blockMap["tool_use_id"].(string)
|
||||
var resultContent string
|
||||
|
||||
switch c := blockMap["content"].(type) {
|
||||
case string:
|
||||
resultContent = c
|
||||
case []any:
|
||||
for _, cb := range c {
|
||||
if cbMap, ok := cb.(map[string]any); ok {
|
||||
if cbMap["type"] == "text" {
|
||||
if text, ok := cbMap["text"].(string); ok {
|
||||
resultContent += text
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
toolResults = append(toolResults, api.Message{
|
||||
Role: "tool",
|
||||
Content: resultContent,
|
||||
ToolCallID: toolUseID,
|
||||
})
|
||||
|
||||
case "thinking":
|
||||
if t, ok := blockMap["thinking"].(string); ok {
|
||||
thinking = t
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if textContent.Len() > 0 || len(images) > 0 || len(toolCalls) > 0 || thinking != "" {
|
||||
m := api.Message{
|
||||
Role: role,
|
||||
Content: textContent.String(),
|
||||
Images: images,
|
||||
ToolCalls: toolCalls,
|
||||
Thinking: thinking,
|
||||
}
|
||||
messages = append(messages, m)
|
||||
}
|
||||
|
||||
// Add tool results as separate messages
|
||||
messages = append(messages, toolResults...)
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid message content type: %T", content)
|
||||
}
|
||||
|
||||
return messages, nil
|
||||
}
|
||||
|
||||
// convertTool converts an Anthropic Tool to an Ollama api.Tool
|
||||
func convertTool(t Tool) (api.Tool, error) {
|
||||
var params api.ToolFunctionParameters
|
||||
if len(t.InputSchema) > 0 {
|
||||
if err := json.Unmarshal(t.InputSchema, ¶ms); err != nil {
|
||||
return api.Tool{}, fmt.Errorf("invalid input_schema for tool %q: %w", t.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
return api.Tool{
|
||||
Type: "function",
|
||||
Function: api.ToolFunction{
|
||||
Name: t.Name,
|
||||
Description: t.Description,
|
||||
Parameters: params,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ToMessagesResponse converts an Ollama api.ChatResponse to an Anthropic MessagesResponse
|
||||
func ToMessagesResponse(id string, r api.ChatResponse) MessagesResponse {
|
||||
var content []ContentBlock
|
||||
|
||||
if r.Message.Thinking != "" {
|
||||
content = append(content, ContentBlock{
|
||||
Type: "thinking",
|
||||
Thinking: ptr(r.Message.Thinking),
|
||||
})
|
||||
}
|
||||
|
||||
if r.Message.Content != "" {
|
||||
content = append(content, ContentBlock{
|
||||
Type: "text",
|
||||
Text: ptr(r.Message.Content),
|
||||
})
|
||||
}
|
||||
|
||||
for _, tc := range r.Message.ToolCalls {
|
||||
content = append(content, ContentBlock{
|
||||
Type: "tool_use",
|
||||
ID: tc.ID,
|
||||
Name: tc.Function.Name,
|
||||
Input: tc.Function.Arguments,
|
||||
})
|
||||
}
|
||||
|
||||
stopReason := mapStopReason(r.DoneReason, len(r.Message.ToolCalls) > 0)
|
||||
|
||||
return MessagesResponse{
|
||||
ID: id,
|
||||
Type: "message",
|
||||
Role: "assistant",
|
||||
Model: r.Model,
|
||||
Content: content,
|
||||
StopReason: stopReason,
|
||||
Usage: Usage{
|
||||
InputTokens: r.Metrics.PromptEvalCount,
|
||||
OutputTokens: r.Metrics.EvalCount,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// mapStopReason converts Ollama done_reason to Anthropic stop_reason
|
||||
func mapStopReason(reason string, hasToolCalls bool) string {
|
||||
if hasToolCalls {
|
||||
return "tool_use"
|
||||
}
|
||||
|
||||
switch reason {
|
||||
case "stop":
|
||||
return "end_turn"
|
||||
case "length":
|
||||
return "max_tokens"
|
||||
default:
|
||||
if reason != "" {
|
||||
return "stop_sequence"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// StreamConverter manages state for converting Ollama streaming responses to Anthropic format
|
||||
type StreamConverter struct {
|
||||
ID string
|
||||
Model string
|
||||
firstWrite bool
|
||||
contentIndex int
|
||||
inputTokens int
|
||||
outputTokens int
|
||||
thinkingStarted bool
|
||||
thinkingDone bool
|
||||
textStarted bool
|
||||
toolCallsSent map[string]bool
|
||||
}
|
||||
|
||||
func NewStreamConverter(id, model string) *StreamConverter {
|
||||
return &StreamConverter{
|
||||
ID: id,
|
||||
Model: model,
|
||||
firstWrite: true,
|
||||
toolCallsSent: make(map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// StreamEvent represents a streaming event to be sent to the client
|
||||
type StreamEvent struct {
|
||||
Event string
|
||||
Data any
|
||||
}
|
||||
|
||||
// Process converts an Ollama ChatResponse to Anthropic streaming events
|
||||
func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
|
||||
var events []StreamEvent
|
||||
|
||||
if c.firstWrite {
|
||||
c.firstWrite = false
|
||||
c.inputTokens = r.Metrics.PromptEvalCount
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "message_start",
|
||||
Data: MessageStartEvent{
|
||||
Type: "message_start",
|
||||
Message: MessagesResponse{
|
||||
ID: c.ID,
|
||||
Type: "message",
|
||||
Role: "assistant",
|
||||
Model: c.Model,
|
||||
Content: []ContentBlock{},
|
||||
Usage: Usage{
|
||||
InputTokens: c.inputTokens,
|
||||
OutputTokens: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if r.Message.Thinking != "" && !c.thinkingDone {
|
||||
if !c.thinkingStarted {
|
||||
c.thinkingStarted = true
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_start",
|
||||
Data: ContentBlockStartEvent{
|
||||
Type: "content_block_start",
|
||||
Index: c.contentIndex,
|
||||
ContentBlock: ContentBlock{
|
||||
Type: "thinking",
|
||||
Thinking: ptr(""),
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_delta",
|
||||
Data: ContentBlockDeltaEvent{
|
||||
Type: "content_block_delta",
|
||||
Index: c.contentIndex,
|
||||
Delta: Delta{
|
||||
Type: "thinking_delta",
|
||||
Thinking: r.Message.Thinking,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if r.Message.Content != "" {
|
||||
if c.thinkingStarted && !c.thinkingDone {
|
||||
c.thinkingDone = true
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_stop",
|
||||
Data: ContentBlockStopEvent{
|
||||
Type: "content_block_stop",
|
||||
Index: c.contentIndex,
|
||||
},
|
||||
})
|
||||
c.contentIndex++
|
||||
}
|
||||
|
||||
if !c.textStarted {
|
||||
c.textStarted = true
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_start",
|
||||
Data: ContentBlockStartEvent{
|
||||
Type: "content_block_start",
|
||||
Index: c.contentIndex,
|
||||
ContentBlock: ContentBlock{
|
||||
Type: "text",
|
||||
Text: ptr(""),
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_delta",
|
||||
Data: ContentBlockDeltaEvent{
|
||||
Type: "content_block_delta",
|
||||
Index: c.contentIndex,
|
||||
Delta: Delta{
|
||||
Type: "text_delta",
|
||||
Text: r.Message.Content,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
for _, tc := range r.Message.ToolCalls {
|
||||
if c.toolCallsSent[tc.ID] {
|
||||
continue
|
||||
}
|
||||
|
||||
if c.textStarted {
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_stop",
|
||||
Data: ContentBlockStopEvent{
|
||||
Type: "content_block_stop",
|
||||
Index: c.contentIndex,
|
||||
},
|
||||
})
|
||||
c.contentIndex++
|
||||
c.textStarted = false
|
||||
}
|
||||
|
||||
argsJSON, err := json.Marshal(tc.Function.Arguments)
|
||||
if err != nil {
|
||||
slog.Error("failed to marshal tool arguments", "error", err, "tool_id", tc.ID)
|
||||
continue
|
||||
}
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_start",
|
||||
Data: ContentBlockStartEvent{
|
||||
Type: "content_block_start",
|
||||
Index: c.contentIndex,
|
||||
ContentBlock: ContentBlock{
|
||||
Type: "tool_use",
|
||||
ID: tc.ID,
|
||||
Name: tc.Function.Name,
|
||||
Input: map[string]any{},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_delta",
|
||||
Data: ContentBlockDeltaEvent{
|
||||
Type: "content_block_delta",
|
||||
Index: c.contentIndex,
|
||||
Delta: Delta{
|
||||
Type: "input_json_delta",
|
||||
PartialJSON: string(argsJSON),
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_stop",
|
||||
Data: ContentBlockStopEvent{
|
||||
Type: "content_block_stop",
|
||||
Index: c.contentIndex,
|
||||
},
|
||||
})
|
||||
|
||||
c.toolCallsSent[tc.ID] = true
|
||||
c.contentIndex++
|
||||
}
|
||||
|
||||
if r.Done {
|
||||
if c.textStarted {
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_stop",
|
||||
Data: ContentBlockStopEvent{
|
||||
Type: "content_block_stop",
|
||||
Index: c.contentIndex,
|
||||
},
|
||||
})
|
||||
} else if c.thinkingStarted && !c.thinkingDone {
|
||||
events = append(events, StreamEvent{
|
||||
Event: "content_block_stop",
|
||||
Data: ContentBlockStopEvent{
|
||||
Type: "content_block_stop",
|
||||
Index: c.contentIndex,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
c.outputTokens = r.Metrics.EvalCount
|
||||
stopReason := mapStopReason(r.DoneReason, len(c.toolCallsSent) > 0)
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "message_delta",
|
||||
Data: MessageDeltaEvent{
|
||||
Type: "message_delta",
|
||||
Delta: MessageDelta{
|
||||
StopReason: stopReason,
|
||||
},
|
||||
Usage: DeltaUsage{
|
||||
OutputTokens: c.outputTokens,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
events = append(events, StreamEvent{
|
||||
Event: "message_stop",
|
||||
Data: MessageStopEvent{
|
||||
Type: "message_stop",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return events
|
||||
}
|
||||
|
||||
// generateID generates a unique ID with the given prefix using crypto/rand
|
||||
func generateID(prefix string) string {
|
||||
b := make([]byte, 12)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
// Fallback to time-based ID if crypto/rand fails
|
||||
return fmt.Sprintf("%s_%d", prefix, time.Now().UnixNano())
|
||||
}
|
||||
return fmt.Sprintf("%s_%x", prefix, b)
|
||||
}
|
||||
|
||||
// GenerateMessageID generates a unique message ID
|
||||
func GenerateMessageID() string {
|
||||
return generateID("msg")
|
||||
}
|
||||
|
||||
// ptr returns a pointer to the given string value
|
||||
func ptr(s string) *string {
|
||||
return &s
|
||||
}
|
||||
|
||||
// mapToArgs converts a map to ToolCallFunctionArguments
|
||||
func mapToArgs(m map[string]any) api.ToolCallFunctionArguments {
|
||||
args := api.NewToolCallFunctionArguments()
|
||||
for k, v := range m {
|
||||
args.Set(k, v)
|
||||
}
|
||||
return args
|
||||
}
|
||||
953
anthropic/anthropic_test.go
Normal file
953
anthropic/anthropic_test.go
Normal file
@@ -0,0 +1,953 @@
|
||||
package anthropic
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
const (
|
||||
testImage = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
|
||||
)
|
||||
|
||||
// testArgs creates ToolCallFunctionArguments from a map (convenience function for tests)
|
||||
func testArgs(m map[string]any) api.ToolCallFunctionArguments {
|
||||
args := api.NewToolCallFunctionArguments()
|
||||
for k, v := range m {
|
||||
args.Set(k, v)
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_Basic(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{Role: "user", Content: "Hello"},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Model != "test-model" {
|
||||
t.Errorf("expected model 'test-model', got %q", result.Model)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 1 {
|
||||
t.Fatalf("expected 1 message, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
if result.Messages[0].Role != "user" || result.Messages[0].Content != "Hello" {
|
||||
t.Errorf("unexpected message: %+v", result.Messages[0])
|
||||
}
|
||||
|
||||
if numPredict, ok := result.Options["num_predict"].(int); !ok || numPredict != 1024 {
|
||||
t.Errorf("expected num_predict 1024, got %v", result.Options["num_predict"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithSystemPrompt(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
System: "You are a helpful assistant.",
|
||||
Messages: []MessageParam{
|
||||
{Role: "user", Content: "Hello"},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 2 {
|
||||
t.Fatalf("expected 2 messages, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
if result.Messages[0].Role != "system" || result.Messages[0].Content != "You are a helpful assistant." {
|
||||
t.Errorf("unexpected system message: %+v", result.Messages[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithSystemPromptArray(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
System: []any{
|
||||
map[string]any{"type": "text", "text": "You are helpful."},
|
||||
map[string]any{"type": "text", "text": " Be concise."},
|
||||
},
|
||||
Messages: []MessageParam{
|
||||
{Role: "user", Content: "Hello"},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 2 {
|
||||
t.Fatalf("expected 2 messages, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
if result.Messages[0].Content != "You are helpful. Be concise." {
|
||||
t.Errorf("unexpected system message content: %q", result.Messages[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithOptions(t *testing.T) {
|
||||
temp := 0.7
|
||||
topP := 0.9
|
||||
topK := 40
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 2048,
|
||||
Messages: []MessageParam{{Role: "user", Content: "Hello"}},
|
||||
Temperature: &temp,
|
||||
TopP: &topP,
|
||||
TopK: &topK,
|
||||
StopSequences: []string{"\n", "END"},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Options["temperature"] != 0.7 {
|
||||
t.Errorf("expected temperature 0.7, got %v", result.Options["temperature"])
|
||||
}
|
||||
if result.Options["top_p"] != 0.9 {
|
||||
t.Errorf("expected top_p 0.9, got %v", result.Options["top_p"])
|
||||
}
|
||||
if result.Options["top_k"] != 40 {
|
||||
t.Errorf("expected top_k 40, got %v", result.Options["top_k"])
|
||||
}
|
||||
if diff := cmp.Diff([]string{"\n", "END"}, result.Options["stop"]); diff != "" {
|
||||
t.Errorf("stop sequences mismatch: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithImage(t *testing.T) {
|
||||
imgData, _ := base64.StdEncoding.DecodeString(testImage)
|
||||
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{
|
||||
Role: "user",
|
||||
Content: []any{
|
||||
map[string]any{"type": "text", "text": "What's in this image?"},
|
||||
map[string]any{
|
||||
"type": "image",
|
||||
"source": map[string]any{
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": testImage,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 1 {
|
||||
t.Fatalf("expected 1 message, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
if result.Messages[0].Content != "What's in this image?" {
|
||||
t.Errorf("expected content 'What's in this image?', got %q", result.Messages[0].Content)
|
||||
}
|
||||
|
||||
if len(result.Messages[0].Images) != 1 {
|
||||
t.Fatalf("expected 1 image, got %d", len(result.Messages[0].Images))
|
||||
}
|
||||
|
||||
if string(result.Messages[0].Images[0]) != string(imgData) {
|
||||
t.Error("image data mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithToolUse(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{Role: "user", Content: "What's the weather in Paris?"},
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "tool_use",
|
||||
"id": "call_123",
|
||||
"name": "get_weather",
|
||||
"input": map[string]any{"location": "Paris"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 2 {
|
||||
t.Fatalf("expected 2 messages, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
if len(result.Messages[1].ToolCalls) != 1 {
|
||||
t.Fatalf("expected 1 tool call, got %d", len(result.Messages[1].ToolCalls))
|
||||
}
|
||||
|
||||
tc := result.Messages[1].ToolCalls[0]
|
||||
if tc.ID != "call_123" {
|
||||
t.Errorf("expected tool call ID 'call_123', got %q", tc.ID)
|
||||
}
|
||||
if tc.Function.Name != "get_weather" {
|
||||
t.Errorf("expected tool name 'get_weather', got %q", tc.Function.Name)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithToolResult(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{
|
||||
Role: "user",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "tool_result",
|
||||
"tool_use_id": "call_123",
|
||||
"content": "The weather in Paris is sunny, 22°C",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 1 {
|
||||
t.Fatalf("expected 1 message, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
msg := result.Messages[0]
|
||||
if msg.Role != "tool" {
|
||||
t.Errorf("expected role 'tool', got %q", msg.Role)
|
||||
}
|
||||
if msg.ToolCallID != "call_123" {
|
||||
t.Errorf("expected tool_call_id 'call_123', got %q", msg.ToolCallID)
|
||||
}
|
||||
if msg.Content != "The weather in Paris is sunny, 22°C" {
|
||||
t.Errorf("unexpected content: %q", msg.Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithTools(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{{Role: "user", Content: "Hello"}},
|
||||
Tools: []Tool{
|
||||
{
|
||||
Name: "get_weather",
|
||||
Description: "Get current weather",
|
||||
InputSchema: json.RawMessage(`{"type":"object","properties":{"location":{"type":"string"}},"required":["location"]}`),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Tools) != 1 {
|
||||
t.Fatalf("expected 1 tool, got %d", len(result.Tools))
|
||||
}
|
||||
|
||||
tool := result.Tools[0]
|
||||
if tool.Type != "function" {
|
||||
t.Errorf("expected type 'function', got %q", tool.Type)
|
||||
}
|
||||
if tool.Function.Name != "get_weather" {
|
||||
t.Errorf("expected name 'get_weather', got %q", tool.Function.Name)
|
||||
}
|
||||
if tool.Function.Description != "Get current weather" {
|
||||
t.Errorf("expected description 'Get current weather', got %q", tool.Function.Description)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_WithThinking(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{{Role: "user", Content: "Hello"}},
|
||||
Thinking: &ThinkingConfig{Type: "enabled", BudgetTokens: 1000},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if result.Think == nil {
|
||||
t.Fatal("expected Think to be set")
|
||||
}
|
||||
if v, ok := result.Think.Value.(bool); !ok || !v {
|
||||
t.Errorf("expected Think.Value to be true, got %v", result.Think.Value)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFromMessagesRequest_ThinkingOnlyBlock verifies that messages containing only
|
||||
// a thinking block (no text, images, or tool calls) are preserved and not dropped.
|
||||
func TestFromMessagesRequest_ThinkingOnlyBlock(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{Role: "user", Content: "Hello"},
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "thinking",
|
||||
"thinking": "Let me think about this...",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := FromMessagesRequest(req)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(result.Messages) != 2 {
|
||||
t.Fatalf("expected 2 messages, got %d", len(result.Messages))
|
||||
}
|
||||
|
||||
assistantMsg := result.Messages[1]
|
||||
if assistantMsg.Thinking != "Let me think about this..." {
|
||||
t.Errorf("expected thinking content, got %q", assistantMsg.Thinking)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_ToolUseMissingID(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "tool_use",
|
||||
"name": "get_weather",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, err := FromMessagesRequest(req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing tool_use id")
|
||||
}
|
||||
if err.Error() != "tool_use block missing required 'id' field" {
|
||||
t.Errorf("unexpected error message: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_ToolUseMissingName(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "tool_use",
|
||||
"id": "call_123",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, err := FromMessagesRequest(req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing tool_use name")
|
||||
}
|
||||
if err.Error() != "tool_use block missing required 'name' field" {
|
||||
t.Errorf("unexpected error message: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromMessagesRequest_InvalidToolSchema(t *testing.T) {
|
||||
req := MessagesRequest{
|
||||
Model: "test-model",
|
||||
MaxTokens: 1024,
|
||||
Messages: []MessageParam{{Role: "user", Content: "Hello"}},
|
||||
Tools: []Tool{
|
||||
{
|
||||
Name: "bad_tool",
|
||||
InputSchema: json.RawMessage(`{invalid json`),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, err := FromMessagesRequest(req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for invalid tool schema")
|
||||
}
|
||||
}
|
||||
|
||||
func TestToMessagesResponse_Basic(t *testing.T) {
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "Hello there!",
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: 10,
|
||||
EvalCount: 5,
|
||||
},
|
||||
}
|
||||
|
||||
result := ToMessagesResponse("msg_123", resp)
|
||||
|
||||
if result.ID != "msg_123" {
|
||||
t.Errorf("expected ID 'msg_123', got %q", result.ID)
|
||||
}
|
||||
if result.Type != "message" {
|
||||
t.Errorf("expected type 'message', got %q", result.Type)
|
||||
}
|
||||
if result.Role != "assistant" {
|
||||
t.Errorf("expected role 'assistant', got %q", result.Role)
|
||||
}
|
||||
if len(result.Content) != 1 {
|
||||
t.Fatalf("expected 1 content block, got %d", len(result.Content))
|
||||
}
|
||||
if result.Content[0].Type != "text" || result.Content[0].Text == nil || *result.Content[0].Text != "Hello there!" {
|
||||
t.Errorf("unexpected content: %+v", result.Content[0])
|
||||
}
|
||||
if result.StopReason != "end_turn" {
|
||||
t.Errorf("expected stop_reason 'end_turn', got %q", result.StopReason)
|
||||
}
|
||||
if result.Usage.InputTokens != 10 || result.Usage.OutputTokens != 5 {
|
||||
t.Errorf("unexpected usage: %+v", result.Usage)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToMessagesResponse_WithToolCalls(t *testing.T) {
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
ID: "call_123",
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_weather",
|
||||
Arguments: testArgs(map[string]any{"location": "Paris"}),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
}
|
||||
|
||||
result := ToMessagesResponse("msg_123", resp)
|
||||
|
||||
if len(result.Content) != 1 {
|
||||
t.Fatalf("expected 1 content block, got %d", len(result.Content))
|
||||
}
|
||||
if result.Content[0].Type != "tool_use" {
|
||||
t.Errorf("expected type 'tool_use', got %q", result.Content[0].Type)
|
||||
}
|
||||
if result.Content[0].ID != "call_123" {
|
||||
t.Errorf("expected ID 'call_123', got %q", result.Content[0].ID)
|
||||
}
|
||||
if result.Content[0].Name != "get_weather" {
|
||||
t.Errorf("expected name 'get_weather', got %q", result.Content[0].Name)
|
||||
}
|
||||
if result.StopReason != "tool_use" {
|
||||
t.Errorf("expected stop_reason 'tool_use', got %q", result.StopReason)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToMessagesResponse_WithThinking(t *testing.T) {
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "The answer is 42.",
|
||||
Thinking: "Let me think about this...",
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
}
|
||||
|
||||
result := ToMessagesResponse("msg_123", resp)
|
||||
|
||||
if len(result.Content) != 2 {
|
||||
t.Fatalf("expected 2 content blocks, got %d", len(result.Content))
|
||||
}
|
||||
if result.Content[0].Type != "thinking" {
|
||||
t.Errorf("expected first block type 'thinking', got %q", result.Content[0].Type)
|
||||
}
|
||||
if result.Content[0].Thinking == nil || *result.Content[0].Thinking != "Let me think about this..." {
|
||||
t.Errorf("unexpected thinking content: %v", result.Content[0].Thinking)
|
||||
}
|
||||
if result.Content[1].Type != "text" {
|
||||
t.Errorf("expected second block type 'text', got %q", result.Content[1].Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMapStopReason(t *testing.T) {
|
||||
tests := []struct {
|
||||
reason string
|
||||
hasToolCalls bool
|
||||
want string
|
||||
}{
|
||||
{"stop", false, "end_turn"},
|
||||
{"length", false, "max_tokens"},
|
||||
{"stop", true, "tool_use"},
|
||||
{"other", false, "stop_sequence"},
|
||||
{"", false, ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := mapStopReason(tt.reason, tt.hasToolCalls)
|
||||
if got != tt.want {
|
||||
t.Errorf("mapStopReason(%q, %v) = %q, want %q", tt.reason, tt.hasToolCalls, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewError(t *testing.T) {
|
||||
tests := []struct {
|
||||
code int
|
||||
want string
|
||||
}{
|
||||
{400, "invalid_request_error"},
|
||||
{401, "authentication_error"},
|
||||
{403, "permission_error"},
|
||||
{404, "not_found_error"},
|
||||
{429, "rate_limit_error"},
|
||||
{500, "api_error"},
|
||||
{503, "overloaded_error"},
|
||||
{529, "overloaded_error"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := NewError(tt.code, "test message")
|
||||
if result.Type != "error" {
|
||||
t.Errorf("NewError(%d) type = %q, want 'error'", tt.code, result.Type)
|
||||
}
|
||||
if result.Error.Type != tt.want {
|
||||
t.Errorf("NewError(%d) error.type = %q, want %q", tt.code, result.Error.Type, tt.want)
|
||||
}
|
||||
if result.Error.Message != "test message" {
|
||||
t.Errorf("NewError(%d) message = %q, want 'test message'", tt.code, result.Error.Message)
|
||||
}
|
||||
if result.RequestID == "" {
|
||||
t.Errorf("NewError(%d) request_id should not be empty", tt.code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateMessageID(t *testing.T) {
|
||||
id1 := GenerateMessageID()
|
||||
id2 := GenerateMessageID()
|
||||
|
||||
if id1 == "" {
|
||||
t.Error("GenerateMessageID returned empty string")
|
||||
}
|
||||
if id1 == id2 {
|
||||
t.Error("GenerateMessageID returned duplicate IDs")
|
||||
}
|
||||
if len(id1) < 10 {
|
||||
t.Errorf("GenerateMessageID returned short ID: %q", id1)
|
||||
}
|
||||
if id1[:4] != "msg_" {
|
||||
t.Errorf("GenerateMessageID should start with 'msg_', got %q", id1[:4])
|
||||
}
|
||||
}
|
||||
|
||||
func TestStreamConverter_Basic(t *testing.T) {
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
// First chunk
|
||||
resp1 := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: "Hello",
|
||||
},
|
||||
Metrics: api.Metrics{PromptEvalCount: 10},
|
||||
}
|
||||
|
||||
events1 := conv.Process(resp1)
|
||||
if len(events1) < 3 {
|
||||
t.Fatalf("expected at least 3 events for first chunk, got %d", len(events1))
|
||||
}
|
||||
|
||||
// Should have message_start, content_block_start, content_block_delta
|
||||
if events1[0].Event != "message_start" {
|
||||
t.Errorf("expected first event 'message_start', got %q", events1[0].Event)
|
||||
}
|
||||
if events1[1].Event != "content_block_start" {
|
||||
t.Errorf("expected second event 'content_block_start', got %q", events1[1].Event)
|
||||
}
|
||||
if events1[2].Event != "content_block_delta" {
|
||||
t.Errorf("expected third event 'content_block_delta', got %q", events1[2].Event)
|
||||
}
|
||||
|
||||
// Final chunk
|
||||
resp2 := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
Content: " world!",
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
Metrics: api.Metrics{EvalCount: 5},
|
||||
}
|
||||
|
||||
events2 := conv.Process(resp2)
|
||||
|
||||
// Should have content_block_delta, content_block_stop, message_delta, message_stop
|
||||
hasStop := false
|
||||
for _, e := range events2 {
|
||||
if e.Event == "message_stop" {
|
||||
hasStop = true
|
||||
}
|
||||
}
|
||||
if !hasStop {
|
||||
t.Error("expected message_stop event in final chunk")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStreamConverter_WithToolCalls(t *testing.T) {
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
ID: "call_123",
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_weather",
|
||||
Arguments: testArgs(map[string]any{"location": "Paris"}),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
Metrics: api.Metrics{PromptEvalCount: 10, EvalCount: 5},
|
||||
}
|
||||
|
||||
events := conv.Process(resp)
|
||||
|
||||
hasToolStart := false
|
||||
hasToolDelta := false
|
||||
for _, e := range events {
|
||||
if e.Event == "content_block_start" {
|
||||
if start, ok := e.Data.(ContentBlockStartEvent); ok {
|
||||
if start.ContentBlock.Type == "tool_use" {
|
||||
hasToolStart = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if e.Event == "content_block_delta" {
|
||||
if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
|
||||
if delta.Delta.Type == "input_json_delta" {
|
||||
hasToolDelta = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !hasToolStart {
|
||||
t.Error("expected tool_use content_block_start event")
|
||||
}
|
||||
if !hasToolDelta {
|
||||
t.Error("expected input_json_delta event")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStreamConverter_ToolCallWithUnmarshalableArgs(t *testing.T) {
|
||||
// Test that unmarshalable arguments (like channels) are handled gracefully
|
||||
// and don't cause a panic or corrupt stream
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
// Create a channel which cannot be JSON marshaled
|
||||
unmarshalable := make(chan int)
|
||||
badArgs := api.NewToolCallFunctionArguments()
|
||||
badArgs.Set("channel", unmarshalable)
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
ID: "call_bad",
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "bad_function",
|
||||
Arguments: badArgs,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
}
|
||||
|
||||
// Should not panic and should skip the unmarshalable tool call
|
||||
events := conv.Process(resp)
|
||||
|
||||
// Verify no tool_use block was started (since marshal failed before block start)
|
||||
hasToolStart := false
|
||||
for _, e := range events {
|
||||
if e.Event == "content_block_start" {
|
||||
if start, ok := e.Data.(ContentBlockStartEvent); ok {
|
||||
if start.ContentBlock.Type == "tool_use" {
|
||||
hasToolStart = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hasToolStart {
|
||||
t.Error("expected no tool_use block when arguments cannot be marshaled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStreamConverter_MultipleToolCallsWithMixedValidity(t *testing.T) {
|
||||
// Test that valid tool calls still work when mixed with invalid ones
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
unmarshalable := make(chan int)
|
||||
badArgs := api.NewToolCallFunctionArguments()
|
||||
badArgs.Set("channel", unmarshalable)
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
ID: "call_good",
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "good_function",
|
||||
Arguments: testArgs(map[string]any{"location": "Paris"}),
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "call_bad",
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "bad_function",
|
||||
Arguments: badArgs,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Done: true,
|
||||
DoneReason: "stop",
|
||||
}
|
||||
|
||||
events := conv.Process(resp)
|
||||
|
||||
// Count tool_use blocks - should only have 1 (the valid one)
|
||||
toolStartCount := 0
|
||||
toolDeltaCount := 0
|
||||
for _, e := range events {
|
||||
if e.Event == "content_block_start" {
|
||||
if start, ok := e.Data.(ContentBlockStartEvent); ok {
|
||||
if start.ContentBlock.Type == "tool_use" {
|
||||
toolStartCount++
|
||||
if start.ContentBlock.Name != "good_function" {
|
||||
t.Errorf("expected tool name 'good_function', got %q", start.ContentBlock.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if e.Event == "content_block_delta" {
|
||||
if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
|
||||
if delta.Delta.Type == "input_json_delta" {
|
||||
toolDeltaCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if toolStartCount != 1 {
|
||||
t.Errorf("expected 1 tool_use block, got %d", toolStartCount)
|
||||
}
|
||||
if toolDeltaCount != 1 {
|
||||
t.Errorf("expected 1 input_json_delta, got %d", toolDeltaCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestContentBlockJSON_EmptyFieldsPresent verifies that empty text and thinking fields
|
||||
// are serialized in JSON output. The Anthropic SDK requires these fields to be present
|
||||
// (even when empty) in content_block_start events to properly accumulate streaming deltas.
|
||||
// Without these fields, the SDK throws: "TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'"
|
||||
func TestContentBlockJSON_EmptyFieldsPresent(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
block ContentBlock
|
||||
wantKeys []string
|
||||
}{
|
||||
{
|
||||
name: "text block includes empty text field",
|
||||
block: ContentBlock{
|
||||
Type: "text",
|
||||
Text: ptr(""),
|
||||
},
|
||||
wantKeys: []string{"type", "text"},
|
||||
},
|
||||
{
|
||||
name: "thinking block includes empty thinking field",
|
||||
block: ContentBlock{
|
||||
Type: "thinking",
|
||||
Thinking: ptr(""),
|
||||
},
|
||||
wantKeys: []string{"type", "thinking"},
|
||||
},
|
||||
{
|
||||
name: "text block with content",
|
||||
block: ContentBlock{
|
||||
Type: "text",
|
||||
Text: ptr("hello"),
|
||||
},
|
||||
wantKeys: []string{"type", "text"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
data, err := json.Marshal(tt.block)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to marshal: %v", err)
|
||||
}
|
||||
|
||||
var result map[string]any
|
||||
if err := json.Unmarshal(data, &result); err != nil {
|
||||
t.Fatalf("failed to unmarshal: %v", err)
|
||||
}
|
||||
|
||||
for _, key := range tt.wantKeys {
|
||||
if _, ok := result[key]; !ok {
|
||||
t.Errorf("expected key %q to be present in JSON output, got: %s", key, string(data))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestStreamConverter_ContentBlockStartIncludesEmptyFields verifies that content_block_start
|
||||
// events include the required empty fields for SDK compatibility.
|
||||
func TestStreamConverter_ContentBlockStartIncludesEmptyFields(t *testing.T) {
|
||||
t.Run("text block start includes empty text", func(t *testing.T) {
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{Role: "assistant", Content: "hello"},
|
||||
}
|
||||
|
||||
events := conv.Process(resp)
|
||||
|
||||
var foundTextStart bool
|
||||
for _, e := range events {
|
||||
if e.Event == "content_block_start" {
|
||||
if start, ok := e.Data.(ContentBlockStartEvent); ok {
|
||||
if start.ContentBlock.Type == "text" {
|
||||
foundTextStart = true
|
||||
// Marshal and verify the text field is present
|
||||
data, _ := json.Marshal(start)
|
||||
var result map[string]any
|
||||
json.Unmarshal(data, &result)
|
||||
cb := result["content_block"].(map[string]any)
|
||||
if _, ok := cb["text"]; !ok {
|
||||
t.Error("content_block_start for text should include 'text' field")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !foundTextStart {
|
||||
t.Error("expected text content_block_start event")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("thinking block start includes empty thinking", func(t *testing.T) {
|
||||
conv := NewStreamConverter("msg_123", "test-model")
|
||||
|
||||
resp := api.ChatResponse{
|
||||
Model: "test-model",
|
||||
Message: api.Message{Role: "assistant", Thinking: "let me think..."},
|
||||
}
|
||||
|
||||
events := conv.Process(resp)
|
||||
|
||||
var foundThinkingStart bool
|
||||
for _, e := range events {
|
||||
if e.Event == "content_block_start" {
|
||||
if start, ok := e.Data.(ContentBlockStartEvent); ok {
|
||||
if start.ContentBlock.Type == "thinking" {
|
||||
foundThinkingStart = true
|
||||
data, _ := json.Marshal(start)
|
||||
var result map[string]any
|
||||
json.Unmarshal(data, &result)
|
||||
cb := result["content_block"].(map[string]any)
|
||||
if _, ok := cb["thinking"]; !ok {
|
||||
t.Error("content_block_start for thinking should include 'thinking' field")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !foundThinkingStart {
|
||||
t.Error("expected thinking content_block_start event")
|
||||
}
|
||||
})
|
||||
}
|
||||
29
cmd/cmd.go
29
cmd/cmd.go
@@ -46,6 +46,8 @@ import (
|
||||
"github.com/ollama/ollama/types/syncmap"
|
||||
"github.com/ollama/ollama/version"
|
||||
xcmd "github.com/ollama/ollama/x/cmd"
|
||||
"github.com/ollama/ollama/x/imagegen"
|
||||
imagegenclient "github.com/ollama/ollama/x/imagegen/client"
|
||||
)
|
||||
|
||||
const ConnectInstructions = "To sign in, navigate to:\n %s\n\n"
|
||||
@@ -96,6 +98,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
filename, err := getModelfileName(cmd)
|
||||
if os.IsNotExist(err) {
|
||||
if filename == "" {
|
||||
// No Modelfile found - check if current directory is an image gen model
|
||||
if imagegen.IsTensorModelDir(".") {
|
||||
return imagegenclient.CreateModel(args[0], ".", p)
|
||||
}
|
||||
reader = strings.NewReader("FROM .\n")
|
||||
} else {
|
||||
return errModelfileNotFound
|
||||
@@ -457,6 +463,15 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
name := args[0]
|
||||
|
||||
// Check if this is a known image generation model (skip Show/Pull)
|
||||
if imagegen.HasTensorLayers(name) {
|
||||
if opts.Prompt == "" && !interactive {
|
||||
return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
|
||||
}
|
||||
return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
|
||||
}
|
||||
|
||||
info, err := func() (*api.ShowResponse, error) {
|
||||
showReq := &api.ShowRequest{Name: name}
|
||||
info, err := client.Show(cmd.Context(), showReq)
|
||||
@@ -520,6 +535,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
// Check for experimental flag
|
||||
isExperimental, _ := cmd.Flags().GetBool("experimental")
|
||||
yoloMode, _ := cmd.Flags().GetBool("experimental-yolo")
|
||||
|
||||
if interactive {
|
||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||
@@ -547,9 +563,9 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Use experimental agent loop with
|
||||
// Use experimental agent loop with tools
|
||||
if isExperimental {
|
||||
return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive)
|
||||
return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
|
||||
}
|
||||
|
||||
return generateInteractive(cmd, opts)
|
||||
@@ -821,6 +837,11 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||
// Check if this is an image generation model
|
||||
if imagegen.HasTensorLayers(args[0]) {
|
||||
return imagegen.Show(args[0], os.Stdout)
|
||||
}
|
||||
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1764,6 +1785,10 @@ func NewCLI() *cobra.Command {
|
||||
runCmd.Flags().Bool("truncate", false, "For embedding models: truncate inputs exceeding context length (default: true). Set --truncate=false to error instead")
|
||||
runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
|
||||
runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
|
||||
runCmd.Flags().Bool("experimental-yolo", false, "Skip all tool approval prompts (use with caution)")
|
||||
|
||||
// Image generation flags (width, height, steps, seed, etc.)
|
||||
imagegen.RegisterFlags(runCmd)
|
||||
|
||||
stopCmd := &cobra.Command{
|
||||
Use: "stop MODEL",
|
||||
|
||||
@@ -6,11 +6,14 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"iter"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
ofs "github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
@@ -18,8 +21,13 @@ type ModelParameters struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
|
||||
// TODO is this needed?
|
||||
ModelType string `json:"model_type"`
|
||||
|
||||
TextModel struct {
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
ModelType string `json:"model_type"`
|
||||
} `json:"text_config"`
|
||||
}
|
||||
|
||||
@@ -33,8 +41,94 @@ type AdapterParameters struct {
|
||||
} `json:"lora_parameters"`
|
||||
}
|
||||
|
||||
func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||
kv := ggml.KV{
|
||||
type KV map[string]any
|
||||
|
||||
func (kv KV) Architecture() string {
|
||||
return kv.String("general.architecture", "unknown")
|
||||
}
|
||||
|
||||
type valueTypes interface {
|
||||
uint8 | int8 | uint16 | int16 |
|
||||
uint32 | int32 | uint64 | int64 |
|
||||
string | float32 | float64 | bool
|
||||
}
|
||||
|
||||
type arrayValueTypes interface {
|
||||
[]uint8 | []int8 | []uint16 | []int16 |
|
||||
[]uint32 | []int32 | []uint64 | []int64 |
|
||||
[]string | []float32 | []float64 | []bool
|
||||
}
|
||||
|
||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
}
|
||||
|
||||
if val, ok := kv[key].(T); ok {
|
||||
return val, true
|
||||
}
|
||||
return defaultValue[0], false
|
||||
}
|
||||
|
||||
func (kv KV) String(key string, defaultValue ...string) string {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, "")...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, 0)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Bool(key string, defaultValue ...bool) bool {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, false)...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
||||
val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
|
||||
return val
|
||||
}
|
||||
|
||||
func (kv KV) Len() int {
|
||||
return len(kv)
|
||||
}
|
||||
|
||||
func (kv KV) Keys() iter.Seq[string] {
|
||||
return maps.Keys(kv)
|
||||
}
|
||||
|
||||
func (kv KV) Value(key string) any {
|
||||
return kv[key]
|
||||
}
|
||||
|
||||
func (ModelParameters) KV(t *Tokenizer) KV {
|
||||
kv := KV{
|
||||
"general.file_type": uint32(1),
|
||||
"general.quantization_version": uint32(2),
|
||||
"tokenizer.ggml.pre": t.Pre,
|
||||
@@ -63,7 +157,7 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p AdapterParameters) KV() ggml.KV {
|
||||
func (p AdapterParameters) KV() KV {
|
||||
var alpha float32
|
||||
if p.LoraParameters.Alpha == 0 {
|
||||
alpha = float32(p.Alpha)
|
||||
@@ -71,7 +165,7 @@ func (p AdapterParameters) KV() ggml.KV {
|
||||
alpha = p.LoraParameters.Alpha
|
||||
}
|
||||
|
||||
kv := ggml.KV{
|
||||
kv := KV{
|
||||
"adapter.lora.alpha": alpha,
|
||||
"adapter.type": "lora",
|
||||
"general.file_type": uint32(1),
|
||||
@@ -88,9 +182,14 @@ func (ModelParameters) specialTokenTypes() []string {
|
||||
}
|
||||
}
|
||||
|
||||
type ModelConverter interface {
|
||||
type ModelKV interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(*Tokenizer) ggml.KV
|
||||
KV(*Tokenizer) KV
|
||||
}
|
||||
|
||||
type ModelConverter interface {
|
||||
ModelKV
|
||||
|
||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||
Tensors([]Tensor) []*ggml.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
@@ -107,7 +206,7 @@ type moreParser interface {
|
||||
|
||||
type AdapterConverter interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(ggml.KV) ggml.KV
|
||||
KV(ofs.Config) KV
|
||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||
Tensors([]Tensor) []*ggml.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
@@ -115,7 +214,7 @@ type AdapterConverter interface {
|
||||
Replacements() []string
|
||||
}
|
||||
|
||||
func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
||||
func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
|
||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -126,8 +225,8 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
||||
return err
|
||||
}
|
||||
|
||||
arch, ok := baseKV["general.architecture"]
|
||||
if !ok {
|
||||
arch := baseKV.Architecture()
|
||||
if arch == "" {
|
||||
return errors.New("architecture not set for the base model")
|
||||
}
|
||||
|
||||
@@ -153,23 +252,19 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
||||
return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
|
||||
}
|
||||
|
||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||
// and files it finds in the input path.
|
||||
// Supported input model formats include safetensors.
|
||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||
func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
|
||||
bts, err := fs.ReadFile(fsys, "config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
var p ModelParameters
|
||||
if err := json.Unmarshal(bts, &p); err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if len(p.Architectures) < 1 {
|
||||
return errors.New("unknown architecture")
|
||||
return nil, nil, errors.New("unknown architecture")
|
||||
}
|
||||
|
||||
var conv ModelConverter
|
||||
@@ -217,22 +312,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
case "DeepseekV3ForCausalLM":
|
||||
conv = &deepseek2Model{}
|
||||
default:
|
||||
return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
||||
return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, conv); err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if t, ok := conv.(moreParser); ok {
|
||||
if err := t.parseMore(fsys); err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
||||
@@ -254,6 +349,19 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
default:
|
||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||
}
|
||||
return conv, t, nil
|
||||
}
|
||||
|
||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||
// and files it finds in the input path.
|
||||
// Supported input model formats include safetensors.
|
||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||
func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
kv, t, err := LoadModelMetadata(fsys)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
conv := kv.(ModelConverter)
|
||||
|
||||
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||
if err != nil {
|
||||
@@ -263,7 +371,7 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
return writeFile(f, conv.KV(t), conv.Tensors(ts))
|
||||
}
|
||||
|
||||
func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
|
||||
func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
|
||||
for i := range ts {
|
||||
ts[i].Shape = slices.Clone(ts[i].Shape)
|
||||
slices.Reverse(ts[i].Shape)
|
||||
|
||||
@@ -88,7 +88,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *bertModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "bert"
|
||||
kv["bert.attention.causal"] = false
|
||||
|
||||
@@ -24,7 +24,7 @@ type commandrModel struct {
|
||||
|
||||
var _ ModelConverter = (*commandrModel)(nil)
|
||||
|
||||
func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *commandrModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "command-r"
|
||||
kv["general.name"] = "command-r"
|
||||
|
||||
@@ -47,7 +47,7 @@ type deepseek2Model struct {
|
||||
Architecture string
|
||||
}
|
||||
|
||||
func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *deepseek2Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "deepseek2"
|
||||
kv["general.type"] = "model"
|
||||
|
||||
@@ -41,7 +41,7 @@ type deepseekocr struct {
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
|
||||
func (m *deepseekocr) KV(t *Tokenizer) KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "deepseekocr"
|
||||
kv["block_count"] = m.LanguageConfig.HiddenLayers
|
||||
|
||||
@@ -23,7 +23,7 @@ type gemmaModel struct {
|
||||
|
||||
var _ ModelConverter = (*gemmaModel)(nil)
|
||||
|
||||
func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *gemmaModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma"
|
||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
package convert
|
||||
|
||||
import "github.com/ollama/ollama/fs/ggml"
|
||||
|
||||
type gemma2Model struct {
|
||||
gemmaModel
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
@@ -9,7 +7,7 @@ type gemma2Model struct {
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
}
|
||||
|
||||
func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *gemma2Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma2"
|
||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
@@ -15,7 +16,7 @@ type gemma2Adapter struct {
|
||||
|
||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||
|
||||
func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
|
||||
func (p *gemma2Adapter) KV(baseKV fs.Config) KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "gemma2"
|
||||
return kv
|
||||
|
||||
@@ -3,8 +3,6 @@ package convert
|
||||
import (
|
||||
"cmp"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type gemma3Model struct {
|
||||
@@ -55,7 +53,7 @@ const (
|
||||
gemma27BLayerCount = 62
|
||||
)
|
||||
|
||||
func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *gemma3Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma3"
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ type gemma3nModel struct {
|
||||
VisionModel struct{} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (m *gemma3nModel) KV(t *Tokenizer) KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma3n"
|
||||
kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
|
||||
|
||||
@@ -37,7 +37,7 @@ type gptossModel struct {
|
||||
|
||||
var _ ModelConverter = (*gptossModel)(nil)
|
||||
|
||||
func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (m *gptossModel) KV(t *Tokenizer) KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gptoss"
|
||||
kv["general.file_type"] = uint32(4)
|
||||
|
||||
@@ -48,7 +48,7 @@ type llamaModel struct {
|
||||
|
||||
var _ ModelConverter = (*llamaModel)(nil)
|
||||
|
||||
func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *llamaModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["llama.vocab_size"] = p.VocabSize
|
||||
|
||||
@@ -35,7 +35,7 @@ type llama4Model struct {
|
||||
}
|
||||
|
||||
// KV implements ModelConverter.
|
||||
func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *llama4Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "llama4"
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
@@ -18,13 +19,13 @@ type llamaAdapter struct {
|
||||
|
||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||
|
||||
func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
|
||||
func (p *llamaAdapter) KV(baseKV fs.Config) KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
|
||||
kv["llama.attention.head_count"] = baseKV.Value("llama.attention.head_count")
|
||||
kv["llama.attention.head_count_kv"] = baseKV.Value("llama.attention.head_count_kv")
|
||||
|
||||
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
|
||||
p.NumAttentionHeads = baseKV.Value("llama.attention.head_count").(uint32)
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ type mistral3Model struct {
|
||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||
}
|
||||
|
||||
func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *mistral3Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mistral3"
|
||||
kv["mistral3.vocab_size"] = p.TextModel.VocabSize
|
||||
|
||||
@@ -39,7 +39,7 @@ type mistral3CausalModel struct {
|
||||
} `json:"rope_parameters"`
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *mistral3CausalModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mistral3"
|
||||
kv["mistral3.vocab_size"] = p.VocabSize
|
||||
|
||||
@@ -12,7 +12,7 @@ type mixtralModel struct {
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
}
|
||||
|
||||
func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *mixtralModel) KV(t *Tokenizer) KV {
|
||||
kv := p.llamaModel.KV(t)
|
||||
|
||||
if p.NumLocalExperts > 0 {
|
||||
|
||||
@@ -34,7 +34,7 @@ type mllamaModel struct {
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (m *mllamaModel) KV(t *Tokenizer) KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mllama"
|
||||
|
||||
|
||||
@@ -87,7 +87,7 @@ func (p *nomicbertModel) parseMore(fsys fs.FS) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *nomicbertModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
|
||||
// Determine architecture based on MoE parameters (following qwen3 pattern)
|
||||
|
||||
@@ -34,7 +34,7 @@ type olmoModel struct {
|
||||
|
||||
var _ ModelConverter = (*olmoModel)(nil)
|
||||
|
||||
func (p *olmoModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *olmoModel) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "olmo3"
|
||||
kv["olmo3.block_count"] = p.NumHiddenLayers
|
||||
|
||||
@@ -37,7 +37,7 @@ type phi3Model struct {
|
||||
|
||||
var _ ModelConverter = (*phi3Model)(nil)
|
||||
|
||||
func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (p *phi3Model) KV(t *Tokenizer) KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "phi3"
|
||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||
|
||||
@@ -22,7 +22,7 @@ type qwen2Model struct {
|
||||
|
||||
var _ ModelConverter = (*qwen2Model)(nil)
|
||||
|
||||
func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (q *qwen2Model) KV(t *Tokenizer) KV {
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "qwen2"
|
||||
kv["qwen2.block_count"] = q.HiddenLayers
|
||||
|
||||
@@ -29,7 +29,7 @@ type qwen25VLModel struct {
|
||||
|
||||
var _ ModelConverter = (*qwen25VLModel)(nil)
|
||||
|
||||
func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (q *qwen25VLModel) KV(t *Tokenizer) KV {
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "qwen25vl"
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ type qwen3Model struct {
|
||||
}
|
||||
|
||||
// KV implements ModelConverter.
|
||||
func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
|
||||
func (q *qwen3Model) KV(t *Tokenizer) KV {
|
||||
arch := "qwen3"
|
||||
if q.NumExperts > 0 {
|
||||
arch += "moe"
|
||||
|
||||
@@ -45,7 +45,7 @@ func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
|
||||
return json.Unmarshal(bts, &m.VisionModel)
|
||||
}
|
||||
|
||||
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
|
||||
func (m *qwen3VLModel) KV(t *Tokenizer) KV {
|
||||
kv := m.qwen3Model.KV(t)
|
||||
|
||||
arch := "qwen3vl"
|
||||
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
fsc "github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
@@ -28,7 +29,7 @@ type tensorData struct {
|
||||
Shape []int `json:"shape"`
|
||||
}
|
||||
|
||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, fsc.Config, ggml.Tensors) {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||
@@ -59,9 +60,10 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||
return r, m.KV(), m.Tensors()
|
||||
}
|
||||
|
||||
func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
|
||||
func generateResultsJSON(t *testing.T, f *os.File, kv fsc.Config, tensors ggml.Tensors) map[string]string {
|
||||
actual := make(map[string]string)
|
||||
for k, v := range kv {
|
||||
for k := range kv.Keys() {
|
||||
v := kv.Value(k)
|
||||
if s, ok := v.(json.Marshaler); !ok {
|
||||
actual[k] = fmt.Sprintf("%v", v)
|
||||
} else {
|
||||
@@ -277,7 +279,7 @@ func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[str
|
||||
func TestConvertAdapter(t *testing.T) {
|
||||
type AdapterCase struct {
|
||||
Name string
|
||||
BaseKV map[string]any
|
||||
BaseKV KV
|
||||
Expected map[string]string
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
* [API Reference](https://docs.ollama.com/api)
|
||||
* [Modelfile Reference](https://docs.ollama.com/modelfile)
|
||||
* [OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
|
||||
* [Anthropic Compatibility](./api/anthropic-compatibility.mdx)
|
||||
|
||||
### Resources
|
||||
|
||||
|
||||
406
docs/api/anthropic-compatibility.mdx
Normal file
406
docs/api/anthropic-compatibility.mdx
Normal file
@@ -0,0 +1,406 @@
|
||||
---
|
||||
title: Anthropic compatibility
|
||||
---
|
||||
|
||||
Ollama provides compatibility with the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) to help connect existing applications to Ollama, including tools like Claude Code.
|
||||
|
||||
## Recommended models
|
||||
|
||||
For coding use cases, models like `glm-4.7:cloud`, `minimax-m2.1:cloud`, and `qwen3-coder` are recommended.
|
||||
|
||||
Pull a model before use:
|
||||
```shell
|
||||
ollama pull qwen3-coder
|
||||
ollama pull glm-4.7:cloud
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Environment variables
|
||||
|
||||
To use Ollama with tools that expect the Anthropic API (like Claude Code), set these environment variables:
|
||||
|
||||
```shell
|
||||
export ANTHROPIC_BASE_URL=http://localhost:11434
|
||||
export ANTHROPIC_API_KEY=ollama # required but ignored
|
||||
```
|
||||
|
||||
### Simple `/v1/messages` example
|
||||
|
||||
<CodeGroup dropdown>
|
||||
|
||||
```python basic.py
|
||||
import anthropic
|
||||
|
||||
client = anthropic.Anthropic(
|
||||
base_url='http://localhost:11434',
|
||||
api_key='ollama', # required but ignored
|
||||
)
|
||||
|
||||
message = client.messages.create(
|
||||
model='qwen3-coder',
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{'role': 'user', 'content': 'Hello, how are you?'}
|
||||
]
|
||||
)
|
||||
print(message.content[0].text)
|
||||
```
|
||||
|
||||
```javascript basic.js
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
|
||||
const anthropic = new Anthropic({
|
||||
baseURL: "http://localhost:11434",
|
||||
apiKey: "ollama", // required but ignored
|
||||
});
|
||||
|
||||
const message = await anthropic.messages.create({
|
||||
model: "qwen3-coder",
|
||||
max_tokens: 1024,
|
||||
messages: [{ role: "user", content: "Hello, how are you?" }],
|
||||
});
|
||||
|
||||
console.log(message.content[0].text);
|
||||
```
|
||||
|
||||
```shell basic.sh
|
||||
curl -X POST http://localhost:11434/v1/messages \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "x-api-key: ollama" \
|
||||
-H "anthropic-version: 2023-06-01" \
|
||||
-d '{
|
||||
"model": "qwen3-coder",
|
||||
"max_tokens": 1024,
|
||||
"messages": [{ "role": "user", "content": "Hello, how are you?" }]
|
||||
}'
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
### Streaming example
|
||||
|
||||
<CodeGroup dropdown>
|
||||
|
||||
```python streaming.py
|
||||
import anthropic
|
||||
|
||||
client = anthropic.Anthropic(
|
||||
base_url='http://localhost:11434',
|
||||
api_key='ollama',
|
||||
)
|
||||
|
||||
with client.messages.stream(
|
||||
model='qwen3-coder',
|
||||
max_tokens=1024,
|
||||
messages=[{'role': 'user', 'content': 'Count from 1 to 10'}]
|
||||
) as stream:
|
||||
for text in stream.text_stream:
|
||||
print(text, end='', flush=True)
|
||||
```
|
||||
|
||||
```javascript streaming.js
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
|
||||
const anthropic = new Anthropic({
|
||||
baseURL: "http://localhost:11434",
|
||||
apiKey: "ollama",
|
||||
});
|
||||
|
||||
const stream = await anthropic.messages.stream({
|
||||
model: "qwen3-coder",
|
||||
max_tokens: 1024,
|
||||
messages: [{ role: "user", content: "Count from 1 to 10" }],
|
||||
});
|
||||
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.type === "content_block_delta" &&
|
||||
event.delta.type === "text_delta"
|
||||
) {
|
||||
process.stdout.write(event.delta.text);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```shell streaming.sh
|
||||
curl -X POST http://localhost:11434/v1/messages \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3-coder",
|
||||
"max_tokens": 1024,
|
||||
"stream": true,
|
||||
"messages": [{ "role": "user", "content": "Count from 1 to 10" }]
|
||||
}'
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
### Tool calling example
|
||||
|
||||
<CodeGroup dropdown>
|
||||
|
||||
```python tools.py
|
||||
import anthropic
|
||||
|
||||
client = anthropic.Anthropic(
|
||||
base_url='http://localhost:11434',
|
||||
api_key='ollama',
|
||||
)
|
||||
|
||||
message = client.messages.create(
|
||||
model='qwen3-coder',
|
||||
max_tokens=1024,
|
||||
tools=[
|
||||
{
|
||||
'name': 'get_weather',
|
||||
'description': 'Get the current weather in a location',
|
||||
'input_schema': {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'location': {
|
||||
'type': 'string',
|
||||
'description': 'The city and state, e.g. San Francisco, CA'
|
||||
}
|
||||
},
|
||||
'required': ['location']
|
||||
}
|
||||
}
|
||||
],
|
||||
messages=[{'role': 'user', 'content': "What's the weather in San Francisco?"}]
|
||||
)
|
||||
|
||||
for block in message.content:
|
||||
if block.type == 'tool_use':
|
||||
print(f'Tool: {block.name}')
|
||||
print(f'Input: {block.input}')
|
||||
```
|
||||
|
||||
```javascript tools.js
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
|
||||
const anthropic = new Anthropic({
|
||||
baseURL: "http://localhost:11434",
|
||||
apiKey: "ollama",
|
||||
});
|
||||
|
||||
const message = await anthropic.messages.create({
|
||||
model: "qwen3-coder",
|
||||
max_tokens: 1024,
|
||||
tools: [
|
||||
{
|
||||
name: "get_weather",
|
||||
description: "Get the current weather in a location",
|
||||
input_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
location: {
|
||||
type: "string",
|
||||
description: "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required: ["location"],
|
||||
},
|
||||
},
|
||||
],
|
||||
messages: [{ role: "user", content: "What's the weather in San Francisco?" }],
|
||||
});
|
||||
|
||||
for (const block of message.content) {
|
||||
if (block.type === "tool_use") {
|
||||
console.log("Tool:", block.name);
|
||||
console.log("Input:", block.input);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```shell tools.sh
|
||||
curl -X POST http://localhost:11434/v1/messages \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3-coder",
|
||||
"max_tokens": 1024,
|
||||
"tools": [
|
||||
{
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a location",
|
||||
"input_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state"
|
||||
}
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
}
|
||||
],
|
||||
"messages": [{ "role": "user", "content": "What is the weather in San Francisco?" }]
|
||||
}'
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
## Using with Claude Code
|
||||
|
||||
[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:
|
||||
|
||||
```shell
|
||||
ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
|
||||
```
|
||||
|
||||
Or set the environment variables in your shell profile:
|
||||
|
||||
```shell
|
||||
export ANTHROPIC_BASE_URL=http://localhost:11434
|
||||
export ANTHROPIC_API_KEY=ollama
|
||||
```
|
||||
|
||||
Then run Claude Code with any Ollama model:
|
||||
|
||||
```shell
|
||||
# Local models
|
||||
claude --model qwen3-coder
|
||||
claude --model gpt-oss:20b
|
||||
|
||||
# Cloud models
|
||||
claude --model glm-4.7:cloud
|
||||
claude --model minimax-m2.1:cloud
|
||||
```
|
||||
|
||||
## Endpoints
|
||||
|
||||
### `/v1/messages`
|
||||
|
||||
#### Supported features
|
||||
|
||||
- [x] Messages
|
||||
- [x] Streaming
|
||||
- [x] System prompts
|
||||
- [x] Multi-turn conversations
|
||||
- [x] Vision (images)
|
||||
- [x] Tools (function calling)
|
||||
- [x] Tool results
|
||||
- [x] Thinking/extended thinking
|
||||
|
||||
#### Supported request fields
|
||||
|
||||
- [x] `model`
|
||||
- [x] `max_tokens`
|
||||
- [x] `messages`
|
||||
- [x] Text `content`
|
||||
- [x] Image `content` (base64)
|
||||
- [x] Array of content blocks
|
||||
- [x] `tool_use` blocks
|
||||
- [x] `tool_result` blocks
|
||||
- [x] `thinking` blocks
|
||||
- [x] `system` (string or array)
|
||||
- [x] `stream`
|
||||
- [x] `temperature`
|
||||
- [x] `top_p`
|
||||
- [x] `top_k`
|
||||
- [x] `stop_sequences`
|
||||
- [x] `tools`
|
||||
- [x] `thinking`
|
||||
- [ ] `tool_choice`
|
||||
- [ ] `metadata`
|
||||
|
||||
#### Supported response fields
|
||||
|
||||
- [x] `id`
|
||||
- [x] `type`
|
||||
- [x] `role`
|
||||
- [x] `model`
|
||||
- [x] `content` (text, tool_use, thinking blocks)
|
||||
- [x] `stop_reason` (end_turn, max_tokens, tool_use)
|
||||
- [x] `usage` (input_tokens, output_tokens)
|
||||
|
||||
#### Streaming events
|
||||
|
||||
- [x] `message_start`
|
||||
- [x] `content_block_start`
|
||||
- [x] `content_block_delta` (text_delta, input_json_delta, thinking_delta)
|
||||
- [x] `content_block_stop`
|
||||
- [x] `message_delta`
|
||||
- [x] `message_stop`
|
||||
- [x] `ping`
|
||||
- [x] `error`
|
||||
|
||||
## Models
|
||||
|
||||
Ollama supports both local and cloud models.
|
||||
|
||||
### Local models
|
||||
|
||||
Pull a local model before use:
|
||||
|
||||
```shell
|
||||
ollama pull qwen3-coder
|
||||
```
|
||||
|
||||
Recommended local models:
|
||||
- `qwen3-coder` - Excellent for coding tasks
|
||||
- `gpt-oss:20b` - Strong general-purpose model
|
||||
|
||||
### Cloud models
|
||||
|
||||
Cloud models are available immediately without pulling:
|
||||
|
||||
- `glm-4.7:cloud` - High-performance cloud model
|
||||
- `minimax-m2.1:cloud` - Fast cloud model
|
||||
|
||||
### Default model names
|
||||
|
||||
For tooling that relies on default Anthropic model names such as `claude-3-5-sonnet`, use `ollama cp` to copy an existing model name:
|
||||
|
||||
```shell
|
||||
ollama cp qwen3-coder claude-3-5-sonnet
|
||||
```
|
||||
|
||||
Afterwards, this new model name can be specified in the `model` field:
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/v1/messages \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "claude-3-5-sonnet",
|
||||
"max_tokens": 1024,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
## Differences from the Anthropic API
|
||||
|
||||
### Behavior differences
|
||||
|
||||
- API key is accepted but not validated
|
||||
- `anthropic-version` header is accepted but not used
|
||||
- Token counts are approximations based on the underlying model's tokenizer
|
||||
|
||||
### Not supported
|
||||
|
||||
The following Anthropic API features are not currently supported:
|
||||
|
||||
| Feature | Description |
|
||||
|---------|-------------|
|
||||
| `/v1/messages/count_tokens` | Token counting endpoint |
|
||||
| `tool_choice` | Forcing specific tool use or disabling tools |
|
||||
| `metadata` | Request metadata (user_id) |
|
||||
| Prompt caching | `cache_control` blocks for caching prefixes |
|
||||
| Batches API | `/v1/messages/batches` for async batch processing |
|
||||
| Citations | `citations` content blocks |
|
||||
| PDF support | `document` content blocks with PDF files |
|
||||
| Server-sent errors | `error` events during streaming (errors return HTTP status) |
|
||||
|
||||
### Partial support
|
||||
|
||||
| Feature | Status |
|
||||
|---------|--------|
|
||||
| Image content | Base64 images supported; URL images not supported |
|
||||
| Extended thinking | Basic support; `budget_tokens` accepted but not enforced |
|
||||
@@ -32,7 +32,9 @@
|
||||
"codeblocks": "system"
|
||||
},
|
||||
"contextual": {
|
||||
"options": ["copy"]
|
||||
"options": [
|
||||
"copy"
|
||||
]
|
||||
},
|
||||
"navbar": {
|
||||
"links": [
|
||||
@@ -52,7 +54,9 @@
|
||||
"display": "simple"
|
||||
},
|
||||
"examples": {
|
||||
"languages": ["curl"]
|
||||
"languages": [
|
||||
"curl"
|
||||
]
|
||||
}
|
||||
},
|
||||
"redirects": [
|
||||
@@ -97,6 +101,7 @@
|
||||
{
|
||||
"group": "Integrations",
|
||||
"pages": [
|
||||
"/integrations/claude-code",
|
||||
"/integrations/vscode",
|
||||
"/integrations/jetbrains",
|
||||
"/integrations/codex",
|
||||
@@ -139,7 +144,8 @@
|
||||
"/api/streaming",
|
||||
"/api/usage",
|
||||
"/api/errors",
|
||||
"/api/openai-compatibility"
|
||||
"/api/openai-compatibility",
|
||||
"/api/anthropic-compatibility"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
69
docs/integrations/claude-code.mdx
Normal file
69
docs/integrations/claude-code.mdx
Normal file
@@ -0,0 +1,69 @@
|
||||
---
|
||||
title: Claude Code
|
||||
---
|
||||
|
||||
## Install
|
||||
|
||||
Install [Claude Code](https://code.claude.com/docs/en/overview):
|
||||
|
||||
<CodeGroup>
|
||||
|
||||
```shell macOS / Linux
|
||||
curl -fsSL https://claude.ai/install.sh | bash
|
||||
```
|
||||
|
||||
```powershell Windows
|
||||
irm https://claude.ai/install.ps1 | iex
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
## Usage with Ollama
|
||||
|
||||
Claude Code connects to Ollama using the Anthropic-compatible API.
|
||||
|
||||
1. Set the environment variables:
|
||||
|
||||
```shell
|
||||
export ANTHROPIC_BASE_URL=http://localhost:11434
|
||||
export ANTHROPIC_API_KEY=ollama
|
||||
```
|
||||
|
||||
2. Run Claude Code with an Ollama model:
|
||||
|
||||
```shell
|
||||
claude --model qwen3-coder
|
||||
```
|
||||
|
||||
Or run with environment variables inline:
|
||||
|
||||
```shell
|
||||
ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
|
||||
```
|
||||
|
||||
## Connecting to ollama.com
|
||||
|
||||
1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
|
||||
2. Set the environment variables:
|
||||
|
||||
```shell
|
||||
export ANTHROPIC_BASE_URL=https://ollama.com
|
||||
export ANTHROPIC_API_KEY=<your-api-key>
|
||||
```
|
||||
|
||||
3. Run Claude Code with a cloud model:
|
||||
|
||||
```shell
|
||||
claude --model glm-4.7:cloud
|
||||
```
|
||||
|
||||
## Recommended Models
|
||||
|
||||
### Cloud models
|
||||
- `glm-4.7:cloud` - High-performance cloud model
|
||||
- `minimax-m2.1:cloud` - Fast cloud model
|
||||
- `qwen3-coder:480b` - Large coding model
|
||||
|
||||
### Local models
|
||||
- `qwen3-coder` - Excellent for coding tasks
|
||||
- `gpt-oss:20b` - Strong general-purpose model
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: Linux
|
||||
title: "Linux"
|
||||
---
|
||||
|
||||
## Install
|
||||
@@ -13,8 +13,7 @@ curl -fsSL https://ollama.com/install.sh | sh
|
||||
## Manual install
|
||||
|
||||
<Note>
|
||||
If you are upgrading from a prior version, you should remove the old libraries
|
||||
with `sudo rm -rf /usr/lib/ollama` first.
|
||||
If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
|
||||
</Note>
|
||||
|
||||
Download and extract the package:
|
||||
@@ -113,11 +112,7 @@ sudo systemctl status ollama
|
||||
```
|
||||
|
||||
<Note>
|
||||
While AMD has contributed the `amdgpu` driver upstream to the official linux
|
||||
kernel source, the version is older and may not support all ROCm features. We
|
||||
recommend you install the latest driver from
|
||||
https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
||||
GPU.
|
||||
While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
|
||||
</Note>
|
||||
|
||||
## Customizing
|
||||
@@ -196,4 +191,4 @@ Remove the downloaded models and Ollama service user and group:
|
||||
sudo userdel ollama
|
||||
sudo groupdel ollama
|
||||
sudo rm -r /usr/share/ollama
|
||||
```
|
||||
```
|
||||
@@ -1,5 +1,7 @@
|
||||
package fs
|
||||
|
||||
import "iter"
|
||||
|
||||
type Config interface {
|
||||
Architecture() string
|
||||
String(string, ...string) string
|
||||
@@ -11,4 +13,8 @@ type Config interface {
|
||||
Ints(string, ...[]int32) []int32
|
||||
Floats(string, ...[]float32) []float32
|
||||
Bools(string, ...[]bool) []bool
|
||||
|
||||
Len() int
|
||||
Keys() iter.Seq[string]
|
||||
Value(key string) any
|
||||
}
|
||||
|
||||
@@ -6,7 +6,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"math"
|
||||
"slices"
|
||||
"strings"
|
||||
@@ -239,6 +241,18 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
||||
return val.values
|
||||
}
|
||||
|
||||
func (kv KV) Len() int {
|
||||
return len(kv)
|
||||
}
|
||||
|
||||
func (kv KV) Keys() iter.Seq[string] {
|
||||
return maps.Keys(kv)
|
||||
}
|
||||
|
||||
func (kv KV) Value(key string) any {
|
||||
return kv[key]
|
||||
}
|
||||
|
||||
func (kv KV) OllamaEngineRequired() bool {
|
||||
return slices.Contains([]string{
|
||||
"bert",
|
||||
|
||||
@@ -8,12 +8,12 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
@@ -508,7 +508,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
|
||||
return binary.Write(w, binary.LittleEndian, s)
|
||||
}
|
||||
|
||||
func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
||||
func WriteGGUF(f *os.File, kv fs.Config, ts []*Tensor) error {
|
||||
arch := kv.String("general.architecture")
|
||||
if arch == "" {
|
||||
return fmt.Errorf("architecture not set")
|
||||
@@ -526,12 +526,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
|
||||
if err := binary.Write(f, binary.LittleEndian, uint64(kv.Len())); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range slices.Sorted(maps.Keys(kv)) {
|
||||
if err := ggufWriteKV(f, arch, key, kv[key]); err != nil {
|
||||
for _, key := range slices.Sorted(kv.Keys()) {
|
||||
if err := ggufWriteKV(f, arch, key, kv.Value(key)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,25 +14,28 @@ make -f Makefile.sync apply-patches
|
||||
|
||||
### Updating Base Commit
|
||||
|
||||
**Pin to new base commit**
|
||||
To update to a new base commit:
|
||||
|
||||
To change the base commit, update `FETCH_HEAD` in Makefile.sync.
|
||||
1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
|
||||
|
||||
When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
|
||||
2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
|
||||
|
||||
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
|
||||
3. **Apply patches**:
|
||||
```shell
|
||||
make -f Makefile.sync apply-patches
|
||||
```
|
||||
|
||||
```shell
|
||||
make -f Makefile.sync apply-patches
|
||||
```
|
||||
4. **Resolve conflicts** (if any): When `git am` fails on a patch:
|
||||
- Fix conflicts in `./vendor/`
|
||||
- Stage the resolved files: `git -C llama/vendor add <file>`
|
||||
- Continue: `git -C llama/vendor am --continue`
|
||||
- Re-run: `make -f Makefile.sync apply-patches`
|
||||
- Repeat until all patches are applied.
|
||||
|
||||
If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
|
||||
|
||||
Once all patches are applied, commit the changes to the tracking repository.
|
||||
|
||||
```shell
|
||||
make -f Makefile.sync format-patches sync
|
||||
```
|
||||
5. **Regenerate patches and sync**:
|
||||
```shell
|
||||
make -f Makefile.sync format-patches sync
|
||||
```
|
||||
|
||||
### Generating Patches
|
||||
|
||||
|
||||
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "ec98e2002";
|
||||
char const *LLAMA_COMMIT = "b1377188784f9aea26b8abde56d4aee8c733eec7";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
||||
70
llama/llama.cpp/common/common.cpp
vendored
70
llama/llama.cpp/common/common.cpp
vendored
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
||||
}
|
||||
|
||||
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
|
||||
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||
return false;
|
||||
}
|
||||
@@ -1078,12 +1078,15 @@ struct common_init_result::impl {
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
|
||||
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
|
||||
std::vector<common_sampler_ptr> samplers;
|
||||
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
||||
};
|
||||
|
||||
common_init_result::common_init_result(common_params & params) :
|
||||
@@ -1092,9 +1095,9 @@ common_init_result::common_init_result(common_params & params) :
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
}
|
||||
|
||||
@@ -1107,6 +1110,25 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load and optionally apply lora adapters (must be loaded before context creation)
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
pimpl->model.reset(model);
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
la.ptr = lora.get();
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
||||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
// updates params.sampling
|
||||
// TODO: fix naming
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
@@ -1141,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
|
||||
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
// init the backend samplers as part of the context creation
|
||||
pimpl->samplers.resize(cparams.n_seq_max);
|
||||
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
|
||||
|
||||
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
||||
}
|
||||
|
||||
// TODO: temporarily gated behind a flag
|
||||
if (params.sampling.backend_sampling) {
|
||||
cparams.samplers = pimpl->samplers_seq_config.data();
|
||||
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
@@ -1168,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||
return pimpl->samplers[seq_id].get();
|
||||
}
|
||||
|
||||
void common_init_result::reset_samplers() {
|
||||
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
|
||||
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||
return pimpl->lora;
|
||||
}
|
||||
@@ -1243,24 +1280,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
la.ptr = lora.get();
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
||||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||
}
|
||||
@@ -1301,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
|
||||
// reset samplers to reset RNG state after warmup to the seeded state
|
||||
res->reset_samplers();
|
||||
}
|
||||
|
||||
return res;
|
||||
@@ -1339,14 +1361,12 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_direct_io = params.use_direct_io;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
||||
35
llama/llama.cpp/common/common.h
vendored
35
llama/llama.cpp/common/common.h
vendored
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
LLAMA_EXAMPLE_DEBUG,
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_COMPLETION,
|
||||
@@ -216,6 +217,8 @@ struct common_params_sampling {
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||
|
||||
bool backend_sampling = false;
|
||||
|
||||
bool has_logit_bias() const {
|
||||
return !logit_bias.empty();
|
||||
}
|
||||
@@ -329,12 +332,14 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
@@ -370,6 +375,11 @@ struct common_params {
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
|
||||
// llama-debug specific options
|
||||
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
||||
bool save_logits = false; // whether to save logits to files // NOLINT
|
||||
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
@@ -420,7 +430,8 @@ struct common_params {
|
||||
bool kv_unified = false; // enable unified KV cache
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mmap = true; // enable mmap to use filesystem cache
|
||||
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
@@ -475,7 +486,8 @@ struct common_params {
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
@@ -484,8 +496,11 @@ struct common_params {
|
||||
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// webui configs
|
||||
bool webui = true;
|
||||
std::string webui_config_json;
|
||||
|
||||
// "advanced" endpoints are disabled by default for better security
|
||||
bool webui = true;
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
@@ -685,7 +700,9 @@ struct common_init_result {
|
||||
|
||||
llama_model * model();
|
||||
llama_context * context();
|
||||
|
||||
common_sampler * sampler(llama_seq_id seq_id);
|
||||
void reset_samplers();
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & lora();
|
||||
|
||||
|
||||
160
llama/llama.cpp/common/sampling.cpp
vendored
160
llama/llama.cpp/common/sampling.cpp
vendored
@@ -104,10 +104,9 @@ struct ring_buffer {
|
||||
struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
bool grammar;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
@@ -121,17 +120,34 @@ struct common_sampler {
|
||||
}
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
|
||||
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
|
||||
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
cur.resize(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
if (sampled_probs) {
|
||||
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
|
||||
cur.resize(sampled_probs_count);
|
||||
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
||||
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
|
||||
}
|
||||
} else if (sampled_logits) {
|
||||
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
|
||||
cur.resize(sampled_logits_count);
|
||||
for (uint32_t i = 0; i < sampled_logits_count; i++) {
|
||||
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
|
||||
}
|
||||
} else {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
GGML_ASSERT(logits != nullptr);
|
||||
cur.resize(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
}
|
||||
|
||||
cur_p = { cur.data(), cur.size(), -1, false };
|
||||
@@ -160,45 +176,50 @@ std::string common_params_sampling::print() const {
|
||||
return std::string(result);
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
llama_sampler * grmr = nullptr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
bool grammar = false;
|
||||
std::vector<llama_sampler *> samplers;
|
||||
|
||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||
grammar = true;
|
||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
||||
#else
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
} else {
|
||||
std::vector<std::string> trigger_patterns;
|
||||
std::vector<std::string> patterns_anywhere;
|
||||
std::vector<llama_token> trigger_tokens;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
patterns_anywhere.push_back(regex_escape(word));
|
||||
trigger_patterns.push_back(regex_escape(word));
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
{
|
||||
patterns_anywhere.push_back(trigger.value);
|
||||
trigger_patterns.push_back(trigger.value);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
trigger_patterns.push_back(trigger.value);
|
||||
const auto & pattern = trigger.value;
|
||||
std::string anchored = "^$";
|
||||
if (!pattern.empty()) {
|
||||
anchored = (pattern.front() != '^' ? "^" : "")
|
||||
+ pattern
|
||||
+ (pattern.back() != '$' ? "$" : "");
|
||||
}
|
||||
trigger_patterns.push_back(anchored);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||
@@ -212,10 +233,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
}
|
||||
}
|
||||
|
||||
if (!patterns_anywhere.empty()) {
|
||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
|
||||
std::vector<const char *> trigger_patterns_c;
|
||||
trigger_patterns_c.reserve(trigger_patterns.size());
|
||||
for (const auto & regex : trigger_patterns) {
|
||||
@@ -224,15 +241,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
if (params.grammar_lazy) {
|
||||
samplers.push_back(
|
||||
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size()));
|
||||
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size());
|
||||
} else {
|
||||
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
}
|
||||
|
||||
grammar = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -301,10 +315,16 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
llama_sampler_chain_add(chain, smpl);
|
||||
}
|
||||
|
||||
if (grmr && params.backend_sampling) {
|
||||
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
|
||||
|
||||
params.backend_sampling = false;
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .chain = */ chain,
|
||||
/* .grammar = */ grammar,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
@@ -315,6 +335,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
@@ -324,25 +345,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (gsmpl->grammar) {
|
||||
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||
|
||||
for (int i = 0; i < n_smpl; i++) {
|
||||
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
|
||||
// the grammar sampler is always the first one
|
||||
if (i == 0) {
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
||||
@@ -353,8 +361,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .grammar = */ gsmpl->grammar,
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
@@ -410,7 +418,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||
return gsmpl->chain;
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
@@ -418,11 +426,61 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
// Check if a backend sampler has already sampled a token in which case we
|
||||
// return that token id directly.
|
||||
{
|
||||
id = llama_get_sampled_token_ith(ctx, idx);
|
||||
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
||||
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
|
||||
// TODO: simplify
|
||||
gsmpl->cur.resize(1);
|
||||
gsmpl->cur[0] = { id, 0.0f, 1.0f };
|
||||
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
|
||||
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
if (grammar_first) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
}
|
||||
|
||||
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
if (is_valid) {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
// resampling:
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
@@ -432,7 +490,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
return id;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
@@ -440,7 +498,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
@@ -452,7 +510,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
@@ -462,13 +520,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<int> idxs(draft.size() + 1);
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
|
||||
13
llama/llama.cpp/common/sampling.h
vendored
13
llama/llama.cpp/common/sampling.h
vendored
@@ -36,7 +36,8 @@ struct common_sampler;
|
||||
|
||||
// llama_sampler API overloads
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
||||
// note: can mutate params in some cases
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl);
|
||||
|
||||
@@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||
// arguments can be nullptr to skip printing
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||
|
||||
// get the underlying llama_sampler_chain
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
@@ -57,7 +59,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
// generalized version of common_sampler_sample
|
||||
//
|
||||
@@ -75,10 +80,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
//
|
||||
// returns at least 1 token, up to idxs.size()
|
||||
//
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
|
||||
119
llama/llama.cpp/include/llama.h
vendored
119
llama/llama.cpp/include/llama.h
vendored
@@ -286,7 +286,7 @@ extern "C" {
|
||||
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
||||
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||
@@ -309,6 +309,7 @@ extern "C" {
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_direct_io; // use direct io, takes precedence over use_mmap
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
@@ -316,6 +317,11 @@ extern "C" {
|
||||
bool no_alloc; // only load metadata and simulate memory allocations
|
||||
};
|
||||
|
||||
struct llama_sampler_seq_config {
|
||||
llama_seq_id seq_id;
|
||||
struct llama_sampler * sampler;
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
// https://github.com/ggml-org/llama.cpp/pull/7544
|
||||
struct llama_context_params {
|
||||
@@ -364,6 +370,12 @@ extern "C" {
|
||||
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
||||
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
||||
|
||||
// [EXPERIMENTAL]
|
||||
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
|
||||
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
||||
struct llama_sampler_seq_config * samplers;
|
||||
size_t n_samplers;
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@@ -467,16 +479,23 @@ extern "C" {
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
enum llama_params_fit_status {
|
||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// returns true if the parameters could be successfully modified to fit device memory
|
||||
// this function is NOT thread safe because it modifies the global llama logger state
|
||||
LLAMA_API bool llama_params_fit(
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t margin, // margin of memory to leave per device in bytes
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
@@ -517,6 +536,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
@@ -600,6 +620,8 @@ extern "C" {
|
||||
//
|
||||
|
||||
// Load a LoRA adapter from file
|
||||
// The adapter is valid as long as the associated model is not freed
|
||||
// All adapters must be loaded before context creation
|
||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
@@ -983,6 +1005,32 @@ extern "C" {
|
||||
// otherwise: float[n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
||||
|
||||
//
|
||||
// backend sampling API [EXPERIMENTAL]
|
||||
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
|
||||
//
|
||||
|
||||
// Get the backend sampled token for the ith token.
|
||||
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
||||
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the backend sampled probabilites for the ith token
|
||||
// The index matches llama_get_sampled_token_ith().
|
||||
// Returns NULL if no probabilites were generated.
|
||||
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the backend sampled logits for the ith token
|
||||
// Returns NULL if no logits were sampled.
|
||||
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the backend sampled candidates (token ids) for the ith token
|
||||
// These are needed to map probability/logit indices to vocab token ids.
|
||||
// Returns NULL if no candidates were sampled.
|
||||
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
//
|
||||
// Vocab
|
||||
//
|
||||
@@ -1154,11 +1202,16 @@ extern "C" {
|
||||
//
|
||||
// llama_sampler_free(smpl);
|
||||
//
|
||||
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
||||
//
|
||||
|
||||
typedef void * llama_sampler_context_t;
|
||||
|
||||
struct llama_sampler_data {
|
||||
struct ggml_tensor * logits;
|
||||
struct ggml_tensor * probs;
|
||||
struct ggml_tensor * sampled;
|
||||
struct ggml_tensor * candidates;
|
||||
};
|
||||
|
||||
// user code can implement the interface below in order to create custom llama_sampler
|
||||
struct llama_sampler_i {
|
||||
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
|
||||
@@ -1168,17 +1221,45 @@ extern "C" {
|
||||
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
||||
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
||||
|
||||
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
|
||||
//void (*apply_ggml) (struct llama_sampler * smpl, ...);
|
||||
// [EXPERIMENTAL]
|
||||
// backend sampling interface:
|
||||
|
||||
// return true if the backend supports all ops needed by the sampler
|
||||
// note: call once per sampler
|
||||
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// call after .backend_apply()
|
||||
void (*backend_accept)(
|
||||
struct llama_sampler * smpl,
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * selected_token);
|
||||
|
||||
// call after .backend_init()
|
||||
void (*backend_apply)(
|
||||
struct llama_sampler * smpl,
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_cgraph * gf,
|
||||
struct llama_sampler_data * data);
|
||||
|
||||
// called before graph execution to set inputs for the current ubatch
|
||||
void (*backend_set_input)(struct llama_sampler * smpl);
|
||||
};
|
||||
|
||||
struct llama_sampler {
|
||||
const struct llama_sampler_i * iface;
|
||||
llama_sampler_context_t ctx;
|
||||
struct llama_sampler_i * iface;
|
||||
|
||||
llama_sampler_context_t ctx;
|
||||
};
|
||||
|
||||
// [EXPERIMENTAL]
|
||||
// attach a sampler to the context
|
||||
// note: prefer initializing the context with llama_context_params.samplers when possible
|
||||
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
||||
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
||||
|
||||
// mirror of llama_sampler_i:
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
||||
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
||||
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
||||
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||
@@ -1194,7 +1275,15 @@ extern "C" {
|
||||
|
||||
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
|
||||
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||
|
||||
// return NULL if:
|
||||
// - the sampler is NULL
|
||||
// - the sampler is not a llama_sampler_chain
|
||||
// - the index is out of bounds, unless i == -1
|
||||
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
|
||||
|
||||
// the total number of samplers in the chain
|
||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||
|
||||
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
||||
@@ -1203,7 +1292,9 @@ extern "C" {
|
||||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||
|
||||
/// seed == LLAMA_DEFAULT_SEED to use a random seed.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
|
||||
15
llama/llama.cpp/src/llama-adapter.cpp
vendored
15
llama/llama.cpp/src/llama-adapter.cpp
vendored
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
||||
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
llama_model & model = adapter.model;
|
||||
|
||||
ggml_context * ctx_init;
|
||||
gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
}
|
||||
}
|
||||
|
||||
// update number of nodes used
|
||||
model.n_lora_nodes += adapter.get_n_nodes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
// update number of nodes used
|
||||
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
||||
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
||||
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
|
||||
8
llama/llama.cpp/src/llama-adapter.h
vendored
8
llama/llama.cpp/src/llama-adapter.h
vendored
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
|
||||
};
|
||||
|
||||
struct llama_adapter_lora {
|
||||
llama_model & model;
|
||||
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
|
||||
@@ -73,10 +75,14 @@ struct llama_adapter_lora {
|
||||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora() = default;
|
||||
llama_adapter_lora(llama_model & model) : model(model) {}
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||
|
||||
uint32_t get_n_nodes() const {
|
||||
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
|
||||
}
|
||||
};
|
||||
|
||||
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
||||
|
||||
80
llama/llama.cpp/src/llama-arch.cpp
vendored
80
llama/llama.cpp/src/llama-arch.cpp
vendored
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||
{ LLM_ARCH_REFACT, "refact" },
|
||||
{ LLM_ARCH_BERT, "bert" },
|
||||
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_PLAMO2, "plamo2" },
|
||||
{ LLM_ARCH_PLAMO3, "plamo3" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
@@ -115,6 +117,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
||||
{ LLM_ARCH_MAINCODER, "maincoder" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -148,6 +153,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
||||
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
||||
@@ -205,6 +211,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
@@ -216,6 +223,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
@@ -500,6 +508,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
@@ -781,6 +790,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
};
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_TOKEN_EMBD_NORM,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
};
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
@@ -1060,6 +1083,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
};
|
||||
case LLM_ARCH_PLAMO3:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_CODESHELL:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
@@ -2040,6 +2079,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_DENSE_2_OUT,
|
||||
};
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
return {
|
||||
@@ -2058,7 +2098,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_SHORTCONV_INPROJ,
|
||||
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
@@ -2174,11 +2214,49 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_VISEXP_FFN_DOWN,
|
||||
LLM_TENSOR_VISEXP_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_MIMO2:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_SINKS,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
};
|
||||
case LLM_ARCH_GPTJ:
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
};
|
||||
case LLM_ARCH_MAINCODER:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_SOLAR:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
||||
8
llama/llama.cpp/src/llama-arch.h
vendored
8
llama/llama.cpp/src/llama-arch.h
vendored
@@ -24,6 +24,7 @@ enum llm_arch {
|
||||
LLM_ARCH_STARCODER,
|
||||
LLM_ARCH_REFACT,
|
||||
LLM_ARCH_BERT,
|
||||
LLM_ARCH_MODERN_BERT,
|
||||
LLM_ARCH_NOMIC_BERT,
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_NEO_BERT,
|
||||
@@ -45,6 +46,7 @@ enum llm_arch {
|
||||
LLM_ARCH_PHIMOE,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_PLAMO2,
|
||||
LLM_ARCH_PLAMO3,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
@@ -119,6 +121,9 @@ enum llm_arch {
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_LLAMA_EMBED,
|
||||
LLM_ARCH_MAINCODER,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -152,6 +157,7 @@ enum llm_kv {
|
||||
LLM_KV_VOCAB_SIZE,
|
||||
LLM_KV_CONTEXT_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||
LLM_KV_FEATURES_LENGTH,
|
||||
LLM_KV_BLOCK_COUNT,
|
||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||
@@ -209,6 +215,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
@@ -220,6 +227,7 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_FREQ_BASE_SWA,
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
LLM_KV_ROPE_SCALING_FACTOR,
|
||||
|
||||
11
llama/llama.cpp/src/llama-chat.cpp
vendored
11
llama/llama.cpp/src/llama-chat.cpp
vendored
@@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
||||
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_GROK_2;
|
||||
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
||||
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
||||
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
|
||||
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "[unused9]助手:";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|begin|>assistant";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
1
llama/llama.cpp/src/llama-chat.h
vendored
1
llama/llama.cpp/src/llama-chat.h
vendored
@@ -54,6 +54,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
LLM_CHAT_TEMPLATE_PANGU_EMBED,
|
||||
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
682
llama/llama.cpp/src/llama-context.cpp
vendored
682
llama/llama.cpp/src/llama-context.cpp
vendored
@@ -60,6 +60,25 @@ llama_context::llama_context(
|
||||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
if (params.samplers != nullptr && params.n_samplers > 0) {
|
||||
for (size_t i = 0; i < params.n_samplers; ++i) {
|
||||
const auto & config = params.samplers[i];
|
||||
|
||||
if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
|
||||
throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
|
||||
}
|
||||
|
||||
if (set_sampler(config.seq_id, config.sampler)) {
|
||||
const int n_samplers = llama_sampler_chain_n(config.sampler);
|
||||
|
||||
LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rope_scaling_type = params.rope_scaling_type;
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
||||
rope_scaling_type = hparams.rope_scaling_type_train;
|
||||
@@ -231,7 +250,10 @@ llama_context::llama_context(
|
||||
// graph outputs buffer
|
||||
{
|
||||
// resized during inference when a batch uses more outputs
|
||||
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||
// Create a dummy batch for initialization.
|
||||
llama_batch dummy_batch = {};
|
||||
dummy_batch.n_tokens = 0;
|
||||
if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
|
||||
throw std::runtime_error("failed to reserve initial output buffer");
|
||||
}
|
||||
|
||||
@@ -294,8 +316,8 @@ llama_context::llama_context(
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
model.n_devices() > 1 &&
|
||||
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
|
||||
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
model.n_gpu_layers() > model.hparams.n_layer &&
|
||||
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
|
||||
cparams.offload_kqv &&
|
||||
!model.has_tensor_overrides();
|
||||
|
||||
@@ -456,26 +478,35 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the full vocabulary token ids for backend samplers.
|
||||
{
|
||||
const int n_vocab = model.vocab.n_tokens();
|
||||
|
||||
sampling.token_ids_full_vocab.resize(n_vocab);
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
sampling.token_ids_full_vocab[i] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_context::~llama_context() {
|
||||
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||
// if (!model.hparams.no_alloc) {
|
||||
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
// ggml_backend_t backend = backend_ptrs[i];
|
||||
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
if (!model.hparams.no_alloc) {
|
||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
|
||||
// const size_t size_exp = backend_buf_exp_size[i];
|
||||
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
// if (size_exp == size_act) {
|
||||
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// } else {
|
||||
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
const size_t size_exp = backend_buf_exp_size[i];
|
||||
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
if (size_exp == size_act) {
|
||||
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_opt_free(opt_ctx);
|
||||
}
|
||||
|
||||
@@ -617,6 +648,35 @@ float * llama_context::get_logits() {
|
||||
return logits;
|
||||
}
|
||||
|
||||
int64_t llama_context::output_resolve_row(int32_t i) const {
|
||||
int64_t j = -1;
|
||||
|
||||
// support negative indices (last output row)
|
||||
if (i < 0) {
|
||||
j = n_outputs + i;
|
||||
if (j < 0) {
|
||||
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
|
||||
}
|
||||
} else if ((size_t) i >= output_ids.size()) {
|
||||
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
|
||||
} else {
|
||||
// use output_ids to translate the batch token index into a row number
|
||||
// that holds this token's data.
|
||||
j = output_ids[i];
|
||||
}
|
||||
|
||||
if (j < 0) {
|
||||
// the batch token was not configured to output anything
|
||||
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
||||
}
|
||||
|
||||
if (j >= n_outputs) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
@@ -627,6 +687,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error("no logits");
|
||||
}
|
||||
|
||||
// TODO: use output_resolve_row()
|
||||
if (i < 0) {
|
||||
j = n_outputs + i;
|
||||
if (j < 0) {
|
||||
@@ -663,6 +724,10 @@ float * llama_context::get_embeddings() {
|
||||
return embd;
|
||||
}
|
||||
|
||||
llama_token * llama_context::get_sampled_tokens() const{
|
||||
return sampling.sampled;
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
@@ -673,6 +738,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
throw std::runtime_error("no embeddings");
|
||||
}
|
||||
|
||||
// TODO: use output_resolve_row()
|
||||
if (i < 0) {
|
||||
j = n_outputs + i;
|
||||
if (j < 0) {
|
||||
@@ -692,7 +758,8 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return embd + j*model.hparams.n_embd;
|
||||
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
|
||||
return embd + j*n_embd_out;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -712,6 +779,136 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
|
||||
return it->second.data();
|
||||
}
|
||||
|
||||
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.sampled == nullptr) {
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
GGML_ASSERT(row < (int64_t) sampling.sampled_size);
|
||||
return sampling.sampled[row];
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
}
|
||||
|
||||
float * llama_context::get_sampled_probs_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.probs == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return sampling.probs + row*model.vocab.n_tokens();
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
float * llama_context::get_sampled_logits_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.logits == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return sampling.logits + row*model.vocab.n_tokens();
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if (sampling.candidates != nullptr &&
|
||||
(size_t) row < sampling.candidates_count.size() &&
|
||||
sampling.candidates_count[row] > 0) {
|
||||
return sampling.candidates + row*model.vocab.n_tokens();
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
// fallback to full vocab list
|
||||
}
|
||||
|
||||
return sampling.token_ids_full_vocab.data();
|
||||
}
|
||||
|
||||
size_t llama_context::get_sampled_candidates_count(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.candidates == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if ((size_t) row >= sampling.candidates_count.size()) {
|
||||
return 0;
|
||||
}
|
||||
return sampling.candidates_count[row];
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::get_sampled_logits_count(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.logits == nullptr) {
|
||||
return model.vocab.n_tokens();
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if ((size_t) row >= sampling.logits_count.size()) {
|
||||
return 0;
|
||||
}
|
||||
return sampling.logits_count[row];
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::get_sampled_probs_count(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
if (sampling.probs == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
const int64_t row = output_resolve_row(idx);
|
||||
if ((size_t) row >= sampling.probs_count.size()) {
|
||||
return 0;
|
||||
}
|
||||
return sampling.probs_count[row];
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void llama_context::attach_threadpool(
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch) {
|
||||
@@ -768,6 +965,42 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
||||
|
||||
const bool can_offload =
|
||||
sampler &&
|
||||
sampler->iface->backend_init &&
|
||||
sampler->iface->backend_apply &&
|
||||
llama_sampler_chain_n(sampler) > 0;
|
||||
|
||||
if (sampler && can_offload) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
|
||||
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
|
||||
if (host_buft) {
|
||||
buft = host_buft;
|
||||
}
|
||||
|
||||
sampler->iface->backend_init(sampler, buft);
|
||||
|
||||
sampling.samplers[seq_id] = sampler;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (sampler && !can_offload) {
|
||||
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
|
||||
|
||||
sampling.samplers.erase(seq_id);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
sampling.samplers.erase(seq_id);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -908,7 +1141,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_tokens) < n_tokens) {
|
||||
if (output_reserve(n_tokens, batch_inp) < n_tokens) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
|
||||
return -2;
|
||||
};
|
||||
@@ -962,9 +1195,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
|
||||
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
case LLAMA_POOLING_TYPE_CLS:
|
||||
@@ -1032,6 +1266,112 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
|
||||
std::map<llama_seq_id, uint32_t> seq_to_row;
|
||||
// how many output tokens we have seen so far for this ubatch.
|
||||
uint32_t local = 0;
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
// skip tokens that are not output.
|
||||
if (!ubatch.output[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
// row_offset is the number of output tokens before this ubatch.
|
||||
seq_to_row[seq_id] = row_offset + local;
|
||||
++local;
|
||||
}
|
||||
return seq_to_row;
|
||||
}
|
||||
|
||||
static void copy_tensor_async_ints(
|
||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
||||
llama_token * sampled,
|
||||
size_t sampled_size,
|
||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
||||
ggml_backend_sched_t sched) {
|
||||
if (sampled == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
||||
auto it = seq_to_row.find(seq_id);
|
||||
if (it == seq_to_row.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t row = it->second;
|
||||
GGML_ASSERT(row < sampled_size);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
|
||||
|
||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||
ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_tensor_async_floats(
|
||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
||||
float * dst,
|
||||
size_t stride,
|
||||
std::vector<uint32_t> & counts,
|
||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
||||
ggml_backend_sched_t sched) {
|
||||
if (dst == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
||||
auto it = seq_to_row.find(seq_id);
|
||||
if (it == seq_to_row.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t row = it->second;
|
||||
GGML_ASSERT(row < counts.size());
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
|
||||
|
||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||
float * row_ptr = dst + (size_t) row * stride;
|
||||
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
||||
|
||||
// Update the actual number of logits/probabilities that were written for this row.
|
||||
counts[row] = ggml_nelements(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_tensor_async_candidates(
|
||||
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
|
||||
llama_token * dst,
|
||||
size_t stride,
|
||||
std::vector<uint32_t> & counts,
|
||||
const std::map<llama_seq_id, uint32_t> & seq_to_row,
|
||||
ggml_backend_sched_t sched) {
|
||||
if (dst == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto & [seq_id, tensor] : tensor_map) {
|
||||
auto it = seq_to_row.find(seq_id);
|
||||
if (it == seq_to_row.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t row = it->second;
|
||||
GGML_ASSERT(row < counts.size());
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
|
||||
|
||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
|
||||
llama_token * row_ptr = dst + (size_t) row * stride;
|
||||
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
|
||||
|
||||
// Update the actual number of candidates that were written.
|
||||
counts[row] = ggml_nelements(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
int llama_context::decode(const llama_batch & batch_inp) {
|
||||
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
|
||||
|
||||
@@ -1052,8 +1392,35 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
const bool output_all = false;
|
||||
const bool has_samplers = !sampling.samplers.empty();
|
||||
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
|
||||
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
|
||||
|
||||
// TODO: avoid this workaround in the future
|
||||
if (has_samplers && batch_inp.logits) {
|
||||
std::vector<int32_t> seq_output_count(n_seq_max, 0);
|
||||
|
||||
for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
|
||||
if (batch_inp.logits[i] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
|
||||
|
||||
for (int32_t s = 0; s < ns; ++s) {
|
||||
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
|
||||
|
||||
seq_output_count[seq_id]++;
|
||||
if (seq_output_count[seq_id] > 1) {
|
||||
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
|
||||
__func__, seq_id, seq_output_count[seq_id]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
@@ -1134,7 +1501,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
return -2;
|
||||
};
|
||||
@@ -1207,7 +1574,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
|
||||
// extract logits
|
||||
if (t_logits && n_outputs > 0) {
|
||||
// For multi-sequence batches that mix backend samplers and CPU sampler
|
||||
// this is currently inefficient as we copy all logits even for the
|
||||
// backend sampled tokens.
|
||||
if (logits && t_logits && n_outputs > 0) {
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
||||
GGML_ASSERT(backend_res != nullptr);
|
||||
GGML_ASSERT(logits != nullptr);
|
||||
@@ -1222,7 +1592,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (t_embd && n_outputs > 0) {
|
||||
if (embd && t_embd && n_outputs > 0) {
|
||||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
@@ -1231,12 +1601,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
float * embd_out = embd + n_outputs_prev*n_embd;
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
float * embd_out = embd + n_outputs_prev*n_embd_out;
|
||||
|
||||
if (n_outputs) {
|
||||
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
|
||||
}
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
@@ -1276,6 +1647,22 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
// This flag indicates whether a backend sampler has actually sampled a specific
|
||||
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
|
||||
const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
|
||||
|
||||
if (has_samplers && has_sampled) {
|
||||
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
|
||||
const auto stride = n_vocab;
|
||||
|
||||
// async copy the sampling data from the backend to the host
|
||||
copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
|
||||
|
||||
copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
|
||||
copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
|
||||
copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
|
||||
}
|
||||
|
||||
n_outputs_prev += n_outputs;
|
||||
} while (mctx->next());
|
||||
|
||||
@@ -1339,15 +1726,15 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// output
|
||||
//
|
||||
|
||||
uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd_out = hparams.get_n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
@@ -1358,8 +1745,53 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
has_embd = true;
|
||||
}
|
||||
|
||||
logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||
// Check which sampling modes are needed for the current batch.
|
||||
// TODO: avoid this branching by working with the worst-case
|
||||
bool has_sampling = false;
|
||||
bool cpu_logits = false;
|
||||
|
||||
if (batch.logits) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
if (!batch.logits[i]) {
|
||||
continue;
|
||||
}
|
||||
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
||||
llama_seq_id seq_id = batch.seq_id[i][j];
|
||||
if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
|
||||
has_sampling = true;
|
||||
} else {
|
||||
cpu_logits = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// When batch.logits is nullptr (when loading state with a dummy batch),
|
||||
// allocate CPU logits.
|
||||
cpu_logits = true;
|
||||
}
|
||||
|
||||
size_t backend_float_count = 0;
|
||||
size_t backend_token_count = 0;
|
||||
|
||||
// Allocate CPU logits buffer only if needed by sequences in this batch
|
||||
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
|
||||
// TODO: avoid this branching by working with the worst-case
|
||||
if (!has_sampling) {
|
||||
sampling.logits_size = 0;
|
||||
sampling.probs_size = 0;
|
||||
sampling.sampled_size = 0;
|
||||
sampling.candidates_size = 0;
|
||||
} else {
|
||||
sampling.logits_size = n_vocab*n_outputs_max;
|
||||
sampling.probs_size = n_vocab*n_outputs_max;
|
||||
sampling.sampled_size = n_outputs_max;
|
||||
sampling.candidates_size = n_vocab*n_outputs_max;
|
||||
|
||||
backend_float_count = sampling.logits_size + sampling.probs_size;
|
||||
backend_token_count = sampling.sampled_size + sampling.candidates_size;
|
||||
}
|
||||
|
||||
if (output_ids.empty()) {
|
||||
// init, never resized afterwards
|
||||
@@ -1367,7 +1799,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
}
|
||||
|
||||
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
|
||||
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
||||
const size_t new_size =
|
||||
(logits_size + embd_size + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
|
||||
// alloc only when more than the current capacity is required
|
||||
// TODO: also consider shrinking the buffer
|
||||
@@ -1375,9 +1809,11 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
if (buf_output) {
|
||||
#ifndef NDEBUG
|
||||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
synchronize();
|
||||
|
||||
// TODO: not needed?
|
||||
buf_output = nullptr;
|
||||
logits = nullptr;
|
||||
embd = nullptr;
|
||||
@@ -1399,8 +1835,49 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
|
||||
|
||||
logits = has_logits ? output_base : nullptr;
|
||||
embd = has_embd ? output_base + logits_size : nullptr;
|
||||
logits = nullptr;
|
||||
embd = nullptr;
|
||||
|
||||
size_t offset = 0;
|
||||
uint8_t * base = (uint8_t *) output_base;
|
||||
|
||||
logits = (has_logits && cpu_logits) ? output_base : nullptr;
|
||||
offset += logits_size * sizeof(float);
|
||||
|
||||
embd = has_embd ? (float *) (base + offset) : nullptr;
|
||||
offset += embd_size * sizeof(float);
|
||||
|
||||
sampling.logits = nullptr;
|
||||
sampling.probs = nullptr;
|
||||
sampling.sampled = nullptr;
|
||||
sampling.candidates = nullptr;
|
||||
|
||||
if (has_sampling) {
|
||||
sampling.logits = (float *) (base + offset);
|
||||
offset += sampling.logits_size * sizeof(float);
|
||||
|
||||
sampling.probs = (float *) (base + offset);
|
||||
offset += sampling.probs_size * sizeof(float);
|
||||
|
||||
sampling.sampled = (llama_token *) (base + offset);
|
||||
offset += sampling.sampled_size * sizeof(llama_token);
|
||||
|
||||
sampling.candidates = (llama_token *) (base + offset);
|
||||
offset += sampling.candidates_size * sizeof(llama_token);
|
||||
|
||||
// The count vectors keep track of the actual number of logits/probs/candidates
|
||||
// copied from the backend for each output row.
|
||||
|
||||
sampling.logits_count.resize(n_outputs_max);
|
||||
sampling.probs_count.resize(n_outputs_max);
|
||||
sampling.candidates_count.resize(n_outputs_max);
|
||||
|
||||
std::fill(sampling.logits_count.begin(), sampling.logits_count.end(), 0);
|
||||
std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
|
||||
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
|
||||
|
||||
std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
|
||||
}
|
||||
|
||||
// set all ids as invalid (negative)
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
@@ -1429,6 +1906,40 @@ void llama_context::output_reorder() {
|
||||
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (sampling.logits && sampling.logits_size > 0) {
|
||||
for (uint64_t k = 0; k < n_vocab; ++k) {
|
||||
std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (sampling.probs && sampling.probs_size > 0) {
|
||||
for (uint64_t k = 0; k < n_vocab; ++k) {
|
||||
std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (sampling.candidates && sampling.candidates_size > 0) {
|
||||
for (uint64_t k = 0; k < n_vocab; ++k) {
|
||||
std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (sampling.sampled && sampling.sampled_size > 0) {
|
||||
std::swap(sampling.sampled[i0], sampling.sampled[i1]);
|
||||
}
|
||||
|
||||
if (!sampling.logits_count.empty()) {
|
||||
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
|
||||
}
|
||||
|
||||
if (!sampling.probs_count.empty()) {
|
||||
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
|
||||
}
|
||||
|
||||
if (!sampling.candidates_count.empty()) {
|
||||
std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
|
||||
}
|
||||
}
|
||||
|
||||
output_swaps.clear();
|
||||
@@ -1442,7 +1953,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
res += model.n_lora_nodes;
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
@@ -1456,7 +1969,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
n_outputs = std::min(n_outputs, n_tokens);
|
||||
n_outputs = std::max(n_outputs, n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
}
|
||||
@@ -1475,6 +1988,15 @@ ggml_cgraph * llama_context::graph_reserve(
|
||||
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
||||
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
||||
|
||||
// set one output token per sequence in order to activate all backend samplers
|
||||
std::vector<llama_seq_id> seq_ids(n_seqs);
|
||||
for (uint32_t i = 0; i < n_seqs; ++i) {
|
||||
seq_ids[i] = i;
|
||||
ubatch.n_seq_id[i] = 1;
|
||||
ubatch.seq_id[i] = &seq_ids[i];
|
||||
ubatch.output[i] = true;
|
||||
}
|
||||
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
@@ -1505,7 +2027,7 @@ llm_graph_params llama_context::graph_params(
|
||||
llm_graph_result * res,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const {
|
||||
llm_graph_type gtype) const {
|
||||
return {
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
@@ -1518,6 +2040,7 @@ llm_graph_params llama_context::graph_params(
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
/*.cross =*/ &cross,
|
||||
/*.samplers =*/ sampling.samplers,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
/*.res =*/ res,
|
||||
@@ -1570,7 +2093,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
|
||||
|
||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||
// FIXME: fix in ggml_backend_sched
|
||||
const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
|
||||
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
|
||||
if (ubatch.n_tokens < 32 || full_offload) {
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
const auto & dev_layer = model.dev_layer(il);
|
||||
@@ -1973,6 +2496,9 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: handle sampling buffers and samplers state ?
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17004
|
||||
|
||||
if (memory != nullptr) {
|
||||
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
|
||||
memory->state_write(io);
|
||||
@@ -2005,7 +2531,10 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
auto n_outputs = this->n_outputs;
|
||||
io.read_to(&n_outputs, sizeof(n_outputs));
|
||||
|
||||
if (n_outputs > output_reserve(n_outputs)) {
|
||||
// Create a dummy batch for state loading.
|
||||
llama_batch dummy_batch = {};
|
||||
dummy_batch.n_tokens = 0;
|
||||
if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
|
||||
throw std::runtime_error("could not reserve outputs");
|
||||
}
|
||||
|
||||
@@ -2059,6 +2588,9 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: handle sampling buffers and samplers state ?
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17004
|
||||
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
|
||||
|
||||
@@ -2247,7 +2779,7 @@ void llama_context::opt_epoch_iter(
|
||||
}
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
GGML_ABORT("TODO: handle this error");
|
||||
};
|
||||
@@ -2392,6 +2924,8 @@ llama_context_params llama_context_default_params() {
|
||||
/*.op_offload =*/ true,
|
||||
/*.swa_full =*/ true,
|
||||
/*.kv_unified =*/ false,
|
||||
/*.sampler =*/ nullptr,
|
||||
/*.n_sampler =*/ 0,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -2551,7 +3085,15 @@ float * llama_get_logits(llama_context * ctx) {
|
||||
float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_logits_ith(i);
|
||||
float * res = nullptr;
|
||||
|
||||
res = ctx->get_sampled_logits_ith(i);
|
||||
|
||||
if (!res) {
|
||||
res = ctx->get_logits_ith(i);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
float * llama_get_embeddings(llama_context * ctx) {
|
||||
@@ -2572,6 +3114,52 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
||||
return ctx->get_embeddings_seq(seq_id);
|
||||
}
|
||||
|
||||
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
return ctx->set_sampler(seq_id, smpl);
|
||||
}
|
||||
|
||||
llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_sampled_token_ith(i);
|
||||
}
|
||||
|
||||
float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_sampled_probs_ith(i);
|
||||
}
|
||||
|
||||
float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_sampled_logits_ith(i);
|
||||
}
|
||||
|
||||
llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
|
||||
}
|
||||
|
||||
uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
|
||||
}
|
||||
|
||||
uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
|
||||
}
|
||||
|
||||
uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
|
||||
}
|
||||
|
||||
// llama adapter API
|
||||
|
||||
int32_t llama_set_adapter_lora(
|
||||
|
||||
44
llama/llama.cpp/src/llama-context.h
vendored
44
llama/llama.cpp/src/llama-context.h
vendored
@@ -70,6 +70,18 @@ struct llama_context {
|
||||
float * get_embeddings_ith(int32_t i);
|
||||
float * get_embeddings_seq(llama_seq_id seq_id);
|
||||
|
||||
llama_token * get_sampled_tokens() const;
|
||||
llama_token get_sampled_token_ith(int32_t idx);
|
||||
|
||||
float * get_sampled_logits_ith(int32_t idx);
|
||||
size_t get_sampled_logits_count(int32_t idx);
|
||||
|
||||
float * get_sampled_probs_ith(int32_t idx);
|
||||
size_t get_sampled_probs_count(int32_t idx);
|
||||
|
||||
const llama_token * get_sampled_candidates_ith(int32_t idx);
|
||||
size_t get_sampled_candidates_count(int32_t idx);
|
||||
|
||||
void attach_threadpool(
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch);
|
||||
@@ -192,10 +204,13 @@ private:
|
||||
|
||||
// Make sure enough space is available for outputs.
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
|
||||
|
||||
void output_reorder();
|
||||
|
||||
// map the output row index `i` to batch index
|
||||
int64_t output_resolve_row(int32_t i) const;
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -213,6 +228,8 @@ public:
|
||||
ggml_cgraph * graph_reserve(
|
||||
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||
|
||||
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
llm_graph_result * res,
|
||||
@@ -252,6 +269,31 @@ private:
|
||||
size_t embd_size = 0; // capacity (of floats) for embeddings
|
||||
float * embd = nullptr;
|
||||
|
||||
// TODO: simplify
|
||||
struct sampling_info {
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
float * logits = nullptr;
|
||||
size_t logits_size = 0;
|
||||
|
||||
llama_token * sampled = nullptr;
|
||||
size_t sampled_size = 0;
|
||||
|
||||
float * probs = nullptr;
|
||||
size_t probs_size = 0;
|
||||
|
||||
llama_token * candidates = nullptr;
|
||||
size_t candidates_size = 0;
|
||||
|
||||
std::vector<uint32_t> logits_count;
|
||||
std::vector<uint32_t> probs_count;
|
||||
std::vector<uint32_t> candidates_count;
|
||||
|
||||
std::vector<llama_token> token_ids_full_vocab;
|
||||
};
|
||||
|
||||
sampling_info sampling;
|
||||
|
||||
// sequence embeddings output (map of [n_embd] vectors)
|
||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||
|
||||
53
llama/llama.cpp/src/llama-grammar.cpp
vendored
53
llama/llama.cpp/src/llama-grammar.cpp
vendored
@@ -369,6 +369,44 @@ static void print_rule(
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
//
|
||||
// Regex utilities
|
||||
//
|
||||
|
||||
size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
|
||||
auto find_start_pos = [](const std::smatch & match) {
|
||||
// get from the first matched capturing group to the end of the string
|
||||
size_t start = std::string::npos;
|
||||
for (auto i = 1u; i < match.size(); i++) {
|
||||
if (match.length(i) > 0) {
|
||||
start = match.position(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
return start;
|
||||
};
|
||||
|
||||
if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
|
||||
// match against the entire input
|
||||
std::smatch match;
|
||||
if (std::regex_match(input, match, regex)) {
|
||||
return find_start_pos(match);
|
||||
}
|
||||
}
|
||||
|
||||
// search anywhere
|
||||
std::smatch match;
|
||||
if (std::regex_search(input, match, regex)) {
|
||||
return find_start_pos(match);
|
||||
}
|
||||
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// implementation
|
||||
//
|
||||
@@ -1321,21 +1359,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
||||
grammar.trigger_buffer += piece;
|
||||
|
||||
std::smatch match;
|
||||
for (const auto & trigger_pattern : grammar.trigger_patterns) {
|
||||
if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
|
||||
auto start = trigger_pattern.find(grammar.trigger_buffer);
|
||||
if (start != std::string::npos) {
|
||||
grammar.awaiting_trigger = false;
|
||||
// get from the first matched capturing group to the end of the string
|
||||
size_t start = std::string::npos;
|
||||
for (auto i = 1u; i < match.size(); i++) {
|
||||
if (match.length(i) > 0) {
|
||||
start = match.position(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
|
||||
// replay tokens that overlap with [start, end)
|
||||
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
||||
|
||||
2
llama/llama.cpp/src/llama-grammar.h
vendored
2
llama/llama.cpp/src/llama-grammar.h
vendored
@@ -130,6 +130,8 @@ struct llama_grammar_parser {
|
||||
struct llama_grammar_trigger_pattern {
|
||||
std::string pattern;
|
||||
std::regex regex;
|
||||
|
||||
size_t find(const std::string & input) const;
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
|
||||
178
llama/llama.cpp/src/llama-graph.cpp
vendored
178
llama/llama.cpp/src/llama-graph.cpp
vendored
@@ -12,6 +12,7 @@
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_set>
|
||||
|
||||
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->token) {
|
||||
@@ -32,7 +33,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
|
||||
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
|
||||
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -62,7 +63,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= pos->ne[0] == params.ubatch.n_tokens;
|
||||
res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -521,6 +522,43 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
|
||||
// set the inputs only for the active samplers in the current ubatch
|
||||
std::unordered_set<llama_seq_id> active_samplers;
|
||||
for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
|
||||
if (ubatch->output[i]) {
|
||||
llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
active_samplers.insert(seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto seq_id : active_samplers) {
|
||||
if (samplers.find(seq_id) == samplers.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & sampler = samplers[seq_id];
|
||||
|
||||
if (sampler->iface->backend_set_input) {
|
||||
sampler->iface->backend_set_input(sampler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
|
||||
if (samplers.size() != params.samplers.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto & [seq_id, sampler] : params.samplers) {
|
||||
if (samplers[seq_id] != sampler) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -541,6 +579,10 @@ void llm_graph_result::reset() {
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
t_sampled.clear();
|
||||
t_sampled_probs.clear();
|
||||
t_sampled_logits.clear();
|
||||
t_candidates.clear();
|
||||
|
||||
params = {};
|
||||
|
||||
@@ -565,6 +607,38 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_result::set_outputs() {
|
||||
if (t_logits != nullptr) {
|
||||
ggml_set_output(t_logits);
|
||||
}
|
||||
if (t_embd != nullptr) {
|
||||
ggml_set_output(t_embd);
|
||||
}
|
||||
if (t_embd_pooled != nullptr) {
|
||||
ggml_set_output(t_embd_pooled);
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled_probs) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled_logits) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_candidates) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_result::can_reuse(const llm_graph_params & params) {
|
||||
if (!this->params.allow_reuse(params)) {
|
||||
if (debug > 1) {
|
||||
@@ -646,6 +720,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
loras (params.loras),
|
||||
mctx (params.mctx),
|
||||
cross (params.cross),
|
||||
samplers (params.samplers),
|
||||
cb_func (params.cb),
|
||||
res (params.res),
|
||||
ctx0 (res->get_ctx()),
|
||||
@@ -1251,6 +1326,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
// make sure the produced embeddings are immediately materialized in the ggml graph
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18599
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
@@ -1834,8 +1913,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1848,8 +1929,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
|
||||
}
|
||||
|
||||
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||
@@ -1988,14 +2071,18 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
void llm_graph_context::build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_3) const {
|
||||
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
||||
if (!cparams.embeddings || !(dense_2 || dense_3)) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
||||
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
if (dense_2) {
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
}
|
||||
if (dense_3) {
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
}
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
res->t_embd_pooled = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
@@ -2086,6 +2173,87 @@ void llm_graph_context::build_pooling(
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
void llm_graph_context::build_sampling() const {
|
||||
if (samplers.empty() || !res->t_logits) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
|
||||
res->add_input(std::move(inp_sampling));
|
||||
|
||||
std::map<llama_seq_id, int32_t> seq_to_logit_row;
|
||||
int32_t logit_row_idx = 0;
|
||||
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
||||
if (ubatch.output[i]) {
|
||||
llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
seq_to_logit_row[seq_id] = logit_row_idx;
|
||||
logit_row_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
|
||||
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
|
||||
|
||||
// add a dummy row of logits
|
||||
// this trick makes the graph static, regardless of which samplers are activated
|
||||
// this is important in order to minimize graph reallocations
|
||||
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
|
||||
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
||||
|
||||
for (const auto & [seq_id, sampler] : samplers) {
|
||||
const auto it = seq_to_logit_row.find(seq_id);
|
||||
|
||||
// inactive samplers always work on the first row
|
||||
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
|
||||
|
||||
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
|
||||
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
|
||||
|
||||
struct llama_sampler_data data = {
|
||||
/*.logits =*/ logits_seq,
|
||||
/*.probs =*/ nullptr,
|
||||
/*.sampled =*/ nullptr,
|
||||
/*.candidates =*/ nullptr,
|
||||
};
|
||||
|
||||
assert(sampler->iface->backend_apply);
|
||||
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
|
||||
|
||||
if (data.sampled != nullptr) {
|
||||
res->t_sampled[seq_id] = data.sampled;
|
||||
ggml_build_forward_expand(gf, data.sampled);
|
||||
}
|
||||
|
||||
if (data.probs != nullptr) {
|
||||
res->t_sampled_probs[seq_id] = data.probs;
|
||||
ggml_build_forward_expand(gf, data.probs);
|
||||
}
|
||||
|
||||
if (data.logits != nullptr) {
|
||||
res->t_sampled_logits[seq_id] = data.logits;
|
||||
ggml_build_forward_expand(gf, data.logits);
|
||||
}
|
||||
|
||||
if (data.candidates != nullptr) {
|
||||
res->t_candidates[seq_id] = data.candidates;
|
||||
ggml_build_forward_expand(gf, data.candidates);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
|
||||
/*
|
||||
for (const auto & [seq_id, sampler] : samplers) {
|
||||
if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
|
||||
ggml_tensor * selected_token = it->second;
|
||||
if (selected_token != nullptr) {
|
||||
llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
|
||||
// TODO move to hparams if a T5 variant appears that uses a different value
|
||||
const int64_t max_distance = 128;
|
||||
|
||||
77
llama/llama.cpp/src/llama-graph.h
vendored
77
llama/llama.cpp/src/llama-graph.h
vendored
@@ -10,6 +10,7 @@
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
|
||||
struct ggml_cgraph;
|
||||
struct ggml_context;
|
||||
@@ -396,6 +397,18 @@ public:
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_sampling : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
|
||||
samplers(std::move(samplers)) { }
|
||||
virtual ~llm_graph_input_sampling() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -429,6 +442,23 @@ struct llm_graph_params {
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
static bool samplers_equal(
|
||||
const std::map<llama_seq_id, llama_sampler *> & lhs,
|
||||
const std::map<llama_seq_id, llama_sampler *> & rhs) {
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
for (const auto & [seq_id, sampler] : lhs) {
|
||||
auto it = rhs.find(seq_id);
|
||||
if (it == rhs.end() || it->second != sampler) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
llm_graph_cb cb;
|
||||
@@ -468,15 +498,36 @@ struct llm_graph_params {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (n_outputs != other.n_outputs) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!samplers_equal(samplers, other.samplers)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (samplers.size() > 0) {
|
||||
if (!ubatch.data || !other.ubatch.data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// check that the outputs are the same for all samplers
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
if (ubatch.output[i] != other.ubatch.output[i] ||
|
||||
ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
cparams.embeddings == other.cparams.embeddings &&
|
||||
cparams.causal_attn == other.cparams.causal_attn &&
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
loras == other.loras &&
|
||||
cross == other.cross &&
|
||||
n_outputs == other.n_outputs;
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
loras == other.loras &&
|
||||
cross == other.cross;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -499,6 +550,7 @@ public:
|
||||
void reset();
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch);
|
||||
void set_outputs();
|
||||
|
||||
// try to update the existing graph result using the new graph parameters in order to reuse it
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
@@ -517,6 +569,11 @@ public:
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
@@ -592,6 +649,8 @@ struct llm_graph_context {
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
const llm_graph_cb & cb_func;
|
||||
|
||||
llm_graph_result * res;
|
||||
@@ -832,6 +891,12 @@ struct llm_graph_context {
|
||||
ggml_tensor * cls_out,
|
||||
ggml_tensor * cls_out_b) const;
|
||||
|
||||
//
|
||||
// sampling (backend sampling)
|
||||
//
|
||||
|
||||
void build_sampling() const;
|
||||
|
||||
//
|
||||
// dense (out)
|
||||
//
|
||||
|
||||
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
@@ -72,6 +72,10 @@ uint32_t llama_hparams::n_embd_inp() const {
|
||||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_out() const {
|
||||
return n_embd_out > 0 ? n_embd_out : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
|
||||
17
llama/llama.cpp/src/llama-hparams.h
vendored
17
llama/llama.cpp/src/llama-hparams.h
vendored
@@ -107,9 +107,9 @@ struct llama_hparams {
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_base_train_swa;
|
||||
float rope_freq_base_train_swa = 10000.0f;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
float rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
@@ -125,10 +125,11 @@ struct llama_hparams {
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
// the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa = 0;
|
||||
// if swa_layers[il] == true, then layer il is SWA
|
||||
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
|
||||
// if swa_layers[il] == 1, then layer il is SWA
|
||||
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
|
||||
// by default, all layers are dense
|
||||
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
|
||||
// note: using uint32_t type for compatibility reason
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
@@ -163,6 +164,9 @@ struct llama_hparams {
|
||||
// for Classifiers
|
||||
uint32_t n_cls_out = 1;
|
||||
|
||||
// output embedding dimension (0 = use n_embd)
|
||||
uint32_t n_embd_out = 0;
|
||||
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
@@ -235,6 +239,9 @@ struct llama_hparams {
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t get_n_embd_out() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
||||
2
llama/llama.cpp/src/llama-kv-cache.h
vendored
2
llama/llama.cpp/src/llama-kv-cache.h
vendored
@@ -305,7 +305,7 @@ public:
|
||||
bool do_shift,
|
||||
stream_copy_info sc_info);
|
||||
|
||||
// used to create a batch procesing context from a batch
|
||||
// used to create a batch processing context from a batch
|
||||
llama_kv_cache_context(
|
||||
llama_kv_cache * kv,
|
||||
slot_info_vec_t sinfos,
|
||||
|
||||
209
llama/llama.cpp/src/llama-mmap.cpp
vendored
209
llama/llama.cpp/src/llama-mmap.cpp
vendored
@@ -13,9 +13,10 @@
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/stat.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
@@ -74,7 +75,7 @@ struct llama_file::impl {
|
||||
return ret;
|
||||
}
|
||||
|
||||
impl(const char * fname, const char * mode) {
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
@@ -109,7 +110,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
size_t bytes_read = 0;
|
||||
while (bytes_read < len) {
|
||||
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
||||
@@ -126,7 +127,7 @@ struct llama_file::impl {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t read_u32() {
|
||||
uint32_t val;
|
||||
read_raw(&val, sizeof(val));
|
||||
return val;
|
||||
@@ -153,16 +154,55 @@ struct llama_file::impl {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
bool has_direct_io() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
#else
|
||||
impl(const char * fname, const char * mode) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
||||
#ifdef __linux__
|
||||
// Try unbuffered I/O for read only
|
||||
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
||||
if (init_fd()) {
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
||||
fname, strerror(errno));
|
||||
}
|
||||
#endif
|
||||
init_fp(mode);
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
bool init_fd() {
|
||||
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
||||
|
||||
if (fd != -1) {
|
||||
struct stat file_stats{};
|
||||
fstat(fd, &file_stats);
|
||||
|
||||
size = file_stats.st_size;
|
||||
alignment = file_stats.st_blksize;
|
||||
|
||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
void init_fp(const char * mode) {
|
||||
fp = ggml_fopen(fname.c_str(), mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
@@ -170,46 +210,118 @@ struct llama_file::impl {
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
// TODO: this ifdef is never true?
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
||||
if (fd == -1) {
|
||||
long ret = std::ftell(fp);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
return (size_t) ret;
|
||||
off_t pos = lseek(fd, 0, SEEK_CUR);
|
||||
if (pos == -1) {
|
||||
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
|
||||
}
|
||||
return (size_t) pos;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) const {
|
||||
// TODO: this ifdef is never true?
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
if (ret != 0) {
|
||||
off_t ret = 0;
|
||||
if (fd == -1) {
|
||||
ret = std::fseek(fp, (long) offset, whence);
|
||||
} else {
|
||||
ret = lseek(fd, offset, whence);
|
||||
}
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
void read_raw_unsafe(void * ptr, size_t len) {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
if (fd == -1) {
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
}
|
||||
} else {
|
||||
size_t bytes_read = 0;
|
||||
while (bytes_read < len) {
|
||||
const size_t to_read = len - bytes_read;
|
||||
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
|
||||
|
||||
if (ret == -1) {
|
||||
if (errno == EINTR) {
|
||||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
alignment = 1;
|
||||
init_fp("rb");
|
||||
seek(curr_off, SEEK_SET);
|
||||
read_raw_unsafe(ptr, len);
|
||||
return;
|
||||
}
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret == 0) {
|
||||
// EOF: allow if this read was only pulling alignment padding past file end
|
||||
off_t pos = lseek(fd, 0, SEEK_CUR);
|
||||
if (pos != -1 && (size_t) pos == size) {
|
||||
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
|
||||
return;
|
||||
}
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
}
|
||||
|
||||
bytes_read += (size_t) ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
void read_aligned_chunk(void * dest, size_t size) {
|
||||
size_t offset = tell();
|
||||
off_t aligned_offset = offset & ~(alignment - 1);
|
||||
off_t offset_from_alignment = offset - aligned_offset;
|
||||
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
||||
|
||||
void * raw_buffer = nullptr;
|
||||
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
||||
if (ret != 0) {
|
||||
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
||||
}
|
||||
|
||||
struct aligned_buffer_deleter {
|
||||
void operator()(void * p) const { free(p); }
|
||||
};
|
||||
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
||||
|
||||
seek(aligned_offset, SEEK_SET);
|
||||
read_raw_unsafe(buffer.get(), bytes_to_read);
|
||||
|
||||
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
||||
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
if (has_direct_io()) {
|
||||
read_aligned_chunk(ptr, len);
|
||||
} else {
|
||||
read_raw_unsafe(ptr, len);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() {
|
||||
uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
@@ -230,23 +342,41 @@ struct llama_file::impl {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
bool has_direct_io() const {
|
||||
return fd != -1 && alignment > 1;
|
||||
}
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
} else {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
int fd = -1;
|
||||
std::string fname;
|
||||
#endif
|
||||
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
size_t read_alignment() const {
|
||||
return alignment;
|
||||
}
|
||||
|
||||
size_t alignment = 1;
|
||||
|
||||
FILE * fp{};
|
||||
size_t size{};
|
||||
};
|
||||
|
||||
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
|
||||
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||
llama_file::~llama_file() = default;
|
||||
|
||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
size_t llama_file::size() const { return pimpl->size; }
|
||||
|
||||
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
||||
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
||||
|
||||
int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
@@ -260,9 +390,14 @@ int llama_file::file_id() const {
|
||||
}
|
||||
|
||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
||||
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#ifdef _WIN32
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#else
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
||||
#endif
|
||||
|
||||
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
||||
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
||||
|
||||
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
||||
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
||||
|
||||
11
llama/llama.cpp/src/llama-mmap.h
vendored
11
llama/llama.cpp/src/llama-mmap.h
vendored
@@ -3,6 +3,7 @@
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
struct llama_file;
|
||||
struct llama_mmap;
|
||||
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
struct llama_file {
|
||||
llama_file(const char * fname, const char * mode);
|
||||
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||
~llama_file();
|
||||
|
||||
size_t tell() const;
|
||||
@@ -23,12 +24,16 @@ struct llama_file {
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
uint32_t read_u32() const;
|
||||
void read_raw(void * ptr, size_t len);
|
||||
void read_raw_unsafe(void * ptr, size_t len);
|
||||
void read_aligned_chunk(void * dest, size_t size);
|
||||
uint32_t read_u32();
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
void write_u32(uint32_t val) const;
|
||||
|
||||
size_t read_alignment() const;
|
||||
bool has_direct_io() const;
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
|
||||
96
llama/llama.cpp/src/llama-model-loader.cpp
vendored
96
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@@ -462,6 +462,29 @@ namespace GGUFMeta {
|
||||
return get_key_or_arr(llm_kv(kid), result, n, required);
|
||||
}
|
||||
|
||||
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
||||
const std::string key = llm_kv(kid);
|
||||
|
||||
const int id = gguf_find_key(meta.get(), key.c_str());
|
||||
|
||||
if (id < 0) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// throw and error if type is an array
|
||||
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return get_key(key, result, required);
|
||||
}
|
||||
|
||||
// TODO: this is not very clever - figure out something better
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
@@ -472,6 +495,7 @@ llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
@@ -504,9 +528,17 @@ llama_model_loader::llama_model_loader(
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
@@ -572,7 +604,7 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb"));
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
@@ -716,6 +748,7 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
|
||||
this->use_mmap = use_mmap;
|
||||
this->use_direct_io = use_direct_io;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_alloc = no_alloc;
|
||||
}
|
||||
@@ -935,7 +968,15 @@ bool llama_model_loader::load_all_data(
|
||||
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
||||
// NVMe raid configurations might require more / larger buffers.
|
||||
constexpr size_t n_buffers = 4;
|
||||
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
||||
|
||||
size_t alignment = 1;
|
||||
for (const auto & file : files) {
|
||||
alignment = std::max(file->read_alignment(), alignment);
|
||||
}
|
||||
|
||||
// Buffer size: balance between memory usage and I/O efficiency
|
||||
// 64MB works well for NVMe drives
|
||||
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
||||
|
||||
std::vector<ggml_backend_buffer_t> host_buffers;
|
||||
std::vector<ggml_backend_event_t> events;
|
||||
@@ -985,6 +1026,7 @@ bool llama_model_loader::load_all_data(
|
||||
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||
|
||||
if (!buf) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
||||
ggml_backend_dev_name(dev));
|
||||
@@ -1066,6 +1108,7 @@ bool llama_model_loader::load_all_data(
|
||||
}
|
||||
} else {
|
||||
const auto & file = files.at(weight->idx);
|
||||
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
@@ -1077,19 +1120,54 @@ bool llama_model_loader::load_all_data(
|
||||
} else {
|
||||
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||
if (upload_backend) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
size_t offset = weight->offs;
|
||||
alignment = file->read_alignment();
|
||||
size_t aligned_offset = offset & ~(alignment - 1);
|
||||
size_t offset_from_alignment = offset - aligned_offset;
|
||||
file->seek(aligned_offset, SEEK_SET);
|
||||
|
||||
// Calculate aligned read boundaries
|
||||
size_t read_start = aligned_offset;
|
||||
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
||||
|
||||
size_t bytes_read = 0;
|
||||
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
||||
|
||||
while (bytes_read < n_size) {
|
||||
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
||||
while (bytes_read < read_end - read_start) {
|
||||
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
||||
|
||||
// Align the destination pointer within the pinned buffer
|
||||
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
||||
|
||||
// Wait for previous upload to complete before reusing buffer
|
||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
||||
|
||||
// Read aligned chunk from file
|
||||
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||
|
||||
// Calculate actual data portion (excluding alignment padding)
|
||||
uintptr_t ptr_data = ptr_dest_aligned;
|
||||
size_t data_to_copy = read_size;
|
||||
|
||||
// Skip alignment padding at start of first chunk
|
||||
if (bytes_read == 0) {
|
||||
ptr_data += offset_from_alignment;
|
||||
data_to_copy -= offset_from_alignment;
|
||||
}
|
||||
|
||||
// Trim alignment padding at end of last chunk
|
||||
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
||||
data_to_copy -= (read_end - (offset + n_size));
|
||||
}
|
||||
|
||||
// Async upload actual data to GPU
|
||||
ggml_backend_tensor_set_async(upload_backend, cur,
|
||||
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
||||
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||
|
||||
bytes_read += read_iteration;
|
||||
data_read += data_to_copy;
|
||||
bytes_read += read_size;
|
||||
|
||||
++buffer_idx;
|
||||
buffer_idx %= n_buffers;
|
||||
}
|
||||
|
||||
4
llama/llama.cpp/src/llama-model-loader.h
vendored
4
llama/llama.cpp/src/llama-model-loader.h
vendored
@@ -70,6 +70,7 @@ struct llama_model_loader {
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool use_direct_io = false;
|
||||
bool check_tensors;
|
||||
bool no_alloc;
|
||||
|
||||
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
@@ -131,6 +133,8 @@ struct llama_model_loader {
|
||||
template<typename T>
|
||||
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
|
||||
|
||||
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
|
||||
|
||||
std::string get_arch_name() const;
|
||||
|
||||
enum llm_arch get_arch() const;
|
||||
|
||||
3
llama/llama.cpp/src/llama-model-saver.cpp
vendored
3
llama/llama.cpp/src/llama-model-saver.cpp
vendored
@@ -146,6 +146,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
if (hparams.n_embd_out > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
|
||||
399
llama/llama.cpp/src/llama-model.cpp
vendored
399
llama/llama.cpp/src/llama-model.cpp
vendored
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_17M: return "17M";
|
||||
case LLM_TYPE_22M: return "22M";
|
||||
case LLM_TYPE_33M: return "33M";
|
||||
case LLM_TYPE_47M: return "47M";
|
||||
case LLM_TYPE_60M: return "60M";
|
||||
case LLM_TYPE_70M: return "70M";
|
||||
case LLM_TYPE_80M: return "80M";
|
||||
case LLM_TYPE_109M: return "109M";
|
||||
case LLM_TYPE_137M: return "137M";
|
||||
case LLM_TYPE_140M: return "140M";
|
||||
case LLM_TYPE_149M: return "149M";
|
||||
case LLM_TYPE_160M: return "160M";
|
||||
case LLM_TYPE_190M: return "190M";
|
||||
case LLM_TYPE_220M: return "220M";
|
||||
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_335M: return "335M";
|
||||
case LLM_TYPE_350M: return "350M";
|
||||
case LLM_TYPE_360M: return "360M";
|
||||
case LLM_TYPE_395M: return "395M";
|
||||
case LLM_TYPE_410M: return "410M";
|
||||
case LLM_TYPE_450M: return "450M";
|
||||
case LLM_TYPE_475M: return "475M";
|
||||
@@ -123,10 +126,12 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_102B_A12B: return "102B.A12B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
||||
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
||||
case LLM_TYPE_310B_A15B: return "310B.A15B";
|
||||
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
||||
case LLM_TYPE_E2B: return "E2B";
|
||||
case LLM_TYPE_E4B: return "E4B";
|
||||
@@ -502,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -573,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
||||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
||||
|
||||
// TODO: Handle SWA metadata similarly when models start implementing it
|
||||
// rope_freq_scale (inverse of the kv) is optional
|
||||
float ropescale = 0.0f;
|
||||
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
||||
@@ -581,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||
|
||||
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||||
|
||||
// non-transformer models do not have attention heads
|
||||
@@ -603,7 +606,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -627,6 +630,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// arch-specific KVs
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
@@ -671,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.f_attn_temp_offset = 1.0f;
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
}
|
||||
|
||||
switch (hparams.n_expert) {
|
||||
@@ -716,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
if (hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
@@ -875,6 +887,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
{
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
uint32_t swa_period = 3;
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
type = LLM_TYPE_47M; break; // granite-embedding-small
|
||||
case 22:
|
||||
type = LLM_TYPE_149M; break; // modern-bert-base
|
||||
case 28:
|
||||
type = LLM_TYPE_395M; break; // modern-bert-large
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -1076,6 +1116,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MAINCODER:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_1B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_QWEN3VL:
|
||||
{
|
||||
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
|
||||
@@ -1194,6 +1242,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
uint32_t swa_period = 8;
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_2B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -1247,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.n_swa = 4096; // default value of gemma 2
|
||||
hparams.set_swa_pattern(2);
|
||||
hparams.attn_soft_cap = true;
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||||
@@ -1272,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
@@ -1303,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.set_swa_pattern(5);
|
||||
|
||||
hparams.n_layer_kv_from_start = 20;
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
hparams.f_attention_scale = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
@@ -1322,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.causal_attn = false; // embeddings do not use causal attention
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
@@ -1463,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -1502,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
@@ -1629,7 +1703,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
@@ -1725,6 +1799,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
||||
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
||||
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -1843,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
@@ -2160,6 +2239,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(2);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_20B; break;
|
||||
case 36: type = LLM_TYPE_120B; break;
|
||||
@@ -2204,6 +2287,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4, true);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
hparams.n_no_rope_layer_step = hparams.n_layer;
|
||||
@@ -2322,6 +2409,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MIMO2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 48: type = LLM_TYPE_310B_A15B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -2344,15 +2447,16 @@ void llama_model::load_vocab(llama_model_loader & ml) {
|
||||
|
||||
bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const auto & split_mode = params.split_mode;
|
||||
const auto & n_gpu_layers = params.n_gpu_layers;
|
||||
const auto & use_mlock = params.use_mlock;
|
||||
const auto & tensor_split = params.tensor_split;
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_gpu_layers = this->n_gpu_layers();
|
||||
|
||||
const bool use_mmap_buffer = true;
|
||||
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
||||
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
||||
|
||||
// build a list of buffer types for the CPU and GPU devices
|
||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
||||
@@ -2363,6 +2467,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
||||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
|
||||
// calculate the split points
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(n_devices());
|
||||
@@ -2373,6 +2482,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
}
|
||||
splits[i] = free;
|
||||
}
|
||||
} else {
|
||||
@@ -2389,14 +2505,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
splits[i] /= split_sum;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
|
||||
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||
@@ -2636,6 +2748,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -3170,6 +3283,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
|
||||
for(int i = 0; i < n_layer; ++i) {
|
||||
auto& layer = layers[i];
|
||||
|
||||
if ( i != 0 ) {
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
} else{
|
||||
// layer 0 uses identity
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
|
||||
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -3234,7 +3378,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
||||
|
||||
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
|
||||
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
|
||||
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
|
||||
|
||||
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
|
||||
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
@@ -3762,6 +3913,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO3:
|
||||
{
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k;
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
const int64_t num_attention_heads = hparams.n_head(i);
|
||||
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
||||
const int64_t q_proj_dim = num_attention_heads * head_dim_q;
|
||||
const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
|
||||
const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
|
||||
const int64_t n_ff_cur = hparams.n_ff(i);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
|
||||
{n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4652,7 +4841,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
if (!output) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
@@ -4715,7 +4908,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
if (!output) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
@@ -5082,9 +5279,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
|
||||
|
||||
@@ -5196,9 +5393,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
|
||||
// embeddings
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -5250,6 +5444,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
} else {
|
||||
if (n_expert != 0) {
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||
|
||||
@@ -6279,8 +6476,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
@@ -6325,6 +6522,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// for LFM2-ColBert-350M
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
@@ -6627,6 +6827,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MIMO2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
||||
uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
||||
uint32_t n_head = hparams.n_head(i);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// non-MoE branch
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// MoE branch
|
||||
int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MAINCODER:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6736,10 +7005,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (llama_supports_gpu_offload()) {
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
int n_repeating = n_gpu;
|
||||
if (n_repeating > 0) {
|
||||
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
||||
n_repeating--;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
|
||||
|
||||
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||
@@ -6806,6 +7077,14 @@ size_t llama_model::n_devices() const {
|
||||
return devices.size();
|
||||
}
|
||||
|
||||
uint32_t llama_model::n_gpu_layers() const {
|
||||
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
|
||||
}
|
||||
|
||||
llama_split_mode llama_model::split_mode() const {
|
||||
return params.split_mode;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||
@@ -6898,6 +7177,10 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||
@@ -7130,6 +7413,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
@@ -7247,16 +7531,24 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params);
|
||||
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_LLAMA4:
|
||||
{
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
|
||||
llm = std::make_unique<llm_build_llama>(*this, params);
|
||||
llm = std::make_unique<llm_build_llama<false>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama<true>>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_MAINCODER:
|
||||
{
|
||||
llm = std::make_unique<llm_build_maincoder>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params);
|
||||
@@ -7289,6 +7581,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_bert>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
{
|
||||
llm = std::make_unique<llm_build_modern_bert>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
||||
@@ -7378,6 +7674,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_plamo2>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO3:
|
||||
{
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gpt2>(*this, params);
|
||||
@@ -7682,6 +7986,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_MIMO2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -7689,12 +7997,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// add on pooling layer
|
||||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
// add backend sampling layers (if any)
|
||||
llm->build_sampling();
|
||||
|
||||
// if the gguf model was converted with --sentence-transformers-dense-modules
|
||||
// there will be two additional dense projection layers
|
||||
// dense linear projections are applied after pooling
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
||||
|
||||
llm->res->set_outputs();
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
|
||||
@@ -7707,7 +8020,7 @@ llama_model_params llama_model_default_params() {
|
||||
llama_model_params result = {
|
||||
/*.devices =*/ nullptr,
|
||||
/*.tensor_buft_overrides =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 999,
|
||||
/*.n_gpu_layers =*/ -1,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
@@ -7716,6 +8029,7 @@ llama_model_params llama_model_default_params() {
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_direct_io =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
@@ -7750,6 +8064,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
||||
return model->hparams.n_embd_inp();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_embd_out(const llama_model * model) {
|
||||
return model->hparams.get_n_embd_out();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
@@ -7853,6 +8171,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
case LLM_ARCH_MAINCODER:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
@@ -7862,6 +8182,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_DBRX:
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_JINA_BERT_V3:
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_STABLELM:
|
||||
@@ -7881,6 +8202,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_PLAMO2:
|
||||
case LLM_ARCH_PLAMO3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
@@ -7911,6 +8233,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_PANGU_EMBED:
|
||||
case LLM_ARCH_AFMOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_MIMO2:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
15
llama/llama.cpp/src/llama-model.h
vendored
15
llama/llama.cpp/src/llama-model.h
vendored
@@ -24,12 +24,14 @@ enum llm_type {
|
||||
LLM_TYPE_17M,
|
||||
LLM_TYPE_22M,
|
||||
LLM_TYPE_33M,
|
||||
LLM_TYPE_47M,
|
||||
LLM_TYPE_60M,
|
||||
LLM_TYPE_70M,
|
||||
LLM_TYPE_80M,
|
||||
LLM_TYPE_109M,
|
||||
LLM_TYPE_137M,
|
||||
LLM_TYPE_140M,
|
||||
LLM_TYPE_149M,
|
||||
LLM_TYPE_160M,
|
||||
LLM_TYPE_190M,
|
||||
LLM_TYPE_220M,
|
||||
@@ -39,6 +41,7 @@ enum llm_type {
|
||||
LLM_TYPE_335M,
|
||||
LLM_TYPE_350M,
|
||||
LLM_TYPE_360M,
|
||||
LLM_TYPE_395M,
|
||||
LLM_TYPE_410M,
|
||||
LLM_TYPE_450M,
|
||||
LLM_TYPE_475M,
|
||||
@@ -117,10 +120,12 @@ enum llm_type {
|
||||
LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_102B_A12B, // Solar-Open
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
LLM_TYPE_230B_A10B, // Minimax M2
|
||||
LLM_TYPE_235B_A22B,
|
||||
LLM_TYPE_300B_A47B, // Ernie MoE big
|
||||
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
|
||||
LLM_TYPE_355B_A32B, // GLM-4.5
|
||||
LLM_TYPE_E2B,
|
||||
LLM_TYPE_E4B,
|
||||
@@ -465,8 +470,6 @@ struct llama_model {
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
|
||||
llama_model_params params;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
@@ -476,6 +479,9 @@ struct llama_model {
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
// for keeping track of extra nodes used by lora adapters
|
||||
uint32_t n_lora_nodes = 0;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
@@ -497,6 +503,9 @@ struct llama_model {
|
||||
size_t n_tensors() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
uint32_t n_gpu_layers() const;
|
||||
llama_split_mode split_mode() const;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
@@ -525,6 +534,8 @@ struct llama_model {
|
||||
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
||||
|
||||
private:
|
||||
llama_model_params params;
|
||||
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
2
llama/llama.cpp/src/llama-quant.cpp
vendored
2
llama/llama.cpp/src/llama-quant.cpp
vendored
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
||||
1559
llama/llama.cpp/src/llama-sampling.cpp
vendored
1559
llama/llama.cpp/src/llama-sampling.cpp
vendored
File diff suppressed because it is too large
Load Diff
26
llama/llama.cpp/src/llama-sampling.h
vendored
26
llama/llama.cpp/src/llama-sampling.h
vendored
@@ -14,7 +14,19 @@ struct llama_grammar;
|
||||
struct llama_sampler_chain {
|
||||
llama_sampler_chain_params params;
|
||||
|
||||
std::vector<struct llama_sampler *> samplers;
|
||||
// has .backend_init() been called?
|
||||
bool is_init = false;
|
||||
|
||||
struct info {
|
||||
bool is_backend;
|
||||
|
||||
llama_sampler * ptr;
|
||||
};
|
||||
|
||||
std::vector<info> samplers;
|
||||
|
||||
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
// timing
|
||||
|
||||
@@ -24,9 +36,9 @@ struct llama_sampler_chain {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry_testing(
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
float dry_base,
|
||||
int32_t dry_allowed_length,
|
||||
int32_t dry_penalty_last_n,
|
||||
const std::vector<std::vector<llama_token>>& seq_breakers);
|
||||
int32_t context_size,
|
||||
float dry_multiplier,
|
||||
float dry_base,
|
||||
int32_t dry_allowed_length,
|
||||
int32_t dry_penalty_last_n,
|
||||
const std::vector<std::vector<llama_token>> & seq_breakers);
|
||||
|
||||
134
llama/llama.cpp/src/llama-vocab.cpp
vendored
134
llama/llama.cpp/src/llama-vocab.cpp
vendored
@@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
|
||||
regex_exprs = {
|
||||
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥-ゟ゠-ヿ]+",
|
||||
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
@@ -355,6 +361,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
@@ -1849,6 +1856,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "deepseek-v3") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "youtu") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
|
||||
clean_spaces = false;
|
||||
ignore_merges = true;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
@@ -1867,7 +1879,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
tokenizer_pre == "jina-v2-de" ||
|
||||
tokenizer_pre == "a.x-4.0" ||
|
||||
tokenizer_pre == "mellum") {
|
||||
tokenizer_pre == "mellum" ||
|
||||
tokenizer_pre == "modern-bert" ) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
@@ -2003,6 +2016,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "minimax-m2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "solar-open") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
@@ -2176,6 +2193,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
// for now, we apply this workaround to find the tokens based on their text
|
||||
|
||||
for (const auto & t : token_to_id) {
|
||||
auto & attr = id_to_token[t.second].attr;
|
||||
|
||||
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
||||
if (special_eot_id == LLAMA_TOKEN_NULL) {
|
||||
if (false
|
||||
@@ -2191,10 +2210,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
) {
|
||||
special_eot_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2205,10 +2224,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|eom_id|>"
|
||||
) {
|
||||
special_eom_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2225,10 +2244,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|code_prefix|>" // GLM-4.5
|
||||
) {
|
||||
special_fim_pre_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2245,10 +2264,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|code_suffix|>" // GLM-4.5
|
||||
) {
|
||||
special_fim_suf_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2265,10 +2284,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|code_middle|>" // GLM-4.5
|
||||
) {
|
||||
special_fim_mid_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2282,10 +2301,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<PAD>"
|
||||
) {
|
||||
special_fim_pad_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2300,10 +2319,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<reponame>" // Granite
|
||||
) {
|
||||
special_fim_rep_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2314,15 +2333,41 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|file_sep|>" // Qwen
|
||||
) {
|
||||
special_fim_sep_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// auto-detect unused tokens: e.g. control tokens with the word "unused"
|
||||
// ideally, these tokens should be marked as unused during conversion
|
||||
{
|
||||
uint32_t n_unused = 0;
|
||||
|
||||
for (const auto & t : token_to_id) {
|
||||
auto & attr = id_to_token[t.second].attr;
|
||||
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
|
||||
if (strstr(t.first.c_str(), "unused") != NULL) {
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
|
||||
}
|
||||
}
|
||||
|
||||
if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
|
||||
n_unused++;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
|
||||
}
|
||||
|
||||
// maintain a list of tokens that cause end-of-generation
|
||||
// this is currently determined based on the token text, which is obviously not ideal
|
||||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||||
@@ -2341,12 +2386,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
}
|
||||
|
||||
for (const auto & t : token_to_id) {
|
||||
auto & attr = id_to_token[t.second].attr;
|
||||
|
||||
if (false
|
||||
|| t.first == "<|eot_id|>"
|
||||
|| t.first == "<|im_end|>"
|
||||
|| t.first == "<|end|>"
|
||||
|| t.first == "<|return|>" // o200k_harmony
|
||||
|| t.first == "<|call|>" // o200k_harmony
|
||||
|| t.first == "<|flush|>" // solar-open
|
||||
|| t.first == "<|calls|>" // solar-open
|
||||
|| t.first == "<end_of_turn>"
|
||||
|| t.first == "<|endoftext|>"
|
||||
|| t.first == "<|eom_id|>"
|
||||
@@ -2356,24 +2405,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
||||
}
|
||||
} else {
|
||||
// token is control, but not marked as EOG -> print a debug log
|
||||
if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
|
||||
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
|
||||
// token is control, but not marked as EOG -> print a debug log
|
||||
if (special_eog_ids.count(t.second) == 0) {
|
||||
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
||||
__func__, t.second, t.first.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// @ngxson : quick hack for gpt-oss, always render these tokens
|
||||
for (const auto & t : token_to_id) {
|
||||
auto & attr = id_to_token[t.second].attr;
|
||||
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2393,34 +2446,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||
}
|
||||
|
||||
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
|
||||
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
|
||||
// TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
|
||||
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
|
||||
// we remove the "<|end|>" token from the EOG list
|
||||
{
|
||||
bool has_return = false;
|
||||
bool has_call = false;
|
||||
bool has_end = false;
|
||||
bool has_flush = false;
|
||||
|
||||
llama_token end_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
|
||||
for (auto tid : special_eog_ids) {
|
||||
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
|
||||
auto & text = id_to_token[tid].text;
|
||||
|
||||
if (id_to_token[tid].text == "<|return|>") {
|
||||
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
|
||||
|
||||
if (text == "<|return|>") {
|
||||
has_return = true;
|
||||
} else if (id_to_token[tid].text == "<|call|>") {
|
||||
} else if (text == "<|call|>" || text == "<|calls|>") {
|
||||
has_call = true;
|
||||
} else if (id_to_token[tid].text == "<|end|>") {
|
||||
} else if (text == "<|flush|>") {
|
||||
has_flush = true;
|
||||
} else if (text == "<|end|>") {
|
||||
has_end = true;
|
||||
end_id = tid;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_return && has_call && has_end) {
|
||||
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
|
||||
special_eog_ids.erase(end_id);
|
||||
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||
|
||||
auto & attr = id_to_token[end_id].attr;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
||||
|
||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2518,6 +2579,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
||||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
||||
}
|
||||
} else if (_contains_any(model_name, {"modern-bert"})) {
|
||||
if (token_to_id.count("[MASK]") == 0 ) {
|
||||
LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
|
||||
}
|
||||
else {
|
||||
_set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
2
llama/llama.cpp/src/llama-vocab.h
vendored
2
llama/llama.cpp/src/llama-vocab.h
vendored
@@ -51,6 +51,8 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
|
||||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
324
llama/llama.cpp/src/llama.cpp
vendored
324
llama/llama.cpp/src/llama.cpp
vendored
@@ -71,8 +71,9 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
@@ -110,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
size_t free, total;
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
@@ -139,12 +152,15 @@ enum layer_fraction_t {
|
||||
};
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
|
||||
class llama_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
@@ -163,6 +179,12 @@ static void llama_params_fit_impl(
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
@@ -180,11 +202,12 @@ static void llama_params_fit_impl(
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_total = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t min_projected_free = INT64_MAX;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_ctx = 0;
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
@@ -194,59 +217,98 @@ static void llama_params_fit_impl(
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_total += dmd.total;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
min_projected_free = std::min(min_projected_free, projected_free);
|
||||
sum_projected_ctx += dmd.mb.context;
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||
projected_free >= 0 ? "surplus" : "deficit");
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||
assert(sum_projected_used >= sum_projected_ctx);
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||
if (min_projected_free >= margin) {
|
||||
if (nd == 1) {
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
LLAMA_LOG_INFO(nd == 1 ?
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, margin/MiB, -global_surplus/MiB);
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||
const uint32_t ctx_reduction = std::min(
|
||||
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||
global_surplus += memory_reduction;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (global_surplus >= 0) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
for (const auto & dmd : dmds_min_ctx) {
|
||||
sum_projected_used_min_ctx += dmd.mb.total();
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
@@ -259,32 +321,28 @@ static void llama_params_fit_impl(
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
if (hp_ngl < 2*nd) {
|
||||
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
@@ -337,6 +395,11 @@ static void llama_params_fit_impl(
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
@@ -345,8 +408,7 @@ static void llama_params_fit_impl(
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams,
|
||||
const bool add_nonrepeating) {
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
@@ -354,29 +416,25 @@ static void llama_params_fit_impl(
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
if (add_nonrepeating) {
|
||||
mparams.n_gpu_layers += 1;
|
||||
tensor_split[nd - 1] += 1;
|
||||
}
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
@@ -391,10 +449,9 @@ static void llama_params_fit_impl(
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
@@ -427,9 +484,9 @@ static void llama_params_fit_impl(
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||
global_surplus_cpu_moe += dmd.free;
|
||||
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
@@ -448,27 +505,18 @@ static void llama_params_fit_impl(
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margin);
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd - 1; ++id) {
|
||||
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||
if (hp_nex > 0) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
@@ -476,22 +524,30 @@ static void llama_params_fit_impl(
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
uint32_t n_unassigned = hp_ngl;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
@@ -500,25 +556,26 @@ static void llama_params_fit_impl(
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += step_size;
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
@@ -529,7 +586,7 @@ static void llama_params_fit_impl(
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -549,24 +606,20 @@ static void llama_params_fit_impl(
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
@@ -582,11 +635,11 @@ static void llama_params_fit_impl(
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
@@ -601,32 +654,38 @@ static void llama_params_fit_impl(
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
@@ -634,9 +693,10 @@ static void llama_params_fit_impl(
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
@@ -645,9 +705,10 @@ static void llama_params_fit_impl(
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
@@ -662,30 +723,41 @@ static void llama_params_fit_impl(
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
bool llama_params_fit(
|
||||
enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
bool ok = true;
|
||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const std::runtime_error & e) {
|
||||
} catch (const llama_params_fit_exception & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
ok = false;
|
||||
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return ok;
|
||||
return status;
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
/*.no_perf =*/ true,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -758,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
||||
14
llama/llama.cpp/src/models/afmoe.cpp
vendored
14
llama/llama.cpp/src/models/afmoe.cpp
vendored
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
||||
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
||||
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
||||
|
||||
// dual attention normalization (pre)
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
// RoPE only for sliding_attention layers
|
||||
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
||||
((il + 1) % hparams.n_no_rope_layer_step) != 0;
|
||||
if (use_rope) {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
}
|
||||
|
||||
6
llama/llama.cpp/src/models/bert.cpp
vendored
6
llama/llama.cpp/src/models/bert.cpp
vendored
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
||||
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
|
||||
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
|
||||
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
||||
type_op, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
cur = build_ffn(cur,
|
||||
|
||||
8
llama/llama.cpp/src/models/cogvlm.cpp
vendored
8
llama/llama.cpp/src/models/cogvlm.cpp
vendored
@@ -3,12 +3,14 @@
|
||||
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor *inpL, *cur;
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * cur;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||
}
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
|
||||
// build self attention
|
||||
{
|
||||
|
||||
3
llama/llama.cpp/src/models/cohere2-iswa.cpp
vendored
3
llama/llama.cpp/src/models/cohere2-iswa.cpp
vendored
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
// UNUSED:
|
||||
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
||||
|
||||
2
llama/llama.cpp/src/models/deepseek2.cpp
vendored
2
llama/llama.cpp/src/models/deepseek2.cpp
vendored
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
if (ubatch.token) {
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
}
|
||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
7
llama/llama.cpp/src/models/gemma2-iswa.cpp
vendored
7
llama/llama.cpp/src/models/gemma2-iswa.cpp
vendored
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
7
llama/llama.cpp/src/models/gemma3.cpp
vendored
7
llama/llama.cpp/src/models/gemma3.cpp
vendored
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
if (ubatch.token) {
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
}
|
||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
|
||||
27
llama/llama.cpp/src/models/gemma3n-iswa.cpp
vendored
27
llama/llama.cpp/src/models/gemma3n-iswa.cpp
vendored
@@ -1,7 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params),
|
||||
model(model),
|
||||
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
if (ubatch.token) {
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
}
|
||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
||||
// equivalent to get_per_layer_inputs() in python code
|
||||
// output shape: [n_embd_altup, n_layer, n_tokens]
|
||||
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
||||
auto inp = std::make_unique<llm_graph_input_embd>();
|
||||
auto inp = std::make_unique<llm_graph_input_embd>();
|
||||
ggml_tensor * inp_per_layer;
|
||||
if (ubatch.token) {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
@@ -258,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
GGML_ABORT("TODO: support embd input");
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (column 0)
|
||||
ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
||||
ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
|
||||
inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
|
||||
|
||||
// Reshape to [n_embd_altup, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
}
|
||||
res->add_input(std::move(inp));
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
||||
@@ -279,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
|
||||
-1); // [n_embd_altup, n_layer, n_tokens]
|
||||
cb(per_layer_proj, "per_layer_proj", -1);
|
||||
|
||||
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
||||
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||
cb(inp_per_layer, "inp_per_layer", -1);
|
||||
|
||||
|
||||
8
llama/llama.cpp/src/models/llama-iswa.cpp
vendored
8
llama/llama.cpp/src/models/llama-iswa.cpp
vendored
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
||||
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
||||
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
||||
|
||||
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
||||
if (use_rope) {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
} else if (inp_attn_scale) {
|
||||
|
||||
25
llama/llama.cpp/src/models/llama.cpp
vendored
25
llama/llama.cpp/src/models/llama.cpp
vendored
@@ -1,6 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <bool embed>
|
||||
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
|
||||
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
if constexpr (embed) {
|
||||
inp_attn = build_attn_inp_no_cache();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
@@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
if constexpr (!embed) {
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template struct llm_build_llama<false>;
|
||||
template struct llm_build_llama<true>;
|
||||
|
||||
117
llama/llama.cpp/src/models/maincoder.cpp
vendored
Normal file
117
llama/llama.cpp/src/models/maincoder.cpp
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
123
llama/llama.cpp/src/models/mimo2-iswa.cpp
vendored
Normal file
123
llama/llama.cpp/src/models/mimo2-iswa.cpp
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
|
||||
#include "models.h"
|
||||
|
||||
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
uint32_t n_head_l = hparams.n_head(il);
|
||||
uint32_t n_head_kv_l = hparams.n_head_kv(il);
|
||||
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// self_attention
|
||||
{
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
ggml_tensor * sinks = model.layers[il].attn_sinks;
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
// dense branch
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
|
||||
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
18
llama/llama.cpp/src/models/models.h
vendored
18
llama/llama.cpp/src/models/models.h
vendored
@@ -303,6 +303,7 @@ struct llm_build_llada_moe : public llm_graph_context {
|
||||
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
template <bool embed>
|
||||
struct llm_build_llama : public llm_graph_context {
|
||||
llm_build_llama(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
@@ -311,10 +312,18 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
||||
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_maincoder : public llm_graph_context {
|
||||
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_mamba : public llm_graph_context_mamba {
|
||||
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_mimo2_iswa : public llm_graph_context {
|
||||
llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_minicpm3 : public llm_graph_context {
|
||||
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
@@ -327,6 +336,10 @@ struct llm_build_mistral3 : public llm_graph_context {
|
||||
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_modern_bert : public llm_graph_context {
|
||||
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_mpt : public llm_graph_context {
|
||||
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
@@ -396,6 +409,11 @@ struct llm_build_plamo : public llm_graph_context {
|
||||
llm_build_plamo(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_plamo3 : public llm_graph_context {
|
||||
llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_plm : public llm_graph_context {
|
||||
llm_build_plm(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
116
llama/llama.cpp/src/models/modern-bert.cpp
vendored
Normal file
116
llama/llama.cpp/src/models/modern-bert.cpp
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// construct input embeddings (token, type, position)
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// embed layer norm
|
||||
inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
|
||||
cb(inpL, "inp_norm", -1);
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
auto * inp_attn = build_attn_inp_no_cache();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// attention layer norm
|
||||
if (model.layers[il].attn_norm) {
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
}
|
||||
|
||||
// self attention
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
const size_t type_size = ggml_type_size(cur->type);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
|
||||
|
||||
// RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
// re-add the layer input
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// attention layer norm
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM, -1);
|
||||
cb(cur, "final_norm_out", -1);
|
||||
|
||||
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
// extracting cls token
|
||||
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
|
||||
cb(cur, "cls_pooled_embd", -1);
|
||||
}
|
||||
|
||||
cb(cur, "res_embd", -1);
|
||||
res->t_embd = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
@@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
|
||||
128
llama/llama.cpp/src/models/plamo3.cpp
vendored
Normal file
128
llama/llama.cpp/src/models/plamo3.cpp
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
#include "models.h"
|
||||
|
||||
template <bool iswa>
|
||||
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k;
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v;
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * residual = inpL;
|
||||
|
||||
float freq_base_l = 0.0f;
|
||||
float freq_scale_l = 0.0f;
|
||||
if constexpr (iswa) {
|
||||
freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
} else {
|
||||
freq_base_l = freq_base;
|
||||
freq_scale_l = freq_scale;
|
||||
}
|
||||
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
const int32_t n_head = hparams.n_head(il);
|
||||
const int32_t n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
const int64_t q_offset = 0;
|
||||
const int64_t k_offset = head_dim_q * n_head;
|
||||
const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
|
||||
head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
|
||||
head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
|
||||
head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "attn_q_norm", il);
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "attn_k_norm", il);
|
||||
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "attn_residual", il);
|
||||
|
||||
residual = cur;
|
||||
|
||||
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "ffn_residual", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// Explicit template instantiations
|
||||
template struct llm_build_plamo3<false>;
|
||||
template struct llm_build_plamo3<true>;
|
||||
18
llama/llama.cpp/src/models/smallthinker.cpp
vendored
18
llama/llama.cpp/src/models/smallthinker.cpp
vendored
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
ggml_tensor * probs = nullptr;
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
||||
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
|
||||
il % hparams.n_no_rope_layer_step != 0;
|
||||
|
||||
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
||||
cb(probs, "ffn_moe_logits", il);
|
||||
|
||||
// norm
|
||||
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
if (use_rope) {
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
37
llama/llama.cpp/src/unicode.cpp
vendored
37
llama/llama.cpp/src/unicode.cpp
vendored
@@ -985,6 +985,11 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
||||
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
|
||||
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
|
||||
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
|
||||
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
|
||||
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
|
||||
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
|
||||
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
|
||||
};
|
||||
|
||||
static const std::map<int, int> k_ucat_cpt = {
|
||||
@@ -1095,22 +1100,26 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
|
||||
// Match \p{...} Unicode properties of varying lengths
|
||||
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
|
||||
regex_expr[i + 1] == 'p' &&
|
||||
regex_expr[i + 2] == '{' &&
|
||||
regex_expr[i + 4] == '}') {
|
||||
const std::string pat = regex_expr.substr(i, 5);
|
||||
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
|
||||
if (!inside) {
|
||||
regex_expr_collapsed += '[';
|
||||
regex_expr[i + 2] == '{') {
|
||||
// Find the closing brace
|
||||
size_t closing_brace = regex_expr.find('}', i + 3);
|
||||
if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
|
||||
const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
|
||||
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
|
||||
if (!inside) {
|
||||
regex_expr_collapsed += '[';
|
||||
}
|
||||
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
|
||||
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
|
||||
if (!inside) {
|
||||
regex_expr_collapsed += ']';
|
||||
}
|
||||
i = closing_brace;
|
||||
continue;
|
||||
}
|
||||
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
|
||||
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
|
||||
if (!inside) {
|
||||
regex_expr_collapsed += ']';
|
||||
}
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
81
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
81
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
@@ -45,13 +45,14 @@
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
@@ -138,6 +139,62 @@
|
||||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// (conformer) lfm2
|
||||
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
|
||||
#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
|
||||
#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
|
||||
#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
|
||||
#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
|
||||
#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
|
||||
#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
|
||||
#define TN_CONV_DW "%s.blk.%d.conv_dw.%s"
|
||||
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
|
||||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
|
||||
|
||||
// Stage 0 Block (Edge Residual)
|
||||
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
|
||||
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
|
||||
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
|
||||
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
|
||||
|
||||
// Stage 1+ Block (Universal Inverted Residual)
|
||||
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
|
||||
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
|
||||
|
||||
// Attention Components
|
||||
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
|
||||
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
|
||||
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
|
||||
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
|
||||
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
|
||||
|
||||
// MSFA
|
||||
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
@@ -155,6 +212,8 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
@@ -165,12 +224,15 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -184,6 +246,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
@@ -193,12 +257,15 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
91
llama/llama.cpp/tools/mtmd/clip-model.h
vendored
91
llama/llama.cpp/tools/mtmd/clip-model.h
vendored
@@ -4,6 +4,7 @@
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
#include <cstdint>
|
||||
@@ -60,6 +61,7 @@ struct clip_hparams {
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
@@ -142,11 +144,74 @@ struct clip_layer {
|
||||
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||
|
||||
// lfm2
|
||||
ggml_tensor * ff_norm_w = nullptr;
|
||||
ggml_tensor * ff_norm_b = nullptr;
|
||||
ggml_tensor * ff_norm_1_w = nullptr;
|
||||
ggml_tensor * ff_norm_1_b = nullptr;
|
||||
ggml_tensor * ff_up_1_w = nullptr;
|
||||
ggml_tensor * ff_up_1_b = nullptr;
|
||||
ggml_tensor * ff_down_1_w = nullptr;
|
||||
ggml_tensor * ff_down_1_b = nullptr;
|
||||
ggml_tensor * pos_bias_u = nullptr;
|
||||
ggml_tensor * pos_bias_v = nullptr;
|
||||
ggml_tensor * norm_conv_w = nullptr;
|
||||
ggml_tensor * norm_conv_b = nullptr;
|
||||
ggml_tensor * linear_pos_w = nullptr;
|
||||
|
||||
ggml_tensor * conv_norm_w = nullptr;
|
||||
ggml_tensor * conv_norm_b = nullptr;
|
||||
ggml_tensor * conv_dw_w = nullptr;
|
||||
ggml_tensor * conv_dw_b = nullptr;
|
||||
ggml_tensor * conv_pw1_w = nullptr;
|
||||
ggml_tensor * conv_pw1_b = nullptr;
|
||||
ggml_tensor * conv_pw2_w = nullptr;
|
||||
ggml_tensor * conv_pw2_b = nullptr;
|
||||
|
||||
bool has_deepstack() const {
|
||||
return deepstack_fc1_w != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
||||
struct mobilenetv5_block {
|
||||
// Stage 0 (Edge Residual)
|
||||
ggml_tensor * s0_conv_exp_w = nullptr;
|
||||
ggml_tensor * s0_bn1_w = nullptr;
|
||||
ggml_tensor * s0_conv_pwl_w = nullptr;
|
||||
ggml_tensor * s0_bn2_w = nullptr;
|
||||
|
||||
// Stage 1+ (Universal Inverted Residual)
|
||||
ggml_tensor * dw_start_w = nullptr;
|
||||
ggml_tensor * dw_start_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_exp_w = nullptr;
|
||||
ggml_tensor * pw_exp_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * dw_mid_w = nullptr;
|
||||
ggml_tensor * dw_mid_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_proj_w = nullptr;
|
||||
ggml_tensor * pw_proj_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * layer_scale_w = nullptr;
|
||||
|
||||
// Attention (MQA) components
|
||||
ggml_tensor * attn_q_w = nullptr;
|
||||
ggml_tensor * attn_k_w = nullptr;
|
||||
ggml_tensor * attn_v_w = nullptr;
|
||||
ggml_tensor * attn_o_w = nullptr;
|
||||
|
||||
// Optional downsampling/norm in attention
|
||||
ggml_tensor * attn_k_dw_w = nullptr;
|
||||
ggml_tensor * attn_k_norm_w = nullptr;
|
||||
ggml_tensor * attn_v_dw_w = nullptr;
|
||||
ggml_tensor * attn_v_norm_w = nullptr;
|
||||
|
||||
// Block norm (often present in attention blocks)
|
||||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
@@ -263,6 +328,23 @@ struct clip_model {
|
||||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// mobilenetv5 for gemma3n
|
||||
std::vector<mobilenetv5_block> mobilenet_blocks;
|
||||
std::vector<int> mobilenet_stage_ends;
|
||||
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
||||
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
||||
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
||||
|
||||
// Multi-Scale Fusion Adapter (MSFA) components
|
||||
ggml_tensor * msfa_concat_conv_w = nullptr;
|
||||
ggml_tensor * msfa_concat_norm_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
@@ -286,9 +368,16 @@ struct clip_model {
|
||||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
// lfm2 audio
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
||||
ggml_tensor * pre_encode_out_w = nullptr;
|
||||
ggml_tensor * pre_encode_out_b = nullptr;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||
}
|
||||
|
||||
bool audio_has_stack_frames() const {
|
||||
|
||||
373
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
373
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
@@ -801,6 +801,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
@@ -831,6 +835,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||
} break;
|
||||
@@ -850,10 +855,18 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("missing cgraph builder");
|
||||
}
|
||||
@@ -1154,6 +1167,14 @@ struct clip_model_loader {
|
||||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
// Similar configuration to Gemma3
|
||||
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
@@ -1171,6 +1192,20 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
std::vector<int> wa_layer_indexes_vec;
|
||||
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
|
||||
for (auto & layer : wa_layer_indexes_vec) {
|
||||
hparams.wa_layer_indexes.insert(layer);
|
||||
}
|
||||
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
|
||||
hparams.set_limit_image_tokens(1, 62500);
|
||||
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
@@ -1189,6 +1224,7 @@ struct clip_model_loader {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||
@@ -1204,6 +1240,15 @@ struct clip_model_loader {
|
||||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
hparams.audio_chunk_len = 1; // in seconds
|
||||
hparams.audio_sample_rate = 16000;
|
||||
hparams.audio_n_fft = 512;
|
||||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -1229,7 +1274,14 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
|
||||
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
if (!hparams.wa_layer_indexes.empty()) {
|
||||
LOG_INF("%s: wa_layer_indexes: ", __func__);
|
||||
for (auto & layer : hparams.wa_layer_indexes) {
|
||||
LOG_INF("%d ", layer);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
if (hparams.image_min_pixels > 0) {
|
||||
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
|
||||
}
|
||||
@@ -1311,6 +1363,10 @@ struct clip_model_loader {
|
||||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
@@ -1385,6 +1441,7 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
switch (model.proj_type) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
@@ -1497,6 +1554,14 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
@@ -1516,11 +1581,112 @@ struct clip_model_loader {
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
||||
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
||||
|
||||
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
||||
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
||||
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
||||
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
||||
|
||||
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
||||
|
||||
// Dynamically load blocks stage by stage
|
||||
for (int stage = 0; stage < 4; ++stage) {
|
||||
int blocks_found_in_stage = 0;
|
||||
|
||||
for (int blk_idx = 0; ; ++blk_idx) {
|
||||
bool found_block = false;
|
||||
mobilenetv5_block block;
|
||||
|
||||
// 1. Check for Edge Residual (S0)
|
||||
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
||||
if (block.s0_conv_exp_w) {
|
||||
found_block = true;
|
||||
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
||||
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
||||
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
||||
}
|
||||
// 2. Check for UIR (Universal Inverted Residual)
|
||||
else {
|
||||
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
||||
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
||||
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
||||
|
||||
if (block.dw_start_w || block.pw_exp_w) {
|
||||
found_block = true;
|
||||
if (block.dw_start_w) {
|
||||
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
||||
}
|
||||
if (block.pw_exp_w) {
|
||||
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
||||
if (block.dw_mid_w) {
|
||||
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
||||
if (block.pw_proj_w) {
|
||||
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check for Attention (MQA)
|
||||
// Even if UIR/Edge check failed, this might be a pure attention block
|
||||
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
||||
if (attn_q_check) {
|
||||
found_block = true;
|
||||
block.attn_q_w = attn_q_check;
|
||||
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
||||
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
||||
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
||||
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
||||
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
||||
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
||||
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
||||
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
||||
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
||||
if (!block.layer_scale_w) {
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block) {
|
||||
model.mobilenet_blocks.push_back(block);
|
||||
blocks_found_in_stage++;
|
||||
} else {
|
||||
// End of blocks for this stage
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Track where this stage ends in the flat vector
|
||||
if (blocks_found_in_stage > 0) {
|
||||
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
||||
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
||||
}
|
||||
}
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
@@ -1580,6 +1746,17 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
@@ -1628,6 +1805,52 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
for (int i : {0, 2, 3, 5, 6}) {
|
||||
model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
|
||||
model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
|
||||
}
|
||||
model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
|
||||
model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
|
||||
|
||||
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
|
||||
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
|
||||
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = model.layers[il];
|
||||
|
||||
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
|
||||
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
|
||||
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
|
||||
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
|
||||
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
|
||||
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
|
||||
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
|
||||
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
|
||||
|
||||
layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
|
||||
layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
|
||||
|
||||
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
|
||||
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
|
||||
|
||||
layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
|
||||
|
||||
layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
|
||||
layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
|
||||
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
|
||||
layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"));
|
||||
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
|
||||
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
|
||||
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
|
||||
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown projector type");
|
||||
}
|
||||
@@ -1932,6 +2155,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
|
||||
try {
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
@@ -1941,10 +2165,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio) {
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
@@ -2668,6 +2896,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
// res_imgs->data[0] = *res;
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
const int patch_size = params.patch_size; // typically 16
|
||||
const int merge_size = params.n_merge; // typically 2
|
||||
const int align_size = patch_size * merge_size; // 32
|
||||
|
||||
const int max_num_patches = params.image_max_pixels > 0 ?
|
||||
params.image_max_pixels / (patch_size * patch_size) : 256;
|
||||
|
||||
// Linear search for optimal scale to fit within max_num_patches
|
||||
float scale = 1.0f;
|
||||
int target_height = original_size.height;
|
||||
int target_width = original_size.width;
|
||||
|
||||
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
||||
float scaled_size = size * scale;
|
||||
// Round up to nearest multiple of align_size
|
||||
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
|
||||
// Ensure at least one patch
|
||||
return std::max(align_size, aligned);
|
||||
};
|
||||
|
||||
// Linear search with 0.02 step size
|
||||
while (scale > 0.0f) {
|
||||
target_height = get_scaled_image_size(scale, original_size.height);
|
||||
target_width = get_scaled_image_size(scale, original_size.width);
|
||||
|
||||
int num_patches_h = target_height / patch_size;
|
||||
int num_patches_w = target_width / patch_size;
|
||||
int num_patches = num_patches_h * num_patches_w;
|
||||
|
||||
if (num_patches > max_num_patches) {
|
||||
scale -= 0.02f;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
clip_image_size new_size = {target_width, target_height};
|
||||
|
||||
// Resize the image
|
||||
clip_image_u8 resized;
|
||||
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
|
||||
// Normalize to float32
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
|
||||
|
||||
// Add to results
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
@@ -2731,6 +3010,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
||||
@@ -2900,6 +3189,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -2915,6 +3205,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
default:
|
||||
break;
|
||||
@@ -2975,6 +3266,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// dynamic size (2 conv, so double patch size)
|
||||
int x_patch = img->nx / (params.patch_size * 2);
|
||||
@@ -2990,6 +3282,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int scale_factor = ctx->model.hparams.n_merge;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
||||
// regardless of input size (see architecture description)
|
||||
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
@@ -3015,6 +3313,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
|
||||
@@ -3047,6 +3346,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
{
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
@@ -3097,7 +3400,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int pos_w = image_size_width / patch_size;
|
||||
const int pos_h = image_size_height / patch_size;
|
||||
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
||||
|
||||
auto get_inp_tensor = [&gf](const char * name) {
|
||||
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
||||
@@ -3246,9 +3548,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||
// ipw * ipw = number of vision token been processed inside ViT
|
||||
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
@@ -3259,7 +3563,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
|
||||
if (use_window_attn) {
|
||||
const int attn_window_size = 112;
|
||||
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
@@ -3376,6 +3680,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
@@ -3383,6 +3688,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
@@ -3405,6 +3711,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
set_input_i32("pos_w", pos_data);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
GGML_ASSERT(imgs.entries.size() == 1);
|
||||
const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
|
||||
|
||||
auto d_model = 512;
|
||||
auto seq_len = n_frames * 2 - 1;
|
||||
std::vector<float> pos_emb(d_model*seq_len);
|
||||
std::vector<double> inv_freq(d_model / 2);
|
||||
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
||||
inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
|
||||
}
|
||||
for (int64_t pos = 0; pos < seq_len; ++pos) {
|
||||
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
||||
const float ang = (n_frames - pos - 1) * inv_freq[i];
|
||||
pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
|
||||
pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
|
||||
}
|
||||
}
|
||||
set_input_f32("pos_emb", pos_emb);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
@@ -3475,16 +3802,19 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return ctx->model.mm_1_b->ne[0];
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
return ctx->model.mm_3_w->ne[1];
|
||||
@@ -3499,6 +3829,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
return ctx->model.position_embeddings->ne[0];
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return ctx->model.mm_ffn_down_w->ne[1];
|
||||
default:
|
||||
@@ -3507,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
@@ -3514,24 +3847,26 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
||||
}
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_VISION;
|
||||
}
|
||||
@@ -3541,10 +3876,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
|
||||
3
llama/llama.cpp/tools/mtmd/clip.h
vendored
3
llama/llama.cpp/tools/mtmd/clip.h
vendored
@@ -106,7 +106,8 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user