mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-22 07:38:26 -04:00
Compare commits
3 Commits
issue-9414
...
docs/wan-g
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b27de08fff | ||
|
|
510f791ccc | ||
|
|
369c50a41c |
@@ -1,38 +0,0 @@
|
||||
From: LocalAI maintainers <noreply@localai.io>
|
||||
Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
|
||||
|
||||
Some Gemma 3 GGUF files (notably those distributed via the Ollama
|
||||
registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
|
||||
metadata key. ik_llama.cpp currently requires the key to be present and
|
||||
fails the entire model load with:
|
||||
|
||||
error loading model hyperparameters:
|
||||
key not found in model: gemma3.attention.layer_norm_rms_epsilon
|
||||
|
||||
Ollama's own loader silently falls back to ~1e-6 in the same situation,
|
||||
which is the canonical Gemma 3 default (see google/gemma_pytorch
|
||||
config.py and the Hugging Face Gemma3Config), so the model still loads
|
||||
and works correctly.
|
||||
|
||||
Mirror that behavior here: pre-seed the field with the Gemma 3 default
|
||||
and mark the metadata key as optional. This unblocks Ollama-converted
|
||||
Gemma 3 models without affecting GGUFs that already carry the key.
|
||||
|
||||
Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
|
||||
---
|
||||
src/llama-hparams.cpp | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -679,7 +679,8 @@
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: model.type = e_model::MODEL_2B; break;
|
||||
@@ -1,38 +0,0 @@
|
||||
From: LocalAI maintainers <noreply@localai.io>
|
||||
Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
|
||||
|
||||
Some Gemma 3 GGUF files (notably those distributed via the Ollama
|
||||
registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
|
||||
metadata key. llama.cpp currently requires the key to be present and
|
||||
fails the entire model load with:
|
||||
|
||||
error loading model hyperparameters:
|
||||
key not found in model: gemma3.attention.layer_norm_rms_epsilon
|
||||
|
||||
Ollama's own loader silently falls back to ~1e-6 in the same situation,
|
||||
which is the canonical Gemma 3 default (see google/gemma_pytorch
|
||||
config.py and the Hugging Face Gemma3Config), so the model still loads
|
||||
and works correctly.
|
||||
|
||||
Mirror that behavior here: pre-seed the field with the Gemma 3 default
|
||||
and mark the metadata key as optional. This unblocks Ollama-converted
|
||||
Gemma 3 models without affecting GGUFs that already carry the key.
|
||||
|
||||
Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
|
||||
---
|
||||
src/llama-model.cpp | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1568,7 +1568,8 @@
|
||||
|
||||
hparams.f_final_logit_softcapping = 0.0f;
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
# Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant.
|
||||
# Auto-bumped nightly by .github/workflows/bump_deps.yaml.
|
||||
TURBOQUANT_VERSION?=45f8a066ed5f5bb38c695cec532f6cef9f4efa9d
|
||||
TURBOQUANT_VERSION?=627ebbc6e27727bd4f65422d8aa60b13404993c8
|
||||
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -1,13 +1,22 @@
|
||||
#!/bin/bash
|
||||
# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache
|
||||
# types so the gRPC `LoadModel` call accepts the TurboQuant-specific
|
||||
# `turbo2` / `turbo3` / `turbo4` cache types.
|
||||
# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
|
||||
# turboquant build to account for two gaps between upstream and the fork:
|
||||
#
|
||||
# We do this on the *copy* sitting in turboquant-<flavor>-build/, never on the
|
||||
# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps
|
||||
# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*.
|
||||
# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
|
||||
# fork-specific `turbo2` / `turbo3` / `turbo4` cache types.
|
||||
# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
|
||||
# server-side random per-instance marker) with the legacy "<__media__>"
|
||||
# literal. The fork branched before that PR, so server-common.cpp has no
|
||||
# get_media_marker symbol. The fork's mtmd_default_marker() still returns
|
||||
# "<__media__>", and Go-side tooling falls back to that sentinel when the
|
||||
# backend does not expose media_marker, so substituting the literal keeps
|
||||
# behavior identical on the turboquant path.
|
||||
#
|
||||
# Idempotent: skips the insertion if the marker is already present (so re-runs
|
||||
# We patch the *copy* sitting in turboquant-<flavor>-build/, never the original
|
||||
# under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling
|
||||
# against vanilla upstream.
|
||||
#
|
||||
# Idempotent: skips each insertion if its marker is already present (so re-runs
|
||||
# of the same build dir don't double-insert).
|
||||
|
||||
set -euo pipefail
|
||||
@@ -25,33 +34,47 @@ if [[ ! -f "$SRC" ]]; then
|
||||
fi
|
||||
|
||||
if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then
|
||||
echo "==> $SRC already has TurboQuant cache types, skipping"
|
||||
exit 0
|
||||
echo "==> $SRC already has TurboQuant cache types, skipping KV allow-list patch"
|
||||
else
|
||||
echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
|
||||
|
||||
# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
||||
# line (the kv_cache_types[] allow-list). Using awk because the builder image
|
||||
# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
|
||||
awk '
|
||||
/^ GGML_TYPE_Q5_1,$/ && !done {
|
||||
print
|
||||
print " // turboquant fork extras — added by patch-grpc-server.sh"
|
||||
print " GGML_TYPE_TURBO2_0,"
|
||||
print " GGML_TYPE_TURBO3_0,"
|
||||
print " GGML_TYPE_TURBO4_0,"
|
||||
done = 1
|
||||
next
|
||||
}
|
||||
{ print }
|
||||
END {
|
||||
if (!done) {
|
||||
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
' "$SRC" > "$SRC.tmp"
|
||||
mv "$SRC.tmp" "$SRC"
|
||||
|
||||
echo "==> KV allow-list patch OK"
|
||||
fi
|
||||
|
||||
echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
|
||||
if grep -q 'get_media_marker()' "$SRC"; then
|
||||
echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
|
||||
# Only one call site today (ModelMetadata), but replace all occurrences to
|
||||
# stay robust if upstream adds more. Use a temp file to avoid relying on
|
||||
# sed -i portability (the builder image uses GNU sed, but keeping this
|
||||
# consistent with the awk block above).
|
||||
sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
|
||||
mv "$SRC.tmp" "$SRC"
|
||||
echo "==> get_media_marker() substitution OK"
|
||||
else
|
||||
echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
|
||||
fi
|
||||
|
||||
# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
||||
# line (the kv_cache_types[] allow-list). Using awk because the builder image
|
||||
# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
|
||||
awk '
|
||||
/^ GGML_TYPE_Q5_1,$/ && !done {
|
||||
print
|
||||
print " // turboquant fork extras — added by patch-grpc-server.sh"
|
||||
print " GGML_TYPE_TURBO2_0,"
|
||||
print " GGML_TYPE_TURBO3_0,"
|
||||
print " GGML_TYPE_TURBO4_0,"
|
||||
done = 1
|
||||
next
|
||||
}
|
||||
{ print }
|
||||
END {
|
||||
if (!done) {
|
||||
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
' "$SRC" > "$SRC.tmp"
|
||||
mv "$SRC.tmp" "$SRC"
|
||||
|
||||
echo "==> patched OK"
|
||||
echo "==> all patches applied"
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001
|
||||
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
||||
Date: Wed, 8 Apr 2026 11:12:15 -0400
|
||||
Subject: [PATCH] server: respect the ignore eos flag (#21203)
|
||||
|
||||
---
|
||||
tools/server/server-context.cpp | 3 +++
|
||||
tools/server/server-context.h | 3 +++
|
||||
tools/server/server-task.cpp | 3 ++-
|
||||
tools/server/server-task.h | 1 +
|
||||
4 files changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index 9d3ac538..b31981c5 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const {
|
||||
/* fim_rep_token */ llama_vocab_fim_rep(impl->vocab),
|
||||
/* fim_sep_token */ llama_vocab_fim_sep(impl->vocab),
|
||||
|
||||
+ /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog,
|
||||
+
|
||||
/* model_vocab_type */ llama_vocab_type(impl->vocab),
|
||||
/* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab),
|
||||
/* model_n_ctx_train */ llama_model_n_ctx_train(impl->model),
|
||||
@@ -3117,6 +3119,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
ctx_server.vocab,
|
||||
params,
|
||||
meta->slot_n_ctx,
|
||||
+ meta->logit_bias_eog,
|
||||
data);
|
||||
task.id_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
|
||||
index d7ce8735..6ea9afc0 100644
|
||||
--- a/tools/server/server-context.h
|
||||
+++ b/tools/server/server-context.h
|
||||
@@ -39,6 +39,9 @@ struct server_context_meta {
|
||||
llama_token fim_rep_token;
|
||||
llama_token fim_sep_token;
|
||||
|
||||
+ // sampling
|
||||
+ std::vector<llama_logit_bias> logit_bias_eog;
|
||||
+
|
||||
// model meta
|
||||
enum llama_vocab_type model_vocab_type;
|
||||
int32_t model_vocab_n_tokens;
|
||||
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
|
||||
index 4cc87bc5..856b3f0e 100644
|
||||
--- a/tools/server/server-task.cpp
|
||||
+++ b/tools/server/server-task.cpp
|
||||
@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl(
|
||||
const llama_vocab * vocab,
|
||||
const common_params & params_base,
|
||||
const int n_ctx_slot,
|
||||
+ const std::vector<llama_logit_bias> & logit_bias_eog,
|
||||
const json & data) {
|
||||
task_params params;
|
||||
|
||||
@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl(
|
||||
if (params.sampling.ignore_eos) {
|
||||
params.sampling.logit_bias.insert(
|
||||
params.sampling.logit_bias.end(),
|
||||
- defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
|
||||
+ logit_bias_eog.begin(), logit_bias_eog.end());
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
|
||||
index d855bf08..243e47a8 100644
|
||||
--- a/tools/server/server-task.h
|
||||
+++ b/tools/server/server-task.h
|
||||
@@ -209,6 +209,7 @@ struct server_task {
|
||||
const llama_vocab * vocab,
|
||||
const common_params & params_base,
|
||||
const int n_ctx_slot,
|
||||
+ const std::vector<llama_logit_bias> & logit_bias_eog,
|
||||
const json & data);
|
||||
|
||||
// utility function
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1008,6 +1008,20 @@
|
||||
nvidia-cuda-12: "cuda12-turboquant-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development"
|
||||
- !!merge <<: *stablediffusionggml
|
||||
name: "stablediffusion-ggml-development"
|
||||
capabilities:
|
||||
default: "cpu-stablediffusion-ggml-development"
|
||||
nvidia: "cuda12-stablediffusion-ggml-development"
|
||||
intel: "intel-sycl-f16-stablediffusion-ggml-development"
|
||||
# amd: "rocm-stablediffusion-ggml-development"
|
||||
vulkan: "vulkan-stablediffusion-ggml-development"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-stablediffusion-ggml-development"
|
||||
metal: "metal-stablediffusion-ggml-development"
|
||||
nvidia-cuda-13: "cuda13-stablediffusion-ggml-development"
|
||||
nvidia-cuda-12: "cuda12-stablediffusion-ggml-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-stablediffusion-ggml-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-stablediffusion-ggml-development"
|
||||
- !!merge <<: *neutts
|
||||
name: "cpu-neutts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts"
|
||||
|
||||
@@ -15186,10 +15186,10 @@
|
||||
- gpu
|
||||
overrides:
|
||||
parameters:
|
||||
model: wan2.1-t2v-1.3B-Q8_0.gguf
|
||||
model: wan2.1_t2v_1.3b-q8_0.gguf
|
||||
files:
|
||||
- filename: "wan2.1-t2v-1.3B-Q8_0.gguf"
|
||||
uri: "huggingface://calcuis/wan-gguf/wan2.1-t2v-1.3B-Q8_0.gguf"
|
||||
- filename: "wan2.1_t2v_1.3b-q8_0.gguf"
|
||||
uri: "huggingface://calcuis/wan-gguf/wan2.1_t2v_1.3b-q8_0.gguf"
|
||||
- filename: "wan_2.1_vae.safetensors"
|
||||
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors"
|
||||
- filename: "umt5-xxl-encoder-Q8_0.gguf"
|
||||
|
||||
@@ -9,11 +9,6 @@ config_file: |
|
||||
- "diffusion_model"
|
||||
- "vae_decode_only:false"
|
||||
- "sampler:euler"
|
||||
- "scheduler:discrete"
|
||||
- "flow_shift:3.0"
|
||||
- "diffusion_flash_attn:true"
|
||||
- "offload_params_to_cpu:true"
|
||||
- "keep_vae_on_cpu:true"
|
||||
- "keep_clip_on_cpu:true"
|
||||
- "t5xxl_path:umt5-xxl-encoder-Q8_0.gguf"
|
||||
- "vae_path:wan_2.1_vae.safetensors"
|
||||
|
||||
Reference in New Issue
Block a user