From 9bc69c9e5f67942db5e7ec818f500f355ba56465 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:52:46 +0200 Subject: [PATCH] chore(model gallery): :robot: add 1 new models via gallery agent (#10200) chore(model gallery): :robot: add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- gallery/index.yaml | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 37ff7cd15..f6089b9b4 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,57 @@ --- +- name: "gemma-4-12b-it-qat-q4_0" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + urls: + - https://huggingface.co/google/gemma-4-12B-it-qat-q4_0-gguf + description: | + Hugging Face | + GitHub | + Launch Blog | + Documentation + + License: Apache 2.0 | Authors: Google DeepMind + + > [!Note] + > This model card is for the new versions of the Gemma 4 family optimized with Quantization-Aware Training (QAT), which allows preserving similar quality to bfloat16 while dramatically reducing the memory requirements to load the model. + > Four versions of the QAT checkpoints are available: + > * **Unquantized QAT checkpoints** (Q4_0): Half-precision weights extracted from the QAT pipeline, ideal for custom downstream compilation and research. Available for Gemma 4 E2B, E4B, 12B, 26B A4B, and 31B, and their drafter models. + > * **GGUF** (Q4_0): Ready-to-deploy formats for broad ecosystem compatibility. Available for Gemma 4 E2B, E4B, 12B, 26B A4B, and 31B. + > * **Mobile-optimized** (wNa8o8): A custom schema engineered explicitly for mobile hardware efficiency. It features targeted 2-bit decoding layers, optimized KV caches, and static activations to maximize VRAM savings. Available for Gemma 4 E2B and E4B. + > * **Compressed Tensors** (w4a16): QAT checkpoints serialized in the compressed-tensors format for native, optimized inference with vLLM. Available for Gemma 4 E2B, E4B, 12B + + ... + license: "apache-2.0" + tags: + - llm + - gguf + icon: https://ai.google.dev/gemma/images/gemma4_banner.png + overrides: + backend: llama-cpp + function: + automatic_tool_parsing_fallback: true + grammar: + disable: true + known_usecases: + - chat + mmproj: llama-cpp/mmproj/gemma-4-12B-it-qat-q4_0-gguf/mmproj-gemma-4-12b-it-qat-q4_0.gguf + options: + - use_jinja:true + parameters: + min_p: 0 + model: llama-cpp/models/gemma-4-12B-it-qat-q4_0-gguf/gemma-4-12b-it-qat-q4_0.gguf + repeat_penalty: 1 + temperature: 1 + top_k: 64 + top_p: 0.95 + template: + use_tokenizer_template: true + files: + - filename: llama-cpp/models/gemma-4-12B-it-qat-q4_0-gguf/gemma-4-12b-it-qat-q4_0.gguf + sha256: faff1a63667fac17ac5e777f47114688fcefea96e220e211aaa8d62c2c4561f1 + uri: https://huggingface.co/google/gemma-4-12B-it-qat-q4_0-gguf/resolve/main/gemma-4-12b-it-qat-q4_0.gguf + - filename: llama-cpp/mmproj/gemma-4-12B-it-qat-q4_0-gguf/mmproj-gemma-4-12b-it-qat-q4_0.gguf + sha256: e70b0e5cd80323d5d588b4ed06780356b7b1ba03995a4b8164c6ae9db0ff5989 + uri: https://huggingface.co/google/gemma-4-12B-it-qat-q4_0-gguf/resolve/main/mmproj-gemma-4-12b-it-qat-q4_0.gguf - name: "step-3.7-flash" url: "github:mudler/LocalAI/gallery/virtual.yaml@master" urls: