From 1a1bd57469f407f62fbb85d3e4b81cb592566a88 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Mon, 22 Jun 2026 00:46:56 +0200 Subject: [PATCH] chore(model gallery): :robot: add 1 new models via gallery agent (#10436) chore(model gallery): :robot: add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- gallery/index.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index ed6b57abe..7699acb0b 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,49 @@ --- +- name: "qwopus3.6-27b-v2-mtp-nvfp4" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + urls: + - https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF + description: | + ๐Ÿช Qwopus3.6-27B-v2-MTP + MTP Release + + Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B + + ๐Ÿงฌ Trace Inversion & Negentropy + ๐Ÿง  27B Parameters + โšก Speculative Decoding + ๐Ÿ› ๏ธ Coding / DevOps / Math + + ๐Ÿ’ก What is Qwopus3.6-27B-v2-MTP? + ๐Ÿช Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster. + + โšก MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts. + ๐Ÿงฉ Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories. + ๐Ÿงช GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks. + ๐Ÿš€ Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not. + + ... + tags: + - llm + - gguf + overrides: + backend: llama-cpp + function: + automatic_tool_parsing_fallback: true + grammar: + disable: true + known_usecases: + - chat + options: + - use_jinja:true + parameters: + model: llama-cpp/models/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf + template: + use_tokenizer_template: true + files: + - filename: llama-cpp/models/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf + sha256: 2a0a36fd10374c2a85356121c7c315bda725c7eaca0b3ae14838567629c6924a + uri: https://huggingface.co/michaelw9999/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF/resolve/main/Qwopus3.6-27B-v2-MTP-NVFP4-GGUF.gguf - name: "qwopus3.6-27b-coder-mtp-nvfp4" url: "github:mudler/LocalAI/gallery/virtual.yaml@master" urls: