mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-24 16:51:44 -04:00
refactor(tinygrad): reuse tinygrad.apps.llm instead of vendored Transformer (#9380)
Drop the 295-line vendor/llama.py fork in favor of `tinygrad.apps.llm`, which now provides the Transformer blocks, GGUF loader (incl. Q4/Q6/Q8 quantization), KV-cache and generate loop we were maintaining ourselves. What changed: - New vendor/appsllm_adapter.py (~90 LOC) — HF -> GGUF-native state-dict keymap, Transformer kwargs builder, `_embed_hidden` helper, and a hard rejection of qkv_bias models (Qwen2 / 2.5 are no longer supported; the apps.llm Transformer ties `bias=False` on Q/K/V projections). - backend.py routes both safetensors and GGUF paths through apps.llm.Transformer. Generation now delegates to its (greedy-only) `generate()`; Temperature / TopK / TopP / RepetitionPenalty are still accepted on the wire but ignored — documented in the module docstring. - Jinja chat render now passes `enable_thinking=False` so Qwen3's reasoning preamble doesn't eat the tool-call token budget on small models. - Embedding path uses `_embed_hidden` (block stack + output_norm) rather than the custom `embed()` method we were carrying on the vendored Transformer. - test.py gains TestAppsLLMAdapter covering the keymap rename, tied embedding fallback, unknown-key skipping, and qkv_bias rejection. - Makefile fixtures move from Qwen/Qwen2.5-0.5B-Instruct to Qwen/Qwen3-0.6B (apps.llm-compatible) and tool_parser from qwen3_xml to hermes (the HF chat template emits hermes-style JSON tool calls). Verified with the docker-backed targets: test-extra-backend-tinygrad 5/5 PASS test-extra-backend-tinygrad-embeddings 3/3 PASS test-extra-backend-tinygrad-whisper 4/4 PASS test-extra-backend-tinygrad-sd 3/3 PASS
This commit is contained in:
committed by
GitHub
parent
b4e30692a2
commit
a0cbc46be9
@@ -22,6 +22,7 @@ import backend_pb2_grpc
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from tool_parsers.hermes import HermesToolParser # noqa: E402
|
||||
from vendor.appsllm_adapter import _hf_to_appsllm_state_dict # noqa: E402
|
||||
|
||||
|
||||
class TestHealth(unittest.TestCase):
|
||||
@@ -80,5 +81,73 @@ class TestHermesParser(unittest.TestCase):
|
||||
self.assertEqual(calls, [])
|
||||
|
||||
|
||||
class TestAppsLLMAdapter(unittest.TestCase):
|
||||
"""Smoke tests for the HF → tinygrad.apps.llm state-dict keymap."""
|
||||
|
||||
def _fake_hf_weights(self, n_layers: int = 2, include_lm_head: bool = True):
|
||||
keys = [
|
||||
"model.embed_tokens.weight",
|
||||
"model.norm.weight",
|
||||
]
|
||||
if include_lm_head:
|
||||
keys.append("lm_head.weight")
|
||||
for l in range(n_layers):
|
||||
keys += [
|
||||
f"model.layers.{l}.input_layernorm.weight",
|
||||
f"model.layers.{l}.post_attention_layernorm.weight",
|
||||
f"model.layers.{l}.self_attn.q_proj.weight",
|
||||
f"model.layers.{l}.self_attn.k_proj.weight",
|
||||
f"model.layers.{l}.self_attn.v_proj.weight",
|
||||
f"model.layers.{l}.self_attn.o_proj.weight",
|
||||
f"model.layers.{l}.self_attn.q_norm.weight",
|
||||
f"model.layers.{l}.self_attn.k_norm.weight",
|
||||
f"model.layers.{l}.mlp.gate_proj.weight",
|
||||
f"model.layers.{l}.mlp.up_proj.weight",
|
||||
f"model.layers.{l}.mlp.down_proj.weight",
|
||||
]
|
||||
# sentinel objects so we can verify identity-based aliasing
|
||||
return {k: object() for k in keys}
|
||||
|
||||
def test_keymap_renames_every_hf_key(self):
|
||||
hf = self._fake_hf_weights(n_layers=2)
|
||||
sd = _hf_to_appsllm_state_dict(hf, 2)
|
||||
expected = {
|
||||
"token_embd.weight", "output_norm.weight", "output.weight",
|
||||
"blk.0.attn_norm.weight", "blk.0.ffn_norm.weight",
|
||||
"blk.0.attn_q.weight", "blk.0.attn_k.weight", "blk.0.attn_v.weight",
|
||||
"blk.0.attn_output.weight",
|
||||
"blk.0.attn_q_norm.weight", "blk.0.attn_k_norm.weight",
|
||||
"blk.0.ffn_gate.weight", "blk.0.ffn_up.weight", "blk.0.ffn_down.weight",
|
||||
"blk.1.attn_norm.weight", "blk.1.ffn_norm.weight",
|
||||
"blk.1.attn_q.weight", "blk.1.attn_k.weight", "blk.1.attn_v.weight",
|
||||
"blk.1.attn_output.weight",
|
||||
"blk.1.attn_q_norm.weight", "blk.1.attn_k_norm.weight",
|
||||
"blk.1.ffn_gate.weight", "blk.1.ffn_up.weight", "blk.1.ffn_down.weight",
|
||||
}
|
||||
self.assertEqual(set(sd.keys()), expected)
|
||||
|
||||
def test_tied_embedding_fallback_when_lm_head_missing(self):
|
||||
hf = self._fake_hf_weights(n_layers=1, include_lm_head=False)
|
||||
sd = _hf_to_appsllm_state_dict(hf, 1)
|
||||
self.assertIn("output.weight", sd)
|
||||
self.assertIs(sd["output.weight"], sd["token_embd.weight"])
|
||||
|
||||
def test_unknown_keys_are_skipped(self):
|
||||
hf = self._fake_hf_weights(n_layers=1)
|
||||
hf["model.layers.0.self_attn.rotary_emb.inv_freq"] = object()
|
||||
hf["model.some_unknown.weight"] = object()
|
||||
sd = _hf_to_appsllm_state_dict(hf, 1)
|
||||
self.assertNotIn("model.some_unknown.weight", sd)
|
||||
# Renamed keys still present
|
||||
self.assertIn("blk.0.attn_q.weight", sd)
|
||||
|
||||
def test_qkv_bias_models_rejected(self):
|
||||
hf = self._fake_hf_weights(n_layers=1)
|
||||
hf["model.layers.0.self_attn.q_proj.bias"] = object()
|
||||
with self.assertRaises(ValueError) as ctx:
|
||||
_hf_to_appsllm_state_dict(hf, 1)
|
||||
self.assertIn("Qwen3", str(ctx.exception))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user