diff --git a/Makefile b/Makefile index 2a695ae28..f88c1e24f 100644 --- a/Makefile +++ b/Makefile @@ -560,18 +560,18 @@ test-extra-backend-vllm: docker-build-vllm ## the `test-extra-backend-tinygrad-all` aggregate. test-extra-backend-tinygrad: docker-build-tinygrad BACKEND_IMAGE=local-ai-backend:tinygrad \ - BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ + BACKEND_TEST_MODEL_NAME=Qwen/Qwen3-0.6B \ BACKEND_TEST_CAPS=health,load,predict,stream,tools \ BACKEND_TEST_OPTIONS=tool_parser:hermes \ $(MAKE) test-extra-backend ## tinygrad — embeddings via LLM last-hidden-state pooling. Reuses the same -## Qwen2.5-0.5B-Instruct as the chat target so we don't need a separate BERT -## vendor; the Embedding RPC mean-pools and L2-normalizes the last-layer -## hidden state. +## Qwen3-0.6B as the chat target so we don't need a separate BERT vendor; +## the Embedding RPC mean-pools and L2-normalizes the last-layer hidden +## state. test-extra-backend-tinygrad-embeddings: docker-build-tinygrad BACKEND_IMAGE=local-ai-backend:tinygrad \ - BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ + BACKEND_TEST_MODEL_NAME=Qwen/Qwen3-0.6B \ BACKEND_TEST_CAPS=health,load,embeddings \ $(MAKE) test-extra-backend diff --git a/backend/python/tinygrad/backend.py b/backend/python/tinygrad/backend.py index 7fa997f47..c5867de4f 100644 --- a/backend/python/tinygrad/backend.py +++ b/backend/python/tinygrad/backend.py @@ -2,15 +2,28 @@ """ LocalAI gRPC backend for tinygrad. -Multimodal scope (landing incrementally): - - LLM text generation (Llama 3 / Qwen 2.5 / Mistral via HF safetensors + GGUF) - - Native tool-call extraction via pluggable parsers (hermes, llama3_json, ...) - - Embeddings, Stable Diffusion, Whisper — planned; currently return UNIMPLEMENTED. +LLM execution is delegated to `tinygrad.apps.llm.Transformer` — we keep +only a thin HF → GGUF-name adapter (vendor/appsllm_adapter.py) for the +safetensors path; GGUF models load through `Transformer.from_gguf()` +with native Q4/Q6/Q8 support. -The heavy imports (tinygrad, tokenizers, vendor.llama) are deferred until -`LoadModel`, because tinygrad binds its compute device at import time from -env vars. `_select_tinygrad_device()` maps LocalAI's BUILD_TYPE onto the -corresponding tinygrad env flag before any import happens. +Scope: + - LLM text generation via apps.llm (Qwen3 / Qwen3.5 / Llama 3.x / + GLM-4 / OLMoE / Kimi-K2 / Moonlight — anything apps.llm supports). + - Native tool-call extraction via pluggable parsers (hermes, + llama3_json, qwen3_xml, mistral). + - Embeddings — mean-pooled last-hidden-state over the block stack. + - Stable Diffusion 1.x, Whisper — handled by the vendored paths. + +Sampling is greedy-only because `apps.llm.Transformer.generate` (in the +tinygrad 0.12.0 PyPI release) ends with `.argmax(-1)` and takes no +temperature / top-k / top-p / repetition-penalty arguments. These +request fields are accepted and ignored. + +The heavy imports (tinygrad, tokenizers, tinygrad.apps.llm) are deferred +until `LoadModel`, because tinygrad binds its compute device at import +time from env vars. `_select_tinygrad_device()` maps LocalAI's BUILD_TYPE +onto the corresponding tinygrad env flag before any import happens. """ from __future__ import annotations @@ -62,8 +75,10 @@ def _select_tinygrad_device() -> None: def _resolve_model_assets(model_ref: str) -> Path: """ Accept either a local path or a HuggingFace repo id (e.g. - "Qwen/Qwen2.5-0.5B-Instruct") and return the local directory / file. - HF ids are materialized via `huggingface_hub.snapshot_download`. + "unsloth/Qwen3.5-0.8B-GGUF") and return the local directory / file. + HF ids are materialized via `huggingface_hub.snapshot_download` — we + pull both safetensors (for fp16 HF repos) and GGUF (for quantized + repos) so the same code path handles either. """ p = Path(model_ref) if p.exists(): @@ -80,23 +95,27 @@ def _resolve_model_assets(model_ref: str) -> Path: "generation_config.json", "*.safetensors", "*.safetensors.index.json", + "*.gguf", ], ) return Path(local) raise FileNotFoundError(f"Model not found: {model_ref}") -def _load_llm_weights(model_dir: Path): - """Load HF safetensors weights from a directory or a single file.""" - from tinygrad import Device, Tensor, dtypes - from tinygrad.nn.state import safe_load, gguf_load +def _gguf_path(model_ref: Path) -> Optional[Path]: + """Return the GGUF file to load from a path that may be a file or dir.""" + if model_ref.is_file() and str(model_ref).endswith(".gguf"): + return model_ref + if model_ref.is_dir(): + ggufs = sorted(model_ref.glob("*.gguf")) + if ggufs: + return ggufs[0] + return None - if model_dir.is_file(): - if str(model_dir).endswith(".gguf"): - gguf_tensor = Tensor.empty(os.stat(model_dir).st_size, dtype=dtypes.uint8, - device=f"disk:{model_dir}").to(Device.DEFAULT) - return gguf_load(gguf_tensor)[1], "gguf" - return safe_load(str(model_dir)), "safetensors" + +def _load_hf_safetensors(model_dir: Path) -> dict[str, Any]: + """Load sharded or single-file HF safetensors from a directory.""" + from tinygrad.nn.state import safe_load index = model_dir / "model.safetensors.index.json" if index.exists(): @@ -105,28 +124,13 @@ def _load_llm_weights(model_dir: Path): shards: dict[str, Any] = {} for shard_name in set(weight_map.values()): shards[shard_name] = safe_load(str(model_dir / shard_name)) - merged = {k: shards[n][k] for k, n in weight_map.items()} - return merged, "safetensors" + return {k: shards[n][k] for k, n in weight_map.items()} single = model_dir / "model.safetensors" if single.exists(): - return safe_load(str(single)), "safetensors" + return safe_load(str(single)) - ggufs = sorted(model_dir.glob("*.gguf")) - if ggufs: - gguf_tensor = Tensor.empty(os.stat(ggufs[0]).st_size, dtype=dtypes.uint8, - device=f"disk:{ggufs[0]}").to(Device.DEFAULT) - return gguf_load(gguf_tensor)[1], "gguf" - - raise FileNotFoundError(f"No safetensors or gguf weights found under {model_dir}") - - -def _infer_qkv_bias(config: dict) -> bool: - """Qwen2-family uses Q/K/V projection bias; Llama/Mistral do not.""" - architectures = [a.lower() for a in config.get("architectures", [])] - if any("qwen" in a for a in architectures): - return True - return bool(config.get("attention_bias", False)) or bool(config.get("qkv_bias", False)) + raise FileNotFoundError(f"No safetensors weights found under {model_dir}") def _auto_tool_parser(model_ref: Optional[str], config: dict) -> Optional[str]: @@ -252,69 +256,102 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): messages=self._messages_to_dicts(request.Messages), tools=tools, add_generation_prompt=True, + # Qwen3's chat template enables ... reasoning + # by default. On small models (0.6B) that reasoning preamble + # eats the whole token budget before a tool call emerges, so + # we disable it. Templates that don't know this var ignore it. + enable_thinking=False, ) # --------------------- LLM path ------------------------------------- - def _load_llm(self, model_dir: Path) -> None: - config_path = model_dir / "config.json" if model_dir.is_dir() else None - if config_path is None or not config_path.exists(): - raise FileNotFoundError(f"config.json not found under {model_dir}") - with open(config_path) as fp: - config = json.load(fp) - self.llm_config = config + def _load_llm(self, model_path: Path) -> None: + """Load an LLM through `tinygrad.apps.llm.Transformer`. - from tinygrad import nn + Two paths: + - GGUF file (anywhere in the tree) → `Transformer.from_gguf()` + handles config, weight conversion (incl. Q4/Q6/Q8 quantization) + and RoPE permute natively. + - HF safetensors directory → build `TransformerConfig` from + config.json and load weights via a small HF→GGUF-name adapter. + """ + from tinygrad import Device, Tensor, dtypes + from tinygrad.apps.llm import Transformer + from tinygrad.nn.state import load_state_dict - from vendor.llama import Transformer, convert_from_huggingface, convert_from_gguf, fix_bf16 - - n_layers = config["num_hidden_layers"] - n_heads = config["num_attention_heads"] - n_kv_heads = config.get("num_key_value_heads", n_heads) - dim = config["hidden_size"] - hidden_dim = config["intermediate_size"] - vocab_size = config["vocab_size"] - norm_eps = config.get("rms_norm_eps", 1e-5) - rope_theta = config.get("rope_theta", 10000) - self.max_context = min(config.get("max_position_embeddings", 4096), 8192) - qkv_bias = _infer_qkv_bias(config) - - weights, weight_format = _load_llm_weights(model_dir) - - model = Transformer( - dim=dim, - hidden_dim=hidden_dim, - n_heads=n_heads, - n_layers=n_layers, - norm_eps=norm_eps, - vocab_size=vocab_size, - linear=nn.Linear, - embedding=nn.Embedding, - n_kv_heads=n_kv_heads, - rope_theta=rope_theta, - max_context=self.max_context, - jit=True, - qkv_bias=qkv_bias, + from vendor.appsllm_adapter import ( + _hf_to_appsllm_state_dict, + _hf_to_transformer_kwargs, ) - if weight_format == "safetensors": - weights = convert_from_huggingface(weights, n_layers, n_heads, n_kv_heads) + max_context_cap = 8192 + + gguf_file = _gguf_path(model_path) + if gguf_file is not None: + # GGUF path: apps.llm handles everything — config, quant, RoPE. + gguf_tensor = Tensor.empty( + os.stat(gguf_file).st_size, dtype=dtypes.uint8, + device=f"disk:{gguf_file}", + ).to(Device.DEFAULT) + model, kv = Transformer.from_gguf(gguf_tensor, max_context=max_context_cap) + self.llm_model = model + self.max_context = model.max_context + # Preserve a config-shaped dict for tool-parser heuristics and + # the "loaded" message. + arch = kv.get("general.architecture", "") + self.llm_config = { + "architectures": [kv.get("general.name", arch) or arch], + "gguf_kv": kv, + } + + # Tokenizer: prefer sidecar tokenizer.json (richer HF Jinja2 + # templates), fall back to apps.llm's SimpleTokenizer built + # from GGUF metadata. + self._load_tokenizer_for_dir(model_path if model_path.is_dir() else gguf_file.parent, gguf_kv=kv) else: - weights = convert_from_gguf(weights, n_layers) - weights = fix_bf16(weights) + # HF safetensors path. + if not model_path.is_dir(): + raise FileNotFoundError(f"Expected HF model directory, got file: {model_path}") + config_path = model_path / "config.json" + if not config_path.exists(): + raise FileNotFoundError(f"config.json not found under {model_path}") + with open(config_path) as fp: + hf_config = json.load(fp) + self.llm_config = hf_config - from tinygrad.nn.state import load_state_dict - load_state_dict(model, weights, strict=False, consume=True) - self.llm_model = model + raw_weights = _load_hf_safetensors(model_path) + n_layers = hf_config["num_hidden_layers"] + state_dict = _hf_to_appsllm_state_dict(raw_weights, n_layers) - # Tokenizer + kwargs = _hf_to_transformer_kwargs(hf_config, state_dict, max_context_cap) + self.max_context = kwargs["max_context"] + + model = Transformer(**kwargs) + load_state_dict(model, state_dict, strict=False, consume=True) + self.llm_model = model + + self._load_tokenizer_for_dir(model_path, gguf_kv=None) + + # Auto-pick tool parser from options or model family. + parser_name = self.options.get("tool_parser") or _auto_tool_parser(self.model_ref, self.llm_config) + self.tool_parser = resolve_parser(parser_name) + + def _load_tokenizer_for_dir(self, model_dir: Path, gguf_kv: Optional[dict]) -> None: + """Load HF tokenizer + chat template + EOS ids from a model directory. + + Falls back to apps.llm's `SimpleTokenizer.from_gguf_kv` when there + is no `tokenizer.json` sidecar (single-file GGUF, no HF repo). + """ tokenizer_json = model_dir / "tokenizer.json" - if not tokenizer_json.exists(): + if tokenizer_json.exists(): + from tokenizers import Tokenizer as HFTokenizer + self.llm_tokenizer = HFTokenizer.from_file(str(tokenizer_json)) + elif gguf_kv is not None: + from tinygrad.apps.llm import SimpleTokenizer + self.llm_tokenizer = SimpleTokenizer.from_gguf_kv(gguf_kv) + else: raise FileNotFoundError(f"tokenizer.json not found under {model_dir}") - from tokenizers import Tokenizer as HFTokenizer - self.llm_tokenizer = HFTokenizer.from_file(str(tokenizer_json)) - # Chat template + EOS ids (from tokenizer_config.json + generation_config.json) tok_cfg_path = model_dir / "tokenizer_config.json" if tok_cfg_path.exists(): with open(tok_cfg_path) as fp: @@ -322,26 +359,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): self.chat_template = tok_cfg.get("chat_template") self.llm_eos_ids = [] - gen_cfg_path = model_dir / "generation_config.json" - if gen_cfg_path.exists(): - with open(gen_cfg_path) as fp: - gen_cfg = json.load(fp) - eos = gen_cfg.get("eos_token_id") + for cfg_name in ("generation_config.json", "config.json"): + cfg_path = model_dir / cfg_name + if not cfg_path.exists(): + continue + with open(cfg_path) as fp: + cfg = json.load(fp) + eos = cfg.get("eos_token_id") if isinstance(eos, list): self.llm_eos_ids.extend(int(x) for x in eos) elif isinstance(eos, int): self.llm_eos_ids.append(eos) - if not self.llm_eos_ids: - eos = config.get("eos_token_id") - if isinstance(eos, list): - self.llm_eos_ids.extend(int(x) for x in eos) - elif isinstance(eos, int): + if self.llm_eos_ids: + break + if not self.llm_eos_ids and gguf_kv is not None: + eos = gguf_kv.get("tokenizer.ggml.eos_token_id") + if isinstance(eos, int): self.llm_eos_ids.append(eos) - # Auto-pick tool parser from options or model family. - parser_name = self.options.get("tool_parser") or _auto_tool_parser(self.model_ref, config) - self.tool_parser = resolve_parser(parser_name) - # --------------------- Stable Diffusion path ------------------------ def _load_sd(self, model_ref: str) -> None: @@ -400,28 +435,37 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # --------------------- LLM generation ------------------------------- - def _generate_tokens(self, prompt: str, max_new_tokens: int, temperature: float, top_p: float, top_k: int): - """Synchronous generator yielding (token_id, token_text) pairs.""" - from tinygrad import Tensor - + def _encode_prompt(self, prompt: str) -> list[int]: + """Normalize tokenizer output: HF `tokenizers.Tokenizer.encode()` + returns an `Encoding` with `.ids`; apps.llm's `SimpleTokenizer.encode()` + returns `list[int]` directly.""" encoded = self.llm_tokenizer.encode(prompt) - ids = encoded.ids + return list(getattr(encoded, "ids", encoded)) + + def _decode_tokens(self, ids: list[int]) -> str: + return self.llm_tokenizer.decode(ids) + + def _generate_tokens(self, prompt: str, max_new_tokens: int, temperature: float): + """Yield (token_id, token_text) pairs using `apps.llm.Transformer.generate()`. + + tinygrad 0.12.0's `generate()` is greedy-only (its `forward` ends + with `.argmax(-1)` and it takes no temperature / top-k / top-p + knobs). We accept `temperature` in the signature for API + compatibility but it is ignored. + """ + del temperature # tinygrad.apps.llm.Transformer.generate is greedy-only + ids = self._encode_prompt(prompt) if not ids: return - # Prefill: run one forward pass over the full prompt, sampling from the last token. - prompt_tensor = Tensor([ids]) - next_tok = int(self.llm_model(prompt_tensor, 0, temperature=temperature, top_k=top_k, top_p=top_p).item()) - pos = len(ids) - - for _ in range(max_new_tokens): + count = 0 + for next_tok in self.llm_model.generate(list(ids)): if next_tok in self.llm_eos_ids: break - text = self.llm_tokenizer.decode([next_tok]) - yield next_tok, text - next_tok = int(self.llm_model(Tensor([[next_tok]]), pos, temperature=temperature, - top_k=top_k, top_p=top_p).item()) - pos += 1 + yield next_tok, self._decode_tokens([next_tok]) + count += 1 + if count >= max_new_tokens: + break # --------------------- gRPC methods --------------------------------- @@ -455,12 +499,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): ) model_path = _resolve_model_assets(self.model_ref) - if model_path.is_file(): - return backend_pb2.Result( - success=False, - message=("tinygrad currently requires an HF model directory with config.json + " - "tokenizer.json; single-file GGUF support lands with gallery metadata wiring."), - ) self._load_llm(model_path) return backend_pb2.Result( @@ -483,13 +521,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): prompt = self._render_prompt(request) max_new = request.Tokens if request.Tokens > 0 else 256 temperature = request.Temperature if request.Temperature > 0 else 0.7 - top_p = request.TopP if request.TopP > 0 else 0.95 - top_k = request.TopK if request.TopK > 0 else 0 t0 = time.monotonic() pieces: list[str] = [] ntok = 0 - for _, text in self._generate_tokens(prompt, max_new, temperature, top_p, top_k): + for _, text in self._generate_tokens(prompt, max_new, temperature): pieces.append(text) ntok += 1 elapsed = time.monotonic() - t0 @@ -534,11 +570,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): prompt = self._render_prompt(request) max_new = request.Tokens if request.Tokens > 0 else 256 temperature = request.Temperature if request.Temperature > 0 else 0.7 - top_p = request.TopP if request.TopP > 0 else 0.95 - top_k = request.TopK if request.TopK > 0 else 0 buffer = "" - for _, text in self._generate_tokens(prompt, max_new, temperature, top_p, top_k): + for _, text in self._generate_tokens(prompt, max_new, temperature): buffer += text yield backend_pb2.Reply( message=text.encode("utf-8"), @@ -585,8 +619,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.EmbeddingResult() from tinygrad import Tensor, dtypes + from vendor.appsllm_adapter import _embed_hidden - ids = self.llm_tokenizer.encode(text).ids + ids = self._encode_prompt(text) if not ids: return backend_pb2.EmbeddingResult(embeddings=[]) @@ -594,7 +629,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): ids = ids[: self.max_context] tokens = Tensor([ids]) - hidden = self.llm_model.embed(tokens) # (1, seqlen, dim) + hidden = _embed_hidden(self.llm_model, tokens) # (1, seqlen, dim) # Mean pool over sequence dim pooled = hidden.mean(axis=1).squeeze(0) # (dim,) # L2 normalize diff --git a/backend/python/tinygrad/test.py b/backend/python/tinygrad/test.py index bf47d9bda..33f7ebfc7 100644 --- a/backend/python/tinygrad/test.py +++ b/backend/python/tinygrad/test.py @@ -22,6 +22,7 @@ import backend_pb2_grpc sys.path.insert(0, os.path.dirname(__file__)) from tool_parsers.hermes import HermesToolParser # noqa: E402 +from vendor.appsllm_adapter import _hf_to_appsllm_state_dict # noqa: E402 class TestHealth(unittest.TestCase): @@ -80,5 +81,73 @@ class TestHermesParser(unittest.TestCase): self.assertEqual(calls, []) +class TestAppsLLMAdapter(unittest.TestCase): + """Smoke tests for the HF → tinygrad.apps.llm state-dict keymap.""" + + def _fake_hf_weights(self, n_layers: int = 2, include_lm_head: bool = True): + keys = [ + "model.embed_tokens.weight", + "model.norm.weight", + ] + if include_lm_head: + keys.append("lm_head.weight") + for l in range(n_layers): + keys += [ + f"model.layers.{l}.input_layernorm.weight", + f"model.layers.{l}.post_attention_layernorm.weight", + f"model.layers.{l}.self_attn.q_proj.weight", + f"model.layers.{l}.self_attn.k_proj.weight", + f"model.layers.{l}.self_attn.v_proj.weight", + f"model.layers.{l}.self_attn.o_proj.weight", + f"model.layers.{l}.self_attn.q_norm.weight", + f"model.layers.{l}.self_attn.k_norm.weight", + f"model.layers.{l}.mlp.gate_proj.weight", + f"model.layers.{l}.mlp.up_proj.weight", + f"model.layers.{l}.mlp.down_proj.weight", + ] + # sentinel objects so we can verify identity-based aliasing + return {k: object() for k in keys} + + def test_keymap_renames_every_hf_key(self): + hf = self._fake_hf_weights(n_layers=2) + sd = _hf_to_appsllm_state_dict(hf, 2) + expected = { + "token_embd.weight", "output_norm.weight", "output.weight", + "blk.0.attn_norm.weight", "blk.0.ffn_norm.weight", + "blk.0.attn_q.weight", "blk.0.attn_k.weight", "blk.0.attn_v.weight", + "blk.0.attn_output.weight", + "blk.0.attn_q_norm.weight", "blk.0.attn_k_norm.weight", + "blk.0.ffn_gate.weight", "blk.0.ffn_up.weight", "blk.0.ffn_down.weight", + "blk.1.attn_norm.weight", "blk.1.ffn_norm.weight", + "blk.1.attn_q.weight", "blk.1.attn_k.weight", "blk.1.attn_v.weight", + "blk.1.attn_output.weight", + "blk.1.attn_q_norm.weight", "blk.1.attn_k_norm.weight", + "blk.1.ffn_gate.weight", "blk.1.ffn_up.weight", "blk.1.ffn_down.weight", + } + self.assertEqual(set(sd.keys()), expected) + + def test_tied_embedding_fallback_when_lm_head_missing(self): + hf = self._fake_hf_weights(n_layers=1, include_lm_head=False) + sd = _hf_to_appsllm_state_dict(hf, 1) + self.assertIn("output.weight", sd) + self.assertIs(sd["output.weight"], sd["token_embd.weight"]) + + def test_unknown_keys_are_skipped(self): + hf = self._fake_hf_weights(n_layers=1) + hf["model.layers.0.self_attn.rotary_emb.inv_freq"] = object() + hf["model.some_unknown.weight"] = object() + sd = _hf_to_appsllm_state_dict(hf, 1) + self.assertNotIn("model.some_unknown.weight", sd) + # Renamed keys still present + self.assertIn("blk.0.attn_q.weight", sd) + + def test_qkv_bias_models_rejected(self): + hf = self._fake_hf_weights(n_layers=1) + hf["model.layers.0.self_attn.q_proj.bias"] = object() + with self.assertRaises(ValueError) as ctx: + _hf_to_appsllm_state_dict(hf, 1) + self.assertIn("Qwen3", str(ctx.exception)) + + if __name__ == "__main__": unittest.main() diff --git a/backend/python/tinygrad/vendor/appsllm_adapter.py b/backend/python/tinygrad/vendor/appsllm_adapter.py new file mode 100644 index 000000000..da3464f2a --- /dev/null +++ b/backend/python/tinygrad/vendor/appsllm_adapter.py @@ -0,0 +1,102 @@ +"""Glue code between LocalAI's HF-shaped model assets and tinygrad.apps.llm. + +apps.llm's `Transformer` uses GGUF-native weight names and consumes a +`TransformerConfig` dataclass. LocalAI resolves models from HuggingFace +snapshots (HF safetensors + config.json) so we translate both sides here. + +This module does NOT subclass anything from apps.llm. With the Qwen3+ +scope the backend targets, we can use `apps.llm.Transformer` unchanged +(no qkv_bias, no RoPE permute). Everything below is a thin adapter. +""" +from __future__ import annotations + +from typing import Any + + +def _hf_to_appsllm_state_dict(hf_weights: dict[str, Any], n_layers: int) -> dict[str, Any]: + """Rename a HuggingFace-style state dict to the GGUF-native keys that + `tinygrad.apps.llm.Transformer` expects. + + HF and apps.llm both store RoPE weights in half-split layout, so no + permute is required — only a direct key rename and a tied-embedding + fallback for models like Llama 3.2 that drop `lm_head.weight`. + """ + keymap: dict[str, str] = { + "model.embed_tokens.weight": "token_embd.weight", + "model.norm.weight": "output_norm.weight", + "lm_head.weight": "output.weight", + } + for layer in range(n_layers): + keymap[f"model.layers.{layer}.input_layernorm.weight"] = f"blk.{layer}.attn_norm.weight" + keymap[f"model.layers.{layer}.post_attention_layernorm.weight"] = f"blk.{layer}.ffn_norm.weight" + for hf_proj, gguf_proj in (("q", "q"), ("k", "k"), ("v", "v"), ("o", "output")): + keymap[f"model.layers.{layer}.self_attn.{hf_proj}_proj.weight"] = f"blk.{layer}.attn_{gguf_proj}.weight" + keymap[f"model.layers.{layer}.self_attn.q_norm.weight"] = f"blk.{layer}.attn_q_norm.weight" + keymap[f"model.layers.{layer}.self_attn.k_norm.weight"] = f"blk.{layer}.attn_k_norm.weight" + for hf_name, gguf_name in (("gate", "gate"), ("up", "up"), ("down", "down")): + keymap[f"model.layers.{layer}.mlp.{hf_name}_proj.weight"] = f"blk.{layer}.ffn_{gguf_name}.weight" + + # Fail loudly if the model carries Q/K/V projection bias (Qwen2 / 2.5). + # apps.llm's `TransformerBlock` hardcodes `bias=False`, so these weights + # would be silently dropped by `load_state_dict(strict=False)` and the + # model would produce garbage. Supported families (Qwen3, Qwen3.5, + # Llama 3.x, GLM-4, Mistral) have no qkv bias. + bias_keys = [k for k in hf_weights + if k.startswith("model.layers.") and + any(k.endswith(f".self_attn.{p}_proj.bias") for p in ("q", "k", "v"))] + if bias_keys: + raise ValueError( + "tinygrad backend: model has Q/K/V projection bias (" + f"{bias_keys[0]} etc). Supported families are Qwen3, Qwen3.5, " + "Llama 3.x, GLM-4, Mistral. For Qwen2 / 2.5 please use a " + "newer model or the vLLM / llama.cpp backends." + ) + + sd = {dst: hf_weights[src] for src, dst in keymap.items() if src in hf_weights} + if "output.weight" not in sd and "token_embd.weight" in sd: + sd["output.weight"] = sd["token_embd.weight"] + return sd + + +def _hf_to_transformer_kwargs(hf_config: dict, state_dict: dict[str, Any], max_context: int) -> dict: + """Build the kwargs dict for `tinygrad.apps.llm.Transformer(**kwargs)`. + + Supports dense Qwen3 / Qwen3.5 / Llama 3.x / GLM-4 / Mistral-shaped + models. The tinygrad 0.12.0 `Transformer` takes keyword-only args (no + `TransformerConfig` dataclass) — so we return a plain dict. + """ + n_heads = hf_config["num_attention_heads"] + head_dim = hf_config.get("head_dim") or (hf_config["hidden_size"] // n_heads) + + # Detect qk_norm presence from the GGUF-shaped state dict (matches + # apps.llm's own heuristic in `from_gguf`). + qk_norm = 0 + qn = state_dict.get("blk.0.attn_q_norm.weight") + if qn is not None: + qk_norm = int(qn.shape[0]) + + max_pos = hf_config.get("max_position_embeddings", 4096) + + return dict( + num_blocks=hf_config["num_hidden_layers"], + dim=hf_config["hidden_size"], + hidden_dim=hf_config["intermediate_size"], + n_heads=n_heads, + n_kv_heads=hf_config.get("num_key_value_heads", n_heads), + norm_eps=hf_config.get("rms_norm_eps", 1e-5), + vocab_size=hf_config["vocab_size"], + head_dim=head_dim, + rope_theta=float(hf_config.get("rope_theta", 10000.0)), + max_context=min(max_pos, max_context), + qk_norm=qk_norm, + ) + + +def _embed_hidden(model, tokens): + """Return mean-poolable hidden states by running the block stack + without going through the LM head + Gumbel-max sampler baked into + `Transformer.forward`.""" + x = model.token_embd(tokens).float() + for blk in model.blk: + x = blk(x, 0) + return model.output_norm(x) diff --git a/backend/python/tinygrad/vendor/llama.py b/backend/python/tinygrad/vendor/llama.py deleted file mode 100644 index 445f68b9c..000000000 --- a/backend/python/tinygrad/vendor/llama.py +++ /dev/null @@ -1,294 +0,0 @@ -# Vendored from tinygrad extra/models/llama.py (MIT license). -# Upstream: https://github.com/tinygrad/tinygrad/blob/master/extra/models/llama.py -# -# Local modification: Attention / TransformerBlock / Transformer accept an -# optional `qkv_bias` flag so the same module can host Qwen2-style models that -# use bias on the Q/K/V projections (Llama 3 has no bias). Changes are marked -# with `# LOCALAI`. -# -# Copyright (c) 2023- the tinygrad authors -# SPDX-License-Identifier: MIT -from typing import Union, Optional, Any -import collections, math -from tinygrad import Tensor, Variable, TinyJit, dtypes, nn, Device -from tinygrad.helpers import getenv, DEBUG - - -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> Tensor: - freqs = 1.0 / (theta ** (Tensor.arange(0, dim, 2)[:(dim // 2)] / dim)) - freqs = Tensor.arange(end).unsqueeze(dim=1) * freqs.unsqueeze(dim=0) - return Tensor.stack(freqs.cos(), freqs.sin(), dim=-1).reshape(1, end, 1, dim//2, 2) - - -def complex_mult(A, c, d): - a, b = A[..., 0:1], A[..., 1:2] - ro = a * c - b * d - co = a * d + b * c - return ro.cat(co, dim=-1) - - -def apply_rotary_emb(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]: - assert freqs_cis.shape[1] == xq.shape[1] == xk.shape[1], f"freqs_cis shape mismatch {freqs_cis.shape} xq:{xq.shape} xk:{xk.shape}" - xq = xq.reshape(*xq.shape[0:-1], -1, 2) - xk = xk.reshape(*xk.shape[0:-1], -1, 2) - assert len(xq.shape) == len(xk.shape) == len(freqs_cis.shape) == 5 - c, d = freqs_cis[..., 0:1], freqs_cis[..., 1:2] - xq_out = complex_mult(xq, c, d) - xk_out = complex_mult(xk, c, d) - return xq_out.flatten(3), xk_out.flatten(3) - - -def repeat_kv(x: Tensor, n_rep: int) -> Tensor: - bs, seqlen, n_kv_heads, head_dim = x.shape - if n_rep == 1: - return x - return x.repeat((1, 1, 1, n_rep)).reshape(bs, seqlen, n_kv_heads * n_rep, head_dim) - - -class Attention: - # LOCALAI: added qkv_bias - def __init__(self, dim, n_heads, n_kv_heads=None, max_context=0, linear=nn.Linear, qk_norm: float | None = None, qkv_bias: bool = False): - self.n_heads = n_heads - self.n_kv_heads = n_kv_heads if n_kv_heads is not None else n_heads - self.head_dim = dim // n_heads - self.n_rep = self.n_heads // self.n_kv_heads - self.max_context = max_context - - self.wq = linear(dim, self.n_heads * self.head_dim, bias=qkv_bias) - self.wk = linear(dim, self.n_kv_heads * self.head_dim, bias=qkv_bias) - self.wv = linear(dim, self.n_kv_heads * self.head_dim, bias=qkv_bias) - self.wo = linear(self.n_heads * self.head_dim, dim, bias=False) - - self.q_norm = nn.RMSNorm(dim, qk_norm) if qk_norm is not None else None - self.k_norm = nn.RMSNorm(dim, qk_norm) if qk_norm is not None else None - - def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor] = None) -> Tensor: - xq, xk, xv = self.wq(x), self.wk(x.contiguous_backward()), self.wv(x) - - if self.q_norm is not None and self.k_norm is not None: - xq = self.q_norm(xq) - xk = self.k_norm(xk) - - if x.dtype == dtypes.bfloat16: - xq, xk = xq.contiguous_backward(), xk.contiguous_backward() - - xq = xq.reshape(xq.shape[0], xq.shape[1], self.n_heads, self.head_dim) - xk = xk.reshape(xk.shape[0], xk.shape[1], self.n_kv_heads, self.head_dim) - xv = xv.reshape(xv.shape[0], xv.shape[1], self.n_kv_heads, self.head_dim) - - xq, xk = apply_rotary_emb(xq, xk, freqs_cis) - bsz, seqlen, _, _ = xq.shape - - if self.max_context: - if not hasattr(self, "cache_kv"): - self.cache_kv = Tensor.zeros(2, bsz, self.max_context, self.n_kv_heads, self.head_dim, dtype=x.dtype).contiguous().realize() - if isinstance(x.device, tuple): - self.cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize() - - assert xk.dtype == xv.dtype == self.cache_kv.dtype, f"{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}" - self.cache_kv[:, :, start_pos:start_pos + seqlen, :, :].assign(Tensor.stack(xk, xv)).realize() - - keys = self.cache_kv[0, :, 0:start_pos + seqlen, :, :] - values = self.cache_kv[1, :, 0:start_pos + seqlen, :, :] - else: - assert start_pos == 0 - keys, values = xk, xv - - if self.max_context: - keys, values = repeat_kv(keys, self.n_rep), repeat_kv(values, self.n_rep) - xq, keys, values = xq.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2) - attn = xq.scaled_dot_product_attention(keys, values, mask).transpose(1, 2) - else: - xq, keys, values = xq.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2) - attn = xq.scaled_dot_product_attention(keys, values, is_causal=True, enable_gqa=True).transpose(1, 2) - - attn = attn.reshape(bsz, seqlen, -1) - return self.wo(attn) - - -class FeedForward: - def __init__(self, dim: int, hidden_dim: int, linear=nn.Linear): - self.w1 = linear(dim, hidden_dim, bias=False) - self.w2 = linear(hidden_dim, dim, bias=False) - self.w3 = linear(dim, hidden_dim, bias=False) - - def __call__(self, x: Tensor) -> Tensor: - w1 = self.w1(x).silu() - w3 = self.w3(x.contiguous_backward()) - return self.w2(w1 * w3) - - -class TransformerBlock: - # LOCALAI: added qkv_bias - def __init__(self, dim: int, hidden_dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, max_context: int, - linear=nn.Linear, feed_forward=FeedForward, qk_norm=None, qkv_bias: bool = False): - self.attention = Attention(dim, n_heads, n_kv_heads, max_context, linear, qk_norm, qkv_bias=qkv_bias) - self.feed_forward = feed_forward(dim, hidden_dim, linear) - self.attention_norm = nn.RMSNorm(dim, norm_eps) - self.ffn_norm = nn.RMSNorm(dim, norm_eps) - - def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]): - h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask) - return (h + self.feed_forward(self.ffn_norm(h))).contiguous().contiguous_backward() - - -def sample(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float): - assert logits.ndim == 1, "only works on 1d tensors" - assert 0 <= p <= 1, "p must be between 0 and 1" - assert 0 <= k <= logits.numel(), "k must be between 0 and numel" - - if temp < 1e-6: - return logits.argmax() - - logits = logits.to(Device.DEFAULT) - - if af or ap: - if not hasattr(sample, "alpha_counter"): - setattr(sample, "alpha_counter", Tensor.zeros_like(logits, dtype=dtypes.int32).contiguous()) - logits = logits - (sample.alpha_counter * af + (sample.alpha_counter > 0) * ap) - - logits = (logits != logits).where(-float("inf"), logits) - - t = (logits / temp).softmax() - - counter = Tensor.arange(t.numel(), device=logits.device).contiguous() - counter2 = Tensor.arange(t.numel() - 1, -1, -1, device=logits.device).contiguous() - - if k: - output = Tensor.zeros(k, device=logits.device).contiguous() - output_indices = Tensor.zeros(k, device=logits.device, dtype=dtypes.int32).contiguous() - for i in range(k): - t_argmax = (t.numel() - ((t == (t_max := t.max())) * counter2).max() - 1).cast(dtypes.default_int) - output = output + t_max.unsqueeze(0).pad(((i, k - i - 1),)) - output_indices = output_indices + t_argmax.unsqueeze(0).pad(((i, k - i - 1),)) - t = (counter == t_argmax).where(0, t) - - output_cumsum = output[::-1].cumsum()[::-1] + t.sum() - output = (output_cumsum >= (1 - p)) * output - output_indices = (output_cumsum >= (1 - p)) * output_indices - - output_idx = output.multinomial() - output_token = output_indices[output_idx] - else: - output_token = t.multinomial() - - if af or ap: - sample.alpha_counter = (counter == output_token).where(sample.alpha_counter + 1, sample.alpha_counter) - - return output_token - - -class Transformer: - # LOCALAI: added qkv_bias - def __init__(self, dim: int, hidden_dim: int, n_heads: int, n_layers: int, norm_eps: float, vocab_size, - linear=nn.Linear, embedding=nn.Embedding, n_kv_heads=None, rope_theta=10000, - max_context=1024, jit=True, feed_forward=FeedForward, qk_norm=None, disable_kv_cache=False, - qkv_bias: bool = False): - self.layers = [ - TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, - 0 if disable_kv_cache else max_context, linear, - feed_forward=feed_forward, qk_norm=qk_norm, qkv_bias=qkv_bias) - for _ in range(n_layers) - ] - self.norm = nn.RMSNorm(dim, norm_eps) - self.tok_embeddings = embedding(vocab_size, dim) - self.output = nn.Linear(dim, vocab_size, bias=False) if embedding == nn.Embedding else linear(dim, vocab_size, bias=False) - self.max_context = max_context - self.freqs_cis = precompute_freqs_cis(dim // n_heads, self.max_context * 2, rope_theta).contiguous().requires_grad_(False) - self.forward_jit = TinyJit(self.forward) if jit else None - - def forward(self, tokens: Tensor, start_pos: Union[Variable, int], temperature: float, top_k: int, top_p: float, alpha_f: float, alpha_p: float): - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens).contiguous() - freqs_cis = self.freqs_cis.cast(h.dtype)[:, start_pos:start_pos + seqlen, :, :, :] - - if self.max_context != 0 and seqlen > 1: - mask = Tensor.full((1, 1, seqlen, start_pos + seqlen), float("-inf"), dtype=h.dtype, device=h.device).triu(start_pos + 1) - else: - mask = None - for layer in self.layers: - h = layer(h, start_pos, freqs_cis, mask) - logits = self.output(self.norm(h).contiguous().contiguous_backward()).contiguous_backward() - if math.isnan(temperature): - return logits - - return sample(logits[:, -1, :].flatten(), temperature, top_k, top_p, alpha_f, alpha_p) - - def __call__(self, tokens: Tensor, start_pos: int, temperature: float = 0.0, top_k: int = 0, top_p: float = 0.8, alpha_f: float = 0.0, alpha_p: float = 0.0): - if tokens.shape[0:2] == (1, 1) and self.forward_jit is not None and start_pos != 0: - return self.forward_jit(tokens, Variable("start_pos", 1, self.max_context - 1).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p) - return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p) - - # LOCALAI: extract last hidden state for embeddings. Skips the LM head and - # the causal-mask branch is left intact so the pooling sees the full sequence. - def embed(self, tokens: Tensor) -> Tensor: - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens).contiguous() - freqs_cis = self.freqs_cis.cast(h.dtype)[:, 0:seqlen, :, :, :] - mask = Tensor.full((1, 1, seqlen, seqlen), float("-inf"), dtype=h.dtype, device=h.device).triu(1) if seqlen > 1 else None - for layer in self.layers: - h = layer(h, 0, freqs_cis, mask) - return self.norm(h) - - -def convert_from_huggingface(weights: dict[str, Tensor], n_layers: int, n_heads: int, n_kv_heads: int, permute_layers: bool = True): - def permute(v: Tensor, n_heads: int): - return v.reshape(n_heads, 2, v.shape[0] // n_heads // 2, v.shape[1] if len(v.shape) > 1 else 1).transpose(1, 2).reshape(*v.shape[:2]) - - keymap = { - "model.embed_tokens.weight": "tok_embeddings.weight", - **{f"model.layers.{l}.input_layernorm.weight": f"layers.{l}.attention_norm.weight" for l in range(n_layers)}, - **{f"model.layers.{l}.self_attn.{x}_norm.weight": f"layers.{l}.attention.{x}_norm.weight" for x in ["q", "k"] for l in range(n_layers)}, - **{f"model.layers.{l}.self_attn.{x}_proj.weight": f"layers.{l}.attention.w{x}.weight" for x in ["q", "k", "v", "o"] for l in range(n_layers)}, - **{f"model.layers.{l}.self_attn.{x}_proj.bias": f"layers.{l}.attention.w{x}.bias" for x in ["q", "k", "v", "o"] for l in range(n_layers)}, - **{f"model.layers.{l}.post_attention_layernorm.weight": f"layers.{l}.ffn_norm.weight" for l in range(n_layers)}, - **{f"model.layers.{l}.mlp.{x}_proj.weight": f"layers.{l}.feed_forward.w{y}.weight" for x, y in {"gate": "1", "down": "2", "up": "3"}.items() for l in range(n_layers)}, - **{f"model.layers.{l}.mlp.gate.weight": f"layers.{l}.feed_forward.gate.weight" for l in range(n_layers)}, - "model.norm.weight": "norm.weight", - "lm_head.weight": "output.weight", - } - sd = {} - experts = collections.defaultdict(dict) - for k, v in weights.items(): - if ".rotary_emb." in k: - continue - v = v.to(Device.DEFAULT) - if "model.layers" in k: - if ("q_proj" in k or "q_norm" in k) and permute_layers: - v = permute(v, n_heads) - elif ("k_proj" in k or "k_norm" in k) and permute_layers: - v = permute(v, n_kv_heads) - if '.mlp.experts.' in k: - _, _, layer, _, _, expert, name, _ = k.split('.') - experts[f'layers.{layer}.feed_forward.{name}'][int(expert)] = v - continue - sd[keymap[k]] = v - for k, v in experts.items(): - sd[k] = Tensor.stack(*[v[i] for i in range(len(v))]) - - if "output.weight" not in sd and "tok_embeddings.weight" in sd: - sd["output.weight"] = sd["tok_embeddings.weight"] - - return sd - - -def convert_from_gguf(weights: dict[str, Tensor], n_layers: int): - keymap = { - "token_embd.weight": "tok_embeddings.weight", - **{f"blk.{l}.attn_norm.weight": f"layers.{l}.attention_norm.weight" for l in range(n_layers)}, - **{f"blk.{l}.attn_{x}.weight": f"layers.{l}.attention.w{x}.weight" for x in ["q", "k", "v"] for l in range(n_layers)}, - **{f"blk.{l}.attn_{x}.bias": f"layers.{l}.attention.w{x}.bias" for x in ["q", "k", "v"] for l in range(n_layers)}, - **{f"blk.{l}.attn_output.weight": f"layers.{l}.attention.wo.weight" for l in range(n_layers)}, - **{f"blk.{l}.ffn_norm.weight": f"layers.{l}.ffn_norm.weight" for l in range(n_layers)}, - **{f"blk.{l}.ffn_{x}.weight": f"layers.{l}.feed_forward.w{y}.weight" for x, y in {"gate": "1", "down": "2", "up": "3"}.items() for l in range(n_layers)}, - "output_norm.weight": "norm.weight", - "rope_freqs.weight": "rope_freqs.weight", - } - sd = {keymap[k]: v for k, v in weights.items() if k in keymap} - if "output.weight" not in sd and "token_embd.weight" in weights: - sd["output.weight"] = weights["token_embd.weight"] - return sd - - -def fix_bf16(weights: dict[Any, Tensor]): - return {k: v.cast(dtypes.float32).cast(dtypes.float16) if v.dtype == dtypes.bfloat16 else v for k, v in weights.items()}