LocalAI/backend/python/tinygrad/vendor/appsllm_adapter.py

"""Glue code between LocalAI's HF-shaped model assets and tinygrad.apps.llm.

apps.llm's `Transformer` uses GGUF-native weight names and consumes a
`TransformerConfig` dataclass. LocalAI resolves models from HuggingFace
snapshots (HF safetensors + config.json) so we translate both sides here.

This module does NOT subclass anything from apps.llm. With the Qwen3+
scope the backend targets, we can use `apps.llm.Transformer` unchanged
(no qkv_bias, no RoPE permute). Everything below is a thin adapter.
"""
from __future__ import annotations

from typing import Any


def _hf_to_appsllm_state_dict(hf_weights: dict[str, Any], n_layers: int) -> dict[str, Any]:
    """Rename a HuggingFace-style state dict to the GGUF-native keys that
    `tinygrad.apps.llm.Transformer` expects.

    HF and apps.llm both store RoPE weights in half-split layout, so no
    permute is required — only a direct key rename and a tied-embedding
    fallback for models like Llama 3.2 that drop `lm_head.weight`.
    """
    keymap: dict[str, str] = {
        "model.embed_tokens.weight": "token_embd.weight",
        "model.norm.weight": "output_norm.weight",
        "lm_head.weight": "output.weight",
    }
    for layer in range(n_layers):
        keymap[f"model.layers.{layer}.input_layernorm.weight"] = f"blk.{layer}.attn_norm.weight"
        keymap[f"model.layers.{layer}.post_attention_layernorm.weight"] = f"blk.{layer}.ffn_norm.weight"
        for hf_proj, gguf_proj in (("q", "q"), ("k", "k"), ("v", "v"), ("o", "output")):
            keymap[f"model.layers.{layer}.self_attn.{hf_proj}_proj.weight"] = f"blk.{layer}.attn_{gguf_proj}.weight"
        keymap[f"model.layers.{layer}.self_attn.q_norm.weight"] = f"blk.{layer}.attn_q_norm.weight"
        keymap[f"model.layers.{layer}.self_attn.k_norm.weight"] = f"blk.{layer}.attn_k_norm.weight"
        for hf_name, gguf_name in (("gate", "gate"), ("up", "up"), ("down", "down")):
            keymap[f"model.layers.{layer}.mlp.{hf_name}_proj.weight"] = f"blk.{layer}.ffn_{gguf_name}.weight"

    # Fail loudly if the model carries Q/K/V projection bias (Qwen2 / 2.5).
    # apps.llm's `TransformerBlock` hardcodes `bias=False`, so these weights
    # would be silently dropped by `load_state_dict(strict=False)` and the
    # model would produce garbage. Supported families (Qwen3, Qwen3.5,
    # Llama 3.x, GLM-4, Mistral) have no qkv bias.
    bias_keys = [k for k in hf_weights
                 if k.startswith("model.layers.") and
                 any(k.endswith(f".self_attn.{p}_proj.bias") for p in ("q", "k", "v"))]
    if bias_keys:
        raise ValueError(
            "tinygrad backend: model has Q/K/V projection bias ("
            f"{bias_keys[0]} etc). Supported families are Qwen3, Qwen3.5, "
            "Llama 3.x, GLM-4, Mistral. For Qwen2 / 2.5 please use a "
            "newer model or the vLLM / llama.cpp backends."
        )

    sd = {dst: hf_weights[src] for src, dst in keymap.items() if src in hf_weights}
    if "output.weight" not in sd and "token_embd.weight" in sd:
        sd["output.weight"] = sd["token_embd.weight"]
    return sd


def _hf_to_transformer_kwargs(hf_config: dict, state_dict: dict[str, Any], max_context: int) -> dict:
    """Build the kwargs dict for `tinygrad.apps.llm.Transformer(**kwargs)`.

    Supports dense Qwen3 / Qwen3.5 / Llama 3.x / GLM-4 / Mistral-shaped
    models. The tinygrad 0.12.0 `Transformer` takes keyword-only args (no
    `TransformerConfig` dataclass) — so we return a plain dict.
    """
    n_heads = hf_config["num_attention_heads"]
    head_dim = hf_config.get("head_dim") or (hf_config["hidden_size"] // n_heads)

    # Detect qk_norm presence from the GGUF-shaped state dict (matches
    # apps.llm's own heuristic in `from_gguf`).
    qk_norm = 0
    qn = state_dict.get("blk.0.attn_q_norm.weight")
    if qn is not None:
        qk_norm = int(qn.shape[0])

    max_pos = hf_config.get("max_position_embeddings", 4096)

    return dict(
        num_blocks=hf_config["num_hidden_layers"],
        dim=hf_config["hidden_size"],
        hidden_dim=hf_config["intermediate_size"],
        n_heads=n_heads,
        n_kv_heads=hf_config.get("num_key_value_heads", n_heads),
        norm_eps=hf_config.get("rms_norm_eps", 1e-5),
        vocab_size=hf_config["vocab_size"],
        head_dim=head_dim,
        rope_theta=float(hf_config.get("rope_theta", 10000.0)),
        max_context=min(max_pos, max_context),
        qk_norm=qk_norm,
    )


def _embed_hidden(model, tokens):
    """Return mean-poolable hidden states by running the block stack
    without going through the LM head + Gumbel-max sampler baked into
    `Transformer.forward`."""
    x = model.token_embd(tokens).float()
    for blk in model.blk:
        x = blk(x, 0)
    return model.output_norm(x)