"""Speaker-recognition engines. Two engines are offered, mirroring the insightface backend's split: * SpeechBrainEngine: full PyTorch / SpeechBrain path. Uses the ECAPA-TDNN recipe trained on VoxCeleb; 192-d L2-normalized embeddings, cosine distance for verification. Auto-downloads the checkpoint into LocalAI's models directory on first LoadModel. * OnnxDirectEngine: CPU-friendly fallback that runs pre-exported ONNX speaker encoders (WeSpeaker ResNet34, 3D-Speaker ERes2Net, CAM++, etc.). Model paths come from the model config — the gallery `files:` flow drops them into the models directory. Engine selection follows the same gallery-driven convention face recognition uses (insightface commits 9c6da0f7 / 405fec0b): the Python backend reads `engine` / `model_path` / `checkpoint` from the options dict and picks an engine accordingly. """ from __future__ import annotations import os from typing import Any, Iterable, Protocol class SpeakerEngine(Protocol): """Interface both concrete engines satisfy.""" name: str def embed(self, audio_path: str) -> list[float]: # pragma: no cover - interface ... def compare(self, audio1: str, audio2: str) -> float: # pragma: no cover ... def analyze(self, audio_path: str, actions: Iterable[str]) -> list[dict[str, Any]]: # pragma: no cover ... def _cosine_distance(a, b) -> float: import numpy as np va = np.asarray(a, dtype=np.float32).reshape(-1) vb = np.asarray(b, dtype=np.float32).reshape(-1) na = float(np.linalg.norm(va)) nb = float(np.linalg.norm(vb)) if na == 0.0 or nb == 0.0: return 1.0 return float(1.0 - np.dot(va, vb) / (na * nb)) class AnalysisHead: """Age / gender / emotion head, lazy-loaded on first analyze call. Wraps two open-licence HuggingFace checkpoints: * audeering/wav2vec2-large-robust-24-ft-age-gender — age regression (0–100 years) + 3-way gender (female/male/child). Apache 2.0. * superb/wav2vec2-base-superb-er — 4-way emotion classification (neutral / happy / angry / sad). Apache 2.0. Either model is optional — the head degrades gracefully to only the attributes it could load. Override the checkpoint with the `age_gender_model` / `emotion_model` option if you want something else. Set either to an empty string to disable that head. """ # Age + gender is OFF by default: the high-accuracy Apache-2.0 # checkpoint (Audeering wav2vec2-large-robust-24-ft-age-gender) uses a # custom multi-task head that AutoModelForAudioClassification silently # mangles — it drops the age weights as UNEXPECTED and re-initialises # the classifier head with random values, so the output is noise. Users # who have a cleanly loadable age/gender classifier can opt in with # `age_gender_model:` in options. The emotion default below # (superb/wav2vec2-base-superb-er) loads via the standard audio- # classification pipeline with no such caveat. DEFAULT_AGE_GENDER_MODEL = "" DEFAULT_EMOTION_MODEL = "superb/wav2vec2-base-superb-er" AGE_GENDER_LABELS = ("female", "male", "child") def __init__(self, options: dict[str, str]): self._options = options self._age_gender = None self._age_gender_processor = None self._age_gender_loaded = False self._age_gender_error: str | None = None self._emotion = None self._emotion_loaded = False self._emotion_error: str | None = None # --- age / gender ------------------------------------------------- def _ensure_age_gender(self): if self._age_gender_loaded: return self._age_gender_loaded = True model_id = self._options.get( "age_gender_model", self.DEFAULT_AGE_GENDER_MODEL ) if not model_id: self._age_gender_error = "disabled" return try: # Late imports — torch / transformers are heavy and only # pulled in when the analyze head actually runs. import torch # type: ignore from transformers import AutoFeatureExtractor, AutoModelForAudioClassification # type: ignore self._torch = torch self._age_gender_processor = AutoFeatureExtractor.from_pretrained(model_id) self._age_gender = AutoModelForAudioClassification.from_pretrained(model_id) self._age_gender.eval() except Exception as exc: # noqa: BLE001 self._age_gender_error = f"{type(exc).__name__}: {exc}" def _infer_age_gender(self, waveform_16k) -> dict[str, Any]: self._ensure_age_gender() if self._age_gender is None: return {} import numpy as np try: inputs = self._age_gender_processor( waveform_16k, sampling_rate=16000, return_tensors="pt" ) with self._torch.no_grad(): outputs = self._age_gender(**inputs) # Audeering's checkpoint is published with a custom head: the # official recipe exposes `(hidden_states, logits_age, logits_gender)`. # AutoModelForAudioClassification flattens that into a single # `logits` tensor of shape [batch, 4] — [age_regression, female, male, child]. # Fall back gracefully when the shape is different (e.g. a # user-supplied age_gender_model checkpoint that returns a proper tuple). hidden = getattr(outputs, "logits", outputs) age_years = None gender_logits = None if isinstance(hidden, (tuple, list)) and len(hidden) >= 2: age_years = float(hidden[0].squeeze().item()) * 100.0 gender_logits = hidden[1] else: flat = hidden.squeeze() if flat.ndim == 1 and flat.numel() >= 4: age_years = float(flat[0].item()) * 100.0 gender_logits = flat[1:4] elif flat.ndim == 1 and flat.numel() == 1: age_years = float(flat.item()) * 100.0 if age_years is None and gender_logits is None: return {} result: dict[str, Any] = {} if age_years is not None: result["age"] = age_years if gender_logits is not None: probs = self._torch.softmax(gender_logits, dim=-1).cpu().numpy() probs = np.asarray(probs).reshape(-1) gender_map = { label: float(probs[i]) for i, label in enumerate(self.AGE_GENDER_LABELS[: len(probs)]) } result["gender"] = gender_map if gender_map: dom = max(gender_map.items(), key=lambda kv: kv[1])[0] result["dominant_gender"] = { "female": "Female", "male": "Male", "child": "Child", }.get(dom, dom.capitalize()) return result except Exception as exc: # noqa: BLE001 # Analyze is a best-effort feature — never take down the # whole analyze call because the age/gender head had a bad # day. Mark the failure so the emotion branch still runs. self._age_gender_error = f"runtime: {type(exc).__name__}: {exc}" return {} # --- emotion ------------------------------------------------------ def _ensure_emotion(self): if self._emotion_loaded: return self._emotion_loaded = True model_id = self._options.get("emotion_model", self.DEFAULT_EMOTION_MODEL) if not model_id: self._emotion_error = "disabled" return try: from transformers import pipeline # type: ignore self._emotion = pipeline("audio-classification", model=model_id) except Exception as exc: # noqa: BLE001 self._emotion_error = f"{type(exc).__name__}: {exc}" def _infer_emotion(self, audio_path: str) -> dict[str, Any]: self._ensure_emotion() if self._emotion is None: return {} try: raw = self._emotion(audio_path, top_k=8) except Exception as exc: # noqa: BLE001 # Second-line defense: don't fail the whole analyze call # over a runtime inference hiccup. self._emotion_error = f"runtime: {type(exc).__name__}: {exc}" return {} emotion_map = {row["label"].lower(): float(row["score"]) for row in raw} if not emotion_map: return {} dom = max(emotion_map.items(), key=lambda kv: kv[1])[0] return {"emotion": emotion_map, "dominant_emotion": dom} # --- orchestrator ------------------------------------------------- def analyze(self, audio_path: str, waveform_16k, actions: Iterable[str]) -> dict[str, Any]: wanted = {a.strip().lower() for a in actions} if actions else {"age", "gender", "emotion"} result: dict[str, Any] = {} if "age" in wanted or "gender" in wanted: ag = self._infer_age_gender(waveform_16k) if "age" in wanted and "age" in ag: result["age"] = ag["age"] if "gender" in wanted: if "gender" in ag: result["gender"] = ag["gender"] if "dominant_gender" in ag: result["dominant_gender"] = ag["dominant_gender"] if "emotion" in wanted: em = self._infer_emotion(audio_path) result.update(em) return result class SpeechBrainEngine: """ECAPA-TDNN via SpeechBrain. Auto-downloads on first use.""" name = "speechbrain-ecapa-tdnn" def __init__(self, model_name: str, options: dict[str, str]): # Late imports so the module can be introspected / tested # without torch / speechbrain being installed. from speechbrain.inference.speaker import EncoderClassifier # type: ignore source = options.get("source") or model_name or "speechbrain/spkrec-ecapa-voxceleb" savedir = options.get("_model_path") or os.environ.get("HF_HOME") or "./pretrained_models" self._model = EncoderClassifier.from_hparams(source=source, savedir=savedir) self._analysis = AnalysisHead(options) def _load_waveform(self, path: str): # Use soundfile + torch directly — torchaudio.load in torchaudio # 2.8+ requires the torchcodec package for decoding, which adds # another heavy ffmpeg-linked dep. soundfile covers WAV/FLAC # which is what we care about here. import numpy as np import soundfile as sf # type: ignore import torch # type: ignore audio, sr = sf.read(path, always_2d=False) if audio.ndim > 1: audio = audio.mean(axis=1) audio = np.asarray(audio, dtype=np.float32) if sr != 16000: # Simple linear resample — good enough for 16kHz downsampling # from 44.1/48kHz, and we expect 16kHz inputs in practice. ratio = 16000 / float(sr) n = int(round(len(audio) * ratio)) audio = np.interp( np.linspace(0, len(audio), n, endpoint=False), np.arange(len(audio)), audio, ).astype(np.float32) return torch.from_numpy(audio).unsqueeze(0) # [1, T] def embed(self, audio_path: str) -> list[float]: waveform = self._load_waveform(audio_path) vec = self._model.encode_batch(waveform).squeeze().detach().cpu().numpy() return [float(x) for x in vec] def compare(self, audio1: str, audio2: str) -> float: return _cosine_distance(self.embed(audio1), self.embed(audio2)) def analyze(self, audio_path: str, actions): # Age / gender / emotion aren't produced by ECAPA-TDNN itself; # delegate to AnalysisHead which wraps separate Apache-2.0 # checkpoints. Returns a single segment spanning the clip — # segmentation / diarisation is a future enhancement. waveform = self._load_waveform(audio_path) mono = waveform.squeeze().detach().cpu().numpy() attrs = self._analysis.analyze(audio_path, mono, actions) if not attrs: raise NotImplementedError( "analyze head failed to load — install transformers + torch or pass age_gender_model/emotion_model options" ) duration = float(mono.shape[-1]) / 16000.0 if mono.size else 0.0 return [dict(start=0.0, end=duration, **attrs)] class OnnxDirectEngine: """Run a pre-exported ONNX speaker encoder (WeSpeaker / 3D-Speaker).""" name = "onnx-direct" def __init__(self, model_name: str, options: dict[str, str]): import onnxruntime as ort # type: ignore # The gallery is expected to have dropped the ONNX file under # the models directory; accept either an absolute path or a # filename relative to _model_path. onnx_path = options.get("model_path") or options.get("onnx") if not onnx_path: raise ValueError("OnnxDirectEngine requires `model_path: ` in options") if not os.path.isabs(onnx_path): onnx_path = os.path.join(options.get("_model_path", ""), onnx_path) if not os.path.isfile(onnx_path): raise FileNotFoundError(f"ONNX model not found: {onnx_path}") providers = options.get("providers") if providers: provider_list = [p.strip() for p in providers.split(",") if p.strip()] else: provider_list = ["CPUExecutionProvider"] self._session = ort.InferenceSession(onnx_path, providers=provider_list) input_meta = self._session.get_inputs()[0] self._input_name = input_meta.name # Pre-exported speaker encoders come in two shapes: # rank-2 [batch, samples] — some 3D-Speaker exports feed raw waveform. # rank-3 [batch, frames, n_mels] — WeSpeaker and most Kaldi-lineage encoders # expect pre-computed Kaldi FBank features. # We detect this at load time and branch in embed(), because feeding raw audio # into a rank-3 graph is exactly what triggered # "Invalid rank for input: feats Got: 2 Expected: 3". self._input_rank = len(input_meta.shape) if input_meta.shape is not None else 2 self._expected_sr = int(options.get("sample_rate", "16000")) self._fbank_mels = int(options.get("fbank_num_mel_bins", "80")) self._fbank_frame_length_ms = float(options.get("fbank_frame_length_ms", "25")) self._fbank_frame_shift_ms = float(options.get("fbank_frame_shift_ms", "10")) # Per-utterance cepstral mean normalisation — on for WeSpeaker by default, # toggleable for encoders that expect raw FBank. self._fbank_cmn = options.get("fbank_cmn", "true").lower() in ("1", "true", "yes") self._analysis = AnalysisHead(options) def _load_waveform(self, path: str): import numpy as np import soundfile as sf # type: ignore audio, sr = sf.read(path, always_2d=False) if sr != self._expected_sr: # Cheap linear resample — good enough for sanity; callers # should pre-resample for production. ratio = self._expected_sr / float(sr) n = int(round(len(audio) * ratio)) audio = np.interp( np.linspace(0, len(audio), n, endpoint=False), np.arange(len(audio)), audio, ) if audio.ndim > 1: audio = audio.mean(axis=1) return audio.astype("float32") def embed(self, audio_path: str) -> list[float]: import numpy as np audio = self._load_waveform(audio_path) if self._input_rank >= 3: feats = self._extract_fbank(audio) # [frames, n_mels] feed = feats[np.newaxis, :, :] # [1, frames, n_mels] else: feed = audio.reshape(1, -1) # [1, samples] out = self._session.run(None, {self._input_name: feed}) vec = np.asarray(out[0]).reshape(-1) return [float(x) for x in vec] def _extract_fbank(self, audio): """Compute Kaldi-style 80-dim FBank features for speaker encoders that expect pre-featurised input (WeSpeaker, most 3D-Speaker exports). torchaudio is already a backend dependency for SpeechBrain — no new package required.""" import numpy as np import torch # type: ignore import torchaudio.compliance.kaldi as kaldi # type: ignore tensor = torch.from_numpy(audio).unsqueeze(0) # [1, samples] feats = kaldi.fbank( tensor, sample_frequency=self._expected_sr, num_mel_bins=self._fbank_mels, frame_length=self._fbank_frame_length_ms, frame_shift=self._fbank_frame_shift_ms, dither=0.0, ) # [frames, n_mels] if self._fbank_cmn: feats = feats - feats.mean(dim=0, keepdim=True) return feats.numpy().astype(np.float32) def compare(self, audio1: str, audio2: str) -> float: return _cosine_distance(self.embed(audio1), self.embed(audio2)) def analyze(self, audio_path: str, actions): # AnalysisHead expects 16kHz mono; _load_waveform already # resamples to self._expected_sr. If the user configured a # non-16k expected rate, resample one more time for analyze. audio = self._load_waveform(audio_path) if self._expected_sr != 16000: import numpy as np ratio = 16000 / float(self._expected_sr) n = int(round(len(audio) * ratio)) audio = np.interp( np.linspace(0, len(audio), n, endpoint=False), np.arange(len(audio)), audio, ).astype("float32") attrs = self._analysis.analyze(audio_path, audio, actions) if not attrs: raise NotImplementedError( "analyze head failed to load — install transformers + torch or pass age_gender_model/emotion_model options" ) duration = float(len(audio)) / 16000.0 if len(audio) else 0.0 return [dict(start=0.0, end=duration, **attrs)] def build_engine(model_name: str, options: dict[str, str]) -> tuple[SpeakerEngine, str]: """Pick an engine based on the options. ONNX path takes priority: if the gallery has dropped a `model_path:` or `onnx:` option, run the direct ONNX engine. Otherwise, fall back to SpeechBrain. """ engine_kind = (options.get("engine") or "").lower() if engine_kind == "onnx" or options.get("model_path") or options.get("onnx"): return OnnxDirectEngine(model_name, options), OnnxDirectEngine.name return SpeechBrainEngine(model_name, options), SpeechBrainEngine.name