From 454d8adc76cca7b3c17f277839ea7aca6a3d8819 Mon Sep 17 00:00:00 2001 From: Andres Date: Wed, 4 Mar 2026 09:47:21 +0100 Subject: [PATCH] feat(qwen-tts): Support using multiple voices (#8757) * Add support for multiple voice clones in Qwen TTS Signed-off-by: Andres Smith * Add voice prompt caching and generation logs to see generation time --------- Signed-off-by: Andres Smith Co-authored-by: Ettore Di Giacinto --- backend/python/qwen-tts/backend.py | 682 +++++++++++++++++++++---- docs/content/features/text-to-audio.md | 51 +- 2 files changed, 619 insertions(+), 114 deletions(-) diff --git a/backend/python/qwen-tts/backend.py b/backend/python/qwen-tts/backend.py index 57fbc222b..e2029cf83 100644 --- a/backend/python/qwen-tts/backend.py +++ b/backend/python/qwen-tts/backend.py @@ -2,6 +2,7 @@ """ This is an extra gRPC server of LocalAI for Qwen3-TTS """ + from concurrent import futures import time import argparse @@ -17,8 +18,13 @@ import torch import soundfile as sf from qwen_tts import Qwen3TTSModel +import json +import hashlib +import pickle + import grpc + def is_float(s): """Check if a string can be converted to float.""" try: @@ -27,6 +33,7 @@ def is_float(s): except ValueError: return False + def is_int(s): """Check if a string can be converted to int.""" try: @@ -35,10 +42,11 @@ def is_int(s): except ValueError: return False + _ONE_DAY_IN_SECONDS = 60 * 60 * 24 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) +MAX_WORKERS = int(os.environ.get("PYTHON_GRPC_MAX_WORKERS", "1")) # Implement the BackendServicer class with the service methods @@ -46,9 +54,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): """ BackendServicer is the class that implements the gRPC service """ + def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - + return backend_pb2.Reply(message=bytes("OK", "utf-8")) + def LoadModel(self, request, context): # Get device if torch.cuda.is_available(): @@ -57,7 +66,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): else: print("CUDA is not available", file=sys.stderr) device = "cpu" - mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + mps_available = ( + hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + ) if mps_available: device = "mps" if not torch.cuda.is_available() and request.CUDA: @@ -67,7 +78,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if device == "mpx": print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr) device = "mps" - + # Validate mps availability if requested if device == "mps" and not torch.backends.mps.is_available(): print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) @@ -97,6 +108,64 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): value = value.lower() == "true" self.options[key] = value + # Parse voices configuration from options + self.voices = {} + if "voices" in self.options: + try: + voices_data = self.options["voices"] + if isinstance(voices_data, str): + voices_list = json.loads(voices_data) + else: + voices_list = voices_data + + # Validate and store voices + for voice_entry in voices_list: + if not isinstance(voice_entry, dict): + print( + f"[WARNING] Invalid voice entry (not a dict): {voice_entry}", + file=sys.stderr, + ) + continue + + name = voice_entry.get("name") + audio = voice_entry.get("audio") + ref_text = voice_entry.get("ref_text") + + if not name or not isinstance(name, str): + print( + f"[WARNING] Voice entry missing required 'name' field: {voice_entry}", + file=sys.stderr, + ) + continue + if not audio or not isinstance(audio, str): + print( + f"[WARNING] Voice entry missing required 'audio' field: {voice_entry}", + file=sys.stderr, + ) + continue + if ref_text is None or not isinstance(ref_text, str): + print( + f"[WARNING] Voice entry missing required 'ref_text' field: {voice_entry}", + file=sys.stderr, + ) + continue + + self.voices[name] = {"audio": audio, "ref_text": ref_text} + print( + f"[INFO] Registered voice '{name}' with audio: {audio}", + file=sys.stderr, + ) + + print(f"[INFO] Loaded {len(self.voices)} voice(s)", file=sys.stderr) + except json.JSONDecodeError as e: + print(f"[ERROR] Failed to parse voices JSON: {e}", file=sys.stderr) + except Exception as e: + print( + f"[ERROR] Error processing voices configuration: {e}", + file=sys.stderr, + ) + print(traceback.format_exc(), file=sys.stderr) + # Get model path from request model_path = request.Model if not model_path: @@ -118,11 +187,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Cache for voice clone prompts self._voice_clone_cache = {} + # Pre-load cached voices if disk_cache is enabled + self._preload_cached_voices() + # Store AudioPath, ModelFile, and ModelPath from LoadModel request # These are used later in TTS for VoiceClone mode - self.audio_path = request.AudioPath if hasattr(request, 'AudioPath') and request.AudioPath else None - self.model_file = request.ModelFile if hasattr(request, 'ModelFile') and request.ModelFile else None - self.model_path = request.ModelPath if hasattr(request, 'ModelPath') and request.ModelPath else None + self.audio_path = ( + request.AudioPath + if hasattr(request, "AudioPath") and request.AudioPath + else None + ) + self.model_file = ( + request.ModelFile + if hasattr(request, "ModelFile") and request.ModelFile + else None + ) + self.model_path = ( + request.ModelPath + if hasattr(request, "ModelPath") and request.ModelPath + else None + ) # Decide dtype & attention implementation if self.device == "mps": @@ -138,7 +222,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): device_map = "cpu" attn_impl_primary = "sdpa" - print(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}, model_type: {self.model_type}", file=sys.stderr) + print( + f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}, model_type: {self.model_type}", + file=sys.stderr, + ) print(f"Loading model from: {model_path}", file=sys.stderr) # Load model with device-specific logic @@ -148,7 +235,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): "attn_implementation": attn_impl_primary, "trust_remote_code": True, # Required for qwen-tts models } - + try: if self.device == "mps": load_kwargs["device_map"] = None # load then move @@ -162,35 +249,66 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): self.model = Qwen3TTSModel.from_pretrained(model_path, **load_kwargs) except Exception as e: error_msg = str(e) - print(f"[ERROR] Loading model: {type(e).__name__}: {error_msg}", file=sys.stderr) + print( + f"[ERROR] Loading model: {type(e).__name__}: {error_msg}", + file=sys.stderr, + ) print(traceback.format_exc(), file=sys.stderr) - + # Check if it's a missing feature extractor/tokenizer error - if "speech_tokenizer" in error_msg or "preprocessor_config.json" in error_msg or "feature extractor" in error_msg.lower(): - print("\n[ERROR] Model files appear to be incomplete. This usually means:", file=sys.stderr) - print(" 1. The model download was interrupted or incomplete", file=sys.stderr) + if ( + "speech_tokenizer" in error_msg + or "preprocessor_config.json" in error_msg + or "feature extractor" in error_msg.lower() + ): + print( + "\n[ERROR] Model files appear to be incomplete. This usually means:", + file=sys.stderr, + ) + print( + " 1. The model download was interrupted or incomplete", + file=sys.stderr, + ) print(" 2. The model cache is corrupted", file=sys.stderr) print("\nTo fix this, try:", file=sys.stderr) - print(f" rm -rf ~/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-*", file=sys.stderr) + print( + f" rm -rf ~/.cache/huggingface/hub/models--Qwen--Qwen3-TTS-*", + file=sys.stderr, + ) print(" Then re-run to trigger a fresh download.", file=sys.stderr) - print("\nAlternatively, try using a different model variant:", file=sys.stderr) + print( + "\nAlternatively, try using a different model variant:", + file=sys.stderr, + ) print(" - Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", file=sys.stderr) print(" - Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", file=sys.stderr) print(" - Qwen/Qwen3-TTS-12Hz-1.7B-Base", file=sys.stderr) - - if attn_impl_primary == 'flash_attention_2': - print("\nTrying to use SDPA instead of flash_attention_2...", file=sys.stderr) - load_kwargs["attn_implementation"] = 'sdpa' + + if attn_impl_primary == "flash_attention_2": + print( + "\nTrying to use SDPA instead of flash_attention_2...", + file=sys.stderr, + ) + load_kwargs["attn_implementation"] = "sdpa" try: if self.device == "mps": load_kwargs["device_map"] = None - self.model = Qwen3TTSModel.from_pretrained(model_path, **load_kwargs) + self.model = Qwen3TTSModel.from_pretrained( + model_path, **load_kwargs + ) self.model.to("mps") else: - load_kwargs["device_map"] = (self.device if self.device in ("cuda", "cpu") else None) - self.model = Qwen3TTSModel.from_pretrained(model_path, **load_kwargs) + load_kwargs["device_map"] = ( + self.device if self.device in ("cuda", "cpu") else None + ) + self.model = Qwen3TTSModel.from_pretrained( + model_path, **load_kwargs + ) except Exception as e2: - print(f"[ERROR] Failed to load with SDPA: {type(e2).__name__}: {e2}", file=sys.stderr) + print( + f"[ERROR] Failed to load with SDPA: {type(e2).__name__}: {e2}", + file=sys.stderr, + ) print(traceback.format_exc(), file=sys.stderr) raise e2 else: @@ -212,81 +330,338 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if self.model_type == "VoiceDesign": return "VoiceDesign" - # VoiceClone: AudioPath is provided (from LoadModel, stored in self.audio_path) - if self.audio_path: + # VoiceClone: AudioPath is provided OR voices dict is populated + if self.audio_path or self.voices: return "VoiceClone" - + # VoiceDesign: instruct option is provided if "instruct" in self.options and self.options["instruct"]: return "VoiceDesign" - + # Default to CustomVoice return "CustomVoice" - def _get_ref_audio_path(self, request): - """Get reference audio path from stored AudioPath (from LoadModel).""" + def _get_ref_audio_path(self, request, voice_name=None): + """Get reference audio path from stored AudioPath or from voices dict.""" + # If voice_name is provided and exists in voices dict, use that + if voice_name and voice_name in self.voices: + audio_path = self.voices[voice_name]["audio"] + + # If absolute path, use as-is + if os.path.isabs(audio_path): + return audio_path + + # Try relative to ModelFile + if self.model_file: + model_file_base = os.path.dirname(self.model_file) + ref_path = os.path.join(model_file_base, audio_path) + if os.path.exists(ref_path): + return ref_path + + # Try relative to ModelPath + if self.model_path: + ref_path = os.path.join(self.model_path, audio_path) + if os.path.exists(ref_path): + return ref_path + + # Return as-is (might be URL or base64) + return audio_path + + # Fall back to legacy single-voice mode using self.audio_path if not self.audio_path: return None - + # If absolute path, use as-is if os.path.isabs(self.audio_path): return self.audio_path - + # Try relative to ModelFile if self.model_file: model_file_base = os.path.dirname(self.model_file) ref_path = os.path.join(model_file_base, self.audio_path) if os.path.exists(ref_path): return ref_path - + # Try relative to ModelPath if self.model_path: ref_path = os.path.join(self.model_path, self.audio_path) if os.path.exists(ref_path): return ref_path - + # Return as-is (might be URL or base64) return self.audio_path def _get_voice_clone_prompt(self, request, ref_audio, ref_text): - """Get or create voice clone prompt, with caching.""" - cache_key = f"{ref_audio}:{ref_text}" - + """Get or create voice clone prompt, with in-memory and disk caching.""" + cache_key = self._get_voice_cache_key(ref_audio, ref_text) + if cache_key not in self._voice_clone_cache: - print(f"Creating voice clone prompt from {ref_audio}", file=sys.stderr) - try: - prompt_items = self.model.create_voice_clone_prompt( - ref_audio=ref_audio, - ref_text=ref_text, - x_vector_only_mode=self.options.get("x_vector_only_mode", False), - ) - self._voice_clone_cache[cache_key] = prompt_items - except Exception as e: - print(f"Error creating voice clone prompt: {e}", file=sys.stderr) - print(traceback.format_exc(), file=sys.stderr) - return None - + # Check disk cache first (if enabled) + disk_cached = self._get_cached_voice_clone_prompt_from_disk( + ref_audio, ref_text + ) + if disk_cached is not None: + self._voice_clone_cache[cache_key] = disk_cached + else: + # Create new prompt + print(f"Creating voice clone prompt from {ref_audio}", file=sys.stderr) + try: + prompt_items = self.model.create_voice_clone_prompt( + ref_audio=ref_audio, + ref_text=ref_text, + x_vector_only_mode=self.options.get( + "x_vector_only_mode", False + ), + ) + self._voice_clone_cache[cache_key] = prompt_items + # Save to disk cache if enabled + self._save_voice_clone_prompt_to_disk( + ref_audio, ref_text, prompt_items + ) + except Exception as e: + print(f"Error creating voice clone prompt: {e}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return None + return self._voice_clone_cache[cache_key] + def _is_text_file_path(self, text): + """Check if the text is a file path to a text file.""" + if not text or not isinstance(text, str): + return False + # Check if it looks like a file path (contains / or \ and ends with common text file extensions) + text_extensions = [".txt", ".md", ".rst", ".text"] + has_path_separator = "/" in text or "\\" in text + ends_with_text_ext = any(text.lower().endswith(ext) for ext in text_extensions) + return has_path_separator and ends_with_text_ext + + def _read_text_file(self, file_path): + """Read text content from a file path, resolving relative paths.""" + try: + # If absolute path, use as-is + if os.path.isabs(file_path): + resolved_path = file_path + else: + # Try relative to ModelFile + if self.model_file: + model_file_base = os.path.dirname(self.model_file) + candidate_path = os.path.join(model_file_base, file_path) + if os.path.exists(candidate_path): + resolved_path = candidate_path + else: + resolved_path = file_path + else: + resolved_path = file_path + + # Try relative to ModelPath + if not os.path.exists(resolved_path) and self.model_path: + candidate_path = os.path.join(self.model_path, file_path) + if os.path.exists(candidate_path): + resolved_path = candidate_path + + # Check if file exists and is readable + if not os.path.exists(resolved_path): + print( + f"[ERROR] ref_text file not found: {resolved_path}", file=sys.stderr + ) + return None + + if not os.path.isfile(resolved_path): + print( + f"[ERROR] ref_text path is not a file: {resolved_path}", + file=sys.stderr, + ) + return None + + # Read and return file contents + with open(resolved_path, "r", encoding="utf-8") as f: + content = f.read().strip() + + print( + f"[INFO] Successfully read ref_text from file: {resolved_path}", + file=sys.stderr, + ) + return content + + except Exception as e: + print( + f"[ERROR] Failed to read ref_text file {file_path}: {e}", + file=sys.stderr, + ) + print(traceback.format_exc(), file=sys.stderr) + return None + + def _compute_file_hash(self, file_path): + """Compute SHA256 hash of file content.""" + try: + sha256 = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256.update(chunk) + return sha256.hexdigest() + except Exception as e: + print( + f"[ERROR] Failed to compute hash for {file_path}: {e}", file=sys.stderr + ) + return None + + def _compute_string_hash(self, text): + """Compute SHA256 hash of string.""" + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + def _get_cached_voice_clone_prompt_from_disk(self, ref_audio, ref_text_content): + """Load cached prompt from disk if available and valid.""" + if not self.options.get("disk_cache", False): + return None + + cache_file = f"{ref_audio}.voice_cache.pkl" + + if not os.path.exists(cache_file): + return None + + try: + with open(cache_file, "rb") as f: + cached = pickle.load(f) + + # Validate checksums + current_audio_hash = self._compute_file_hash(ref_audio) + current_text_hash = self._compute_string_hash(ref_text_content) + + if current_audio_hash is None or cached["audio_hash"] != current_audio_hash: + print("[INFO] Cache invalidation: audio file changed", file=sys.stderr) + os.remove(cache_file) + return None + + if cached["ref_text_hash"] != current_text_hash: + print( + "[INFO] Cache invalidation: ref_text content changed", + file=sys.stderr, + ) + os.remove(cache_file) + return None + + print( + f"[INFO] Loaded voice clone prompt from disk cache: {cache_file}", + file=sys.stderr, + ) + return cached["prompt_items"] + + except Exception as e: + print( + f"[WARNING] Failed to load disk cache {cache_file}: {e}", + file=sys.stderr, + ) + return None + + def _save_voice_clone_prompt_to_disk( + self, ref_audio, ref_text_content, prompt_items + ): + """Save prompt to disk cache alongside audio file.""" + if not self.options.get("disk_cache", False): + return + + cache_file = f"{ref_audio}.voice_cache.pkl" + + try: + cache_data = { + "audio_hash": self._compute_file_hash(ref_audio), + "ref_text_hash": self._compute_string_hash(ref_text_content), + "prompt_items": prompt_items, + } + + with open(cache_file, "wb") as f: + pickle.dump(cache_data, f) + + print( + f"[INFO] Saved voice clone prompt to disk cache: {cache_file}", + file=sys.stderr, + ) + except Exception as e: + print( + f"[WARNING] Failed to save disk cache {cache_file}: {e}", + file=sys.stderr, + ) + + def _get_voice_cache_key(self, ref_audio, ref_text): + """Get the cache key for a voice.""" + return f"{ref_audio}:{ref_text}" + + def _preload_cached_voices(self): + """Pre-load cached voice prompts at model startup.""" + if not self.voices or not self.options.get("disk_cache", False): + return + + print( + f"[INFO] Pre-loading {len(self.voices)} cached voice(s)...", file=sys.stderr + ) + loaded_count = 0 + missing_count = 0 + invalid_count = 0 + + for voice_name, voice_config in self.voices.items(): + audio_path = voice_config["audio"] + ref_text_path = voice_config["ref_text"] + + # Check for cache file + cache_file = f"{audio_path}.voice_cache.pkl" + if os.path.exists(cache_file): + # Read ref_text content for validation + ref_text_content = self._read_text_file(ref_text_path) + if ref_text_content is None: + invalid_count += 1 + print( + f"[INFO] Cannot read ref_text for {voice_name} (will recreate on first use)", + file=sys.stderr, + ) + continue + + cached_prompt = self._get_cached_voice_clone_prompt_from_disk( + audio_path, ref_text_content + ) + if cached_prompt: + # Pre-populate memory cache with content-based key + cache_key = self._get_voice_cache_key(audio_path, ref_text_content) + self._voice_clone_cache[cache_key] = cached_prompt + loaded_count += 1 + print(f"[INFO] Pre-loaded voice: {voice_name}", file=sys.stderr) + else: + invalid_count += 1 + print( + f"[INFO] Cache invalid for {voice_name} (will recreate on first use)", + file=sys.stderr, + ) + else: + missing_count += 1 + print( + f"[INFO] No cache found for {voice_name} (will create on first use)", + file=sys.stderr, + ) + + # Summary line + print( + f"[INFO] Pre-loaded {loaded_count}/{len(self.voices)} voices ({missing_count} missing, {invalid_count} invalid)", + file=sys.stderr, + ) + def TTS(self, request, context): try: # Check if dst is provided if not request.dst: return backend_pb2.Result( - success=False, - message="dst (output path) is required" + success=False, message="dst (output path) is required" ) - + # Prepare text text = request.text.strip() if not text: - return backend_pb2.Result( - success=False, - message="Text is empty" - ) + return backend_pb2.Result(success=False, message="Text is empty") # Get language (auto-detect if not provided) - language = request.language if hasattr(request, 'language') and request.language else None + language = ( + request.language + if hasattr(request, "language") and request.language + else None + ) if not language or language == "": language = "Auto" # Auto-detect language @@ -318,55 +693,123 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Generate audio based on mode if mode == "VoiceClone": # VoiceClone mode - ref_audio = self._get_ref_audio_path(request) + + # Check if multi-voice mode is active (voices dict is populated) + voice_name = None + if self.voices: + # Get voice from request (priority) or options + voice_name = request.voice if request.voice else None + if not voice_name: + voice_name = self.options.get("voice", None) + + # Validate voice exists + if voice_name and voice_name not in self.voices: + available_voices = ", ".join(sorted(self.voices.keys())) + return backend_pb2.Result( + success=False, + message=f"Voice '{voice_name}' not found. Available voices: {available_voices}", + ) + + # Get reference audio path (with voice-specific lookup if in multi-voice mode) + ref_audio = self._get_ref_audio_path(request, voice_name) if not ref_audio: - return backend_pb2.Result( - success=False, - message="AudioPath is required for VoiceClone mode" - ) - - ref_text = self.options.get("ref_text", None) - if not ref_text: - # Try to get from request if available - if hasattr(request, 'ref_text') and request.ref_text: - ref_text = request.ref_text + if voice_name: + return backend_pb2.Result( + success=False, + message=f"Audio path for voice '{voice_name}' could not be resolved", + ) else: - # x_vector_only_mode doesn't require ref_text - if not self.options.get("x_vector_only_mode", False): - return backend_pb2.Result( - success=False, - message="ref_text is required for VoiceClone mode (or set x_vector_only_mode=true)" - ) + return backend_pb2.Result( + success=False, + message="AudioPath is required for VoiceClone mode", + ) + + # Get reference text (from voice config if multi-voice, else from options/request) + if voice_name and voice_name in self.voices: + ref_text_source = self.voices[voice_name]["ref_text"] + else: + ref_text_source = self.options.get("ref_text", None) + if not ref_text_source: + # Try to get from request if available + if hasattr(request, "ref_text") and request.ref_text: + ref_text_source = request.ref_text + + if not ref_text_source: + # x_vector_only_mode doesn't require ref_text + if not self.options.get("x_vector_only_mode", False): + return backend_pb2.Result( + success=False, + message="ref_text is required for VoiceClone mode (or set x_vector_only_mode=true)", + ) + + # Determine if ref_text_source is a file path + ref_text_is_file = ref_text_source and self._is_text_file_path( + ref_text_source + ) + + if ref_text_is_file: + ref_text_content = self._read_text_file(ref_text_source) + if ref_text_content is None: + return backend_pb2.Result( + success=False, + message=f"Failed to read ref_text from file: {ref_text_source}", + ) + ref_text_source = ref_text_content + print( + f"[INFO] Loaded ref_text from file: {ref_text_content[:100]}...", + file=sys.stderr, + ) + + # For caching: use the content as the key (since we've read the file if it was one) + ref_text_for_cache = ref_text_source # Check if we should use cached prompt use_cached_prompt = self.options.get("use_cached_prompt", True) voice_clone_prompt = None - + if use_cached_prompt: - voice_clone_prompt = self._get_voice_clone_prompt(request, ref_audio, ref_text) - if voice_clone_prompt is None: - return backend_pb2.Result( - success=False, - message="Failed to create voice clone prompt" - ) + voice_clone_prompt = self._get_voice_clone_prompt( + request, ref_audio, ref_text_for_cache + ) + + if voice_clone_prompt is None: + return backend_pb2.Result( + success=False, message="Failed to create voice clone prompt" + ) if voice_clone_prompt: # Use cached prompt + start_time = time.time() wavs, sr = self.model.generate_voice_clone( text=text, language=language, voice_clone_prompt=voice_clone_prompt, - **generation_kwargs + **generation_kwargs, + ) + generation_duration = time.time() - start_time + print( + f"[INFO] Voice clone generation completed: {generation_duration:.2f}s, output_samples={len(wavs) if wavs else 0}", + file=sys.stderr, + flush=True, ) else: - # Create prompt on-the-fly + # Create prompt on-the-fly (only for non-file ref_text that wasn't cached) + start_time = time.time() wavs, sr = self.model.generate_voice_clone( text=text, language=language, ref_audio=ref_audio, - ref_text=ref_text, - x_vector_only_mode=self.options.get("x_vector_only_mode", False), - **generation_kwargs + ref_text=ref_text_source, + x_vector_only_mode=self.options.get( + "x_vector_only_mode", False + ), + **generation_kwargs, + ) + generation_duration = time.time() - start_time + print( + f"[INFO] Voice clone generation (on-the-fly) completed: {generation_duration:.2f}s, output_samples={len(wavs) if wavs else 0}", + file=sys.stderr, + flush=True, ) elif mode == "VoiceDesign": @@ -374,14 +817,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if not instruct: return backend_pb2.Result( success=False, - message="instruct option is required for VoiceDesign mode" + message="instruct option is required for VoiceDesign mode", ) wavs, sr = self.model.generate_voice_design( - text=text, - language=language, - instruct=instruct, - **generation_kwargs + text=text, language=language, instruct=instruct, **generation_kwargs ) else: @@ -393,57 +833,74 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if not speaker: # Use default speaker speaker = "Vivian" - print(f"No speaker specified, using default: {speaker}", file=sys.stderr) + print( + f"No speaker specified, using default: {speaker}", + file=sys.stderr, + ) # Validate speaker if model supports it - if hasattr(self.model, 'get_supported_speakers'): + if hasattr(self.model, "get_supported_speakers"): try: supported_speakers = self.model.get_supported_speakers() if speaker not in supported_speakers: - print(f"Warning: Speaker '{speaker}' not in supported list. Available: {supported_speakers}", file=sys.stderr) + print( + f"Warning: Speaker '{speaker}' not in supported list. Available: {supported_speakers}", + file=sys.stderr, + ) # Try to find a close match (case-insensitive) speaker_lower = speaker.lower() for sup_speaker in supported_speakers: if sup_speaker.lower() == speaker_lower: speaker = sup_speaker - print(f"Using matched speaker: {speaker}", file=sys.stderr) + print( + f"Using matched speaker: {speaker}", + file=sys.stderr, + ) break except Exception as e: - print(f"Warning: Could not get supported speakers: {e}", file=sys.stderr) + print( + f"Warning: Could not get supported speakers: {e}", + file=sys.stderr, + ) wavs, sr = self.model.generate_custom_voice( - text=text, - language=language, - speaker=speaker, - **generation_kwargs + text=text, language=language, speaker=speaker, **generation_kwargs ) # Save output if wavs is not None and len(wavs) > 0: # wavs is a list, take first element audio_data = wavs[0] if isinstance(wavs, list) else wavs + audio_duration = len(audio_data) / sr if sr > 0 else 0 sf.write(request.dst, audio_data, sr) - print(f"Saved output to {request.dst}", file=sys.stderr) + print( + f"Saved {audio_duration:.2f}s audio to {request.dst}", + file=sys.stderr, + ) else: return backend_pb2.Result( - success=False, - message="No audio output generated" + success=False, message="No audio output generated" ) except Exception as err: print(f"Error in TTS: {err}", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - + return backend_pb2.Result( + success=False, message=f"Unexpected {err=}, {type(err)=}" + ) + return backend_pb2.Result(success=True) + def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + server = grpc.server( + futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), options=[ - ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB - ]) + ("grpc.max_message_length", 50 * 1024 * 1024), # 50MB + ("grpc.max_send_message_length", 50 * 1024 * 1024), # 50MB + ("grpc.max_receive_message_length", 50 * 1024 * 1024), # 50MB + ], + ) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) server.add_insecure_port(address) server.start() @@ -465,6 +922,7 @@ def serve(address): except KeyboardInterrupt: server.stop(0) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run the gRPC server.") parser.add_argument( diff --git a/docs/content/features/text-to-audio.md b/docs/content/features/text-to-audio.md index c1291cf59..59081f164 100644 --- a/docs/content/features/text-to-audio.md +++ b/docs/content/features/text-to-audio.md @@ -364,7 +364,54 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' | aplay ``` -## Using config files +#### Multi-Voice Clone Mode + +Qwen3-TTS also supports loading multiple voices for voice cloning, allowing you to select different voices at request time. Configure multiple voices using the `voices` option: + +```yaml +name: qwen-tts-multi-voice +backend: qwen-tts +parameters: + model: Qwen/Qwen3-TTS-12Hz-1.7B-Base +options: + - voices:[{"name":"jane","audio":"voices/jane.wav","ref_text":"voices/jane-ref.txt"},{"name":"john","audio":"voices/john.wav","ref_text":"voices/john-ref.txt"}] +``` + +The `voices` option accepts a JSON array where each voice entry must have: +- `name`: The voice identifier (used in API requests) +- `audio`: Path to the reference audio file (relative to model directory or absolute) +- `ref_text`: Path to the reference text file for the audio it is paired with + +Then use the model with voice selection: + +```bash +curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "model": "qwen-tts-multi-voice", + "input":"Hello world, this is Jane speaking.", + "voice": "jane" + }' | aplay + +# Switch to a different voice +curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "model": "qwen-tts-multi-voice", + "input":"Hello world, this is John speaking.", + "voice": "john" + }' | aplay +``` + +**Voice Selection Priority:** +1. `voice` parameter in the API request (highest priority) +2. `voice` option in the model configuration +3. Error if voice is not found among configured voices + +**Error Handling:** +If you request a voice that doesn't exist in the voices list, the API will return an error with a list of available voices: +```json +{"error": "Voice 'unknown' not found. Available voices: jane, john"} +``` + +**Backward Compatibility:** +The multi-voice mode is backward compatible with existing single-voice configurations. Models using `audio_path` in the `tts` section will continue to work as before. You can also use a `config-file` to specify TTS models and their parameters. @@ -408,4 +455,4 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' ``` -If a `response_format` is added in the query (other than `wav`) and ffmpeg is not available, the call will fail. \ No newline at end of file +If a `response_format` is added in the query (other than `wav`) and ffmpeg is not available, the call will fail.