package main import ( "fmt" "os" "path/filepath" "strings" "sync" "unsafe" "github.com/ebitengine/purego" "github.com/mudler/LocalAI/pkg/grpc/base" pb "github.com/mudler/LocalAI/pkg/grpc/proto" ) var ( // omni_load(model_path, codec_path, use_fa, clamp_fp16) int CppLoad func(modelPath, codecPath string, useFA, clampFP16 int) int // omni_tts(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, out_n) -> float* (uintptr) CppTTS func(text, lang, instruct string, refSamples unsafe.Pointer, refN int, refText string, seed int64, denoise int, outN unsafe.Pointer) uintptr // omni_tts_stream(text, lang, instruct, ref_samples, ref_n, ref_text, seed, denoise, cb, user) int CppTTSStream func(text, lang, instruct string, refSamples unsafe.Pointer, refN int, refText string, seed int64, denoise int, cb uintptr, user uintptr) int CppPCMFree func(ptr uintptr) CppUnload func() ) type OmnivoiceCpp struct { base.SingleThread opts loadOptions // audioPath is the model-config reference voice (tts.audio_path), used as // the default voice-cloning reference when a request does not set Voice. audioPath string } func (o *OmnivoiceCpp) Load(opts *pb.ModelOptions) error { model := opts.ModelFile if model == "" { model = opts.ModelPath } if !filepath.IsAbs(model) && opts.ModelPath != "" { model = filepath.Join(opts.ModelPath, model) } o.opts = parseOptions(opts.Options) // Resolve the codec/tokenizer GGUF: explicit option, else auto-discover a // *tokenizer*.gguf sibling of the base model. codec := o.opts.codecPath if codec != "" && !filepath.IsAbs(codec) { codec = filepath.Join(filepath.Dir(model), codec) } if codec == "" { codec = discoverTokenizer(filepath.Dir(model)) } if codec == "" { return fmt.Errorf("omnivoice: no codec/tokenizer GGUF found; set option 'tokenizer:'") } o.opts.codecPath = codec // tts.audio_path (ModelOptions.AudioPath) is the config-level voice-cloning // reference: a default reference WAV used when a request omits Voice. // Resolved relative to the model directory like the codec. o.audioPath = opts.AudioPath if o.audioPath != "" && !filepath.IsAbs(o.audioPath) { o.audioPath = filepath.Join(filepath.Dir(model), o.audioPath) } useFA := boolToInt(o.opts.useFA) clamp := boolToInt(o.opts.clampFP16) fmt.Fprintf(os.Stderr, "[omnivoice-cpp] Load model=%s codec=%s use_fa=%d clamp_fp16=%d\n", model, codec, useFA, clamp) if rc := CppLoad(model, codec, useFA, clamp); rc != 0 { return fmt.Errorf("omnivoice: failed to load model (rc=%d)", rc) } return nil } // discoverTokenizer returns the first *tokenizer*.gguf in dir, or "". func discoverTokenizer(dir string) string { entries, err := os.ReadDir(dir) if err != nil { return "" } for _, e := range entries { name := strings.ToLower(e.Name()) if strings.Contains(name, "tokenizer") && strings.HasSuffix(name, ".gguf") { return filepath.Join(dir, e.Name()) } } return "" } func boolToInt(b bool) int { if b { return 1 } return 0 } // refAudio loads the reference WAV (voice cloning) if voice points to a file. // Returns nil if no cloning (empty or non-path - voice design uses Instructions). func (o *OmnivoiceCpp) refAudio(voice string) ([]float32, error) { v := strings.TrimSpace(voice) if v == "" { return nil, nil } if _, err := os.Stat(v); err != nil { return nil, nil } return readWAVAsFloat(v) } // refAudioFor resolves the cloning reference for a request: the per-request // Voice takes precedence, falling back to the model-config audio_path. Empty // result means no cloning (voice design via Instructions still applies). func (o *OmnivoiceCpp) refAudioFor(req *pb.TTSRequest) ([]float32, error) { voice := strings.TrimSpace(req.Voice) if voice == "" { voice = o.audioPath } return o.refAudio(voice) } func reqParam(req *pb.TTSRequest, key string) string { if req.Params == nil { return "" } return req.Params[key] } func (o *OmnivoiceCpp) seedFor(req *pb.TTSRequest) int64 { if s := reqParam(req, "seed"); s != "" { var n int64 if _, err := fmt.Sscan(s, &n); err == nil { return n } } return o.opts.seed } func optStr(p *string) string { if p == nil { return "" } return *p } func (o *OmnivoiceCpp) TTS(req *pb.TTSRequest) error { if req.Dst == "" { return fmt.Errorf("omnivoice: TTS requires a destination path") } lang := normalizeLanguage(optStr(req.Language)) instruct := optStr(req.Instructions) refText := reqParam(req, "ref_text") seed := o.seedFor(req) ref, err := o.refAudioFor(req) if err != nil { return err } var refPtr unsafe.Pointer if len(ref) > 0 { refPtr = unsafe.Pointer(&ref[0]) } var n int32 ptr := CppTTS(req.Text, lang, instruct, refPtr, len(ref), refText, seed, boolToInt(o.opts.denoise), unsafe.Pointer(&n)) runtimeKeepAlive(ref) if ptr == 0 || n <= 0 { return fmt.Errorf("omnivoice: synthesis failed") } defer CppPCMFree(ptr) src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), int(n)) //nolint:govet // C-allocated PCM, copied out before free out := make([]float32, int(n)) copy(out, src) return writeWAV24k(req.Dst, out) } // streamState carries the active TTSStream channel to the single shared C // callback. base.SingleThread serializes TTS/TTSStream, so one global slot is // safe and avoids leaking a purego callback per request (purego callbacks // cannot be freed and are capped). var ( streamMu sync.Mutex streamChan chan []byte streamCbOnce sync.Once streamCbPtr uintptr ) // streamCallback is registered once and forwards each PCM chunk to streamChan. func streamCallback(samples *float32, nSamples int32, _ uintptr) uintptr { if nSamples <= 0 || samples == nil || streamChan == nil { return 1 // continue } src := unsafe.Slice(samples, int(nSamples)) cp := make([]float32, int(nSamples)) // copy out of C memory before returning copy(cp, src) streamChan <- floatToPCM16LE(cp) return 1 // continue } func (o *OmnivoiceCpp) TTSStream(req *pb.TTSRequest, results chan []byte) error { defer close(results) if req.Text == "" { return fmt.Errorf("omnivoice: TTSStream requires text") } streamCbOnce.Do(func() { streamCbPtr = purego.NewCallback(streamCallback) }) lang := normalizeLanguage(optStr(req.Language)) instruct := optStr(req.Instructions) refText := reqParam(req, "ref_text") seed := o.seedFor(req) ref, err := o.refAudioFor(req) if err != nil { return err } var refPtr unsafe.Pointer if len(ref) > 0 { refPtr = unsafe.Pointer(&ref[0]) } // Emit the WAV header first so the HTTP layer gets a self-describing stream. results <- wavHeader24k() streamMu.Lock() streamChan = results rc := CppTTSStream(req.Text, lang, instruct, refPtr, len(ref), refText, seed, boolToInt(o.opts.denoise), streamCbPtr, 0) streamChan = nil streamMu.Unlock() runtimeKeepAlive(ref) if rc != 0 { return fmt.Errorf("omnivoice: streaming synthesis failed (rc=%d)", rc) } return nil }