package backend import ( "bytes" "context" "encoding/binary" "encoding/json" "fmt" "os" "path/filepath" "github.com/mudler/LocalAI/core/config" laudio "github.com/mudler/LocalAI/pkg/audio" "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/LocalAI/pkg/utils" ) func ModelTTS( text, voice, language string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig, ) (string, *proto.Result, error) { opts := ModelOptions(modelConfig, appConfig) ttsModel, err := loader.Load(opts...) if err != nil { return "", nil, err } if ttsModel == nil { return "", nil, fmt.Errorf("could not load tts model %q", modelConfig.Model) } audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio") if err := os.MkdirAll(audioDir, 0750); err != nil { return "", nil, fmt.Errorf("failed creating audio directory: %s", err) } fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav") filePath := filepath.Join(audioDir, fileName) // We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect. // This should be addressed in a follow up PR soon. // Copying it over nearly verbatim, as TTS backends are not functional without this. modelPath := "" // Checking first that it exists and is not outside ModelPath // TODO: we should actually first check if the modelFile is looking like // a FS path mp := filepath.Join(loader.ModelPath, modelConfig.Model) if _, err := os.Stat(mp); err == nil { if err := utils.VerifyPath(mp, appConfig.SystemState.Model.ModelsPath); err != nil { return "", nil, err } modelPath = mp } else { modelPath = modelConfig.Model // skip this step if it fails????? } res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{ Text: text, Model: modelPath, Voice: voice, Dst: filePath, Language: &language, }) if err != nil { return "", nil, err } // return RPC error if any if !res.Success { return "", nil, fmt.Errorf("error during TTS: %s", res.Message) } return filePath, res, err } func ModelTTSStream( text, voice, language string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig, audioCallback func([]byte) error, ) error { opts := ModelOptions(modelConfig, appConfig) ttsModel, err := loader.Load(opts...) if err != nil { return err } if ttsModel == nil { return fmt.Errorf("could not load tts model %q", modelConfig.Model) } // We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect. // This should be addressed in a follow up PR soon. // Copying it over nearly verbatim, as TTS backends are not functional without this. modelPath := "" // Checking first that it exists and is not outside ModelPath // TODO: we should actually first check if the modelFile is looking like // a FS path mp := filepath.Join(loader.ModelPath, modelConfig.Model) if _, err := os.Stat(mp); err == nil { if err := utils.VerifyPath(mp, appConfig.SystemState.Model.ModelsPath); err != nil { return err } modelPath = mp } else { modelPath = modelConfig.Model // skip this step if it fails????? } var sampleRate uint32 = 16000 // default headerSent := false var callbackErr error err = ttsModel.TTSStream(context.Background(), &proto.TTSRequest{ Text: text, Model: modelPath, Voice: voice, Language: &language, }, func(reply *proto.Reply) { // First message contains sample rate info if !headerSent && len(reply.Message) > 0 { var info map[string]interface{} if json.Unmarshal(reply.Message, &info) == nil { if sr, ok := info["sample_rate"].(float64); ok { sampleRate = uint32(sr) } } // Send WAV header with placeholder size (0xFFFFFFFF for streaming) header := laudio.WAVHeader{ ChunkID: [4]byte{'R', 'I', 'F', 'F'}, ChunkSize: 0xFFFFFFFF, // Unknown size for streaming Format: [4]byte{'W', 'A', 'V', 'E'}, Subchunk1ID: [4]byte{'f', 'm', 't', ' '}, Subchunk1Size: 16, AudioFormat: 1, // PCM NumChannels: 1, // Mono SampleRate: sampleRate, ByteRate: sampleRate * 2, // SampleRate * BlockAlign BlockAlign: 2, // 16-bit = 2 bytes BitsPerSample: 16, Subchunk2ID: [4]byte{'d', 'a', 't', 'a'}, Subchunk2Size: 0xFFFFFFFF, // Unknown size for streaming } var buf bytes.Buffer if writeErr := binary.Write(&buf, binary.LittleEndian, header); writeErr != nil { callbackErr = writeErr return } if writeErr := audioCallback(buf.Bytes()); writeErr != nil { callbackErr = writeErr return } headerSent = true } // Stream audio chunks if len(reply.Audio) > 0 { if writeErr := audioCallback(reply.Audio); writeErr != nil { callbackErr = writeErr } } }) if callbackErr != nil { return callbackErr } return err }