mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-19 22:29:10 -04:00
feat(crispasr): add word-level timestamp support (#10403)
* feat(crispasr): add word-level timestamp support Add word-level timestamp extraction to the crispasr backend by calling the CrispASR C library's word accessor functions that are already exported by libgocraspasr but were not previously bound by the Go wrapper. Two families of word functions are supported: 1. Session-based (get_word_count/text/t0/t1) — works per-segment for whisper-like backends. 2. Parakeet-specific (get_parakeet_word_count/text/t0/t1) — returns a global word list for TDT/CTC/RNNT parakeet models where the session API does not expose per-segment word data. The Go code tries session-based first and falls back to parakeet-specific when the session word count is zero. Depends on #10402 (grpc server Words forwarding) for the words to reach the HTTP response. Signed-off-by: fqscfqj <fqscfqj@outlook.com> * fix(crispasr): use portable sed -i.bak for macOS compatibility BSD sed requires -i '' for in-place editing while GNU sed uses -i. Replace with -i.bak which works on both platforms, then remove the backup file. Signed-off-by: fqscfqj <fqscfqj@outlook.com> --------- Signed-off-by: fqscfqj <fqscfqj@outlook.com>
This commit is contained in:
@@ -67,7 +67,7 @@ sources/CrispASR:
|
||||
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
||||
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
||||
# which is correct both standalone and as a subproject. Idempotent.
|
||||
sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
|
||||
sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
|
||||
|
||||
# Detect OS
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
|
||||
g_abort.store(v, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
extern "C" {
|
||||
int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
|
||||
const char *crispasr_session_result_word_text(crispasr_session_result *r,
|
||||
int seg_i, int word_i);
|
||||
int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int crispasr_parakeet_result_n_words(void *r);
|
||||
const char *crispasr_parakeet_result_word_text(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
|
||||
}
|
||||
|
||||
void *get_result(void) { return g_result; }
|
||||
|
||||
int get_word_count(int seg_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_n_words(g_result, seg_i);
|
||||
}
|
||||
|
||||
const char *get_word_text(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_session_result_word_text(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t0(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t0(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t1(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t1(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int get_parakeet_word_count(void) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_n_words(g_result);
|
||||
}
|
||||
|
||||
const char *get_parakeet_word_text(int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_parakeet_result_word_text(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t0(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t0(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t1(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t1(g_result, word_i);
|
||||
}
|
||||
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||
void *data) {
|
||||
const char *level_str;
|
||||
|
||||
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
|
||||
void tts_free(float *pcm);
|
||||
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
||||
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
// Session-based (works for whisper-like backends)
|
||||
void *get_result(void);
|
||||
int get_word_count(int seg_i);
|
||||
const char *get_word_text(int seg_i, int word_i);
|
||||
int64_t get_word_t0(int seg_i, int word_i);
|
||||
int64_t get_word_t1(int seg_i, int word_i);
|
||||
|
||||
// Parakeet-specific (global word list, no segment index)
|
||||
int get_parakeet_word_count(void);
|
||||
const char *get_parakeet_word_text(int word_i);
|
||||
int64_t get_parakeet_word_t0(int word_i);
|
||||
int64_t get_parakeet_word_t1(int word_i);
|
||||
}
|
||||
|
||||
@@ -34,6 +34,18 @@ var (
|
||||
CppTTSFree func(ptr uintptr)
|
||||
CppTTSSetVoice func(name string) int
|
||||
CppTTSSetVoiceFile func(path string, refText string) int
|
||||
|
||||
// Word-level timestamp accessors (session-based, per-segment)
|
||||
CppGetWordCount func(segI int) int
|
||||
CppGetWordText func(segI int, wordI int) string
|
||||
CppGetWordT0 func(segI int, wordI int) int64
|
||||
CppGetWordT1 func(segI int, wordI int) int64
|
||||
|
||||
// Parakeet-specific word accessors (global, no segment index)
|
||||
CppGetParakeetWordCount func() int
|
||||
CppGetParakeetWordText func(wordI int) string
|
||||
CppGetParakeetWordT0 func(wordI int) int64
|
||||
CppGetParakeetWordT1 func(wordI int) int64
|
||||
)
|
||||
|
||||
type CrispASR struct {
|
||||
@@ -290,10 +302,36 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
|
||||
// IDs, so Tokens is left empty.
|
||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||
|
||||
// Populate word-level timestamps. Try session-based functions first
|
||||
// (per-segment); fall back to parakeet-specific functions (global word
|
||||
// list with no segment index — only populated on the first segment to
|
||||
// avoid duplication).
|
||||
words := []*pb.TranscriptWord{}
|
||||
wordCount := CppGetWordCount(i)
|
||||
if wordCount == 0 && i == 0 {
|
||||
wordCount = CppGetParakeetWordCount()
|
||||
for j := 0; j < wordCount; j++ {
|
||||
words = append(words, &pb.TranscriptWord{
|
||||
Start: CppGetParakeetWordT0(j) * (10000000),
|
||||
End: CppGetParakeetWordT1(j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
|
||||
})
|
||||
}
|
||||
} else {
|
||||
for j := 0; j < wordCount; j++ {
|
||||
words = append(words, &pb.TranscriptWord{
|
||||
Start: CppGetWordT0(i, j) * (10000000),
|
||||
End: CppGetWordT1(i, j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
segment := &pb.TranscriptSegment{
|
||||
Id: int32(i),
|
||||
Text: txt,
|
||||
Start: s, End: t,
|
||||
Words: words,
|
||||
}
|
||||
|
||||
segments = append(segments, segment)
|
||||
|
||||
@@ -44,6 +44,14 @@ func main() {
|
||||
{&CppTTSFree, "tts_free"},
|
||||
{&CppTTSSetVoice, "tts_set_voice"},
|
||||
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
||||
{&CppGetWordCount, "get_word_count"},
|
||||
{&CppGetWordText, "get_word_text"},
|
||||
{&CppGetWordT0, "get_word_t0"},
|
||||
{&CppGetWordT1, "get_word_t1"},
|
||||
{&CppGetParakeetWordCount, "get_parakeet_word_count"},
|
||||
{&CppGetParakeetWordText, "get_parakeet_word_text"},
|
||||
{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
|
||||
{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
|
||||
}
|
||||
|
||||
for _, lf := range libFuncs {
|
||||
|
||||
Reference in New Issue
Block a user