diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile index 42a7a7555..bbc84f1de 100644 --- a/backend/go/crispasr/Makefile +++ b/backend/go/crispasr/Makefile @@ -67,7 +67,7 @@ sources/CrispASR: # it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources # aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root), # which is correct both standalone and as a subproject. Idempotent. - sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt + sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak # Detect OS UNAME_S := $(shell uname -s) diff --git a/backend/go/crispasr/cpp/crispasr_shim.cpp b/backend/go/crispasr/cpp/crispasr_shim.cpp index bf6151ae1..60dbfd86b 100644 --- a/backend/go/crispasr/cpp/crispasr_shim.cpp +++ b/backend/go/crispasr/cpp/crispasr_shim.cpp @@ -47,6 +47,74 @@ extern "C" void set_abort(int v) { g_abort.store(v, std::memory_order_relaxed); } +// --- word-level timestamp accessors --- +extern "C" { +int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i); +const char *crispasr_session_result_word_text(crispasr_session_result *r, + int seg_i, int word_i); +int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i, + int word_i); +int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i, + int word_i); + +// Parakeet-specific word accessors +int crispasr_parakeet_result_n_words(void *r); +const char *crispasr_parakeet_result_word_text(void *r, int word_i); +int64_t crispasr_parakeet_result_word_t0(void *r, int word_i); +int64_t crispasr_parakeet_result_word_t1(void *r, int word_i); +} + +void *get_result(void) { return g_result; } + +int get_word_count(int seg_i) { + if (!g_result) + return 0; + return crispasr_session_result_n_words(g_result, seg_i); +} + +const char *get_word_text(int seg_i, int word_i) { + if (!g_result) + return ""; + return crispasr_session_result_word_text(g_result, seg_i, word_i); +} + +int64_t get_word_t0(int seg_i, int word_i) { + if (!g_result) + return 0; + return crispasr_session_result_word_t0(g_result, seg_i, word_i); +} + +int64_t get_word_t1(int seg_i, int word_i) { + if (!g_result) + return 0; + return crispasr_session_result_word_t1(g_result, seg_i, word_i); +} + +// Parakeet-specific word accessors +int get_parakeet_word_count(void) { + if (!g_result) + return 0; + return crispasr_parakeet_result_n_words(g_result); +} + +const char *get_parakeet_word_text(int word_i) { + if (!g_result) + return ""; + return crispasr_parakeet_result_word_text(g_result, word_i); +} + +int64_t get_parakeet_word_t0(int word_i) { + if (!g_result) + return 0; + return crispasr_parakeet_result_word_t0(g_result, word_i); +} + +int64_t get_parakeet_word_t1(int word_i) { + if (!g_result) + return 0; + return crispasr_parakeet_result_word_t1(g_result, word_i); +} + static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) { const char *level_str; diff --git a/backend/go/crispasr/cpp/crispasr_shim.h b/backend/go/crispasr/cpp/crispasr_shim.h index 7c593951a..c7baa41f4 100644 --- a/backend/go/crispasr/cpp/crispasr_shim.h +++ b/backend/go/crispasr/cpp/crispasr_shim.h @@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float void tts_free(float *pcm); int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text) + +// --- word-level timestamp accessors --- +// Session-based (works for whisper-like backends) +void *get_result(void); +int get_word_count(int seg_i); +const char *get_word_text(int seg_i, int word_i); +int64_t get_word_t0(int seg_i, int word_i); +int64_t get_word_t1(int seg_i, int word_i); + +// Parakeet-specific (global word list, no segment index) +int get_parakeet_word_count(void); +const char *get_parakeet_word_text(int word_i); +int64_t get_parakeet_word_t0(int word_i); +int64_t get_parakeet_word_t1(int word_i); } diff --git a/backend/go/crispasr/gocrispasr.go b/backend/go/crispasr/gocrispasr.go index 5c3528d38..af1f1a95c 100644 --- a/backend/go/crispasr/gocrispasr.go +++ b/backend/go/crispasr/gocrispasr.go @@ -34,6 +34,18 @@ var ( CppTTSFree func(ptr uintptr) CppTTSSetVoice func(name string) int CppTTSSetVoiceFile func(path string, refText string) int + + // Word-level timestamp accessors (session-based, per-segment) + CppGetWordCount func(segI int) int + CppGetWordText func(segI int, wordI int) string + CppGetWordT0 func(segI int, wordI int) int64 + CppGetWordT1 func(segI int, wordI int) int64 + + // Parakeet-specific word accessors (global, no segment index) + CppGetParakeetWordCount func() int + CppGetParakeetWordText func(wordI int) string + CppGetParakeetWordT0 func(wordI int) int64 + CppGetParakeetWordT1 func(wordI int) int64 ) type CrispASR struct { @@ -290,10 +302,36 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe // IDs, so Tokens is left empty. txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "�") + // Populate word-level timestamps. Try session-based functions first + // (per-segment); fall back to parakeet-specific functions (global word + // list with no segment index — only populated on the first segment to + // avoid duplication). + words := []*pb.TranscriptWord{} + wordCount := CppGetWordCount(i) + if wordCount == 0 && i == 0 { + wordCount = CppGetParakeetWordCount() + for j := 0; j < wordCount; j++ { + words = append(words, &pb.TranscriptWord{ + Start: CppGetParakeetWordT0(j) * (10000000), + End: CppGetParakeetWordT1(j) * (10000000), + Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "�"), + }) + } + } else { + for j := 0; j < wordCount; j++ { + words = append(words, &pb.TranscriptWord{ + Start: CppGetWordT0(i, j) * (10000000), + End: CppGetWordT1(i, j) * (10000000), + Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "�"), + }) + } + } + segment := &pb.TranscriptSegment{ Id: int32(i), Text: txt, Start: s, End: t, + Words: words, } segments = append(segments, segment) diff --git a/backend/go/crispasr/main.go b/backend/go/crispasr/main.go index c2069bd85..9f3ef14d0 100644 --- a/backend/go/crispasr/main.go +++ b/backend/go/crispasr/main.go @@ -44,6 +44,14 @@ func main() { {&CppTTSFree, "tts_free"}, {&CppTTSSetVoice, "tts_set_voice"}, {&CppTTSSetVoiceFile, "tts_set_voice_file"}, + {&CppGetWordCount, "get_word_count"}, + {&CppGetWordText, "get_word_text"}, + {&CppGetWordT0, "get_word_t0"}, + {&CppGetWordT1, "get_word_t1"}, + {&CppGetParakeetWordCount, "get_parakeet_word_count"}, + {&CppGetParakeetWordText, "get_parakeet_word_text"}, + {&CppGetParakeetWordT0, "get_parakeet_word_t0"}, + {&CppGetParakeetWordT1, "get_parakeet_word_t1"}, } for _, lf := range libFuncs {