mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-26 17:37:07 -04:00
* feat(crispasr): add word-level timestamp support Add word-level timestamp extraction to the crispasr backend by calling the CrispASR C library's word accessor functions that are already exported by libgocraspasr but were not previously bound by the Go wrapper. Two families of word functions are supported: 1. Session-based (get_word_count/text/t0/t1) — works per-segment for whisper-like backends. 2. Parakeet-specific (get_parakeet_word_count/text/t0/t1) — returns a global word list for TDT/CTC/RNNT parakeet models where the session API does not expose per-segment word data. The Go code tries session-based first and falls back to parakeet-specific when the session word count is zero. Depends on #10402 (grpc server Words forwarding) for the words to reach the HTTP response. Signed-off-by: fqscfqj <fqscfqj@outlook.com> * fix(crispasr): use portable sed -i.bak for macOS compatibility BSD sed requires -i '' for in-place editing while GNU sed uses -i. Replace with -i.bak which works on both platforms, then remove the backup file. Signed-off-by: fqscfqj <fqscfqj@outlook.com> --------- Signed-off-by: fqscfqj <fqscfqj@outlook.com>
38 lines
1.5 KiB
C++
38 lines
1.5 KiB
C++
#include <cstddef>
|
|
#include <cstdint>
|
|
|
|
extern "C" {
|
|
int load_model(const char *const model_path, int threads,
|
|
const char *backend_name);
|
|
int set_codec_path(const char *path);
|
|
int load_model_vad(const char *const model_path);
|
|
int vad(float pcmf32[], size_t pcmf32_size, float **segs_out,
|
|
size_t *segs_out_len);
|
|
int transcribe(uint32_t threads, char *lang, bool translate, bool diarize,
|
|
float pcmf32[], size_t pcmf32_len, size_t *segs_out_len,
|
|
char *prompt);
|
|
const char *get_segment_text(int i);
|
|
int64_t get_segment_t0(int i);
|
|
int64_t get_segment_t1(int i);
|
|
const char *get_backend(void);
|
|
void set_abort(int v);
|
|
float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float, malloc'd; NULL on failure
|
|
void tts_free(float *pcm);
|
|
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
|
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
|
|
|
// --- word-level timestamp accessors ---
|
|
// Session-based (works for whisper-like backends)
|
|
void *get_result(void);
|
|
int get_word_count(int seg_i);
|
|
const char *get_word_text(int seg_i, int word_i);
|
|
int64_t get_word_t0(int seg_i, int word_i);
|
|
int64_t get_word_t1(int seg_i, int word_i);
|
|
|
|
// Parakeet-specific (global word list, no segment index)
|
|
int get_parakeet_word_count(void);
|
|
const char *get_parakeet_word_text(int word_i);
|
|
int64_t get_parakeet_word_t0(int word_i);
|
|
int64_t get_parakeet_word_t1(int word_i);
|
|
}
|