feat: support word-level timestamps for faster-whisper (#9621)

Signed-off-by: Andreas Egli <github@kharan.ch>
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Andreas Egli
2026-05-06 00:32:52 +02:00
committed by GitHub
parent a315c321c1
commit af83518532
6 changed files with 146 additions and 11 deletions

View File

@@ -355,6 +355,12 @@ message TranscriptStreamResponse {
TranscriptResult final_result = 2;
}
message TranscriptWord {
int64 start = 1;
int64 end = 2;
string text = 3;
}
message TranscriptSegment {
int32 id = 1;
int64 start = 2;
@@ -362,6 +368,7 @@ message TranscriptSegment {
string text = 4;
repeated int32 tokens = 5;
string speaker = 6;
repeated TranscriptWord words = 7;
}
message GenerateImageRequest {

View File

@@ -55,11 +55,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
resultSegments = []
text = ""
try:
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
word_timestamps = "word" in request.timestamp_granularities
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False, word_timestamps=word_timestamps)
id = 0
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=int(segment.start*1e9), end=int(segment.end*1e9), text=segment.text))
words = []
if word_timestamps and hasattr(segment, 'words'):
for word in segment.words:
words.append(backend_pb2.TranscriptWord(
start=int(word.start * 1e9),
end=int(word.end * 1e9),
text=word.word
))
resultSegments.append(backend_pb2.TranscriptSegment(
id=id,
start=int(segment.start * 1e9),
end=int(segment.end * 1e9),
text=segment.text,
words=words
))
text += segment.text
id += 1
except Exception as err:

View File

@@ -179,11 +179,22 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
Language: r.Language,
Duration: float64(r.Duration),
}
for _, s := range r.Segments {
var tks []int
for _, t := range s.Tokens {
tks = append(tks, int(t))
}
var words []schema.TranscriptionWord
for _, w := range s.Words {
var word = schema.TranscriptionWord {
Start: time.Duration(w.Start),
End: time.Duration(w.End),
Text: w.Text,
}
words = append(words, word)
tr.Words = append(tr.Words, word)
}
tr.Segments = append(tr.Segments,
schema.TranscriptionSegment{
Text: s.Text,
@@ -192,6 +203,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
End: time.Duration(s.End),
Tokens: tks,
Speaker: s.Speaker,
Words: words,
})
}
return tr

View File

@@ -81,14 +81,48 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
fmt.Println(schema.TranscriptionResponse(tr, t.ResponseFormat))
case schema.TranscriptionResponseFormatJson:
tr.Segments = nil
tr.Words = nil
fallthrough
case schema.TranscriptionResponseFormatJsonVerbose:
trs := schema.TranscriptionResultSeconds{
Text: tr.Text,
Language: tr.Language,
Duration: tr.Duration,
Words: []schema.TranscriptionWordSeconds{},
Segments: []schema.TranscriptionSegmentSeconds{},
}
for _, word := range(tr.Words) {
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
for _, seg := range(tr.Segments) {
segWords := []schema.TranscriptionWordSeconds{}
for _, word := range(seg.Words) {
segWords = append(segWords, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
Id: seg.Id,
Start: seg.Start.Seconds(),
End: seg.End.Seconds(),
Text: seg.Text,
Tokens: seg.Tokens,
Speaker: seg.Speaker,
Words: segWords,
})
}
var mtr []byte
var err error
if t.PrettyPrint {
mtr, err = json.MarshalIndent(tr, "", " ")
mtr, err = json.MarshalIndent(trs, "", " ")
} else {
mtr, err = json.Marshal(tr)
mtr, err = json.Marshal(trs)
}
if err != nil {
return err

View File

@@ -138,9 +138,43 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
return c.String(http.StatusOK, schema.TranscriptionResponse(tr, responseFormat))
case schema.TranscriptionResponseFormatJson:
tr.Segments = nil
tr.Words = nil
fallthrough
case schema.TranscriptionResponseFormatJsonVerbose, "": // maintain backwards compatibility
return c.JSON(http.StatusOK, tr)
trs := schema.TranscriptionResultSeconds{
Text: tr.Text,
Language: tr.Language,
Duration: tr.Duration,
Words: []schema.TranscriptionWordSeconds{},
Segments: []schema.TranscriptionSegmentSeconds{},
}
for _, word := range(tr.Words) {
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
for _, seg := range(tr.Segments) {
segWords := []schema.TranscriptionWordSeconds{}
for _, word := range(seg.Words) {
segWords = append(segWords, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
Id: seg.Id,
Start: seg.Start.Seconds(),
End: seg.End.Seconds(),
Text: seg.Text,
Tokens: seg.Tokens,
Speaker: seg.Speaker,
Words: segWords,
})
}
return c.JSON(http.StatusOK, trs)
default:
return errors.New("invalid response_format")
}

View File

@@ -3,17 +3,49 @@ package schema
import "time"
type TranscriptionSegment struct {
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
}
type TranscriptionWord struct {
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
}
type TranscriptionResult struct {
Segments []TranscriptionSegment `json:"segments,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
Text string `json:"text"`
Language string `json:"language,omitempty"`
Duration float64 `json:"duration,omitempty"`
}
type TranscriptionSegmentSeconds struct {
Id int `json:"id"`
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Words []TranscriptionWordSeconds `json:"words,omitempty"`
}
type TranscriptionWordSeconds struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
}
type TranscriptionResultSeconds struct {
Segments []TranscriptionSegmentSeconds `json:"segments,omitempty"`
Words []TranscriptionWordSeconds `json:"words,omitempty"`
Text string `json:"text"`
Language string `json:"language,omitempty"`
Duration float64 `json:"duration,omitempty"`
}