feat: support word-level timestamps for faster-whisper (#9621)

Signed-off-by: Andreas Egli <github@kharan.ch> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-07-05 05:47:50 -04:00 · 2026-05-06 00:32:52 +02:00
parent a315c321c1
commit af83518532
6 changed files with 146 additions and 11 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -355,6 +355,12 @@ message TranscriptStreamResponse {
  TranscriptResult final_result = 2;
 }

+message TranscriptWord {
+  int64 start = 1;
+  int64 end = 2;
+  string text = 3;
+}
+
 message TranscriptSegment {
  int32 id = 1;
  int64 start = 2;
@@ -362,6 +368,7 @@ message TranscriptSegment {
  string text = 4;
  repeated int32 tokens = 5;
  string speaker = 6;
+  repeated TranscriptWord words = 7;
 }

 message GenerateImageRequest {
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -55,11 +55,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        resultSegments = []
        text = ""
        try:
-            segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
+            word_timestamps = "word" in request.timestamp_granularities
+            segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False, word_timestamps=word_timestamps)
            id = 0
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
-                resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=int(segment.start*1e9), end=int(segment.end*1e9), text=segment.text))
+                words = []
+                if word_timestamps and hasattr(segment, 'words'):
+                    for word in segment.words:
+                        words.append(backend_pb2.TranscriptWord(
+                            start=int(word.start * 1e9),
+                            end=int(word.end * 1e9),
+                            text=word.word
+                        ))
+
+                resultSegments.append(backend_pb2.TranscriptSegment(
+                    id=id,
+                    start=int(segment.start * 1e9),
+                    end=int(segment.end * 1e9),
+                    text=segment.text,
+                    words=words
+                ))
                text += segment.text
                id += 1
        except Exception as err:
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -179,11 +179,22 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 		Language: r.Language,
 		Duration: float64(r.Duration),
 	}
+
 	for _, s := range r.Segments {
 		var tks []int
 		for _, t := range s.Tokens {
 			tks = append(tks, int(t))
 		}
+		var words []schema.TranscriptionWord
+		for _, w := range s.Words {
+			var word = schema.TranscriptionWord {
+				Start: time.Duration(w.Start),
+				End:   time.Duration(w.End),
+				Text:  w.Text,
+			}
+			words    = append(words, word)
+			tr.Words = append(tr.Words, word)
+		}
 		tr.Segments = append(tr.Segments,
 			schema.TranscriptionSegment{
 				Text:    s.Text,
@@ -192,6 +203,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 				End:     time.Duration(s.End),
 				Tokens:  tks,
 				Speaker: s.Speaker,
+				Words:   words,
 			})
 	}
 	return tr
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -81,14 +81,48 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 		fmt.Println(schema.TranscriptionResponse(tr, t.ResponseFormat))
 	case schema.TranscriptionResponseFormatJson:
 		tr.Segments = nil
+		tr.Words = nil
 		fallthrough
 	case schema.TranscriptionResponseFormatJsonVerbose:
+		trs := schema.TranscriptionResultSeconds{
+			Text:     tr.Text,
+			Language: tr.Language,
+			Duration: tr.Duration,
+			Words:    []schema.TranscriptionWordSeconds{},
+			Segments: []schema.TranscriptionSegmentSeconds{},
+		}
+		for _, word := range(tr.Words) {
+			trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
+				Start: word.Start.Seconds(),
+				End:   word.End.Seconds(),
+				Text:  word.Text,
+			})
+		}
+		for _, seg := range(tr.Segments) {
+			segWords := []schema.TranscriptionWordSeconds{}
+			for _, word := range(seg.Words) {
+				segWords = append(segWords, schema.TranscriptionWordSeconds{
+					Start: word.Start.Seconds(),
+					End:   word.End.Seconds(),
+					Text:  word.Text,
+				})
+			}
+			trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
+			  Id:      seg.Id,
+				Start:   seg.Start.Seconds(),
+				End:     seg.End.Seconds(),
+				Text:    seg.Text,
+				Tokens:  seg.Tokens,
+				Speaker: seg.Speaker,
+				Words:   segWords,
+			})
+		}
 		var mtr []byte
 		var err error
 		if t.PrettyPrint {
-			mtr, err = json.MarshalIndent(tr, "", "    ")
+			mtr, err = json.MarshalIndent(trs, "", "    ")
 		} else {
-			mtr, err = json.Marshal(tr)
+			mtr, err = json.Marshal(trs)
 		}
 		if err != nil {
 			return err
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -138,9 +138,43 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 			return c.String(http.StatusOK, schema.TranscriptionResponse(tr, responseFormat))
 		case schema.TranscriptionResponseFormatJson:
 			tr.Segments = nil
+			tr.Words = nil
 			fallthrough
 		case schema.TranscriptionResponseFormatJsonVerbose, "": // maintain backwards compatibility
-			return c.JSON(http.StatusOK, tr)
+			trs := schema.TranscriptionResultSeconds{
+				Text:     tr.Text,
+				Language: tr.Language,
+				Duration: tr.Duration,
+				Words:    []schema.TranscriptionWordSeconds{},
+				Segments: []schema.TranscriptionSegmentSeconds{},
+			}
+			for _, word := range(tr.Words) {
+				trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
+					Start: word.Start.Seconds(),
+					End:   word.End.Seconds(),
+					Text:  word.Text,
+				})
+			}
+			for _, seg := range(tr.Segments) {
+				segWords := []schema.TranscriptionWordSeconds{}
+				for _, word := range(seg.Words) {
+					segWords = append(segWords, schema.TranscriptionWordSeconds{
+						Start: word.Start.Seconds(),
+						End:   word.End.Seconds(),
+						Text:  word.Text,
+					})
+				}
+				trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
+				  Id:      seg.Id,
+					Start:   seg.Start.Seconds(),
+					End:     seg.End.Seconds(),
+					Text:    seg.Text,
+					Tokens:  seg.Tokens,
+					Speaker: seg.Speaker,
+					Words:   segWords,
+				})
+			}
+			return c.JSON(http.StatusOK, trs)
 		default:
 			return errors.New("invalid response_format")
 		}
--- a/core/schema/transcription.go
+++ b/core/schema/transcription.go
@@ -3,17 +3,49 @@ package schema
 import "time"

 type TranscriptionSegment struct {
-	Id      int           `json:"id"`
-	Start   time.Duration `json:"start"`
-	End     time.Duration `json:"end"`
-	Text    string        `json:"text"`
-	Tokens  []int         `json:"tokens"`
-	Speaker string        `json:"speaker,omitempty"`
+	Id      int                 `json:"id"`
+	Start   time.Duration       `json:"start"`
+	End     time.Duration       `json:"end"`
+	Text    string              `json:"text"`
+	Tokens  []int               `json:"tokens"`
+	Speaker string              `json:"speaker,omitempty"`
+	Words   []TranscriptionWord `json:"words,omitempty"`
+}
+
+type TranscriptionWord struct {
+	Start time.Duration `json:"start"`
+	End   time.Duration `json:"end"`
+	Text  string        `json:"text"`
 }

 type TranscriptionResult struct {
 	Segments []TranscriptionSegment `json:"segments,omitempty"`
+	Words    []TranscriptionWord    `json:"words,omitempty"`
 	Text     string                 `json:"text"`
 	Language string                 `json:"language,omitempty"`
 	Duration float64                `json:"duration,omitempty"`
 }
+
+type TranscriptionSegmentSeconds struct {
+	Id      int                        `json:"id"`
+	Start   float64                    `json:"start"`
+	End     float64                    `json:"end"`
+	Text    string                     `json:"text"`
+	Tokens  []int                      `json:"tokens"`
+	Speaker string                     `json:"speaker,omitempty"`
+	Words   []TranscriptionWordSeconds `json:"words,omitempty"`
+}
+
+type TranscriptionWordSeconds struct {
+	Start float64 `json:"start"`
+	End   float64 `json:"end"`
+	Text  string  `json:"text"`
+}
+
+type TranscriptionResultSeconds struct {
+	Segments []TranscriptionSegmentSeconds `json:"segments,omitempty"`
+	Words    []TranscriptionWordSeconds    `json:"words,omitempty"`
+	Text     string                        `json:"text"`
+	Language string                        `json:"language,omitempty"`
+	Duration float64                       `json:"duration,omitempty"`
+}