mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-16 20:52:08 -04:00
feat: support word-level timestamps for faster-whisper (#9621)
Signed-off-by: Andreas Egli <github@kharan.ch> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
@@ -355,6 +355,12 @@ message TranscriptStreamResponse {
|
||||
TranscriptResult final_result = 2;
|
||||
}
|
||||
|
||||
message TranscriptWord {
|
||||
int64 start = 1;
|
||||
int64 end = 2;
|
||||
string text = 3;
|
||||
}
|
||||
|
||||
message TranscriptSegment {
|
||||
int32 id = 1;
|
||||
int64 start = 2;
|
||||
@@ -362,6 +368,7 @@ message TranscriptSegment {
|
||||
string text = 4;
|
||||
repeated int32 tokens = 5;
|
||||
string speaker = 6;
|
||||
repeated TranscriptWord words = 7;
|
||||
}
|
||||
|
||||
message GenerateImageRequest {
|
||||
|
||||
@@ -55,11 +55,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
resultSegments = []
|
||||
text = ""
|
||||
try:
|
||||
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
|
||||
word_timestamps = "word" in request.timestamp_granularities
|
||||
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False, word_timestamps=word_timestamps)
|
||||
id = 0
|
||||
for segment in segments:
|
||||
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
|
||||
resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=int(segment.start*1e9), end=int(segment.end*1e9), text=segment.text))
|
||||
words = []
|
||||
if word_timestamps and hasattr(segment, 'words'):
|
||||
for word in segment.words:
|
||||
words.append(backend_pb2.TranscriptWord(
|
||||
start=int(word.start * 1e9),
|
||||
end=int(word.end * 1e9),
|
||||
text=word.word
|
||||
))
|
||||
|
||||
resultSegments.append(backend_pb2.TranscriptSegment(
|
||||
id=id,
|
||||
start=int(segment.start * 1e9),
|
||||
end=int(segment.end * 1e9),
|
||||
text=segment.text,
|
||||
words=words
|
||||
))
|
||||
text += segment.text
|
||||
id += 1
|
||||
except Exception as err:
|
||||
|
||||
@@ -179,11 +179,22 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
|
||||
Language: r.Language,
|
||||
Duration: float64(r.Duration),
|
||||
}
|
||||
|
||||
for _, s := range r.Segments {
|
||||
var tks []int
|
||||
for _, t := range s.Tokens {
|
||||
tks = append(tks, int(t))
|
||||
}
|
||||
var words []schema.TranscriptionWord
|
||||
for _, w := range s.Words {
|
||||
var word = schema.TranscriptionWord {
|
||||
Start: time.Duration(w.Start),
|
||||
End: time.Duration(w.End),
|
||||
Text: w.Text,
|
||||
}
|
||||
words = append(words, word)
|
||||
tr.Words = append(tr.Words, word)
|
||||
}
|
||||
tr.Segments = append(tr.Segments,
|
||||
schema.TranscriptionSegment{
|
||||
Text: s.Text,
|
||||
@@ -192,6 +203,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
|
||||
End: time.Duration(s.End),
|
||||
Tokens: tks,
|
||||
Speaker: s.Speaker,
|
||||
Words: words,
|
||||
})
|
||||
}
|
||||
return tr
|
||||
|
||||
@@ -81,14 +81,48 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
||||
fmt.Println(schema.TranscriptionResponse(tr, t.ResponseFormat))
|
||||
case schema.TranscriptionResponseFormatJson:
|
||||
tr.Segments = nil
|
||||
tr.Words = nil
|
||||
fallthrough
|
||||
case schema.TranscriptionResponseFormatJsonVerbose:
|
||||
trs := schema.TranscriptionResultSeconds{
|
||||
Text: tr.Text,
|
||||
Language: tr.Language,
|
||||
Duration: tr.Duration,
|
||||
Words: []schema.TranscriptionWordSeconds{},
|
||||
Segments: []schema.TranscriptionSegmentSeconds{},
|
||||
}
|
||||
for _, word := range(tr.Words) {
|
||||
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
|
||||
Start: word.Start.Seconds(),
|
||||
End: word.End.Seconds(),
|
||||
Text: word.Text,
|
||||
})
|
||||
}
|
||||
for _, seg := range(tr.Segments) {
|
||||
segWords := []schema.TranscriptionWordSeconds{}
|
||||
for _, word := range(seg.Words) {
|
||||
segWords = append(segWords, schema.TranscriptionWordSeconds{
|
||||
Start: word.Start.Seconds(),
|
||||
End: word.End.Seconds(),
|
||||
Text: word.Text,
|
||||
})
|
||||
}
|
||||
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
|
||||
Id: seg.Id,
|
||||
Start: seg.Start.Seconds(),
|
||||
End: seg.End.Seconds(),
|
||||
Text: seg.Text,
|
||||
Tokens: seg.Tokens,
|
||||
Speaker: seg.Speaker,
|
||||
Words: segWords,
|
||||
})
|
||||
}
|
||||
var mtr []byte
|
||||
var err error
|
||||
if t.PrettyPrint {
|
||||
mtr, err = json.MarshalIndent(tr, "", " ")
|
||||
mtr, err = json.MarshalIndent(trs, "", " ")
|
||||
} else {
|
||||
mtr, err = json.Marshal(tr)
|
||||
mtr, err = json.Marshal(trs)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -138,9 +138,43 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
|
||||
return c.String(http.StatusOK, schema.TranscriptionResponse(tr, responseFormat))
|
||||
case schema.TranscriptionResponseFormatJson:
|
||||
tr.Segments = nil
|
||||
tr.Words = nil
|
||||
fallthrough
|
||||
case schema.TranscriptionResponseFormatJsonVerbose, "": // maintain backwards compatibility
|
||||
return c.JSON(http.StatusOK, tr)
|
||||
trs := schema.TranscriptionResultSeconds{
|
||||
Text: tr.Text,
|
||||
Language: tr.Language,
|
||||
Duration: tr.Duration,
|
||||
Words: []schema.TranscriptionWordSeconds{},
|
||||
Segments: []schema.TranscriptionSegmentSeconds{},
|
||||
}
|
||||
for _, word := range(tr.Words) {
|
||||
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
|
||||
Start: word.Start.Seconds(),
|
||||
End: word.End.Seconds(),
|
||||
Text: word.Text,
|
||||
})
|
||||
}
|
||||
for _, seg := range(tr.Segments) {
|
||||
segWords := []schema.TranscriptionWordSeconds{}
|
||||
for _, word := range(seg.Words) {
|
||||
segWords = append(segWords, schema.TranscriptionWordSeconds{
|
||||
Start: word.Start.Seconds(),
|
||||
End: word.End.Seconds(),
|
||||
Text: word.Text,
|
||||
})
|
||||
}
|
||||
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
|
||||
Id: seg.Id,
|
||||
Start: seg.Start.Seconds(),
|
||||
End: seg.End.Seconds(),
|
||||
Text: seg.Text,
|
||||
Tokens: seg.Tokens,
|
||||
Speaker: seg.Speaker,
|
||||
Words: segWords,
|
||||
})
|
||||
}
|
||||
return c.JSON(http.StatusOK, trs)
|
||||
default:
|
||||
return errors.New("invalid response_format")
|
||||
}
|
||||
|
||||
@@ -3,17 +3,49 @@ package schema
|
||||
import "time"
|
||||
|
||||
type TranscriptionSegment struct {
|
||||
Id int `json:"id"`
|
||||
Start time.Duration `json:"start"`
|
||||
End time.Duration `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Tokens []int `json:"tokens"`
|
||||
Speaker string `json:"speaker,omitempty"`
|
||||
Id int `json:"id"`
|
||||
Start time.Duration `json:"start"`
|
||||
End time.Duration `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Tokens []int `json:"tokens"`
|
||||
Speaker string `json:"speaker,omitempty"`
|
||||
Words []TranscriptionWord `json:"words,omitempty"`
|
||||
}
|
||||
|
||||
type TranscriptionWord struct {
|
||||
Start time.Duration `json:"start"`
|
||||
End time.Duration `json:"end"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type TranscriptionResult struct {
|
||||
Segments []TranscriptionSegment `json:"segments,omitempty"`
|
||||
Words []TranscriptionWord `json:"words,omitempty"`
|
||||
Text string `json:"text"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Duration float64 `json:"duration,omitempty"`
|
||||
}
|
||||
|
||||
type TranscriptionSegmentSeconds struct {
|
||||
Id int `json:"id"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Tokens []int `json:"tokens"`
|
||||
Speaker string `json:"speaker,omitempty"`
|
||||
Words []TranscriptionWordSeconds `json:"words,omitempty"`
|
||||
}
|
||||
|
||||
type TranscriptionWordSeconds struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type TranscriptionResultSeconds struct {
|
||||
Segments []TranscriptionSegmentSeconds `json:"segments,omitempty"`
|
||||
Words []TranscriptionWordSeconds `json:"words,omitempty"`
|
||||
Text string `json:"text"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Duration float64 `json:"duration,omitempty"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user