mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 19:27:48 -04:00
* ⬆️ Update mudler/parakeet.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * fix(parakeet-cpp): close streaming segments on <EOB> after ABI v5 eou/eob split parakeet.cpp ABI v5 (the pin this PR bumps to) splits the streaming JSON "eou" flag: in v4 "eou":1 fired for either <EOU> (end of utterance) or <EOB> (backchannel); in v5 "eou" means <EOU> only, with a new separate "eob" field for the backchannel token. The streamSegmenter closed a segment on "eou" alone, so after the bump a backchannel token would silently stop ending a segment and merge into the next utterance. Read the new "eob" field and flush on either signal to preserve the v4 segmentation boundaries. The flat stream_feed eou_out path is unaffected: its mask is still non-zero for either event. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
141 lines
5.5 KiB
Go
141 lines
5.5 KiB
Go
package main
|
|
|
|
import (
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
func tw(text string, start, end float64) transcriptWord {
|
|
return transcriptWord{W: text, Start: start, End: end}
|
|
}
|
|
|
|
var _ = Describe("splitWordsIntoSegments (NeMo get_segment_offsets parity)", func() {
|
|
seps := []rune{'.', '?', '!'}
|
|
|
|
It("splits on sentence-ending punctuation, including the delimiter word", func() {
|
|
words := []transcriptWord{tw("hello", 0, 0.4), tw("world.", 0.4, 0.8), tw("bye", 1.0, 1.3)}
|
|
segs := splitWordsIntoSegments(words, seps, 0)
|
|
Expect(segs).To(HaveLen(2))
|
|
Expect(segs[0]).To(HaveLen(2))
|
|
Expect(segs[0][1].W).To(Equal("world."))
|
|
Expect(segs[1]).To(HaveLen(1))
|
|
Expect(segs[1][0].W).To(Equal("bye"))
|
|
})
|
|
|
|
It("keeps a single segment with no terminal punctuation and gap off", func() {
|
|
words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)}
|
|
segs := splitWordsIntoSegments(words, seps, 0)
|
|
Expect(segs).To(HaveLen(1))
|
|
})
|
|
|
|
It("splits on the gap rule when enabled, the gapped word starting the next segment", func() {
|
|
words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)}
|
|
segs := splitWordsIntoSegments(words, seps, 1.0) // c is 4.6s after b
|
|
Expect(segs).To(HaveLen(2))
|
|
Expect(segs[0]).To(HaveLen(2)) // a b
|
|
Expect(segs[1]).To(HaveLen(1)) // c
|
|
Expect(segs[1][0].W).To(Equal("c"))
|
|
})
|
|
|
|
It("checks the gap rule before punctuation (NeMo elif order)", func() {
|
|
// "b." would terminate, but c is far after it -> gap closes [a b.] at b.
|
|
words := []transcriptWord{tw("a", 0, 0.2), tw("b.", 0.2, 0.4), tw("c", 9.0, 9.2)}
|
|
segs := splitWordsIntoSegments(words, seps, 1.0)
|
|
Expect(segs).To(HaveLen(2))
|
|
Expect(segs[0]).To(HaveLen(2))
|
|
Expect(segs[1][0].W).To(Equal("c"))
|
|
})
|
|
|
|
It("still splits on punctuation when the gap rule is enabled but does not fire", func() {
|
|
words := []transcriptWord{tw("hi.", 0, 0.4), tw("bye", 0.4, 0.8)}
|
|
segs := splitWordsIntoSegments(words, seps, 5.0) // gap never reached
|
|
Expect(segs).To(HaveLen(2))
|
|
Expect(segs[0][0].W).To(Equal("hi."))
|
|
})
|
|
|
|
It("returns nothing for empty input", func() {
|
|
Expect(splitWordsIntoSegments(nil, seps, 0)).To(BeEmpty())
|
|
})
|
|
})
|
|
|
|
var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
|
|
doc := transcriptJSON{
|
|
Text: "hello world. bye now",
|
|
FrameSec: 0.08,
|
|
Words: []transcriptWord{
|
|
{W: "hello", Start: 0.0, End: 0.4},
|
|
{W: "world.", Start: 0.4, End: 0.8},
|
|
{W: "bye", Start: 1.0, End: 1.3},
|
|
{W: "now", Start: 1.3, End: 1.6},
|
|
},
|
|
Tokens: []transcriptToken{{ID: 1, T: 0.1}, {ID: 2, T: 0.5}, {ID: 3, T: 1.1}, {ID: 4, T: 1.4}},
|
|
}
|
|
|
|
It("emits one segment per punctuation-delimited group with start/end", func() {
|
|
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
|
Expect(res.Segments).To(HaveLen(2))
|
|
Expect(res.Segments[0].Text).To(Equal("hello world."))
|
|
Expect(res.Segments[0].Start).To(Equal(int64(0)))
|
|
Expect(res.Segments[0].End).To(Equal(secondsToNanos(0.8)))
|
|
Expect(res.Segments[1].Text).To(Equal("bye now"))
|
|
Expect(res.Segments[1].Start).To(Equal(secondsToNanos(1.0)))
|
|
Expect(res.Segments[1].Id).To(Equal(int32(1)))
|
|
})
|
|
|
|
It("assigns tokens to the segment whose time window contains them", func() {
|
|
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
|
Expect(res.Segments[0].Tokens).To(Equal([]int32{1, 2}))
|
|
Expect(res.Segments[1].Tokens).To(Equal([]int32{3, 4}))
|
|
})
|
|
|
|
It("attaches per-segment words only when word granularity requested", func() {
|
|
plain := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
|
Expect(plain.Segments[0].Words).To(BeEmpty())
|
|
withWords := transcriptResultFromDoc(doc, &pb.TranscriptRequest{TimestampGranularities: []string{"word"}}, 0)
|
|
Expect(withWords.Segments[0].Words).To(HaveLen(2))
|
|
})
|
|
|
|
It("falls back to a single text segment when there are no words", func() {
|
|
res := transcriptResultFromDoc(transcriptJSON{Text: "hi"}, &pb.TranscriptRequest{}, 0)
|
|
Expect(res.Segments).To(HaveLen(1))
|
|
Expect(res.Segments[0].Text).To(Equal("hi"))
|
|
})
|
|
})
|
|
|
|
var _ = Describe("streaming segment assembly", func() {
|
|
It("closes a segment with start/end from its words on EOU", func() {
|
|
acc := &streamSegmenter{}
|
|
acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
|
|
{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
|
|
}})
|
|
segs := acc.segments()
|
|
Expect(segs).To(HaveLen(1))
|
|
Expect(segs[0].Text).To(Equal("hello world"))
|
|
Expect(segs[0].Start).To(Equal(int64(0)))
|
|
Expect(segs[0].End).To(Equal(secondsToNanos(0.9)))
|
|
})
|
|
|
|
It("buffers words across feeds until EOU", func() {
|
|
acc := &streamSegmenter{}
|
|
acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
|
|
Expect(acc.segments()).To(BeEmpty())
|
|
acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
|
|
Expect(acc.segments()).To(HaveLen(1))
|
|
Expect(acc.segments()[0].Text).To(Equal("hi there"))
|
|
})
|
|
|
|
// ABI v5 split <EOB> (backchannel) out of the "eou" flag into its own "eob"
|
|
// field; a backchannel must still close the segment as it did in v4.
|
|
It("closes a segment on EOB (backchannel) too", func() {
|
|
acc := &streamSegmenter{}
|
|
acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
|
|
{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
|
|
}})
|
|
segs := acc.segments()
|
|
Expect(segs).To(HaveLen(1))
|
|
Expect(segs[0].Text).To(Equal("uh huh"))
|
|
Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
|
|
})
|
|
})
|