fix(whisper): scrub invalid UTF-8 from segment text before protobuf marshal

whisper.cpp can emit bytes that are not valid UTF-8 — typically a multibyte codepoint split across token boundaries. protobuf string fields reject those at marshal time, which would surface as a transcribe failure. Run strings.ToValidUTF8 on the segment text before it leaves the cgo boundary so the bad byte gets replaced with U+FFFD. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-7 [Claude Code]
2026-07-30 09:57:57 -04:00 · 2026-04-26 19:35:21 +00:00
parent c8d63a1003
commit 5b0196c7d0
1 changed files with 4 additions and 1 deletions
--- a/backend/go/whisper/gowhisper.go
+++ b/backend/go/whisper/gowhisper.go
@@ -139,7 +139,10 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
 		// segment start/end conversion factor taken from https://github.com/ggml-org/whisper.cpp/blob/master/examples/cli/cli.cpp#L895
 		s := CppGetSegmentStart(i) * (10000000)
 		t := CppGetSegmentEnd(i) * (10000000)
-		txt := strings.Clone(CppGetSegmentText(i))
+		// whisper.cpp can emit bytes that aren't valid UTF-8 (e.g. a multibyte
+		// codepoint split across token boundaries); protobuf string fields
+		// reject those at marshal time. Scrub before the value escapes cgo.
+		txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
 		tokens := make([]int32, CppNTokens(i))

 		if opts.Diarize && CppGetSegmentSpeakerTurnNext(i) {