From 595b6fd22ddf8e7a6a0d8b4680e707e87091c075 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 7 May 2026 17:28:26 +0200
Subject: [PATCH] feat(api/transcription): include segments + duration +
 language on stream done event (#9709)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

streamTranscription previously emitted a done event with just `text`,
matching the OpenAI streaming spec exactly. Streaming clients that need
per-utterance timings or audio duration had to fall back to the
non-streaming JSON path — and that path is exactly the one that trips
on ResponseHeaderTimeout when whisper requests queue behind each other
on a SingleThread backend.

Extend the done event to additively carry `language`, `duration`, and
a `segments` array (id, start, end, text — start/end as float seconds,
matching TranscriptionSegmentSeconds). Empty / zero values are still
omitted; spec-compliant clients ignore the new fields.

This unblocks notary's streaming Transcribe (companion change in the
notary repo) so it produces the same TranscriptionResult shape as the
JSON path while sidestepping the queue-induced header timeouts.


Assisted-by: Claude:claude-opus-4-7 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/transcription.go | 30 +++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/core/http/endpoints/openai/transcription.go b/core/http/endpoints/openai/transcription.go
index 6312e3cb9..81f89b927 100644
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -257,10 +257,36 @@ func streamTranscription(c echo.Context, req backend.TranscriptionRequest, ml *m
 			"delta": finalResult.Text,
 		})
 	}
-	_ = writeEvent(map[string]any{
+	// done carries the assembled text plus, when the backend produced them,
+	// per-segment timings, audio duration, and detected language. The OpenAI
+	// streaming spec only specifies `text`; the extra fields are an additive
+	// extension so streaming clients (e.g. notetaker) can build the same
+	// TranscriptionResultSeconds shape they get from the JSON response path
+	// without us forcing them off SSE just to recover segments. Spec-compliant
+	// clients ignore unknown fields.
+	doneEvent := map[string]any{
 		"type": "transcript.text.done",
 		"text": finalResult.Text,
-	})
+	}
+	if finalResult.Language != "" {
+		doneEvent["language"] = finalResult.Language
+	}
+	if finalResult.Duration > 0 {
+		doneEvent["duration"] = finalResult.Duration
+	}
+	if len(finalResult.Segments) > 0 {
+		segs := make([]map[string]any, 0, len(finalResult.Segments))
+		for _, seg := range finalResult.Segments {
+			segs = append(segs, map[string]any{
+				"id":    seg.Id,
+				"start": seg.Start.Seconds(),
+				"end":   seg.End.Seconds(),
+				"text":  seg.Text,
+			})
+		}
+		doneEvent["segments"] = segs
+	}
+	_ = writeEvent(doneEvent)
 	_, _ = fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
 	c.Response().Flush()
 	return nil