mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-05 15:26:14 -04:00
* feat(parakeet-cpp): dynamic-batching scheduler (queue + dispatcher) Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(parakeet-cpp): dynamic batching for AudioTranscription via batched JSON C-API Drop SingleThread; route unary transcription through the in-process batcher which coalesces concurrent requests into one batched engine call. Streaming stays mutually exclusive via engineMu. Adds batch_max_size / batch_max_wait_ms options (size=1 disables; recommended on CPU). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(parakeet-cpp): tear down dispatcher in Free; log batch config; preallocate; clarify stream lock Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(parakeet-cpp): Ginkgo batcher tests; optional batch C-API binding with per-request fallback The batched JSON C-API symbol exists only in newer libparakeet.so (ABI >= 2); probe it with Dlsym and register optionally so the backend still loads against an older library, falling back to per-request transcription. Rewrites the batcher unit tests as Ginkgo/Gomega specs (forbidigo bans t.Fatal in tests). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(parakeet-cpp): debug-log coalesced batch size in runBatch Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(parakeet-cpp): default batch_max_size to 1 (batching opt-in) Dynamic batching now defaults off (batch_max_size:1, one request at a time). Raise batch_max_size to opt in: it is a large throughput win on GPU under concurrent load, but on CPU and low-concurrency setups it only adds latency, so off is the safer default. The startup log now states whether batching is on or off, and the audio-to-text docs are updated to match. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * chore(parakeet-cpp): bump parakeet.cpp to 8a7c482 (batched decode + B=1 fast-path) parakeet.cpp PR #1 merged the batched encoder/decode and the B=1 encoder fast-path to master. Point PARAKEET_VERSION at that commit so the backend builds the batched C-API (parakeet_capi_transcribe_pcm_batch_json) that the dynamic batcher calls; the prior pin (30a3075) predated it, so only the per-request fallback path was exercised. Verified the shared lib builds with the backend's CMake flags and exports the batch symbol. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
80 lines
1.9 KiB
Go
80 lines
1.9 KiB
Go
package main
|
|
|
|
import "time"
|
|
|
|
// batchRequest is one in-flight unary transcription waiting to be batched.
|
|
// In production pcm/decoder are set; tag is an opaque marker used by tests.
|
|
type batchRequest struct {
|
|
pcm []float32
|
|
decoder int32
|
|
tag string
|
|
reply chan batchReply
|
|
}
|
|
|
|
// batchReply carries one per-item JSON object string (an element of the C-API's
|
|
// JSON array) or an error back to the waiting handler goroutine.
|
|
type batchReply struct {
|
|
json string
|
|
err error
|
|
}
|
|
|
|
// batcher coalesces concurrent batchRequests into batched runBatch calls. A
|
|
// single run() goroutine is the sole caller of runBatch, so runBatch (which in
|
|
// production calls the thread-unsafe C engine) is never entered concurrently.
|
|
type batcher struct {
|
|
submit chan *batchRequest
|
|
maxSize int
|
|
maxWait time.Duration
|
|
runBatch func(reqs []*batchRequest) // must deliver a reply to every req
|
|
}
|
|
|
|
func newBatcher(maxSize int, maxWait time.Duration, runBatch func([]*batchRequest)) *batcher {
|
|
if maxSize < 1 {
|
|
maxSize = 1
|
|
}
|
|
return &batcher{
|
|
submit: make(chan *batchRequest),
|
|
maxSize: maxSize,
|
|
maxWait: maxWait,
|
|
runBatch: runBatch,
|
|
}
|
|
}
|
|
|
|
// run is the dispatcher loop: accumulate submitted requests until either maxSize
|
|
// is reached or maxWait elapses since the first queued request, then dispatch.
|
|
// Exits when stop is closed (draining any partially-filled batch first).
|
|
func (b *batcher) run(stop <-chan struct{}) {
|
|
for {
|
|
var first *batchRequest
|
|
select {
|
|
case first = <-b.submit:
|
|
case <-stop:
|
|
return
|
|
}
|
|
batch := []*batchRequest{first}
|
|
|
|
// maxSize==1 disables batching: dispatch immediately (passthrough).
|
|
if b.maxSize == 1 {
|
|
b.runBatch(batch)
|
|
continue
|
|
}
|
|
|
|
timer := time.NewTimer(b.maxWait)
|
|
fill:
|
|
for len(batch) < b.maxSize {
|
|
select {
|
|
case r := <-b.submit:
|
|
batch = append(batch, r)
|
|
case <-timer.C:
|
|
break fill
|
|
case <-stop:
|
|
timer.Stop()
|
|
b.runBatch(batch)
|
|
return
|
|
}
|
|
}
|
|
timer.Stop()
|
|
b.runBatch(batch)
|
|
}
|
|
}
|