mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-02 05:06:34 -04:00
Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
148 lines
4.4 KiB
Go
148 lines
4.4 KiB
Go
package grpc
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/pkg/grpc/base"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// toolCallStreamer simulates what a cloud-proxy translate backend
|
|
// emits: a sequence of *pb.Reply chunks carrying content + tool_call
|
|
// deltas + final usage tokens. The replies are sent in the same order
|
|
// and shape the cloud-proxy OpenAI translator produces.
|
|
type toolCallStreamer struct {
|
|
base.SingleThread
|
|
}
|
|
|
|
func (*toolCallStreamer) Predict(*pb.PredictOptions) (string, error) {
|
|
return "", nil
|
|
}
|
|
|
|
func (*toolCallStreamer) PredictStream(*pb.PredictOptions, chan string) error {
|
|
return nil
|
|
}
|
|
|
|
func (*toolCallStreamer) PredictRich(*pb.PredictOptions) (*pb.Reply, error) {
|
|
return &pb.Reply{
|
|
Message: []byte("done"),
|
|
PromptTokens: 11,
|
|
Tokens: 4,
|
|
ChatDeltas: []*pb.ChatDelta{{
|
|
ToolCalls: []*pb.ToolCallDelta{{
|
|
Index: 0, Id: "call_finalize", Name: "submit", Arguments: `{"ok":true}`,
|
|
}},
|
|
}},
|
|
}, nil
|
|
}
|
|
|
|
func (*toolCallStreamer) PredictStreamRich(_ *pb.PredictOptions, out chan<- *pb.Reply) error {
|
|
// Chunk 1: opening text delta.
|
|
out <- &pb.Reply{
|
|
Message: []byte("Looking up "),
|
|
ChatDeltas: []*pb.ChatDelta{{Content: "Looking up "}},
|
|
}
|
|
// Chunk 2: tool call announcement (id + name).
|
|
out <- &pb.Reply{
|
|
ChatDeltas: []*pb.ChatDelta{{
|
|
ToolCalls: []*pb.ToolCallDelta{{
|
|
Index: 0, Id: "call_x", Name: "search",
|
|
}},
|
|
}},
|
|
}
|
|
// Chunks 3-4: argument fragments (consumer concatenates by index).
|
|
out <- &pb.Reply{
|
|
ChatDeltas: []*pb.ChatDelta{{
|
|
ToolCalls: []*pb.ToolCallDelta{{
|
|
Index: 0, Arguments: `{"q":"`,
|
|
}},
|
|
}},
|
|
}
|
|
out <- &pb.Reply{
|
|
ChatDeltas: []*pb.ChatDelta{{
|
|
ToolCalls: []*pb.ToolCallDelta{{
|
|
Index: 0, Arguments: `weather"}`,
|
|
}},
|
|
}},
|
|
}
|
|
// Chunk 5: usage tokens (final chunk pattern from OpenAI stream).
|
|
out <- &pb.Reply{Tokens: 17}
|
|
return nil
|
|
}
|
|
|
|
var _ AIModelRich = &toolCallStreamer{}
|
|
|
|
var _ = Describe("Cloud-proxy translate-mode integration (gRPC + tool calls)", func() {
|
|
// This test simulates what the OpenAI chat endpoint does after
|
|
// ModelInference returns: it walks the per-chunk TokenUsage.ChatDeltas
|
|
// and assembles tool calls indexed by ToolCallDelta.Index. Verifies
|
|
// that the rich gRPC path delivers everything the consumer needs.
|
|
It("delivers tool-call deltas through PredictStream end-to-end", func() {
|
|
addr := "test://translate-integration-stream"
|
|
Provide(addr, &toolCallStreamer{})
|
|
c := NewClient(addr, true, nil, false)
|
|
|
|
type accumulator struct {
|
|
text strings.Builder
|
|
toolID string
|
|
name string
|
|
args strings.Builder
|
|
tokens int32
|
|
}
|
|
var acc accumulator
|
|
|
|
err := c.PredictStream(context.Background(), &pb.PredictOptions{}, func(reply *pb.Reply) {
|
|
if msg := reply.GetMessage(); len(msg) > 0 {
|
|
acc.text.Write(msg)
|
|
}
|
|
if reply.GetTokens() > 0 && len(reply.GetChatDeltas()) == 0 {
|
|
acc.tokens = reply.GetTokens()
|
|
return
|
|
}
|
|
for _, cd := range reply.GetChatDeltas() {
|
|
for _, tc := range cd.GetToolCalls() {
|
|
if tc.GetId() != "" {
|
|
acc.toolID = tc.GetId()
|
|
}
|
|
if tc.GetName() != "" {
|
|
acc.name = tc.GetName()
|
|
}
|
|
acc.args.WriteString(tc.GetArguments())
|
|
}
|
|
}
|
|
})
|
|
Expect(err).NotTo(HaveOccurred())
|
|
|
|
// Text content survived the wire.
|
|
Expect(acc.text.String()).To(Equal("Looking up "))
|
|
// Tool call id + name landed on the first announcing chunk.
|
|
Expect(acc.toolID).To(Equal("call_x"))
|
|
Expect(acc.name).To(Equal("search"))
|
|
// Argument fragments assembled in order.
|
|
Expect(acc.args.String()).To(Equal(`{"q":"weather"}`))
|
|
// Final usage frame propagated.
|
|
Expect(acc.tokens).To(BeEquivalentTo(17))
|
|
})
|
|
|
|
It("delivers complete tool-call results through non-streaming Predict", func() {
|
|
addr := "test://translate-integration-predict"
|
|
Provide(addr, &toolCallStreamer{})
|
|
c := NewClient(addr, true, nil, false)
|
|
|
|
reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
|
|
Expect(err).NotTo(HaveOccurred())
|
|
Expect(string(reply.GetMessage())).To(Equal("done"))
|
|
Expect(reply.GetPromptTokens()).To(BeEquivalentTo(11))
|
|
Expect(reply.GetTokens()).To(BeEquivalentTo(4))
|
|
Expect(reply.GetChatDeltas()).To(HaveLen(1))
|
|
tcs := reply.GetChatDeltas()[0].GetToolCalls()
|
|
Expect(tcs).To(HaveLen(1))
|
|
Expect(tcs[0].GetId()).To(Equal("call_finalize"))
|
|
Expect(tcs[0].GetName()).To(Equal("submit"))
|
|
Expect(tcs[0].GetArguments()).To(Equal(`{"ok":true}`))
|
|
})
|
|
})
|