Files
LocalAI/pkg/grpc/rich_test.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

130 lines
4.3 KiB
Go

package grpc
import (
"context"
"errors"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// richBackend implements AIModel + AIModelRich. The legacy methods
// return scripted errors so a test that touches them by accident
// (instead of taking the rich path) fails loudly rather than silently
// returning empty content.
type richBackend struct {
base.SingleThread
predictRich func(*pb.PredictOptions) (*pb.Reply, error)
predictStreamRich func(*pb.PredictOptions, chan<- *pb.Reply) error
}
func (r *richBackend) Predict(*pb.PredictOptions) (string, error) {
return "", errors.New("richBackend: legacy Predict should not have been called")
}
func (r *richBackend) PredictStream(*pb.PredictOptions, chan string) error {
return errors.New("richBackend: legacy PredictStream should not have been called")
}
func (r *richBackend) PredictRich(opts *pb.PredictOptions) (*pb.Reply, error) {
return r.predictRich(opts)
}
func (r *richBackend) PredictStreamRich(opts *pb.PredictOptions, out chan<- *pb.Reply) error {
return r.predictStreamRich(opts, out)
}
var _ AIModelRich = (*richBackend)(nil)
var _ = Describe("AIModelRich dispatch", func() {
It("server.Predict routes through PredictRich when implemented", func() {
addr := "test://rich-predict"
Provide(addr, &richBackend{
predictRich: func(*pb.PredictOptions) (*pb.Reply, error) {
return &pb.Reply{
Message: []byte("hello"),
PromptTokens: 5,
Tokens: 7,
ChatDeltas: []*pb.ChatDelta{{
ToolCalls: []*pb.ToolCallDelta{{
Index: 0, Id: "call_1", Name: "ping", Arguments: "{}",
}},
}},
}, nil
},
})
c := NewClient(addr, true, nil, false)
reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
Expect(err).NotTo(HaveOccurred())
Expect(string(reply.GetMessage())).To(Equal("hello"))
// Rich fields survive the RPC marshal/unmarshal — proves the
// server used PredictRich, not the legacy (string, error)
// wrapper which would have lost everything except Message.
Expect(reply.GetPromptTokens()).To(BeEquivalentTo(5))
Expect(reply.GetTokens()).To(BeEquivalentTo(7))
Expect(reply.GetChatDeltas()).To(HaveLen(1))
Expect(reply.GetChatDeltas()[0].GetToolCalls()).To(HaveLen(1))
Expect(reply.GetChatDeltas()[0].GetToolCalls()[0].GetName()).To(Equal("ping"))
})
It("server.PredictStream routes through PredictStreamRich when implemented", func() {
addr := "test://rich-stream"
Provide(addr, &richBackend{
predictStreamRich: func(_ *pb.PredictOptions, out chan<- *pb.Reply) error {
out <- &pb.Reply{
Message: []byte("hi"),
ChatDeltas: []*pb.ChatDelta{{Content: "hi"}},
}
out <- &pb.Reply{
ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{{
Index: 0, Id: "call_x", Name: "search",
}}}},
}
out <- &pb.Reply{Tokens: 9}
return nil
},
})
c := NewClient(addr, true, nil, false)
var collected []*pb.Reply
err := c.PredictStream(context.Background(), &pb.PredictOptions{}, func(r *pb.Reply) {
collected = append(collected, r)
})
Expect(err).NotTo(HaveOccurred())
Expect(collected).To(HaveLen(3))
Expect(string(collected[0].GetMessage())).To(Equal("hi"))
Expect(collected[1].GetChatDeltas()).To(HaveLen(1))
Expect(collected[1].GetChatDeltas()[0].GetToolCalls()).To(HaveLen(1))
Expect(collected[2].GetTokens()).To(BeEquivalentTo(9))
})
It("falls back to legacy Predict when AIModelRich is not implemented", func() {
// Use a non-Rich model (just base.SingleThread embedded in a
// minimal wrapper). The legacy wrapper path stringifies the
// reply, so ChatDeltas are lost — the fallback is the contract
// for backends that haven't migrated.
addr := "test://legacy-predict"
Provide(addr, &legacyOnlyBackend{response: "legacy hello"})
c := NewClient(addr, true, nil, false)
reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
Expect(err).NotTo(HaveOccurred())
Expect(string(reply.GetMessage())).To(Equal("legacy hello"))
Expect(reply.GetChatDeltas()).To(BeEmpty())
})
})
// legacyOnlyBackend implements AIModel but NOT AIModelRich.
type legacyOnlyBackend struct {
base.SingleThread
response string
}
func (l *legacyOnlyBackend) Predict(*pb.PredictOptions) (string, error) {
return l.response, nil
}