mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-16 04:38:50 -04:00
feat(depth): typed Depth RPC + REST endpoint exposing full DA3 data
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,7 @@ service Backend {
|
||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
||||
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
||||
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
||||
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
||||
rpc VoiceVerify(VoiceVerifyRequest) returns (VoiceVerifyResponse) {}
|
||||
@@ -670,6 +671,35 @@ message DetectResponse {
|
||||
repeated Detection Detections = 1;
|
||||
}
|
||||
|
||||
// --- Depth estimation messages (Depth Anything 3) ---
|
||||
|
||||
message DepthRequest {
|
||||
string src = 1; // input image (filesystem path or base64-encoded payload)
|
||||
string dst = 2; // optional output directory for exports (glb/colmap)
|
||||
bool include_depth = 3; // return the per-pixel metric depth map
|
||||
bool include_confidence = 4; // return the per-pixel confidence map (DualDPT)
|
||||
bool include_pose = 5; // return camera extrinsics/intrinsics (DualDPT)
|
||||
bool include_sky = 6; // return the per-pixel sky map (mono models)
|
||||
bool include_points = 7; // back-project to a 3D point cloud (DualDPT)
|
||||
float points_conf_thresh = 8; // keep points with confidence >= this threshold
|
||||
repeated string exports = 9; // requested exports: "glb", "colmap"
|
||||
}
|
||||
|
||||
message DepthResponse {
|
||||
int32 width = 1; // processed depth-map width
|
||||
int32 height = 2; // processed depth-map height
|
||||
repeated float depth = 3; // width*height row-major metric depth
|
||||
repeated float confidence = 4; // width*height row-major confidence (DualDPT)
|
||||
repeated float sky = 5; // width*height row-major sky map (mono)
|
||||
repeated float extrinsics = 6; // 12 floats, 3x4 row-major (world-to-camera)
|
||||
repeated float intrinsics = 7; // 9 floats, 3x3 row-major
|
||||
int32 num_points = 8; // number of 3D points
|
||||
repeated float points = 9; // num_points*3 xyz, world space
|
||||
bytes point_colors = 10; // num_points*3 uint8 rgb
|
||||
repeated string export_paths = 11; // paths written for the requested exports
|
||||
bool is_metric = 12; // depth is in metric units
|
||||
}
|
||||
|
||||
// --- Face recognition messages ---
|
||||
|
||||
message FacialArea {
|
||||
|
||||
@@ -49,6 +49,26 @@ var (
|
||||
CapiFreeFloats func(p *float32)
|
||||
// da_capi_pose_path(ctx, image_path, out_ext[12], out_intr[9]) -> 0 ok, -1 err
|
||||
CapiPosePath func(handle uintptr, imagePath string, outExt *float32, outIntr *float32) int32
|
||||
// da_capi_depth_dense(ctx, image_path, out_h*, out_w*, out_depth**, out_conf**,
|
||||
// out_sky**, out_ext[12], out_intr[9], out_is_metric*) -> 0 ok, -1 err.
|
||||
// Each non-NULL out_depth/out_conf/out_sky receives a malloc'd float[H*W] (free
|
||||
// via da_capi_free_floats); buffers the model doesn't produce are set NULL.
|
||||
CapiDepthDense func(handle uintptr, imagePath string,
|
||||
outH, outW *int32,
|
||||
outDepth, outConf, outSky **float32,
|
||||
outExt, outIntr *float32,
|
||||
outIsMetric *int32) int32
|
||||
// da_capi_points(ctx, image_path, conf_thresh, out_n*, out_xyz**, out_rgb**) ->
|
||||
// 0 ok, -1 err. *out_xyz = malloc'd float[3*N] (free via da_capi_free_floats),
|
||||
// *out_rgb = malloc'd uint8[3*N] (free via da_capi_free_bytes).
|
||||
CapiPoints func(handle uintptr, imagePath string, confThresh float32,
|
||||
outN *int32, outXyz **float32, outRgb **byte) int32
|
||||
// da_capi_free_bytes(unsigned char* p)
|
||||
CapiFreeBytes func(p *byte)
|
||||
// da_capi_export_glb(ctx, image_path, out_glb) -> 0 ok, -1 err
|
||||
CapiExportGlb func(handle uintptr, imagePath string, outGlb string) int32
|
||||
// da_capi_export_colmap(ctx, image_path, out_dir, binary) -> 0 ok, -1 err
|
||||
CapiExportColmap func(handle uintptr, imagePath string, outDir string, binary int32) int32
|
||||
)
|
||||
|
||||
type DepthAnythingCpp struct {
|
||||
@@ -168,6 +188,188 @@ func (r *DepthAnythingCpp) GenerateImage(req *pb.GenerateImageRequest) error {
|
||||
return writeDepthPNG(req.GetDst(), depth, h, w)
|
||||
}
|
||||
|
||||
// Depth is the typed Depth RPC. It runs the Depth Anything 3 pipeline on the
|
||||
// request's src image and fills a DepthResponse honoring the include_* flags and
|
||||
// exports: per-pixel metric depth + confidence (DualDPT) or depth + sky (mono),
|
||||
// camera extrinsics/intrinsics, an optional back-projected 3D point cloud and
|
||||
// glb/COLMAP exports. The src may be a filesystem path or a base64 payload.
|
||||
func (r *DepthAnythingCpp) Depth(in *pb.DepthRequest) (pb.DepthResponse, error) {
|
||||
// Accumulate into locals and return a single composite literal at the end:
|
||||
// returning a named pb.DepthResponse value would copy its embedded mutex
|
||||
// (go vet copylocks).
|
||||
if r.handle == 0 {
|
||||
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: model not loaded")
|
||||
}
|
||||
if in.GetSrc() == "" {
|
||||
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: Depth requires src")
|
||||
}
|
||||
|
||||
imgPath, cleanup, err := materializeImage(in.GetSrc())
|
||||
if err != nil {
|
||||
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: %w", err)
|
||||
}
|
||||
defer cleanup()
|
||||
|
||||
// Dense per-pixel output + pose. Pass buffer pointers only for the
|
||||
// requested maps so the native side can skip unrequested work; ext/intr
|
||||
// must always point at 12/9 floats per the C ABI.
|
||||
var (
|
||||
h, w, isMetric int32
|
||||
depthPtr, confPtr *float32
|
||||
skyPtr *float32
|
||||
ext [12]float32
|
||||
intr [9]float32
|
||||
pDepth, pConf, pSky **float32
|
||||
)
|
||||
if in.GetIncludeDepth() {
|
||||
pDepth = &depthPtr
|
||||
}
|
||||
if in.GetIncludeConfidence() {
|
||||
pConf = &confPtr
|
||||
}
|
||||
if in.GetIncludeSky() {
|
||||
pSky = &skyPtr
|
||||
}
|
||||
|
||||
rc := CapiDepthDense(r.handle, imgPath, &h, &w, pDepth, pConf, pSky, &ext[0], &intr[0], &isMetric)
|
||||
if rc != 0 {
|
||||
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_depth_dense failed (rc=%d): %s", rc, r.lastError())
|
||||
}
|
||||
|
||||
n := int(h) * int(w)
|
||||
var (
|
||||
depth, conf, sky []float32
|
||||
extrinsics, intrinsic []float32
|
||||
numPoints int32
|
||||
points []float32
|
||||
pointColors []byte
|
||||
exportPaths []string
|
||||
)
|
||||
|
||||
if depthPtr != nil {
|
||||
depth = copyFloats(depthPtr, n)
|
||||
CapiFreeFloats(depthPtr)
|
||||
}
|
||||
if confPtr != nil {
|
||||
conf = copyFloats(confPtr, n)
|
||||
CapiFreeFloats(confPtr)
|
||||
}
|
||||
if skyPtr != nil {
|
||||
sky = copyFloats(skyPtr, n)
|
||||
CapiFreeFloats(skyPtr)
|
||||
}
|
||||
if in.GetIncludePose() {
|
||||
extrinsics = append([]float32(nil), ext[:]...)
|
||||
intrinsic = append([]float32(nil), intr[:]...)
|
||||
}
|
||||
|
||||
// 3D point cloud (DualDPT / pose-capable models only).
|
||||
if in.GetIncludePoints() {
|
||||
var (
|
||||
np int32
|
||||
xyzPtr *float32
|
||||
rgbPtr *byte
|
||||
)
|
||||
if rc := CapiPoints(r.handle, imgPath, in.GetPointsConfThresh(), &np, &xyzPtr, &rgbPtr); rc != 0 {
|
||||
return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_points failed (rc=%d): %s", rc, r.lastError())
|
||||
}
|
||||
numPoints = np
|
||||
if xyzPtr != nil {
|
||||
points = copyFloats(xyzPtr, int(np)*3)
|
||||
CapiFreeFloats(xyzPtr)
|
||||
}
|
||||
if rgbPtr != nil {
|
||||
pointColors = copyBytes(rgbPtr, int(np)*3)
|
||||
CapiFreeBytes(rgbPtr)
|
||||
}
|
||||
}
|
||||
|
||||
// Exports (glb / colmap). They are written under in.Dst (a directory); a
|
||||
// temp dir is used when Dst is empty.
|
||||
if len(in.GetExports()) > 0 {
|
||||
exportPaths, err = r.runExports(imgPath, in.GetDst(), in.GetExports())
|
||||
if err != nil {
|
||||
return pb.DepthResponse{}, err
|
||||
}
|
||||
}
|
||||
|
||||
return pb.DepthResponse{
|
||||
Width: w,
|
||||
Height: h,
|
||||
Depth: depth,
|
||||
Confidence: conf,
|
||||
Sky: sky,
|
||||
Extrinsics: extrinsics,
|
||||
Intrinsics: intrinsic,
|
||||
NumPoints: numPoints,
|
||||
Points: points,
|
||||
PointColors: pointColors,
|
||||
ExportPaths: exportPaths,
|
||||
IsMetric: isMetric != 0,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// runExports writes the requested exports for imgPath into dstDir and returns
|
||||
// the written paths. Supported exports: "glb", "colmap".
|
||||
func (r *DepthAnythingCpp) runExports(imgPath, dstDir string, exports []string) ([]string, error) {
|
||||
if dstDir == "" {
|
||||
tmp, err := os.MkdirTemp("", "depth-anything-export-*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("depth-anything-cpp: mkdir export dir: %w", err)
|
||||
}
|
||||
dstDir = tmp
|
||||
} else if err := os.MkdirAll(dstDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", dstDir, err)
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, exp := range exports {
|
||||
switch exp {
|
||||
case "glb":
|
||||
out := filepath.Join(dstDir, "pointcloud.glb")
|
||||
if rc := CapiExportGlb(r.handle, imgPath, out); rc != 0 {
|
||||
return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_glb failed (rc=%d): %s", rc, r.lastError())
|
||||
}
|
||||
paths = append(paths, out)
|
||||
case "colmap":
|
||||
out := filepath.Join(dstDir, "colmap")
|
||||
if err := os.MkdirAll(out, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", out, err)
|
||||
}
|
||||
if rc := CapiExportColmap(r.handle, imgPath, out, 1); rc != 0 {
|
||||
return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_colmap failed (rc=%d): %s", rc, r.lastError())
|
||||
}
|
||||
paths = append(paths, out)
|
||||
default:
|
||||
return nil, fmt.Errorf("depth-anything-cpp: unknown export %q (want glb|colmap)", exp)
|
||||
}
|
||||
}
|
||||
return paths, nil
|
||||
}
|
||||
|
||||
// copyFloats copies n float32 values from a C heap pointer into a fresh Go
|
||||
// slice so the C buffer can be freed afterwards.
|
||||
func copyFloats(p *float32, n int) []float32 {
|
||||
if p == nil || n <= 0 {
|
||||
return nil
|
||||
}
|
||||
src := unsafe.Slice(p, n)
|
||||
out := make([]float32, n)
|
||||
copy(out, src)
|
||||
return out
|
||||
}
|
||||
|
||||
// copyBytes copies n bytes from a C heap pointer into a fresh Go slice.
|
||||
func copyBytes(p *byte, n int) []byte {
|
||||
if p == nil || n <= 0 {
|
||||
return nil
|
||||
}
|
||||
src := unsafe.Slice(p, n)
|
||||
out := make([]byte, n)
|
||||
copy(out, src)
|
||||
return out
|
||||
}
|
||||
|
||||
// runDepthPose runs depth estimation then pose recovery on an image file. It
|
||||
// returns the row-major depth map (length h*w), its dimensions, the 3x4
|
||||
// extrinsics (12 floats) and 3x3 intrinsics (9 floats).
|
||||
|
||||
@@ -42,6 +42,11 @@ func main() {
|
||||
{&CapiDepthPath, "da_capi_depth_path"},
|
||||
{&CapiFreeFloats, "da_capi_free_floats"},
|
||||
{&CapiPosePath, "da_capi_pose_path"},
|
||||
{&CapiDepthDense, "da_capi_depth_dense"},
|
||||
{&CapiPoints, "da_capi_points"},
|
||||
{&CapiFreeBytes, "da_capi_free_bytes"},
|
||||
{&CapiExportGlb, "da_capi_export_glb"},
|
||||
{&CapiExportColmap, "da_capi_export_colmap"},
|
||||
}
|
||||
|
||||
for _, lf := range libFuncs {
|
||||
|
||||
66
core/backend/depth.go
Normal file
66
core/backend/depth.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/trace"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
// Depth runs depth estimation (Depth Anything 3) on the supplied image and
|
||||
// returns the full DepthResponse: per-pixel metric depth + confidence + sky,
|
||||
// camera pose (extrinsics/intrinsics), an optional 3D point cloud and any
|
||||
// requested exports (glb/colmap). The include_* flags and exports mirror the
|
||||
// DepthRequest proto so callers can ask for less work.
|
||||
func Depth(
|
||||
ctx context.Context,
|
||||
in *proto.DepthRequest,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
modelConfig config.ModelConfig,
|
||||
) (*proto.DepthResponse, error) {
|
||||
opts := ModelOptions(modelConfig, appConfig)
|
||||
depthModel, err := loader.Load(opts...)
|
||||
if err != nil {
|
||||
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if depthModel == nil {
|
||||
return nil, fmt.Errorf("could not load depth model")
|
||||
}
|
||||
|
||||
var startTime time.Time
|
||||
if appConfig.EnableTracing {
|
||||
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
||||
startTime = time.Now()
|
||||
}
|
||||
|
||||
res, err := depthModel.Depth(ctx, in)
|
||||
|
||||
if appConfig.EnableTracing {
|
||||
errStr := ""
|
||||
if err != nil {
|
||||
errStr = err.Error()
|
||||
}
|
||||
|
||||
trace.RecordBackendTrace(trace.BackendTrace{
|
||||
Timestamp: startTime,
|
||||
Duration: time.Since(startTime),
|
||||
Type: trace.BackendTraceDepth,
|
||||
ModelName: modelConfig.Name,
|
||||
Backend: modelConfig.Backend,
|
||||
Summary: trace.TruncateString(in.GetSrc(), 200),
|
||||
Error: errStr,
|
||||
Data: map[string]any{
|
||||
"exports": in.GetExports(),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
@@ -21,6 +21,7 @@ const (
|
||||
UsecaseSoundGeneration = "sound_generation"
|
||||
UsecaseRerank = "rerank"
|
||||
UsecaseDetection = "detection"
|
||||
UsecaseDepth = "depth"
|
||||
UsecaseVAD = "vad"
|
||||
UsecaseAudioTransform = "audio_transform"
|
||||
UsecaseDiarization = "diarization"
|
||||
@@ -44,6 +45,7 @@ const (
|
||||
MethodSoundGeneration GRPCMethod = "SoundGeneration"
|
||||
MethodTokenizeString GRPCMethod = "TokenizeString"
|
||||
MethodDetect GRPCMethod = "Detect"
|
||||
MethodDepth GRPCMethod = "Depth"
|
||||
MethodRerank GRPCMethod = "Rerank"
|
||||
MethodVAD GRPCMethod = "VAD"
|
||||
MethodAudioTransform GRPCMethod = "AudioTransform"
|
||||
@@ -141,6 +143,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
|
||||
GRPCMethod: MethodDetect,
|
||||
Description: "Object detection via the Detect RPC with bounding boxes.",
|
||||
},
|
||||
UsecaseDepth: {
|
||||
Flag: FLAG_DEPTH,
|
||||
GRPCMethod: MethodDepth,
|
||||
Description: "Per-pixel metric depth, camera pose and 3D point cloud via the Depth RPC (Depth Anything 3).",
|
||||
},
|
||||
UsecaseVAD: {
|
||||
Flag: FLAG_VAD,
|
||||
GRPCMethod: MethodVAD,
|
||||
@@ -488,6 +495,13 @@ var BackendCapabilities = map[string]BackendCapability{
|
||||
DefaultUsecases: []string{UsecaseDetection},
|
||||
Description: "RF-DETR C++ object detection",
|
||||
},
|
||||
"depth-anything": {
|
||||
GRPCMethods: []GRPCMethod{MethodDepth, MethodPredict, MethodGenerateImage},
|
||||
PossibleUsecases: []string{UsecaseDepth},
|
||||
DefaultUsecases: []string{UsecaseDepth},
|
||||
AcceptsImages: true,
|
||||
Description: "Depth Anything 3 C++ — per-pixel metric depth, camera pose and 3D point cloud",
|
||||
},
|
||||
|
||||
// --- Face and speaker recognition backends ---
|
||||
"insightface": {
|
||||
|
||||
@@ -64,6 +64,7 @@ var UsecaseOptions = []FieldOption{
|
||||
{Value: "image", Label: "Image"},
|
||||
{Value: "vision", Label: "Vision"},
|
||||
{Value: "detection", Label: "Detection"},
|
||||
{Value: "depth", Label: "Depth"},
|
||||
{Value: "face_recognition", Label: "Face Recognition"},
|
||||
{Value: "transcript", Label: "Transcript"},
|
||||
{Value: "diarization", Label: "Diarization"},
|
||||
|
||||
@@ -1291,6 +1291,10 @@ const (
|
||||
// chat/completion/embeddings.
|
||||
FLAG_SCORE ModelConfigUsecase = 0b10000000000000000000
|
||||
|
||||
// Marks a model as wired for the Depth gRPC primitive (per-pixel
|
||||
// metric depth + camera pose + 3D point cloud via Depth Anything 3).
|
||||
FLAG_DEPTH ModelConfigUsecase = 0b100000000000000000000
|
||||
|
||||
// Common Subsets
|
||||
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
||||
)
|
||||
@@ -1348,6 +1352,7 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
|
||||
"FLAG_DIARIZATION": FLAG_DIARIZATION,
|
||||
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
|
||||
"FLAG_SCORE": FLAG_SCORE,
|
||||
"FLAG_DEPTH": FLAG_DEPTH,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1491,6 +1496,13 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_DEPTH) == FLAG_DEPTH {
|
||||
depthBackends := []string{"depth-anything"}
|
||||
if !slices.Contains(depthBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_FACE_RECOGNITION) == FLAG_FACE_RECOGNITION {
|
||||
faceBackends := []string{"insightface"}
|
||||
if !slices.Contains(faceBackends, c.Backend) {
|
||||
|
||||
95
core/http/endpoints/localai/depth.go
Normal file
95
core/http/endpoints/localai/depth.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/http/middleware"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// DepthEndpoint is the LocalAI Depth endpoint exposing the full Depth Anything 3
|
||||
// output (per-pixel metric depth + confidence + sky, camera pose, 3D point cloud
|
||||
// and optional glb/COLMAP exports).
|
||||
// @Summary Estimates per-pixel depth (and optionally pose/points) from an image.
|
||||
// @Tags depth
|
||||
// @Param request body schema.DepthRequest true "query params"
|
||||
// @Success 200 {object} schema.DepthResponse "Response"
|
||||
// @Router /v1/depth [post]
|
||||
func DepthEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
|
||||
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.DepthRequest)
|
||||
if !ok || input.Model == "" {
|
||||
return echo.ErrBadRequest
|
||||
}
|
||||
|
||||
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
|
||||
if !ok || cfg == nil {
|
||||
return echo.ErrBadRequest
|
||||
}
|
||||
|
||||
xlog.Debug("Depth", "image", input.Image, "backend", cfg.Backend)
|
||||
|
||||
image, err := decodeImageInput(input.Image)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Default to returning everything the model can produce when the
|
||||
// caller hasn't asked for any specific subset, so a bare request is
|
||||
// still useful.
|
||||
includeDepth := input.IncludeDepth
|
||||
includeConfidence := input.IncludeConfidence
|
||||
includePose := input.IncludePose
|
||||
includeSky := input.IncludeSky
|
||||
includePoints := input.IncludePoints
|
||||
if !includeDepth && !includeConfidence && !includePose && !includeSky && !includePoints {
|
||||
includeDepth = true
|
||||
includeConfidence = true
|
||||
includePose = true
|
||||
includeSky = true
|
||||
}
|
||||
|
||||
req := &proto.DepthRequest{
|
||||
Src: image,
|
||||
Dst: input.Dst,
|
||||
IncludeDepth: includeDepth,
|
||||
IncludeConfidence: includeConfidence,
|
||||
IncludePose: includePose,
|
||||
IncludeSky: includeSky,
|
||||
IncludePoints: includePoints,
|
||||
PointsConfThresh: input.PointsConfThresh,
|
||||
Exports: input.Exports,
|
||||
}
|
||||
|
||||
res, err := backend.Depth(c.Request().Context(), req, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return mapBackendError(err)
|
||||
}
|
||||
|
||||
response := schema.DepthResponse{
|
||||
Width: res.GetWidth(),
|
||||
Height: res.GetHeight(),
|
||||
Depth: res.GetDepth(),
|
||||
Confidence: res.GetConfidence(),
|
||||
Sky: res.GetSky(),
|
||||
Extrinsics: res.GetExtrinsics(),
|
||||
Intrinsics: res.GetIntrinsics(),
|
||||
NumPoints: res.GetNumPoints(),
|
||||
Points: res.GetPoints(),
|
||||
ExportPaths: res.GetExportPaths(),
|
||||
IsMetric: res.GetIsMetric(),
|
||||
}
|
||||
if len(res.GetPointColors()) > 0 {
|
||||
response.PointColors = base64.StdEncoding.EncodeToString(res.GetPointColors())
|
||||
}
|
||||
|
||||
return c.JSON(200, response)
|
||||
}
|
||||
}
|
||||
@@ -98,6 +98,12 @@ func RegisterLocalAIRoutes(router *echo.Echo,
|
||||
requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_DETECTION)),
|
||||
requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.DetectionRequest) }))
|
||||
|
||||
depthHandler := localai.DepthEndpoint(cl, ml, appConfig)
|
||||
router.POST("/v1/depth",
|
||||
depthHandler,
|
||||
requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_DEPTH)),
|
||||
requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.DepthRequest) }))
|
||||
|
||||
// Face recognition endpoints
|
||||
faceMw := []echo.MiddlewareFunc{
|
||||
requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_FACE_RECOGNITION)),
|
||||
|
||||
@@ -181,6 +181,40 @@ type Detection struct {
|
||||
Mask string `json:"mask,omitempty"` // base64-encoded PNG segmentation mask
|
||||
}
|
||||
|
||||
// DepthRequest is the request body for the /v1/depth endpoint. It exposes the
|
||||
// full Depth Anything 3 output surface; the include_* flags and exports let a
|
||||
// caller ask for less work (e.g. depth only, or depth+pose without the point
|
||||
// cloud).
|
||||
type DepthRequest struct {
|
||||
BasicModelRequest
|
||||
Image string `json:"image"` // URL or base64-encoded image to analyze
|
||||
Dst string `json:"dst,omitempty"` // optional output directory for exports (glb/colmap)
|
||||
IncludeDepth bool `json:"include_depth,omitempty"` // return the per-pixel depth map
|
||||
IncludeConfidence bool `json:"include_confidence,omitempty"` // return the per-pixel confidence map (DualDPT)
|
||||
IncludePose bool `json:"include_pose,omitempty"` // return camera extrinsics/intrinsics (DualDPT)
|
||||
IncludeSky bool `json:"include_sky,omitempty"` // return the per-pixel sky map (mono models)
|
||||
IncludePoints bool `json:"include_points,omitempty"` // back-project to a 3D point cloud (DualDPT)
|
||||
PointsConfThresh float32 `json:"points_conf_thresh,omitempty"` // keep points with confidence >= this threshold
|
||||
Exports []string `json:"exports,omitempty"` // requested exports: "glb", "colmap"
|
||||
}
|
||||
|
||||
// DepthResponse is the JSON response for the /v1/depth endpoint, mirroring the
|
||||
// DepthResponse proto.
|
||||
type DepthResponse struct {
|
||||
Width int32 `json:"width"`
|
||||
Height int32 `json:"height"`
|
||||
Depth []float32 `json:"depth,omitempty"` // width*height row-major metric depth
|
||||
Confidence []float32 `json:"confidence,omitempty"` // width*height row-major confidence (DualDPT)
|
||||
Sky []float32 `json:"sky,omitempty"` // width*height row-major sky map (mono)
|
||||
Extrinsics []float32 `json:"extrinsics,omitempty"` // 12 floats, 3x4 row-major (world-to-camera)
|
||||
Intrinsics []float32 `json:"intrinsics,omitempty"` // 9 floats, 3x3 row-major
|
||||
NumPoints int32 `json:"num_points,omitempty"` // number of 3D points
|
||||
Points []float32 `json:"points,omitempty"` // num_points*3 xyz, world space
|
||||
PointColors string `json:"point_colors,omitempty"` // base64-encoded num_points*3 uint8 rgb
|
||||
ExportPaths []string `json:"export_paths,omitempty"` // paths written for the requested exports
|
||||
IsMetric bool `json:"is_metric"` // depth is in metric units
|
||||
}
|
||||
|
||||
// ─── Face recognition ──────────────────────────────────────────────
|
||||
//
|
||||
// FacialArea describes a bounding box for a detected face.
|
||||
|
||||
@@ -169,6 +169,9 @@ func (c *fakeBackendClient) SoundGeneration(_ context.Context, _ *pb.SoundGenera
|
||||
func (c *fakeBackendClient) Detect(_ context.Context, _ *pb.DetectOptions, _ ...ggrpc.CallOption) (*pb.DetectResponse, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (c *fakeBackendClient) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (c *fakeBackendClient) FaceVerify(_ context.Context, _ *pb.FaceVerifyRequest, _ ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -152,6 +152,12 @@ func (c *InFlightTrackingClient) Detect(ctx context.Context, in *pb.DetectOption
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Depth(ctx, in, opts...)
|
||||
return res, c.reconcile(err)
|
||||
}
|
||||
|
||||
func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) {
|
||||
defer c.track(ctx)()
|
||||
res, err := c.Backend.Rerank(ctx, in, opts...)
|
||||
|
||||
@@ -100,6 +100,10 @@ func (f *fakeGRPCBackend) Detect(_ context.Context, _ *pb.DetectOptions, _ ...gg
|
||||
return &pb.DetectResponse{}, nil
|
||||
}
|
||||
|
||||
func (f *fakeGRPCBackend) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) {
|
||||
return &pb.DepthResponse{}, nil
|
||||
}
|
||||
|
||||
func (f *fakeGRPCBackend) FaceVerify(_ context.Context, _ *pb.FaceVerifyRequest, _ ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
|
||||
return &pb.FaceVerifyResponse{}, nil
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ const (
|
||||
BackendTraceRerank BackendTraceType = "rerank"
|
||||
BackendTraceTokenize BackendTraceType = "tokenize"
|
||||
BackendTraceDetection BackendTraceType = "detection"
|
||||
BackendTraceDepth BackendTraceType = "depth"
|
||||
BackendTraceFaceVerify BackendTraceType = "face_verify"
|
||||
BackendTraceFaceAnalyze BackendTraceType = "face_analyze"
|
||||
BackendTraceVoiceVerify BackendTraceType = "voice_verify"
|
||||
|
||||
@@ -54,6 +54,7 @@ type Backend interface {
|
||||
TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...grpc.CallOption) error
|
||||
SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
|
||||
Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error)
|
||||
Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error)
|
||||
FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error)
|
||||
FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...grpc.CallOption) (*pb.FaceAnalyzeResponse, error)
|
||||
VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...grpc.CallOption) (*pb.VoiceVerifyResponse, error)
|
||||
|
||||
@@ -82,6 +82,10 @@ func (llm *Base) Detect(*pb.DetectOptions) (pb.DetectResponse, error) {
|
||||
return pb.DetectResponse{}, fmt.Errorf("unimplemented")
|
||||
}
|
||||
|
||||
func (llm *Base) Depth(*pb.DepthRequest) (pb.DepthResponse, error) {
|
||||
return pb.DepthResponse{}, fmt.Errorf("unimplemented")
|
||||
}
|
||||
|
||||
func (llm *Base) FaceVerify(*pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
|
||||
return pb.FaceVerifyResponse{}, fmt.Errorf("unimplemented")
|
||||
}
|
||||
|
||||
@@ -634,6 +634,24 @@ func (c *Client) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.
|
||||
return client.Detect(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (c *Client) Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error) {
|
||||
if !c.parallel {
|
||||
c.opMutex.Lock()
|
||||
defer c.opMutex.Unlock()
|
||||
}
|
||||
c.setBusy(true)
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := c.dial()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer conn.Close()
|
||||
client := pb.NewBackendClient(conn)
|
||||
return client.Depth(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (c *Client) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error) {
|
||||
if !c.parallel {
|
||||
c.opMutex.Lock()
|
||||
|
||||
@@ -73,6 +73,10 @@ func (e *embedBackend) Detect(ctx context.Context, in *pb.DetectOptions, opts ..
|
||||
return e.s.Detect(ctx, in)
|
||||
}
|
||||
|
||||
func (e *embedBackend) Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error) {
|
||||
return e.s.Depth(ctx, in)
|
||||
}
|
||||
|
||||
func (e *embedBackend) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error) {
|
||||
return e.s.FaceVerify(ctx, in)
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ type AIModel interface {
|
||||
GenerateImage(*pb.GenerateImageRequest) error
|
||||
GenerateVideo(*pb.GenerateVideoRequest) error
|
||||
Detect(*pb.DetectOptions) (pb.DetectResponse, error)
|
||||
Depth(*pb.DepthRequest) (pb.DepthResponse, error)
|
||||
FaceVerify(*pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error)
|
||||
FaceAnalyze(*pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error)
|
||||
VoiceVerify(*pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error)
|
||||
|
||||
@@ -156,6 +156,18 @@ func (s *server) Detect(ctx context.Context, in *pb.DetectOptions) (*pb.DetectRe
|
||||
return &res, nil
|
||||
}
|
||||
|
||||
func (s *server) Depth(ctx context.Context, in *pb.DepthRequest) (*pb.DepthResponse, error) {
|
||||
if s.llm.Locking() {
|
||||
s.llm.Lock()
|
||||
defer s.llm.Unlock()
|
||||
}
|
||||
res, err := s.llm.Depth(in)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &res, nil
|
||||
}
|
||||
|
||||
func (s *server) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest) (*pb.FaceVerifyResponse, error) {
|
||||
if s.llm.Locking() {
|
||||
s.llm.Lock()
|
||||
|
||||
@@ -108,6 +108,12 @@ func (c *ConnectionEvictingClient) Detect(ctx context.Context, in *pb.DetectOpti
|
||||
return result, err
|
||||
}
|
||||
|
||||
func (c *ConnectionEvictingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) {
|
||||
result, err := c.Backend.Depth(ctx, in, opts...)
|
||||
c.checkErr(err)
|
||||
return result, err
|
||||
}
|
||||
|
||||
func (c *ConnectionEvictingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) {
|
||||
result, err := c.Backend.Rerank(ctx, in, opts...)
|
||||
c.checkErr(err)
|
||||
|
||||
Reference in New Issue
Block a user