diff --git a/backend/backend.proto b/backend/backend.proto index 3dca83878..68db81e35 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -24,6 +24,7 @@ service Backend { rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {} rpc Status(HealthMessage) returns (StatusResponse) {} rpc Detect(DetectOptions) returns (DetectResponse) {} + rpc Depth(DepthRequest) returns (DepthResponse) {} rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {} rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {} rpc VoiceVerify(VoiceVerifyRequest) returns (VoiceVerifyResponse) {} @@ -670,6 +671,35 @@ message DetectResponse { repeated Detection Detections = 1; } +// --- Depth estimation messages (Depth Anything 3) --- + +message DepthRequest { + string src = 1; // input image (filesystem path or base64-encoded payload) + string dst = 2; // optional output directory for exports (glb/colmap) + bool include_depth = 3; // return the per-pixel metric depth map + bool include_confidence = 4; // return the per-pixel confidence map (DualDPT) + bool include_pose = 5; // return camera extrinsics/intrinsics (DualDPT) + bool include_sky = 6; // return the per-pixel sky map (mono models) + bool include_points = 7; // back-project to a 3D point cloud (DualDPT) + float points_conf_thresh = 8; // keep points with confidence >= this threshold + repeated string exports = 9; // requested exports: "glb", "colmap" +} + +message DepthResponse { + int32 width = 1; // processed depth-map width + int32 height = 2; // processed depth-map height + repeated float depth = 3; // width*height row-major metric depth + repeated float confidence = 4; // width*height row-major confidence (DualDPT) + repeated float sky = 5; // width*height row-major sky map (mono) + repeated float extrinsics = 6; // 12 floats, 3x4 row-major (world-to-camera) + repeated float intrinsics = 7; // 9 floats, 3x3 row-major + int32 num_points = 8; // number of 3D points + repeated float points = 9; // num_points*3 xyz, world space + bytes point_colors = 10; // num_points*3 uint8 rgb + repeated string export_paths = 11; // paths written for the requested exports + bool is_metric = 12; // depth is in metric units +} + // --- Face recognition messages --- message FacialArea { diff --git a/backend/go/depth-anything-cpp/godepthanythingcpp.go b/backend/go/depth-anything-cpp/godepthanythingcpp.go index f21fdb228..ae5b0d98b 100644 --- a/backend/go/depth-anything-cpp/godepthanythingcpp.go +++ b/backend/go/depth-anything-cpp/godepthanythingcpp.go @@ -49,6 +49,26 @@ var ( CapiFreeFloats func(p *float32) // da_capi_pose_path(ctx, image_path, out_ext[12], out_intr[9]) -> 0 ok, -1 err CapiPosePath func(handle uintptr, imagePath string, outExt *float32, outIntr *float32) int32 + // da_capi_depth_dense(ctx, image_path, out_h*, out_w*, out_depth**, out_conf**, + // out_sky**, out_ext[12], out_intr[9], out_is_metric*) -> 0 ok, -1 err. + // Each non-NULL out_depth/out_conf/out_sky receives a malloc'd float[H*W] (free + // via da_capi_free_floats); buffers the model doesn't produce are set NULL. + CapiDepthDense func(handle uintptr, imagePath string, + outH, outW *int32, + outDepth, outConf, outSky **float32, + outExt, outIntr *float32, + outIsMetric *int32) int32 + // da_capi_points(ctx, image_path, conf_thresh, out_n*, out_xyz**, out_rgb**) -> + // 0 ok, -1 err. *out_xyz = malloc'd float[3*N] (free via da_capi_free_floats), + // *out_rgb = malloc'd uint8[3*N] (free via da_capi_free_bytes). + CapiPoints func(handle uintptr, imagePath string, confThresh float32, + outN *int32, outXyz **float32, outRgb **byte) int32 + // da_capi_free_bytes(unsigned char* p) + CapiFreeBytes func(p *byte) + // da_capi_export_glb(ctx, image_path, out_glb) -> 0 ok, -1 err + CapiExportGlb func(handle uintptr, imagePath string, outGlb string) int32 + // da_capi_export_colmap(ctx, image_path, out_dir, binary) -> 0 ok, -1 err + CapiExportColmap func(handle uintptr, imagePath string, outDir string, binary int32) int32 ) type DepthAnythingCpp struct { @@ -168,6 +188,188 @@ func (r *DepthAnythingCpp) GenerateImage(req *pb.GenerateImageRequest) error { return writeDepthPNG(req.GetDst(), depth, h, w) } +// Depth is the typed Depth RPC. It runs the Depth Anything 3 pipeline on the +// request's src image and fills a DepthResponse honoring the include_* flags and +// exports: per-pixel metric depth + confidence (DualDPT) or depth + sky (mono), +// camera extrinsics/intrinsics, an optional back-projected 3D point cloud and +// glb/COLMAP exports. The src may be a filesystem path or a base64 payload. +func (r *DepthAnythingCpp) Depth(in *pb.DepthRequest) (pb.DepthResponse, error) { + // Accumulate into locals and return a single composite literal at the end: + // returning a named pb.DepthResponse value would copy its embedded mutex + // (go vet copylocks). + if r.handle == 0 { + return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: model not loaded") + } + if in.GetSrc() == "" { + return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: Depth requires src") + } + + imgPath, cleanup, err := materializeImage(in.GetSrc()) + if err != nil { + return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: %w", err) + } + defer cleanup() + + // Dense per-pixel output + pose. Pass buffer pointers only for the + // requested maps so the native side can skip unrequested work; ext/intr + // must always point at 12/9 floats per the C ABI. + var ( + h, w, isMetric int32 + depthPtr, confPtr *float32 + skyPtr *float32 + ext [12]float32 + intr [9]float32 + pDepth, pConf, pSky **float32 + ) + if in.GetIncludeDepth() { + pDepth = &depthPtr + } + if in.GetIncludeConfidence() { + pConf = &confPtr + } + if in.GetIncludeSky() { + pSky = &skyPtr + } + + rc := CapiDepthDense(r.handle, imgPath, &h, &w, pDepth, pConf, pSky, &ext[0], &intr[0], &isMetric) + if rc != 0 { + return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_depth_dense failed (rc=%d): %s", rc, r.lastError()) + } + + n := int(h) * int(w) + var ( + depth, conf, sky []float32 + extrinsics, intrinsic []float32 + numPoints int32 + points []float32 + pointColors []byte + exportPaths []string + ) + + if depthPtr != nil { + depth = copyFloats(depthPtr, n) + CapiFreeFloats(depthPtr) + } + if confPtr != nil { + conf = copyFloats(confPtr, n) + CapiFreeFloats(confPtr) + } + if skyPtr != nil { + sky = copyFloats(skyPtr, n) + CapiFreeFloats(skyPtr) + } + if in.GetIncludePose() { + extrinsics = append([]float32(nil), ext[:]...) + intrinsic = append([]float32(nil), intr[:]...) + } + + // 3D point cloud (DualDPT / pose-capable models only). + if in.GetIncludePoints() { + var ( + np int32 + xyzPtr *float32 + rgbPtr *byte + ) + if rc := CapiPoints(r.handle, imgPath, in.GetPointsConfThresh(), &np, &xyzPtr, &rgbPtr); rc != 0 { + return pb.DepthResponse{}, fmt.Errorf("depth-anything-cpp: da_capi_points failed (rc=%d): %s", rc, r.lastError()) + } + numPoints = np + if xyzPtr != nil { + points = copyFloats(xyzPtr, int(np)*3) + CapiFreeFloats(xyzPtr) + } + if rgbPtr != nil { + pointColors = copyBytes(rgbPtr, int(np)*3) + CapiFreeBytes(rgbPtr) + } + } + + // Exports (glb / colmap). They are written under in.Dst (a directory); a + // temp dir is used when Dst is empty. + if len(in.GetExports()) > 0 { + exportPaths, err = r.runExports(imgPath, in.GetDst(), in.GetExports()) + if err != nil { + return pb.DepthResponse{}, err + } + } + + return pb.DepthResponse{ + Width: w, + Height: h, + Depth: depth, + Confidence: conf, + Sky: sky, + Extrinsics: extrinsics, + Intrinsics: intrinsic, + NumPoints: numPoints, + Points: points, + PointColors: pointColors, + ExportPaths: exportPaths, + IsMetric: isMetric != 0, + }, nil +} + +// runExports writes the requested exports for imgPath into dstDir and returns +// the written paths. Supported exports: "glb", "colmap". +func (r *DepthAnythingCpp) runExports(imgPath, dstDir string, exports []string) ([]string, error) { + if dstDir == "" { + tmp, err := os.MkdirTemp("", "depth-anything-export-*") + if err != nil { + return nil, fmt.Errorf("depth-anything-cpp: mkdir export dir: %w", err) + } + dstDir = tmp + } else if err := os.MkdirAll(dstDir, 0o755); err != nil { + return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", dstDir, err) + } + + var paths []string + for _, exp := range exports { + switch exp { + case "glb": + out := filepath.Join(dstDir, "pointcloud.glb") + if rc := CapiExportGlb(r.handle, imgPath, out); rc != 0 { + return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_glb failed (rc=%d): %s", rc, r.lastError()) + } + paths = append(paths, out) + case "colmap": + out := filepath.Join(dstDir, "colmap") + if err := os.MkdirAll(out, 0o755); err != nil { + return nil, fmt.Errorf("depth-anything-cpp: mkdir %s: %w", out, err) + } + if rc := CapiExportColmap(r.handle, imgPath, out, 1); rc != 0 { + return nil, fmt.Errorf("depth-anything-cpp: da_capi_export_colmap failed (rc=%d): %s", rc, r.lastError()) + } + paths = append(paths, out) + default: + return nil, fmt.Errorf("depth-anything-cpp: unknown export %q (want glb|colmap)", exp) + } + } + return paths, nil +} + +// copyFloats copies n float32 values from a C heap pointer into a fresh Go +// slice so the C buffer can be freed afterwards. +func copyFloats(p *float32, n int) []float32 { + if p == nil || n <= 0 { + return nil + } + src := unsafe.Slice(p, n) + out := make([]float32, n) + copy(out, src) + return out +} + +// copyBytes copies n bytes from a C heap pointer into a fresh Go slice. +func copyBytes(p *byte, n int) []byte { + if p == nil || n <= 0 { + return nil + } + src := unsafe.Slice(p, n) + out := make([]byte, n) + copy(out, src) + return out +} + // runDepthPose runs depth estimation then pose recovery on an image file. It // returns the row-major depth map (length h*w), its dimensions, the 3x4 // extrinsics (12 floats) and 3x3 intrinsics (9 floats). diff --git a/backend/go/depth-anything-cpp/main.go b/backend/go/depth-anything-cpp/main.go index 2ec0a4ff4..6ba43fbcc 100644 --- a/backend/go/depth-anything-cpp/main.go +++ b/backend/go/depth-anything-cpp/main.go @@ -42,6 +42,11 @@ func main() { {&CapiDepthPath, "da_capi_depth_path"}, {&CapiFreeFloats, "da_capi_free_floats"}, {&CapiPosePath, "da_capi_pose_path"}, + {&CapiDepthDense, "da_capi_depth_dense"}, + {&CapiPoints, "da_capi_points"}, + {&CapiFreeBytes, "da_capi_free_bytes"}, + {&CapiExportGlb, "da_capi_export_glb"}, + {&CapiExportColmap, "da_capi_export_colmap"}, } for _, lf := range libFuncs { diff --git a/core/backend/depth.go b/core/backend/depth.go new file mode 100644 index 000000000..ca41ae233 --- /dev/null +++ b/core/backend/depth.go @@ -0,0 +1,66 @@ +package backend + +import ( + "context" + "fmt" + "time" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/model" +) + +// Depth runs depth estimation (Depth Anything 3) on the supplied image and +// returns the full DepthResponse: per-pixel metric depth + confidence + sky, +// camera pose (extrinsics/intrinsics), an optional 3D point cloud and any +// requested exports (glb/colmap). The include_* flags and exports mirror the +// DepthRequest proto so callers can ask for less work. +func Depth( + ctx context.Context, + in *proto.DepthRequest, + loader *model.ModelLoader, + appConfig *config.ApplicationConfig, + modelConfig config.ModelConfig, +) (*proto.DepthResponse, error) { + opts := ModelOptions(modelConfig, appConfig) + depthModel, err := loader.Load(opts...) + if err != nil { + recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil) + return nil, err + } + + if depthModel == nil { + return nil, fmt.Errorf("could not load depth model") + } + + var startTime time.Time + if appConfig.EnableTracing { + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + startTime = time.Now() + } + + res, err := depthModel.Depth(ctx, in) + + if appConfig.EnableTracing { + errStr := "" + if err != nil { + errStr = err.Error() + } + + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: startTime, + Duration: time.Since(startTime), + Type: trace.BackendTraceDepth, + ModelName: modelConfig.Name, + Backend: modelConfig.Backend, + Summary: trace.TruncateString(in.GetSrc(), 200), + Error: errStr, + Data: map[string]any{ + "exports": in.GetExports(), + }, + }) + } + + return res, err +} diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go index 234873ffa..3e3a02a1a 100644 --- a/core/config/backend_capabilities.go +++ b/core/config/backend_capabilities.go @@ -21,6 +21,7 @@ const ( UsecaseSoundGeneration = "sound_generation" UsecaseRerank = "rerank" UsecaseDetection = "detection" + UsecaseDepth = "depth" UsecaseVAD = "vad" UsecaseAudioTransform = "audio_transform" UsecaseDiarization = "diarization" @@ -44,6 +45,7 @@ const ( MethodSoundGeneration GRPCMethod = "SoundGeneration" MethodTokenizeString GRPCMethod = "TokenizeString" MethodDetect GRPCMethod = "Detect" + MethodDepth GRPCMethod = "Depth" MethodRerank GRPCMethod = "Rerank" MethodVAD GRPCMethod = "VAD" MethodAudioTransform GRPCMethod = "AudioTransform" @@ -141,6 +143,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{ GRPCMethod: MethodDetect, Description: "Object detection via the Detect RPC with bounding boxes.", }, + UsecaseDepth: { + Flag: FLAG_DEPTH, + GRPCMethod: MethodDepth, + Description: "Per-pixel metric depth, camera pose and 3D point cloud via the Depth RPC (Depth Anything 3).", + }, UsecaseVAD: { Flag: FLAG_VAD, GRPCMethod: MethodVAD, @@ -488,6 +495,13 @@ var BackendCapabilities = map[string]BackendCapability{ DefaultUsecases: []string{UsecaseDetection}, Description: "RF-DETR C++ object detection", }, + "depth-anything": { + GRPCMethods: []GRPCMethod{MethodDepth, MethodPredict, MethodGenerateImage}, + PossibleUsecases: []string{UsecaseDepth}, + DefaultUsecases: []string{UsecaseDepth}, + AcceptsImages: true, + Description: "Depth Anything 3 C++ — per-pixel metric depth, camera pose and 3D point cloud", + }, // --- Face and speaker recognition backends --- "insightface": { diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go index 9be49fec0..72da2f99a 100644 --- a/core/config/meta/constants.go +++ b/core/config/meta/constants.go @@ -64,6 +64,7 @@ var UsecaseOptions = []FieldOption{ {Value: "image", Label: "Image"}, {Value: "vision", Label: "Vision"}, {Value: "detection", Label: "Detection"}, + {Value: "depth", Label: "Depth"}, {Value: "face_recognition", Label: "Face Recognition"}, {Value: "transcript", Label: "Transcript"}, {Value: "diarization", Label: "Diarization"}, diff --git a/core/config/model_config.go b/core/config/model_config.go index 755280cc3..654bc89d7 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1291,6 +1291,10 @@ const ( // chat/completion/embeddings. FLAG_SCORE ModelConfigUsecase = 0b10000000000000000000 + // Marks a model as wired for the Depth gRPC primitive (per-pixel + // metric depth + camera pose + 3D point cloud via Depth Anything 3). + FLAG_DEPTH ModelConfigUsecase = 0b100000000000000000000 + // Common Subsets FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT ) @@ -1348,6 +1352,7 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase { "FLAG_DIARIZATION": FLAG_DIARIZATION, "FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO, "FLAG_SCORE": FLAG_SCORE, + "FLAG_DEPTH": FLAG_DEPTH, } } @@ -1491,6 +1496,13 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool { } } + if (u & FLAG_DEPTH) == FLAG_DEPTH { + depthBackends := []string{"depth-anything"} + if !slices.Contains(depthBackends, c.Backend) { + return false + } + } + if (u & FLAG_FACE_RECOGNITION) == FLAG_FACE_RECOGNITION { faceBackends := []string{"insightface"} if !slices.Contains(faceBackends, c.Backend) { diff --git a/core/http/endpoints/localai/depth.go b/core/http/endpoints/localai/depth.go new file mode 100644 index 000000000..a2d116d7d --- /dev/null +++ b/core/http/endpoints/localai/depth.go @@ -0,0 +1,95 @@ +package localai + +import ( + "encoding/base64" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// DepthEndpoint is the LocalAI Depth endpoint exposing the full Depth Anything 3 +// output (per-pixel metric depth + confidence + sky, camera pose, 3D point cloud +// and optional glb/COLMAP exports). +// @Summary Estimates per-pixel depth (and optionally pose/points) from an image. +// @Tags depth +// @Param request body schema.DepthRequest true "query params" +// @Success 200 {object} schema.DepthResponse "Response" +// @Router /v1/depth [post] +func DepthEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + + input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.DepthRequest) + if !ok || input.Model == "" { + return echo.ErrBadRequest + } + + cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) + if !ok || cfg == nil { + return echo.ErrBadRequest + } + + xlog.Debug("Depth", "image", input.Image, "backend", cfg.Backend) + + image, err := decodeImageInput(input.Image) + if err != nil { + return err + } + + // Default to returning everything the model can produce when the + // caller hasn't asked for any specific subset, so a bare request is + // still useful. + includeDepth := input.IncludeDepth + includeConfidence := input.IncludeConfidence + includePose := input.IncludePose + includeSky := input.IncludeSky + includePoints := input.IncludePoints + if !includeDepth && !includeConfidence && !includePose && !includeSky && !includePoints { + includeDepth = true + includeConfidence = true + includePose = true + includeSky = true + } + + req := &proto.DepthRequest{ + Src: image, + Dst: input.Dst, + IncludeDepth: includeDepth, + IncludeConfidence: includeConfidence, + IncludePose: includePose, + IncludeSky: includeSky, + IncludePoints: includePoints, + PointsConfThresh: input.PointsConfThresh, + Exports: input.Exports, + } + + res, err := backend.Depth(c.Request().Context(), req, ml, appConfig, *cfg) + if err != nil { + return mapBackendError(err) + } + + response := schema.DepthResponse{ + Width: res.GetWidth(), + Height: res.GetHeight(), + Depth: res.GetDepth(), + Confidence: res.GetConfidence(), + Sky: res.GetSky(), + Extrinsics: res.GetExtrinsics(), + Intrinsics: res.GetIntrinsics(), + NumPoints: res.GetNumPoints(), + Points: res.GetPoints(), + ExportPaths: res.GetExportPaths(), + IsMetric: res.GetIsMetric(), + } + if len(res.GetPointColors()) > 0 { + response.PointColors = base64.StdEncoding.EncodeToString(res.GetPointColors()) + } + + return c.JSON(200, response) + } +} diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index 96baceaf8..a66801556 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -98,6 +98,12 @@ func RegisterLocalAIRoutes(router *echo.Echo, requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_DETECTION)), requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.DetectionRequest) })) + depthHandler := localai.DepthEndpoint(cl, ml, appConfig) + router.POST("/v1/depth", + depthHandler, + requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_DEPTH)), + requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.DepthRequest) })) + // Face recognition endpoints faceMw := []echo.MiddlewareFunc{ requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_FACE_RECOGNITION)), diff --git a/core/schema/localai.go b/core/schema/localai.go index c7e1292fa..41b513ce9 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -181,6 +181,40 @@ type Detection struct { Mask string `json:"mask,omitempty"` // base64-encoded PNG segmentation mask } +// DepthRequest is the request body for the /v1/depth endpoint. It exposes the +// full Depth Anything 3 output surface; the include_* flags and exports let a +// caller ask for less work (e.g. depth only, or depth+pose without the point +// cloud). +type DepthRequest struct { + BasicModelRequest + Image string `json:"image"` // URL or base64-encoded image to analyze + Dst string `json:"dst,omitempty"` // optional output directory for exports (glb/colmap) + IncludeDepth bool `json:"include_depth,omitempty"` // return the per-pixel depth map + IncludeConfidence bool `json:"include_confidence,omitempty"` // return the per-pixel confidence map (DualDPT) + IncludePose bool `json:"include_pose,omitempty"` // return camera extrinsics/intrinsics (DualDPT) + IncludeSky bool `json:"include_sky,omitempty"` // return the per-pixel sky map (mono models) + IncludePoints bool `json:"include_points,omitempty"` // back-project to a 3D point cloud (DualDPT) + PointsConfThresh float32 `json:"points_conf_thresh,omitempty"` // keep points with confidence >= this threshold + Exports []string `json:"exports,omitempty"` // requested exports: "glb", "colmap" +} + +// DepthResponse is the JSON response for the /v1/depth endpoint, mirroring the +// DepthResponse proto. +type DepthResponse struct { + Width int32 `json:"width"` + Height int32 `json:"height"` + Depth []float32 `json:"depth,omitempty"` // width*height row-major metric depth + Confidence []float32 `json:"confidence,omitempty"` // width*height row-major confidence (DualDPT) + Sky []float32 `json:"sky,omitempty"` // width*height row-major sky map (mono) + Extrinsics []float32 `json:"extrinsics,omitempty"` // 12 floats, 3x4 row-major (world-to-camera) + Intrinsics []float32 `json:"intrinsics,omitempty"` // 9 floats, 3x3 row-major + NumPoints int32 `json:"num_points,omitempty"` // number of 3D points + Points []float32 `json:"points,omitempty"` // num_points*3 xyz, world space + PointColors string `json:"point_colors,omitempty"` // base64-encoded num_points*3 uint8 rgb + ExportPaths []string `json:"export_paths,omitempty"` // paths written for the requested exports + IsMetric bool `json:"is_metric"` // depth is in metric units +} + // ─── Face recognition ────────────────────────────────────────────── // // FacialArea describes a bounding box for a detected face. diff --git a/core/services/nodes/health_mock_test.go b/core/services/nodes/health_mock_test.go index fd8ec892d..f14dd133d 100644 --- a/core/services/nodes/health_mock_test.go +++ b/core/services/nodes/health_mock_test.go @@ -169,6 +169,9 @@ func (c *fakeBackendClient) SoundGeneration(_ context.Context, _ *pb.SoundGenera func (c *fakeBackendClient) Detect(_ context.Context, _ *pb.DetectOptions, _ ...ggrpc.CallOption) (*pb.DetectResponse, error) { return nil, nil } +func (c *fakeBackendClient) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) { + return nil, nil +} func (c *fakeBackendClient) FaceVerify(_ context.Context, _ *pb.FaceVerifyRequest, _ ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) { return nil, nil } diff --git a/core/services/nodes/inflight.go b/core/services/nodes/inflight.go index 02b1fff60..85b10f71c 100644 --- a/core/services/nodes/inflight.go +++ b/core/services/nodes/inflight.go @@ -152,6 +152,12 @@ func (c *InFlightTrackingClient) Detect(ctx context.Context, in *pb.DetectOption return res, c.reconcile(err) } +func (c *InFlightTrackingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) { + defer c.track(ctx)() + res, err := c.Backend.Depth(ctx, in, opts...) + return res, c.reconcile(err) +} + func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) { defer c.track(ctx)() res, err := c.Backend.Rerank(ctx, in, opts...) diff --git a/core/services/nodes/inflight_test.go b/core/services/nodes/inflight_test.go index be18cb00c..85de0ac8e 100644 --- a/core/services/nodes/inflight_test.go +++ b/core/services/nodes/inflight_test.go @@ -100,6 +100,10 @@ func (f *fakeGRPCBackend) Detect(_ context.Context, _ *pb.DetectOptions, _ ...gg return &pb.DetectResponse{}, nil } +func (f *fakeGRPCBackend) Depth(_ context.Context, _ *pb.DepthRequest, _ ...ggrpc.CallOption) (*pb.DepthResponse, error) { + return &pb.DepthResponse{}, nil +} + func (f *fakeGRPCBackend) FaceVerify(_ context.Context, _ *pb.FaceVerifyRequest, _ ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) { return &pb.FaceVerifyResponse{}, nil } diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go index e326db58b..2943dd7b6 100644 --- a/core/trace/backend_trace.go +++ b/core/trace/backend_trace.go @@ -25,6 +25,7 @@ const ( BackendTraceRerank BackendTraceType = "rerank" BackendTraceTokenize BackendTraceType = "tokenize" BackendTraceDetection BackendTraceType = "detection" + BackendTraceDepth BackendTraceType = "depth" BackendTraceFaceVerify BackendTraceType = "face_verify" BackendTraceFaceAnalyze BackendTraceType = "face_analyze" BackendTraceVoiceVerify BackendTraceType = "voice_verify" diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go index ead95d195..44912c04b 100644 --- a/pkg/grpc/backend.go +++ b/pkg/grpc/backend.go @@ -54,6 +54,7 @@ type Backend interface { TTSStream(ctx context.Context, in *pb.TTSRequest, f func(reply *pb.Reply), opts ...grpc.CallOption) error SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error) + Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error) FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...grpc.CallOption) (*pb.FaceAnalyzeResponse, error) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...grpc.CallOption) (*pb.VoiceVerifyResponse, error) diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go index 24417e4c2..c67c832a7 100644 --- a/pkg/grpc/base/base.go +++ b/pkg/grpc/base/base.go @@ -82,6 +82,10 @@ func (llm *Base) Detect(*pb.DetectOptions) (pb.DetectResponse, error) { return pb.DetectResponse{}, fmt.Errorf("unimplemented") } +func (llm *Base) Depth(*pb.DepthRequest) (pb.DepthResponse, error) { + return pb.DepthResponse{}, fmt.Errorf("unimplemented") +} + func (llm *Base) FaceVerify(*pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) { return pb.FaceVerifyResponse{}, fmt.Errorf("unimplemented") } diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go index b6a148186..b1e503780 100644 --- a/pkg/grpc/client.go +++ b/pkg/grpc/client.go @@ -634,6 +634,24 @@ func (c *Client) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc. return client.Detect(ctx, in, opts...) } +func (c *Client) Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error) { + if !c.parallel { + c.opMutex.Lock() + defer c.opMutex.Unlock() + } + c.setBusy(true) + defer c.setBusy(false) + c.wdMark() + defer c.wdUnMark() + conn, err := c.dial() + if err != nil { + return nil, err + } + defer conn.Close() + client := pb.NewBackendClient(conn) + return client.Depth(ctx, in, opts...) +} + func (c *Client) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error) { if !c.parallel { c.opMutex.Lock() diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go index b9f08ddb4..c7c6406ca 100644 --- a/pkg/grpc/embed.go +++ b/pkg/grpc/embed.go @@ -73,6 +73,10 @@ func (e *embedBackend) Detect(ctx context.Context, in *pb.DetectOptions, opts .. return e.s.Detect(ctx, in) } +func (e *embedBackend) Depth(ctx context.Context, in *pb.DepthRequest, opts ...grpc.CallOption) (*pb.DepthResponse, error) { + return e.s.Depth(ctx, in) +} + func (e *embedBackend) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...grpc.CallOption) (*pb.FaceVerifyResponse, error) { return e.s.FaceVerify(ctx, in) } diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go index 31b9ab26d..888e36a0c 100644 --- a/pkg/grpc/interface.go +++ b/pkg/grpc/interface.go @@ -19,6 +19,7 @@ type AIModel interface { GenerateImage(*pb.GenerateImageRequest) error GenerateVideo(*pb.GenerateVideoRequest) error Detect(*pb.DetectOptions) (pb.DetectResponse, error) + Depth(*pb.DepthRequest) (pb.DepthResponse, error) FaceVerify(*pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) FaceAnalyze(*pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) VoiceVerify(*pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go index 5be668497..6ddb521ba 100644 --- a/pkg/grpc/server.go +++ b/pkg/grpc/server.go @@ -156,6 +156,18 @@ func (s *server) Detect(ctx context.Context, in *pb.DetectOptions) (*pb.DetectRe return &res, nil } +func (s *server) Depth(ctx context.Context, in *pb.DepthRequest) (*pb.DepthResponse, error) { + if s.llm.Locking() { + s.llm.Lock() + defer s.llm.Unlock() + } + res, err := s.llm.Depth(in) + if err != nil { + return nil, err + } + return &res, nil +} + func (s *server) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest) (*pb.FaceVerifyResponse, error) { if s.llm.Locking() { s.llm.Lock() diff --git a/pkg/model/connection_evicting_client.go b/pkg/model/connection_evicting_client.go index b101e8f82..0053df3dc 100644 --- a/pkg/model/connection_evicting_client.go +++ b/pkg/model/connection_evicting_client.go @@ -108,6 +108,12 @@ func (c *ConnectionEvictingClient) Detect(ctx context.Context, in *pb.DetectOpti return result, err } +func (c *ConnectionEvictingClient) Depth(ctx context.Context, in *pb.DepthRequest, opts ...ggrpc.CallOption) (*pb.DepthResponse, error) { + result, err := c.Backend.Depth(ctx, in, opts...) + c.checkErr(err) + return result, err +} + func (c *ConnectionEvictingClient) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...ggrpc.CallOption) (*pb.RerankResult, error) { result, err := c.Backend.Rerank(ctx, in, opts...) c.checkErr(err)