feat(depth): metric-large + nested metric model gallery entries (#10363)

* feat(depth): add depth-anything-3-metric-large gallery entry

DA3METRIC-LARGE (ViT-L) single-file metric-scale depth + sky, served by the
existing depth-anything backend (same single-GGUF path as mono-large). GGUF
published at mudler/depth-anything.cpp-gguf.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(depth): serve nested metric model (two-file load)

The DA3 nested model needs both branches (anyview GIANT + metric ViT-L) loaded
together. Wire it through the backend:
- Load reads a 'metric_model:<file>' entry from ModelOptions.Options and, when
  present, calls da_capi_load_nested(anyview, metric) instead of da_capi_load
  (registers the new abi-4 symbol; helper optionValue + unit test).
- gallery: depth-anything-3-nested (model=anyview, options=metric branch, both
  GGUFs fetched) for metric-scale depth + pose.
- bump depth-anything.cpp pin to cce5edc (abi 4 / da_capi_load_nested).

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-06-16 22:03:58 +02:00
committed by GitHub
parent a6e1c6d0b3
commit 4c6750fe6b
6 changed files with 193 additions and 14 deletions

View File

@@ -8,9 +8,11 @@ JOBS?=$(shell nproc --ignore=1)
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
# merge upstream can orphan a branch, so the native version is pinned by SHA.
# The SHA is kept alive by the v0.1.2 tag on the upstream repo.
# This SHA adds the nested two-file metric C-API (abi_version 4,
# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
DEPTHANYTHING_VERSION?=442eea4f73e83ca9d9bc8e026b966cffa678ffc4
DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF

View File

@@ -24,6 +24,7 @@ import (
"math"
"os"
"path/filepath"
"strings"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
@@ -36,6 +37,10 @@ import (
var (
// da_capi_load(const char* gguf_path, int n_threads) -> da_ctx* (0 = fail)
CapiLoad func(gguf string, nThreads int32) uintptr
// da_capi_load_nested(const char* anyview_gguf, const char* metric_gguf,
// int n_threads) -> da_ctx* (0 = fail). The returned ctx serves the nested
// metric model: depth/pose calls produce final metric-scale depth + scaled pose.
CapiLoadNested func(anyview string, metric string, nThreads int32) uintptr
// da_capi_free(da_ctx* ctx) — safe on a 0 handle.
CapiFree func(handle uintptr)
// da_capi_last_error(da_ctx* ctx) -> const char* (owned by ctx, "" if none).
@@ -87,17 +92,24 @@ func (r *DepthAnythingCpp) Load(opts *pb.ModelOptions) error {
return fmt.Errorf("depth-anything-cpp: ModelFile is empty")
}
var modelPath string
if filepath.IsAbs(modelFile) {
modelPath = modelFile
} else {
modelPath = filepath.Join(opts.ModelPath, modelFile)
resolve := func(name string) string {
if filepath.IsAbs(name) {
return name
}
return filepath.Join(opts.ModelPath, name)
}
modelPath := resolve(modelFile)
if _, err := os.Stat(modelPath); err != nil {
return fmt.Errorf("depth-anything-cpp: model file not found: %s: %w", modelPath, err)
}
// Nested metric models are a two-file pair: the main model is the anyview
// (GIANT) branch and the metric (ViT-L + DPT/sky) branch is named via a
// "metric_model:<filename>" entry in opts.Options. When present we load both
// branches so the engine runs the nested metric alignment.
metricFile := optionValue(opts.Options, "metric_model")
threads := opts.Threads
if threads <= 0 {
threads = 4
@@ -109,19 +121,47 @@ func (r *DepthAnythingCpp) Load(opts *pb.ModelOptions) error {
r.handle = 0
}
h := CapiLoad(modelPath, threads)
if h == 0 {
// da_capi_last_error needs a ctx; on a failed load we have none (it
// returns "" for a null ctx), so the text is best-effort.
if msg := CapiLastError(0); msg != "" {
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s: %s", modelPath, msg)
var h uintptr
if metricFile != "" {
metricPath := resolve(metricFile)
if _, err := os.Stat(metricPath); err != nil {
return fmt.Errorf("depth-anything-cpp: metric_model file not found: %s: %w", metricPath, err)
}
h = CapiLoadNested(modelPath, metricPath, threads)
if h == 0 {
if msg := CapiLastError(0); msg != "" {
return fmt.Errorf("depth-anything-cpp: da_capi_load_nested failed for %s + %s: %s", modelPath, metricPath, msg)
}
return fmt.Errorf("depth-anything-cpp: da_capi_load_nested failed for %s + %s", modelPath, metricPath)
}
} else {
h = CapiLoad(modelPath, threads)
if h == 0 {
// da_capi_last_error needs a ctx; on a failed load we have none (it
// returns "" for a null ctx), so the text is best-effort.
if msg := CapiLastError(0); msg != "" {
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s: %s", modelPath, msg)
}
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s", modelPath)
}
return fmt.Errorf("depth-anything-cpp: da_capi_load failed for %s", modelPath)
}
r.handle = h
return nil
}
// optionValue returns the value of the first "key:value" entry in opts whose key
// matches (case-sensitive), or "" if absent. Mirrors how other LocalAI backends
// read ModelOptions.Options.
func optionValue(opts []string, key string) string {
prefix := key + ":"
for _, o := range opts {
if strings.HasPrefix(o, prefix) {
return strings.TrimSpace(o[len(prefix):])
}
}
return ""
}
// depthResult is the JSON payload returned by Predict.
type depthResult struct {
DepthW int `json:"depth_w"`
@@ -373,6 +413,10 @@ func copyBytes(p *byte, n int) []byte {
// runDepthPose runs depth estimation then pose recovery on an image file. It
// returns the row-major depth map (length h*w), its dimensions, the 3x4
// extrinsics (12 floats) and 3x3 intrinsics (9 floats).
// runDepthPose returns depth + camera pose via two C-API calls (depth then pose).
// For a nested metric model both calls run the full two-branch pipeline, so this
// path infers twice; the typed Depth RPC (single da_capi_depth_dense call) is the
// efficient path for nested models.
func (r *DepthAnythingCpp) runDepthPose(imagePath string) (depth []float32, h, w int, ext [12]float32, intr [9]float32, err error) {
if r.handle == 0 {
err = fmt.Errorf("depth-anything-cpp: model not loaded")

View File

@@ -37,6 +37,7 @@ func main() {
libFuncs := []LibFuncs{
{&CapiLoad, "da_capi_load"},
{&CapiLoadNested, "da_capi_load_nested"},
{&CapiFree, "da_capi_free"},
{&CapiLastError, "da_capi_last_error"},
{&CapiDepthPath, "da_capi_depth_path"},

View File

@@ -0,0 +1,64 @@
package main
// nested_e2e_test.go - e2e smoke for the nested two-file metric model. Loads the
// anyview branch as the main model and points the metric branch via the
// "metric_model:<file>" option (exactly as the depth-anything-3-nested gallery
// entry does), then exercises the typed Depth RPC and asserts a metric depth map.
//
// Skips cleanly unless both nested GGUFs are present under ./test-models/ and the
// backend binary + fallback .so are built.
import (
"context"
"fmt"
"path/filepath"
"time"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("depth-anything-cpp nested metric model", func() {
It("loads the two-file pair via the metric_model option and returns metric depth", func() {
anyviewPath := modelPathOrSkip("depth-anything-nested-anyview.gguf")
_ = modelPathOrSkip("depth-anything-nested-metric.gguf")
imgB64 := loadTestImage()
port := freePort()
cleanup := startBackend(port)
defer cleanup()
client, closeConn := dialBackend(port)
defer closeConn()
ctx, cancel := context.WithTimeout(context.Background(), 25*time.Minute)
defer cancel()
loadResp, err := client.LoadModel(ctx, &pb.ModelOptions{
Model: "depth-anything-nested-anyview.gguf",
ModelFile: anyviewPath,
ModelPath: filepath.Dir(anyviewPath),
Options: []string{"metric_model:depth-anything-nested-metric.gguf"},
Threads: 8,
})
Expect(err).ToNot(HaveOccurred(), "LoadModel(nested)")
Expect(loadResp.GetSuccess()).To(BeTrue(), "LoadModel reported failure: %s", loadResp.GetMessage())
resp, err := client.Depth(ctx, &pb.DepthRequest{
Src: imgB64,
IncludeDepth: true,
IncludePose: true,
})
Expect(err).ToNot(HaveOccurred(), "Depth(nested)")
Expect(resp.GetWidth()).To(BeNumerically(">", 0), "depth width")
Expect(resp.GetHeight()).To(BeNumerically(">", 0), "depth height")
Expect(resp.GetIsMetric()).To(BeTrue(), "nested output must be metric")
Expect(len(resp.GetDepth())).To(Equal(int(resp.GetWidth())*int(resp.GetHeight())), "dense depth length")
Expect(len(resp.GetExtrinsics())).To(Equal(12), "extrinsics 3x4")
Expect(resp.GetIntrinsics()[0]).To(BeNumerically(">", 0), "fx > 0")
_, _ = fmt.Fprintf(GinkgoWriter, "nested depth OK: %dx%d is_metric=%v fx=%.2f\n",
resp.GetWidth(), resp.GetHeight(), resp.GetIsMetric(), resp.GetIntrinsics()[0])
})
})

View File

@@ -0,0 +1,20 @@
package main
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = DescribeTable("optionValue",
func(opts []string, key, want string) {
Expect(optionValue(opts, key)).To(Equal(want))
},
Entry("present", []string{"foo:bar", "metric_model:m.gguf"}, "metric_model", "m.gguf"),
Entry("absent", []string{"foo:bar"}, "metric_model", ""),
Entry("nil", []string(nil), "metric_model", ""),
Entry("trims space", []string{"metric_model: m.gguf "}, "metric_model", "m.gguf"),
Entry("value with colon", []string{"metric_model:a:b.gguf"}, "metric_model", "a:b.gguf"),
Entry("first wins", []string{"metric_model:first.gguf", "metric_model:second.gguf"}, "metric_model", "first.gguf"),
Entry("empty value", []string{"metric_model:"}, "metric_model", ""),
Entry("prefix not key", []string{"metric_model_extra:x"}, "metric_model", ""),
)

View File

@@ -8162,6 +8162,54 @@
- filename: depth-anything-mono-large-f32.gguf
uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-mono-large-f32.gguf
sha256: "291b1a554af907c3f79986ee225da8933be5f7a31d73c81d06784cda284535de"
- !!merge <<: *depth-anything-3-base
name: depth-anything-3-metric-large
description: |
Depth Anything 3 (metric large / vitl), f32 (~1.3 GB) — single-image
metric-scale depth (meters) + a sky mask. DPT single-head metric variant; use
GenerateImage (src -> normalized depth PNG) or Predict (JSON metric depth
stats, is_metric=true).
overrides:
backend: depth-anything
parameters:
model: depth-anything-metric-large-f32.gguf
files:
- filename: depth-anything-metric-large-f32.gguf
uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-metric-large-f32.gguf
sha256: "d10b7450c2238244b2d72e2749537a1876255180149cd630a18bc1619c9286be"
- !!merge <<: *depth-anything-3-base
name: depth-anything-3-nested
description: |
Depth Anything 3 (nested giant+large), f32 — the recommended metric model. A
two-branch pipeline: the anyview GIANT (vitg) branch and a metric ViT-L branch
are run and aligned to recover true metric-scale depth (meters) + scaled camera
pose from a single image. Downloads both branches (~6 GB total); GPU strongly
recommended. Predict returns metric depth stats + pose (is_metric=true).
tags:
- depth-estimation
- camera-pose
- metric-depth
- depth-anything
- native
- cpp
- gpu
overrides:
backend: depth-anything
# The metric (ViT-L) branch is loaded alongside the anyview model via the
# metric_model option; both files are fetched below.
options:
- "metric_model:depth-anything-nested-metric.gguf"
parameters:
model: depth-anything-nested-anyview.gguf
files:
- filename: depth-anything-nested-anyview.gguf
uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-nested-anyview.gguf
sha256: "2a4cb4382aa8c4159fff10dfffa121f3c7a574551c4ff4ad130f235d5442f9ce"
- filename: depth-anything-nested-metric.gguf
uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-nested-metric.gguf
sha256: "b54ed50cbc0b0c14fae1f8edd0fea8bd1cac0850485fd6e7eb2422c7a19e570e"
- name: rfdetr-cpp-base
url: github:mudler/LocalAI/gallery/virtual.yaml@master
urls: