Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
2aedd2cf44 fix(backends): enable ROCm/HIP GPU offload for ggml audio backends (#10666)
qwen3-tts-cpp, omnivoice-cpp, acestep-cpp and vibevoice-cpp shipped
rocm-* variants that silently ran on CPU ([Load] backend: CPU). Two
coupled defects:

- The Makefiles passed -DGGML_HIPBLAS=ON, but the vendored ggml only
  understands -DGGML_HIP=ON (GGML_HIPBLAS was removed upstream), so the
  ggml-hip backend target was never created and no GPU code was built.
- The CMake foreach that links the ggml GPU backends into the module
  listed blas/cuda/metal/vulkan but not hip, so even a built ggml-hip
  would not have been linked and its static backend registration would
  never run.

CUDA users were unaffected because cublas passes the correct GGML_CUDA=ON
and the foreach already links cuda. Mirror the proven llama-cpp hipblas
block (ROCm clang CC/CXX + AMDGPU_TARGETS) and add hip to each foreach.
Upstream picks the best device via ggml_backend_init_best(), so no
runtime flag is needed once HIP is compiled and linked.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8[1m] [Claude Code]
2026-07-03 20:28:11 +00:00
17 changed files with 224 additions and 133 deletions

View File

@@ -1,5 +1,5 @@
IK_LLAMA_VERSION?=bbc7de475178dd0535c16ad85f204a2529806c9d
IK_LLAMA_VERSION?=87fc8701ff4da81a7d2a91ec0695f95eb3066a47
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
CMAKE_ARGS?=

View File

@@ -25,7 +25,7 @@ target_include_directories(goacestepcpp PRIVATE ${ACESTEP_DIR}/src ${ACESTEP_DIR
target_include_directories(goacestepcpp SYSTEM PRIVATE ${ACESTEP_DIR}/ggml/include)
# Link GPU backends if available (mirrors link_ggml_backends macro)
foreach(backend blas cuda metal vulkan)
foreach(backend blas cuda hip metal vulkan)
if(TARGET ggml-${backend})
target_link_libraries(goacestepcpp PRIVATE ggml-${backend})
string(TOUPPER ${backend} BACKEND_UPPER)

View File

@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON
else ifeq ($(OS),Darwin)

View File

@@ -30,7 +30,7 @@ target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)
# Link GPU backends if the upstream ggml created them.
foreach(backend blas cuda metal vulkan sycl)
foreach(backend blas cuda hip metal vulkan sycl)
if(TARGET ggml-${backend})
target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
if(backend STREQUAL "cuda")

View File

@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON
else ifeq ($(OS),Darwin)

View File

@@ -30,7 +30,7 @@ target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)
# Link GPU backends if the upstream ggml created them.
foreach(backend blas cuda metal vulkan sycl)
foreach(backend blas cuda hip metal vulkan sycl)
if(TARGET ggml-${backend})
target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
if(backend STREQUAL "cuda")

View File

@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON
else ifeq ($(OS),Darwin)

View File

@@ -50,7 +50,7 @@ target_include_directories(govibevoicecpp SYSTEM PRIVATE ${VIBEVOICE_DIR}/third_
# Link GPU backends if available — vibevoice's own CMake already links
# these to the libvibevoice STATIC library, but we re-link them on the
# MODULE so resolved symbols include all backend kernels.
foreach(backend blas cuda metal vulkan)
foreach(backend blas cuda hip metal vulkan)
if(TARGET ggml-${backend})
target_link_libraries(govibevoicecpp PRIVATE ggml-${backend})
string(TOUPPER ${backend} BACKEND_UPPER)

View File

@@ -29,7 +29,14 @@ else ifeq ($(BUILD_TYPE),openblas)
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DVIBEVOICE_GGML_HIPBLAS=ON
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=ON -DVIBEVOICE_GGML_VULKAN=ON
else ifeq ($(OS),Darwin)

View File

@@ -67,6 +67,16 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
ApplyMTPDefaults(cfg, n)
}
// Sliding-window-attention models (Gemma 2/3, Cohere2, Llama 4, ...) ship
// with a reduced SWA KV cache by default, which cannot reuse a prompt
// prefix across requests and so defeats the cross-request prefix cache
// (cache_reuse) we enable in serving_defaults.go. Enable the full SWA cache
// for these models so the prefix survives; skipped for dense models and
// when the user already pinned an SWA cache option.
if w, ok := HasSlidingWindowAttention(f); ok {
ApplySWAFullDefault(cfg, w)
}
// Thinking support detection is done after model load via DetectThinkingSupportFromBackend
// template estimations

56
core/config/swa.go Normal file
View File

@@ -0,0 +1,56 @@
package config
import (
gguf "github.com/gpustack/gguf-parser-go"
"github.com/mudler/xlog"
)
// swaCacheOptionNames lists the backend option keys that control the
// sliding-window-attention KV cache. If the user pinned any of these we leave
// the SWA cache alone instead of forcing swa_full.
var swaCacheOptionNames = []string{"swa_full", "n_swa"}
// HasSlidingWindowAttention reports whether the parsed GGUF describes a
// sliding-window-attention (SWA) model — Gemma 2/3, Cohere2, Llama 4 and the
// like. The gguf-parser library normalizes the per-architecture
// `<arch>.attention.sliding_window` metadata key into
// GGUFArchitecture.AttentionSlidingWindow, applying the same family-specific
// rules llama.cpp uses (e.g. Phi-3 carries the key but does not actually run
// SWA, and is normalized to 0). A non-zero window means the model interleaves
// SWA layers, so the returned size is also the diagnostic value we log.
func HasSlidingWindowAttention(f *gguf.GGUFFile) (uint64, bool) {
if f == nil {
return 0, false
}
w := f.Architecture().AttentionSlidingWindow
return w, w > 0
}
// ApplySWAFullDefault enables the full-size SWA KV cache (swa_full:true) for a
// sliding-window model, unless the user already pinned an SWA cache option.
//
// Why: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window
// (memory-light), but that reduced cache cannot preserve a prompt prefix across
// requests. So for SWA models the cross-request prefix cache we enable in
// serving_defaults.go (cache_reuse) is silently defeated — every turn
// reprocesses the entire prompt. Setting swa_full:true makes llama.cpp keep the
// full KV cache so the shared prefix is actually reused.
//
// The tradeoff is memory: the full SWA cache scales with context_size, so this
// is gated to models that are genuinely SWA (never applied to dense models,
// where it would only waste memory) and never overrides an explicit user
// choice. `slidingWindow` is the value read from the GGUF and is used only for
// the diagnostic log line.
func ApplySWAFullDefault(cfg *ModelConfig, slidingWindow uint64) {
if cfg == nil || slidingWindow == 0 {
return
}
if backendOptionSet(cfg.Options, swaCacheOptionNames...) {
xlog.Debug("[swa] sliding-window model but an SWA cache option is already set; leaving user choice intact",
"name", cfg.Name, "sliding_window", slidingWindow)
return
}
cfg.Options = append(cfg.Options, "swa_full:true")
xlog.Debug("[swa] enabling swa_full for sliding-window model so the cross-request prompt-prefix cache survives (reduced SWA cache cannot reuse a prefix across requests)",
"name", cfg.Name, "sliding_window", slidingWindow)
}

120
core/config/swa_test.go Normal file
View File

@@ -0,0 +1,120 @@
package config_test
import (
. "github.com/mudler/LocalAI/core/config"
gguf "github.com/gpustack/gguf-parser-go"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given
// `general.architecture` and `<arch>.attention.sliding_window` so the SWA
// detection can be exercised without a real model file. A window of 0 omits the
// key, modelling a dense (non-SWA) model.
func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile {
kvs := gguf.GGUFMetadataKVs{
{
Key: "general.architecture",
ValueType: gguf.GGUFMetadataValueTypeString,
Value: arch,
},
}
if window > 0 {
kvs = append(kvs, gguf.GGUFMetadataKV{
Key: arch + ".attention.sliding_window",
ValueType: gguf.GGUFMetadataValueTypeUint32,
Value: window,
})
}
return &gguf.GGUFFile{
Header: gguf.GGUFHeader{MetadataKV: kvs},
}
}
var _ = Describe("SWA full-cache auto-default", func() {
Context("HasSlidingWindowAttention", func() {
It("returns false on a nil GGUF file", func() {
w, ok := HasSlidingWindowAttention(nil)
Expect(ok).To(BeFalse())
Expect(w).To(BeZero())
})
It("detects a sliding-window model (Gemma 3 style)", func() {
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024))
Expect(ok).To(BeTrue())
Expect(w).To(Equal(uint64(1024)))
})
It("detects Gemma 2 even without an explicit key (family default window)", func() {
// gguf-parser applies llama.cpp's family rules: gemma2 defaults the
// sliding window to 4096 when the metadata key is absent.
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0))
Expect(ok).To(BeTrue())
Expect(w).To(Equal(uint64(4096)))
})
It("reports a dense model as non-SWA", func() {
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0))
Expect(ok).To(BeFalse())
Expect(w).To(BeZero())
})
It("treats Phi-3 as non-SWA even when the key is present", func() {
// Phi-3 carries attention.sliding_window but does not actually run
// SWA; gguf-parser normalizes it to 0 to match llama.cpp.
w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048))
Expect(ok).To(BeFalse())
Expect(w).To(BeZero())
})
})
Context("ApplySWAFullDefault", func() {
It("enables swa_full for a sliding-window model when unset", func() {
cfg := &ModelConfig{Name: "gemma3"}
ApplySWAFullDefault(cfg, 1024)
Expect(cfg.Options).To(ContainElement("swa_full:true"))
})
It("is a no-op for a dense model (window 0)", func() {
cfg := &ModelConfig{Name: "llama"}
ApplySWAFullDefault(cfg, 0)
Expect(cfg.Options).To(BeEmpty())
})
It("preserves an explicit swa_full:false", func() {
cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}}
ApplySWAFullDefault(cfg, 1024)
Expect(cfg.Options).To(Equal([]string{"swa_full:false"}))
})
It("preserves an explicit swa_full:true without duplicating it", func() {
cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}}
ApplySWAFullDefault(cfg, 1024)
Expect(cfg.Options).To(Equal([]string{"swa_full:true"}))
})
It("respects the n_swa alias", func() {
cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}}
ApplySWAFullDefault(cfg, 1024)
Expect(cfg.Options).To(Equal([]string{"n_swa:512"}))
})
It("preserves unrelated options already on the config", func() {
cfg := &ModelConfig{
Name: "gemma3",
Options: []string{"use_jinja:true", "cache_reuse:256"},
}
ApplySWAFullDefault(cfg, 1024)
Expect(cfg.Options).To(Equal([]string{
"use_jinja:true",
"cache_reuse:256",
"swa_full:true",
}))
})
It("tolerates a nil config", func() {
Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic())
})
})
})

View File

@@ -15,35 +15,14 @@ import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/downloader"
"github.com/mudler/LocalAI/pkg/system"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/mudler/LocalAI/pkg/xsync"
"github.com/mudler/xlog"
"gopkg.in/yaml.v3"
)
// validateGalleryConfigURL guards the gallery config fetch against SSRF. A
// gallery config URL can be attacker-controlled (e.g. POST /models/apply with
// an empty id fetches it directly), so a plain http(s) URL must not be allowed
// to reach private, loopback, link-local or cloud-metadata addresses. Other
// schemes (huggingface://, github:, oci://, ollama://, file://) resolve to
// fixed public services or local files and are not a network-SSRF vector, so
// they are left untouched.
// See https://github.com/mudler/LocalAI/issues/10665
func validateGalleryConfigURL(rawURL string) error {
lower := strings.ToLower(strings.TrimSpace(rawURL))
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
return utils.ValidateExternalURL(rawURL)
}
return nil
}
func GetGalleryConfigFromURL[T any](url string, basePath string) (T, error) {
var config T
if err := validateGalleryConfigURL(url); err != nil {
xlog.Error("refusing to fetch gallery config", "error", err, "url", url)
return config, err
}
uri := downloader.URI(url)
err := uri.ReadWithCallback(basePath, func(url string, d []byte) error {
return yaml.Unmarshal(d, &config)
@@ -57,10 +36,6 @@ func GetGalleryConfigFromURL[T any](url string, basePath string) (T, error) {
func GetGalleryConfigFromURLWithContext[T any](ctx context.Context, url string, basePath string) (T, error) {
var config T
if err := validateGalleryConfigURL(url); err != nil {
xlog.Error("refusing to fetch gallery config", "error", err, "url", url)
return config, err
}
uri := downloader.URI(url)
err := uri.ReadWithAuthorizationAndCallback(ctx, basePath, "", func(url string, d []byte) error {
return yaml.Unmarshal(d, &config)

View File

@@ -1,10 +1,6 @@
package gallery_test
import (
"context"
"net/http"
"net/http/httptest"
. "github.com/mudler/LocalAI/core/gallery"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
@@ -23,49 +19,4 @@ var _ = Describe("Gallery API tests", func() {
Expect(e.Name).To(Equal("gpt4all-j"))
})
})
// SSRF guard: a user-supplied gallery config URL (e.g. POST /models/apply
// with an empty id) must not be able to reach internal network addresses.
// See https://github.com/mudler/LocalAI/issues/10665
Context("SSRF protection on config URLs", func() {
var server *httptest.Server
BeforeEach(func() {
// A reachable internal server that would happily serve a valid
// gallery config. Without the SSRF guard the fetch succeeds; the
// guard must block it before the request ever leaves the process.
server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("name: internal-ssrf\nfiles: []\n"))
}))
})
AfterEach(func() {
server.Close()
})
It("blocks fetching a config from a loopback address", func() {
_, err := GetGalleryConfigFromURL[ModelConfig](server.URL, "")
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("not allowed"))
})
It("blocks fetching a config from a loopback address (context variant)", func() {
_, err := GetGalleryConfigFromURLWithContext[ModelConfig](context.Background(), server.URL, "")
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("not allowed"))
})
It("blocks well-known internal hostnames and metadata endpoints", func() {
for _, u := range []string{
"http://localhost/secret",
"http://10.0.0.1/config.yaml",
"http://192.168.1.1/config.yaml",
"http://169.254.169.254/latest/meta-data/",
} {
_, err := GetGalleryConfigFromURL[ModelConfig](u, "")
Expect(err).To(HaveOccurred(), "expected %s to be rejected", u)
}
})
})
})

View File

@@ -1,48 +0,0 @@
package pii
import (
"context"
"sync"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)
// Prometheus counter for PII events. The EventStore ring buffer is
// capacity-bound and meant for recent-audit browsing; operators also want
// a monotonic, scrape-friendly signal ("how many detections/blocks per
// hour, did the filter stop firing after a deploy"). Record() is the
// single choke point every producer already goes through (request
// middleware, response scrubbing, MITM proxy connects/intercepts), so one
// counter here covers all paths without touching the producers.
//
// Initialised lazily on first Record so the package works no matter when
// (or whether) the Prometheus-backed global MeterProvider is installed —
// same pattern as core/services/routing/billing.
var (
metricsOnce sync.Once
eventsCounter metric.Int64Counter
)
func recordEventMetric(e PIIEvent) {
metricsOnce.Do(func() {
meter := otel.Meter("github.com/mudler/LocalAI")
c, err := meter.Int64Counter(
"localai_pii_events_total",
metric.WithDescription("PII/audit events recorded, labeled by kind, origin, action and direction"),
)
if err == nil {
eventsCounter = c
}
})
if eventsCounter == nil {
return
}
eventsCounter.Add(context.Background(), 1, metric.WithAttributes(
attribute.String("kind", string(e.Kind)),
attribute.String("origin", string(e.Origin)),
attribute.String("action", string(e.Action)),
attribute.String("direction", string(e.Direction)),
))
}

View File

@@ -58,7 +58,6 @@ type memoryEventStore struct {
}
func (s *memoryEventStore) Record(_ context.Context, e PIIEvent) error {
recordEventMetric(e)
s.mu.Lock()
defer s.mu.Unlock()
s.ring[s.cursor] = e

View File

@@ -507,7 +507,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp
| `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` |
| `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` |
| `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` |
| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` |
| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Upstream default is `false` (a memory-light reduced cache), but that reduced cache cannot reuse a prompt prefix across requests, which defeats `cache_reuse` for SWA models (Gemma 2/3, Cohere2, Llama 4, ...). LocalAI therefore **auto-enables `swa_full:true` for GGUF models detected as SWA** so the cross-request prefix cache works; it is left off for dense models. The tradeoff is memory: the full SWA cache scales with `context_size`. Set `swa_full:false` explicitly to opt back out (e.g. to save memory at a large context). | `swa_full:true` |
| `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` |
| `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` |
| `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |