mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-17 04:56:52 -04:00
* feat(vllm): expose AsyncEngineArgs via generic engine_args YAML map
LocalAI's vLLM backend wraps a small typed subset of vLLM's
AsyncEngineArgs (quantization, tensor_parallel_size, dtype, etc.).
Anything outside that subset -- pipeline/data/expert parallelism,
speculative_config, kv_transfer_config, all2all_backend, prefix
caching, chunked prefill, etc. -- requires a new protobuf field, a
Go struct field, an options.go line, and a backend.py mapping per
feature. That cadence is the bottleneck on shipping vLLM's
production feature set.
Add a generic `engine_args:` map on the model YAML that is
JSON-serialised into a new ModelOptions.EngineArgs proto field and
applied verbatim to AsyncEngineArgs at LoadModel time. Validation
is done by the Python backend via dataclasses.fields(); unknown
keys fail with the closest valid name as a hint.
dataclasses.replace() is used so vLLM's __post_init__ re-runs and
auto-converts dict values into nested config dataclasses
(CompilationConfig, AttentionConfig, ...). speculative_config and
kv_transfer_config flow through as dicts; vLLM converts them at
engine init.
Operators can now write:
engine_args:
data_parallel_size: 8
enable_expert_parallel: true
all2all_backend: deepep_low_latency
speculative_config:
method: deepseek_mtp
num_speculative_tokens: 3
kv_cache_dtype: fp8
without further proto/Go/Python plumbing per field.
Production defaults seeded by hooks_vllm.go: enable_prefix_caching
and enable_chunked_prefill default to true unless explicitly set.
Existing typed YAML fields (gpu_memory_utilization,
tensor_parallel_size, etc.) remain for back-compat; engine_args
overrides them when both are set.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>
* chore(vllm): pin cublas13 to vLLM 0.20.0 cu130 wheel
vLLM's PyPI wheel is built against CUDA 12 (libcudart.so.12) and won't
load on a cu130 host. Switch the cublas13 build to vLLM's per-tag cu130
simple-index (https://wheels.vllm.ai/0.20.0/cu130/) and pin
vllm==0.20.0. The cu130-flavoured wheel ships libcudart.so.13 and
includes the DFlash speculative-decoding method that landed in 0.20.0.
cublas13 install gets --index-strategy=unsafe-best-match so uv consults
both the cu130 index and PyPI when resolving — PyPI also publishes
vllm==0.20.0, but with cu12 binaries that error at import time.
Verified: Qwen3.5-4B + z-lab/Qwen3.5-4B-DFlash loads and serves chat
completions on RTX 5070 Ti (sm_120, cu130).
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>
* ci(vllm): bot job to bump cublas13 vLLM wheel pin
vLLM's cu130 wheel index URL is itself version-locked
(wheels.vllm.ai/<TAG>/cu130/, no /latest/ alias upstream), so a vLLM
bump means rewriting two values atomically — the URL segment and the
version constraint. bump_deps.sh handles git-sha-in-Makefile only;
add a sibling bump_vllm_wheel.sh and a matching workflow job that
mirrors the existing matrix's PR-creation pattern.
The bumper queries /releases/latest (which excludes prereleases),
strips the leading 'v', and seds both lines unconditionally. When the
file is already on the latest tag the rewrite is a no-op and
peter-evans/create-pull-request opens no PR.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>
* docs(vllm): document engine_args and speculative decoding
The new engine_args: map plumbs arbitrary AsyncEngineArgs through to
vLLM, but the public docs only covered the basic typed fields. Add a
short subsection in the vLLM section explaining the typed/generic
split and showing a worked DFlash speculative-decoding config, with
pointers to vLLM's SpeculativeConfig reference and z-lab's drafter
collection.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>
---------
Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
268 lines
7.4 KiB
Go
268 lines
7.4 KiB
Go
package config
|
|
|
|
import (
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
var _ = Describe("Test cases for config related functions", func() {
|
|
Context("Test Read configuration functions", func() {
|
|
It("Test Validate", func() {
|
|
tmp, err := os.CreateTemp("", "config.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer os.Remove(tmp.Name())
|
|
_, err = tmp.WriteString(
|
|
`backend: "../foo-bar"
|
|
name: "foo"
|
|
parameters:
|
|
model: "foo-bar"
|
|
known_usecases:
|
|
- chat
|
|
- COMPLETION
|
|
`)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
configs, err := readModelConfigsFromFile(tmp.Name())
|
|
config := configs[0]
|
|
Expect(err).To(BeNil())
|
|
Expect(config).ToNot(BeNil())
|
|
valid, err := config.Validate()
|
|
Expect(err).To(HaveOccurred())
|
|
Expect(valid).To(BeFalse())
|
|
Expect(config.KnownUsecases).ToNot(BeNil())
|
|
})
|
|
It("Test Validate", func() {
|
|
tmp, err := os.CreateTemp("", "config.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer os.Remove(tmp.Name())
|
|
_, err = tmp.WriteString(
|
|
`name: bar-baz
|
|
backend: "foo-bar"
|
|
parameters:
|
|
model: "foo-bar"`)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
configs, err := readModelConfigsFromFile(tmp.Name())
|
|
config := configs[0]
|
|
Expect(err).To(BeNil())
|
|
Expect(config).ToNot(BeNil())
|
|
// two configs in config.yaml
|
|
Expect(config.Name).To(Equal("bar-baz"))
|
|
valid, err := config.Validate()
|
|
Expect(err).To(BeNil())
|
|
Expect(valid).To(BeTrue())
|
|
|
|
// download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
|
|
httpClient := http.Client{}
|
|
resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer resp.Body.Close()
|
|
tmp, err = os.CreateTemp("", "config.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer os.Remove(tmp.Name())
|
|
_, err = io.Copy(tmp, resp.Body)
|
|
Expect(err).To(BeNil())
|
|
configs, err = readModelConfigsFromFile(tmp.Name())
|
|
config = configs[0]
|
|
Expect(err).To(BeNil())
|
|
Expect(config).ToNot(BeNil())
|
|
// two configs in config.yaml
|
|
Expect(config.Name).To(Equal("hermes-2-pro-mistral"))
|
|
valid, err = config.Validate()
|
|
Expect(err).To(BeNil())
|
|
Expect(valid).To(BeTrue())
|
|
})
|
|
})
|
|
It("Properly handles backend usecase matching", func() {
|
|
|
|
a := ModelConfig{
|
|
Name: "a",
|
|
}
|
|
Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
|
|
|
|
b := ModelConfig{
|
|
Name: "b",
|
|
Backend: "stablediffusion",
|
|
}
|
|
Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
|
|
Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
|
|
|
c := ModelConfig{
|
|
Name: "c",
|
|
Backend: "llama-cpp",
|
|
TemplateConfig: TemplateConfig{
|
|
Chat: "chat",
|
|
},
|
|
}
|
|
Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
|
Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
|
|
Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
|
|
|
d := ModelConfig{
|
|
Name: "d",
|
|
Backend: "llama-cpp",
|
|
TemplateConfig: TemplateConfig{
|
|
Chat: "chat",
|
|
Completion: "completion",
|
|
},
|
|
}
|
|
Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
|
Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
|
Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
|
|
|
trueValue := true
|
|
e := ModelConfig{
|
|
Name: "e",
|
|
Backend: "llama-cpp",
|
|
TemplateConfig: TemplateConfig{
|
|
Completion: "completion",
|
|
},
|
|
Embeddings: &trueValue,
|
|
}
|
|
|
|
Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
|
Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
|
Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
|
Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
|
|
|
|
f := ModelConfig{
|
|
Name: "f",
|
|
Backend: "piper",
|
|
}
|
|
Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
|
|
Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
|
|
|
g := ModelConfig{
|
|
Name: "g",
|
|
Backend: "whisper",
|
|
}
|
|
Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
|
Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
|
|
|
|
h := ModelConfig{
|
|
Name: "h",
|
|
Backend: "transformers-musicgen",
|
|
}
|
|
Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
|
|
Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
|
|
Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
|
|
|
|
knownUsecases := FLAG_CHAT | FLAG_COMPLETION
|
|
i := ModelConfig{
|
|
Name: "i",
|
|
Backend: "whisper",
|
|
// Earlier test checks parsing, this just needs to set final values
|
|
KnownUsecases: &knownUsecases,
|
|
}
|
|
Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
|
|
Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
|
Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
|
|
Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
|
Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
|
})
|
|
It("Test Validate with invalid MCP config", func() {
|
|
tmp, err := os.CreateTemp("", "config.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer os.Remove(tmp.Name())
|
|
_, err = tmp.WriteString(
|
|
`name: test-mcp
|
|
backend: "llama-cpp"
|
|
mcp:
|
|
stdio: |
|
|
{
|
|
"mcpServers": {
|
|
"ddg": {
|
|
"command": "/docker/docker",
|
|
"args": ["run", "-i"]
|
|
}
|
|
"weather": {
|
|
"command": "/docker/docker",
|
|
"args": ["run", "-i"]
|
|
}
|
|
}
|
|
}`)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
configs, err := readModelConfigsFromFile(tmp.Name())
|
|
config := configs[0]
|
|
Expect(err).To(BeNil())
|
|
Expect(config).ToNot(BeNil())
|
|
valid, err := config.Validate()
|
|
Expect(err).To(HaveOccurred())
|
|
Expect(valid).To(BeFalse())
|
|
Expect(err.Error()).To(ContainSubstring("invalid MCP configuration"))
|
|
})
|
|
It("Test Validate with valid MCP config", func() {
|
|
tmp, err := os.CreateTemp("", "config.yaml")
|
|
Expect(err).To(BeNil())
|
|
defer os.Remove(tmp.Name())
|
|
_, err = tmp.WriteString(
|
|
`name: test-mcp-valid
|
|
backend: "llama-cpp"
|
|
mcp:
|
|
stdio: |
|
|
{
|
|
"mcpServers": {
|
|
"ddg": {
|
|
"command": "/docker/docker",
|
|
"args": ["run", "-i"]
|
|
},
|
|
"weather": {
|
|
"command": "/docker/docker",
|
|
"args": ["run", "-i"]
|
|
}
|
|
}
|
|
}`)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
configs, err := readModelConfigsFromFile(tmp.Name())
|
|
config := configs[0]
|
|
Expect(err).To(BeNil())
|
|
Expect(config).ToNot(BeNil())
|
|
valid, err := config.Validate()
|
|
Expect(err).To(BeNil())
|
|
Expect(valid).To(BeTrue())
|
|
})
|
|
It("Test Validate rejects unmarshalable engine_args", func() {
|
|
// chan values cannot be JSON-marshalled. A valid YAML config could
|
|
// not produce one, but a Go caller stuffing a bad value would, and
|
|
// silently dropping it would change runtime behaviour.
|
|
cfg := &ModelConfig{
|
|
Backend: "vllm",
|
|
LLMConfig: LLMConfig{
|
|
EngineArgs: map[string]any{
|
|
"speculative_config": make(chan int),
|
|
},
|
|
},
|
|
}
|
|
valid, err := cfg.Validate()
|
|
Expect(valid).To(BeFalse())
|
|
Expect(err).ToNot(BeNil())
|
|
Expect(err.Error()).To(ContainSubstring("engine_args is not JSON-serialisable"))
|
|
})
|
|
It("Test Validate accepts well-formed engine_args", func() {
|
|
cfg := &ModelConfig{
|
|
Backend: "vllm",
|
|
LLMConfig: LLMConfig{
|
|
EngineArgs: map[string]any{
|
|
"data_parallel_size": 8,
|
|
"speculative_config": map[string]any{
|
|
"method": "ngram",
|
|
"num_speculative_tokens": 4,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
valid, err := cfg.Validate()
|
|
Expect(err).To(BeNil())
|
|
Expect(valid).To(BeTrue())
|
|
})
|
|
})
|