diff --git a/.github/bump_vllm_wheel.sh b/.github/bump_vllm_wheel.sh new file mode 100755 index 000000000..8cb304768 --- /dev/null +++ b/.github/bump_vllm_wheel.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Bump the cublas13 vLLM wheel pin in requirements-cublas13-after.txt. +# +# vLLM's PyPI wheel is built against CUDA 12 so the cublas13 build pulls a +# cu130-flavoured wheel from vLLM's per-tag index at +# https://wheels.vllm.ai//cu130/. That URL segment is itself version-locked +# (no /latest/ alias upstream), so bumping vLLM means rewriting both the URL +# segment and the version constraint atomically. bump_deps.sh handles git-sha +# vars in Makefiles; this script handles the two-value rewrite specific to the +# vLLM requirements file. +set -xe +REPO=$1 # vllm-project/vllm +FILE=$2 # backend/python/vllm/requirements-cublas13-after.txt +VAR=$3 # VLLM_VERSION (used for output file names so the workflow can read them) + +if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +# /releases/latest returns the most recent non-prerelease tag. +LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/$REPO/releases/latest" \ + | python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])") + +# Strip leading 'v' (vLLM tags are 'v0.20.0', the URL/version use '0.20.0'). +NEW_VERSION="${LATEST_TAG#v}" + +set +e +CURRENT_VERSION=$(grep -oE '^vllm==[0-9]+\.[0-9]+\.[0-9]+' "$FILE" | head -1 | cut -d= -f3) +set -e + +# sed both lines unconditionally — peter-evans/create-pull-request opens no PR +# when the working tree is clean, so a no-op rewrite is safe. +sed -i "$FILE" \ + -e "s|wheels\.vllm\.ai/[^/]*/cu130|wheels.vllm.ai/$NEW_VERSION/cu130|g" \ + -e "s|^vllm==.*|vllm==$NEW_VERSION|" + +if [ -z "$CURRENT_VERSION" ]; then + echo "Could not find vllm==X.Y.Z in $FILE." + exit 0 +fi + +echo "Changes: https://github.com/$REPO/compare/v${CURRENT_VERSION}...${LATEST_TAG}" >> "${VAR}_message.txt" +echo "${NEW_VERSION}" >> "${VAR}_commit.txt" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 7117950e0..676af410b 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -80,5 +80,37 @@ jobs: body: ${{ steps.bump.outputs.message }} signoff: true - - + bump-vllm-wheel: + # vLLM's cu130 wheel comes from a per-tag index URL (no /latest/ alias), + # so the cublas13 requirements file pins both a URL segment and a version + # constraint. bump_deps.sh handles git-sha-in-Makefile only — this job + # rewrites both values atomically when a new vLLM stable tag ships. + if: github.repository == 'mudler/LocalAI' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Bump vLLM cu130 wheel pin 🔧 + id: bump + run: | + bash .github/bump_vllm_wheel.sh vllm-project/vllm backend/python/vllm/requirements-cublas13-after.txt VLLM_VERSION + { + echo 'message<> "$GITHUB_OUTPUT" + { + echo 'commit<> "$GITHUB_OUTPUT" + rm -rfv VLLM_VERSION_message.txt VLLM_VERSION_commit.txt + - name: Create Pull Request + uses: peter-evans/create-pull-request@v8 + with: + token: ${{ secrets.UPDATE_BOT_TOKEN }} + push-to-fork: ci-forks/LocalAI + commit-message: ':arrow_up: Update vllm-project/vllm cu130 wheel' + title: 'chore: :arrow_up: Update vllm-project/vllm cu130 wheel to `${{ steps.bump.outputs.commit }}`' + branch: "update/VLLM_VERSION" + body: ${{ steps.bump.outputs.message }} + signoff: true diff --git a/backend/backend.proto b/backend/backend.proto index 0c54d7307..43b6abe6c 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -310,6 +310,11 @@ message ModelOptions { bool Reranking = 71; repeated string Overrides = 72; + + // EngineArgs carries a JSON-encoded map of backend-native engine arguments + // applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs). + // Unknown keys produce an error at LoadModel time. + string EngineArgs = 73; } message Result { diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 95ae95a9d..fcdbb96cd 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 import asyncio +import dataclasses +import difflib from concurrent import futures import argparse import signal @@ -101,6 +103,36 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): opts[key.strip()] = value.strip() return opts + def _apply_engine_args(self, engine_args, engine_args_json): + """Apply user-supplied engine_args (JSON object) onto an AsyncEngineArgs. + + Returns a new AsyncEngineArgs with the typed fields preserved and the + user's overrides layered on top. Uses ``dataclasses.replace`` so vLLM's + ``__post_init__`` re-runs and auto-converts dict-valued fields like + ``compilation_config`` / ``attention_config`` into their dataclass form. + ``speculative_config`` and ``kv_transfer_config`` are accepted as dicts + directly (vLLM converts them at engine init). + + Unknown keys raise ValueError with the closest valid field as a hint. + """ + if not engine_args_json: + return engine_args + try: + extra = json.loads(engine_args_json) + except json.JSONDecodeError as e: + raise ValueError(f"engine_args is not valid JSON: {e}") from e + if not isinstance(extra, dict): + raise ValueError( + f"engine_args must be a JSON object, got {type(extra).__name__}" + ) + valid = {f.name for f in dataclasses.fields(type(engine_args))} + for key in extra: + if key not in valid: + suggestion = difflib.get_close_matches(key, valid, n=1) + hint = f" did you mean {suggestion[0]!r}?" if suggestion else "" + raise ValueError(f"unknown engine_args key {key!r}.{hint}") + return dataclasses.replace(engine_args, **extra) + def _messages_to_dicts(self, messages): """Convert proto Messages to list of dicts suitable for apply_chat_template().""" result = [] @@ -176,6 +208,15 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): "audio": max(request.LimitAudioPerPrompt, 1) } + # engine_args from YAML overrides typed fields above so operators can + # tune anything the AsyncEngineArgs dataclass exposes without waiting + # on protobuf changes. + try: + engine_args = self._apply_engine_args(engine_args, request.EngineArgs) + except ValueError as err: + print(f"engine_args error: {err}", file=sys.stderr) + return backend_pb2.Result(success=False, message=str(err)) + try: self.llm = AsyncLLMEngine.from_engine_args(engine_args) except Exception as err: diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 466095634..2313ff779 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,6 +32,14 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi +# cublas13 pulls the vLLM wheel from a per-tag cu130 index (PyPI's vllm wheel +# is built against CUDA 12 and won't load on cu130). uv's default per-package +# first-match strategy would still pick the PyPI wheel, so allow it to consult +# every configured index when resolving. +if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + # JetPack 7 / L4T arm64 wheels (torch, vllm, flash-attn) live on # pypi.jetson-ai-lab.io and are built for cp312, so bump the venv Python # accordingly. JetPack 6 keeps cp310 + USE_PIP=true. unsafe-best-match diff --git a/backend/python/vllm/requirements-cublas13-after.txt b/backend/python/vllm/requirements-cublas13-after.txt index 1644a5544..1c38d8d3b 100644 --- a/backend/python/vllm/requirements-cublas13-after.txt +++ b/backend/python/vllm/requirements-cublas13-after.txt @@ -1,2 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu130 -vllm +# vLLM's PyPI wheel is built against CUDA 12 (libcudart.so.12) and won't load +# on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index +# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match +# so uv consults this index alongside PyPI. +--extra-index-url https://wheels.vllm.ai/0.20.0/cu130 +vllm==0.20.0 diff --git a/backend/python/vllm/test.py b/backend/python/vllm/test.py index 21aaf4cf7..25a7f54e6 100644 --- a/backend/python/vllm/test.py +++ b/backend/python/vllm/test.py @@ -168,6 +168,58 @@ class TestBackendServicer(unittest.TestCase): self.assertEqual(opts["key_with_colons"], "a:b:c") self.assertNotIn("invalid_no_colon", opts) + def test_apply_engine_args_known_keys(self): + """ + Tests _apply_engine_args overlays user-supplied JSON onto AsyncEngineArgs. + """ + import sys, os, json as _json + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + from vllm.engine.arg_utils import AsyncEngineArgs + + servicer = BackendServicer() + base = AsyncEngineArgs(model="facebook/opt-125m") + extras = _json.dumps({ + "trust_remote_code": True, + "max_num_seqs": 32, + }) + out = servicer._apply_engine_args(base, extras) + self.assertTrue(out.trust_remote_code) + self.assertEqual(out.max_num_seqs, 32) + # untouched fields preserved + self.assertEqual(out.model, "facebook/opt-125m") + + def test_apply_engine_args_unknown_key_raises(self): + """ + Tests _apply_engine_args rejects unknown keys with a helpful suggestion. + """ + import sys, os, json as _json + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + from vllm.engine.arg_utils import AsyncEngineArgs + + servicer = BackendServicer() + base = AsyncEngineArgs(model="facebook/opt-125m") + with self.assertRaises(ValueError) as ctx: + servicer._apply_engine_args(base, _json.dumps({"trustremotecode": True})) + self.assertIn("trustremotecode", str(ctx.exception)) + # close-match hint for the typo + self.assertIn("trust_remote_code", str(ctx.exception)) + + def test_apply_engine_args_empty_passthrough(self): + """ + Tests that empty engine_args returns the base unchanged. + """ + import sys, os + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + from vllm.engine.arg_utils import AsyncEngineArgs + + servicer = BackendServicer() + base = AsyncEngineArgs(model="facebook/opt-125m") + self.assertIs(servicer._apply_engine_args(base, ""), base) + self.assertIs(servicer._apply_engine_args(base, None), base) + def test_tokenize_string(self): """ Tests the TokenizeString RPC returns valid tokens. diff --git a/core/backend/options.go b/core/backend/options.go index b09782ce2..9054bb39b 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -1,6 +1,8 @@ package backend import ( + "encoding/json" + "fmt" "math/rand/v2" "os" "path/filepath" @@ -159,6 +161,19 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { }) } + engineArgsJSON := "" + if len(c.EngineArgs) > 0 { + buf, err := json.Marshal(c.EngineArgs) + if err != nil { + // ModelConfig.Validate() rejects unmarshalable engine_args at + // config load, so reaching here means the validator was bypassed. + // Silently dropping user-set options would change runtime behaviour + // without warning — fail loud instead. + panic(fmt.Sprintf("engine_args marshal failed for model %q: %v (Validate() should have caught this)", c.Model, err)) + } + engineArgsJSON = string(buf) + } + opts := &pb.ModelOptions{ CUDA: c.CUDA || c.Diffusers.CUDA, SchedulerType: c.Diffusers.SchedulerType, @@ -176,6 +191,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { CLIPSubfolder: c.Diffusers.ClipSubFolder, Options: c.Options, Overrides: c.Overrides, + EngineArgs: engineArgsJSON, CLIPSkip: int32(c.Diffusers.ClipSkip), ControlNet: c.Diffusers.ControlNet, ContextSize: int32(ctxSize), diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go new file mode 100644 index 000000000..bdce828b3 --- /dev/null +++ b/core/backend/options_internal_test.go @@ -0,0 +1,44 @@ +package backend + +import ( + "encoding/json" + + "github.com/mudler/LocalAI/core/config" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("grpcModelOpts EngineArgs", func() { + It("serialises engine_args as JSON preserving nested values", func() { + threads := 1 + cfg := config.ModelConfig{ + Threads: &threads, + LLMConfig: config.LLMConfig{ + EngineArgs: map[string]any{ + "data_parallel_size": 8, + "enable_expert_parallel": true, + "speculative_config": map[string]any{ + "method": "ngram", + "num_speculative_tokens": 4, + }, + }, + }, + } + + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.EngineArgs).NotTo(BeEmpty()) + + var round map[string]any + Expect(json.Unmarshal([]byte(opts.EngineArgs), &round)).To(Succeed()) + Expect(round["data_parallel_size"]).To(BeEquivalentTo(8)) + Expect(round["enable_expert_parallel"]).To(BeTrue()) + Expect(round["speculative_config"]).To(HaveKeyWithValue("method", "ngram")) + }) + + It("leaves EngineArgs empty when unset", func() { + threads := 1 + opts := grpcModelOpts(config.ModelConfig{Threads: &threads}, "/tmp/models") + Expect(opts.EngineArgs).To(BeEmpty()) + }) +}) diff --git a/core/config/hooks_test.go b/core/config/hooks_test.go index b97077564..12aad2558 100644 --- a/core/config/hooks_test.go +++ b/core/config/hooks_test.go @@ -110,5 +110,30 @@ var _ = Describe("Backend hooks and parser defaults", func() { } Expect(count).To(Equal(1)) }) + + It("seeds production engine_args defaults", func() { + cfg := &ModelConfig{Backend: "vllm"} + cfg.SetDefaults() + + Expect(cfg.EngineArgs).NotTo(BeNil()) + Expect(cfg.EngineArgs["enable_prefix_caching"]).To(Equal(true)) + Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true)) + }) + + It("does not override user-set engine_args", func() { + cfg := &ModelConfig{ + Backend: "vllm", + LLMConfig: LLMConfig{ + EngineArgs: map[string]any{ + "enable_prefix_caching": false, + }, + }, + } + cfg.SetDefaults() + + Expect(cfg.EngineArgs["enable_prefix_caching"]).To(Equal(false)) + // chunked_prefill is still seeded since user didn't set it + Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true)) + }) }) }) diff --git a/core/config/hooks_vllm.go b/core/config/hooks_vllm.go index 3f7abd9b3..ffdd1a52a 100644 --- a/core/config/hooks_vllm.go +++ b/core/config/hooks_vllm.go @@ -45,8 +45,34 @@ func MatchParserDefaults(modelID string) map[string]string { return nil } +// productionEngineArgsDefaults are vLLM ≥ 0.6 features that production deployments +// almost always want. Applied at load time when the user hasn't set the key in +// engine_args. Anything user-supplied wins; we never silently override. +var productionEngineArgsDefaults = map[string]any{ + "enable_prefix_caching": true, + "enable_chunked_prefill": true, +} + func vllmDefaults(cfg *ModelConfig, modelPath string) { - // Check if user already set tool_parser or reasoning_parser in Options + applyEngineArgDefaults(cfg) + applyParserDefaults(cfg) +} + +// applyEngineArgDefaults seeds production-friendly engine_args without overwriting +// anything the user already set. +func applyEngineArgDefaults(cfg *ModelConfig) { + if cfg.EngineArgs == nil { + cfg.EngineArgs = map[string]any{} + } + for k, v := range productionEngineArgsDefaults { + if _, set := cfg.EngineArgs[k]; set { + continue + } + cfg.EngineArgs[k] = v + } +} + +func applyParserDefaults(cfg *ModelConfig) { hasToolParser := false hasReasoningParser := false for _, opt := range cfg.Options { @@ -61,7 +87,6 @@ func vllmDefaults(cfg *ModelConfig, modelPath string) { return } - // Try matching against Model field, then Name parsers := MatchParserDefaults(cfg.Model) if parsers == nil { parsers = MatchParserDefaults(cfg.Name) diff --git a/core/config/model_config.go b/core/config/model_config.go index 1184d8452..f1f0b30d3 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1,6 +1,7 @@ package config import ( + "encoding/json" "fmt" "os" "regexp" @@ -241,7 +242,13 @@ type LLMConfig struct { DisableLogStatus bool `yaml:"disable_log_stats,omitempty" json:"disable_log_stats,omitempty"` // vLLM DType string `yaml:"dtype,omitempty" json:"dtype,omitempty"` // vLLM LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt,omitempty" json:"limit_mm_per_prompt,omitempty"` // vLLM - MMProj string `yaml:"mmproj,omitempty" json:"mmproj,omitempty"` + // EngineArgs is a backend-native passthrough applied to the engine constructor + // (e.g. vLLM AsyncEngineArgs). Values may be primitives or nested maps; nested + // maps materialise into the backend's nested config dataclasses (e.g. + // SpeculativeConfig, KVTransferConfig, CompilationConfig). Unknown keys cause + // the backend to fail LoadModel with a list of valid names. + EngineArgs map[string]any `yaml:"engine_args,omitempty" json:"engine_args,omitempty"` + MMProj string `yaml:"mmproj,omitempty" json:"mmproj,omitempty"` FlashAttention *string `yaml:"flash_attention,omitempty" json:"flash_attention,omitempty"` NoKVOffloading bool `yaml:"no_kv_offloading,omitempty" json:"no_kv_offloading,omitempty"` @@ -545,6 +552,15 @@ func (c *ModelConfig) Validate() (bool, error) { } } + // engine_args crosses the gRPC boundary as a JSON-encoded string. Reject + // unmarshalable values here so a config that would silently lose user-set + // options at load time is rejected at parse time instead. + if len(c.EngineArgs) > 0 { + if _, err := json.Marshal(c.EngineArgs); err != nil { + return false, fmt.Errorf("engine_args is not JSON-serialisable: %w", err) + } + } + return true, nil } diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go index 9926774b1..81b86fe3f 100644 --- a/core/config/model_config_test.go +++ b/core/config/model_config_test.go @@ -230,4 +230,38 @@ mcp: Expect(err).To(BeNil()) Expect(valid).To(BeTrue()) }) + It("Test Validate rejects unmarshalable engine_args", func() { + // chan values cannot be JSON-marshalled. A valid YAML config could + // not produce one, but a Go caller stuffing a bad value would, and + // silently dropping it would change runtime behaviour. + cfg := &ModelConfig{ + Backend: "vllm", + LLMConfig: LLMConfig{ + EngineArgs: map[string]any{ + "speculative_config": make(chan int), + }, + }, + } + valid, err := cfg.Validate() + Expect(valid).To(BeFalse()) + Expect(err).ToNot(BeNil()) + Expect(err.Error()).To(ContainSubstring("engine_args is not JSON-serialisable")) + }) + It("Test Validate accepts well-formed engine_args", func() { + cfg := &ModelConfig{ + Backend: "vllm", + LLMConfig: LLMConfig{ + EngineArgs: map[string]any{ + "data_parallel_size": 8, + "speculative_config": map[string]any{ + "method": "ngram", + "num_speculative_tokens": 4, + }, + }, + }, + } + valid, err := cfg.Validate() + Expect(err).To(BeNil()) + Expect(valid).To(BeTrue()) + }) }) diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index fdc4edc72..6d051aada 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -665,6 +665,46 @@ curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d }' ``` +#### Passing arbitrary vLLM options with `engine_args` + +A subset of `AsyncEngineArgs` is exposed as typed YAML fields +(`tensor_parallel_size`, `gpu_memory_utilization`, `quantization`, +`max_model_len`, `dtype`, `trust_remote_code`, `enforce_eager`, …). +Anything else can be passed through the generic `engine_args:` map. +Keys are forwarded verbatim to vLLM's engine; unknown keys fail at load +time with the closest valid name as a hint. Nested maps materialise +into vLLM's nested config dataclasses (`SpeculativeConfig`, +`KVTransferConfig`, `CompilationConfig`, …). + +Speculative decoding (DFlash, ngram, eagle, deepseek_mtp, …) is +configured this way: + +```yaml +name: qwen3.5-4b-dflash +backend: vllm +parameters: + model: Qwen/Qwen3.5-4B +context_size: 8192 +max_model_len: 8192 +trust_remote_code: true +quantization: fp8 +template: + use_tokenizer_template: true +engine_args: + speculative_config: + method: dflash + model: z-lab/Qwen3.5-4B-DFlash + num_speculative_tokens: 15 +``` + +The shape of `speculative_config` follows vLLM's +[`SpeculativeConfig`](https://docs.vllm.ai/en/latest/api/vllm/config/speculative.html) +— `method` picks the algorithm, the remaining keys are method-specific. +Drafters from [z-lab](https://huggingface.co/z-lab) are paired with +specific target models; pick the one that matches your target. The +drafter loads in its native precision regardless of the target's +`quantization:` setting. + ### Transformers [Transformers](https://huggingface.co/docs/transformers/index) is a State-of-the-art Machine Learning library for PyTorch, TensorFlow, and JAX.