mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-17 13:10:23 -04:00
Bring the sglang Python backend up to feature parity with vllm by adding
the same engine_args:-map plumbing the vLLM backend already has. Any
ServerArgs field (~380 in sglang 0.5.11) becomes settable from a model
YAML, including the speculative-decoding flags needed for Multi-Token
Prediction. Validation matches the vllm backend's: keys are checked
against dataclasses.fields(ServerArgs), unknown keys raise ValueError
with a difflib close-match suggestion at LoadModel time, and the typed
ModelOptions fields keep their existing meaning with engine_args
overriding them.
Backend code:
* backend/python/sglang/backend.py: add _apply_engine_args, import
dataclasses/difflib/ServerArgs, call from LoadModel; rename Seed ->
sampling_seed (sglang 0.5.11 renamed the SamplingParams field).
* backend/python/sglang/test.py + test.sh + Makefile: six unit tests
exercising the helper directly (no engine load required).
Build / CI / backend gallery (cuda13 + l4t13 paths are now first-class):
* backend/python/sglang/install.sh: add --prerelease=allow because
sglang 0.5.11 hard-pins flash-attn-4 which only ships beta wheels;
add --index-strategy=unsafe-best-match for cublas12 so the cu128
torch index wins over default-PyPI's cu130; new pyproject.toml-driven
l4t13 install path so [tool.uv.sources] can pin torch/torchvision/
torchaudio/sglang to the jetson-ai-lab index without forcing every
transitive PyPI dep through the L4T mirror's flaky proxy (mirrors the
equivalent fix in backend/python/vllm/install.sh).
* backend/python/sglang/pyproject.toml (new): L4T project spec with
explicit-source jetson-ai-lab index. Replaces requirements-l4t13.txt
for the l4t13 BUILD_PROFILE; other profiles still go through the
requirements-*.txt pipeline via libbackend.sh's installRequirements.
* backend/python/sglang/requirements-l4t13.txt: removed; superseded
by pyproject.toml.
* backend/python/sglang/requirements-cublas{12,13}{,-after}.txt: pin
sglang>=0.5.11 (Gemma 4 floor); add cu130 torch index for cublas13
(new files) and cu128 torch index for cublas12 (default PyPI now
ships cu130 torch wheels by default and breaks cu12 hosts).
* backend/index.yaml: add cuda13-sglang and cuda13-sglang-development
capability mappings + image entries pointing at
quay.io/.../-gpu-nvidia-cuda-13-sglang.
* .github/workflows/backend.yml: new cublas13 sglang matrix entry,
mirroring vllm's cuda13 build.
Model gallery + docs:
* gallery/sglang.yaml: base sglang config template, mirrors vllm.yaml.
* gallery/sglang-gemma-4-{e2b,e4b}-mtp.yaml: Gemma 4 MTP demos
transcribed verbatim from the SGLang Gemma 4 cookbook MTP commands.
* gallery/sglang-mimo-7b-mtp.yaml: MiMo-7B-RL with built-in MTP heads
+ online fp8 weight quantization, verified end-to-end on a 16 GB
RTX 5070 Ti at ~88 tok/s. Uses mem_fraction_static: 0.7 because the
MTP draft worker's vocab embedding is loaded unquantised and OOMs
the static reservation at sglang's 0.85 default.
* gallery/index.yaml: three new entries (gemma-4-e2b-it:sglang-mtp,
gemma-4-e4b-it:sglang-mtp, mimo-7b-mtp:sglang).
* docs/content/features/text-generation.md: new SGLang section with
setup, engine_args reference, MTP demos, version requirements.
* .agents/sglang-backend.md (new): agent one-pager covering the flat
ServerArgs structure, the typed-vs-engine_args precedence, the
speculative-decoding cheatsheet, and the mem_fraction_static gotcha
documented above.
* AGENTS.md: index entry for the new agent doc.
Known limitation: the two Gemma 4 MTP gallery entries ship a recipe
that doesn't yet run on stock libraries. The drafter checkpoints
(google/gemma-4-{E2B,E4B}-it-assistant) declare
model_type: gemma4_assistant / Gemma4AssistantForCausalLM, which
neither transformers (<=5.6.0, including the SGLang cookbook's pinned
commit 91b1ab1f... and main HEAD) nor sglang's own model registry
(<=0.5.11) registers as of 2026-05-06. They will start working when
HF or sglang upstream registers the architecture -- no LocalAI
changes needed. The MiMo MTP demo and the non-MTP Gemma 4 paths work
today on this build (verified on RTX 5070 Ti, 16 GB).
Assisted-by: Claude:claude-opus-4-7 [Read] [Edit] [Bash] [WebFetch] [WebSearch]
Signed-off-by: Richard Palethorpe <io@richiejp.com>
102 lines
4.0 KiB
Python
102 lines
4.0 KiB
Python
"""Unit tests for the sglang backend.
|
|
|
|
Helper-level tests run without launching the gRPC server or loading model
|
|
weights — they only exercise the pure-Python helpers on
|
|
``BackendServicer``. They do still require ``sglang`` to be importable
|
|
because ``_apply_engine_args`` validates keys against
|
|
``ServerArgs``'s dataclass fields.
|
|
"""
|
|
import unittest
|
|
|
|
|
|
class TestSglangHelpers(unittest.TestCase):
|
|
"""Tests for the pure helpers on BackendServicer (no gRPC, no engine)."""
|
|
|
|
def _servicer(self):
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from backend import BackendServicer # noqa: E402
|
|
return BackendServicer()
|
|
|
|
def test_parse_options(self):
|
|
servicer = self._servicer()
|
|
opts = servicer._parse_options([
|
|
"tool_parser:hermes",
|
|
"reasoning_parser:deepseek_r1",
|
|
"invalid_no_colon",
|
|
"key_with_colons:a:b:c",
|
|
])
|
|
self.assertEqual(opts["tool_parser"], "hermes")
|
|
self.assertEqual(opts["reasoning_parser"], "deepseek_r1")
|
|
self.assertEqual(opts["key_with_colons"], "a:b:c")
|
|
self.assertNotIn("invalid_no_colon", opts)
|
|
|
|
def test_apply_engine_args_known_keys(self):
|
|
"""User-supplied JSON merges into the kwargs dict; pre-set typed
|
|
fields stay put when not overridden."""
|
|
import json as _json
|
|
servicer = self._servicer()
|
|
base = {
|
|
"model_path": "facebook/opt-125m",
|
|
"mem_fraction_static": 0.7,
|
|
}
|
|
extras = _json.dumps({
|
|
"trust_remote_code": True,
|
|
"speculative_algorithm": "EAGLE",
|
|
"speculative_num_steps": 1,
|
|
})
|
|
out = servicer._apply_engine_args(base, extras)
|
|
self.assertIs(out, base) # in-place merge — same dict back
|
|
self.assertTrue(out["trust_remote_code"])
|
|
self.assertEqual(out["speculative_algorithm"], "EAGLE")
|
|
self.assertEqual(out["speculative_num_steps"], 1)
|
|
self.assertEqual(out["model_path"], "facebook/opt-125m")
|
|
self.assertEqual(out["mem_fraction_static"], 0.7)
|
|
|
|
def test_apply_engine_args_engine_args_overrides_typed_fields(self):
|
|
"""engine_args wins over previously-set typed kwargs (vLLM precedence)."""
|
|
import json as _json
|
|
servicer = self._servicer()
|
|
base = {"model_path": "facebook/opt-125m", "mem_fraction_static": 0.7}
|
|
out = servicer._apply_engine_args(
|
|
base, _json.dumps({"mem_fraction_static": 0.5}),
|
|
)
|
|
self.assertEqual(out["mem_fraction_static"], 0.5)
|
|
|
|
def test_apply_engine_args_unknown_key_raises(self):
|
|
"""Typo'd key raises ValueError with a close-match suggestion."""
|
|
import json as _json
|
|
servicer = self._servicer()
|
|
base = {"model_path": "facebook/opt-125m"}
|
|
with self.assertRaises(ValueError) as ctx:
|
|
servicer._apply_engine_args(
|
|
base, _json.dumps({"trust_remotecode": True}),
|
|
)
|
|
msg = str(ctx.exception)
|
|
self.assertIn("trust_remotecode", msg)
|
|
self.assertIn("trust_remote_code", msg)
|
|
|
|
def test_apply_engine_args_empty_passthrough(self):
|
|
"""Empty / None engine_args returns the kwargs dict untouched."""
|
|
servicer = self._servicer()
|
|
base = {"model_path": "facebook/opt-125m"}
|
|
self.assertIs(servicer._apply_engine_args(base, ""), base)
|
|
self.assertIs(servicer._apply_engine_args(base, None), base)
|
|
|
|
def test_apply_engine_args_invalid_json_raises(self):
|
|
servicer = self._servicer()
|
|
with self.assertRaises(ValueError) as ctx:
|
|
servicer._apply_engine_args({}, "not-json")
|
|
self.assertIn("not valid JSON", str(ctx.exception))
|
|
|
|
def test_apply_engine_args_non_object_raises(self):
|
|
servicer = self._servicer()
|
|
with self.assertRaises(ValueError) as ctx:
|
|
servicer._apply_engine_args({}, "[1,2,3]")
|
|
self.assertIn("must be a JSON object", str(ctx.exception))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|