feat(backends): add sglang

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-14 22:38:52 +00:00
parent ad3c8c4832
commit d47e2aa93f
21 changed files with 896 additions and 2 deletions

View File

@@ -66,6 +66,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-sglang'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
@@ -411,6 +424,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-sglang'
runs-on: 'arc-runner-set'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -1427,6 +1453,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-sglang'
runs-on: 'arc-runner-set'
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
@@ -1689,6 +1728,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sglang'
runs-on: 'arc-runner-set'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""

View File

@@ -33,6 +33,7 @@ jobs:
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
turboquant: ${{ steps.detect.outputs.turboquant }}
vllm: ${{ steps.detect.outputs.vllm }}
sglang: ${{ steps.detect.outputs.sglang }}
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -589,6 +590,32 @@ jobs:
# - name: Build vllm (cpu) backend image and run gRPC e2e tests
# run: |
# make test-extra-backend-vllm
tests-sglang-grpc:
needs: detect-changes
if: needs.detect-changes.outputs.sglang == 'true' || needs.detect-changes.outputs.run-all == 'true'
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
- name: Clone
uses: actions/checkout@v6
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
make build-essential curl unzip ca-certificates git tar
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: '1.25.4'
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
df -h
- name: Build sglang (cpu) backend image and run gRPC e2e tests
run: |
make test-extra-backend-sglang
tests-acestep-cpp:
needs: detect-changes
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'

View File

@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad
GOCMD=go
GOTEST=$(GOCMD) test
@@ -419,6 +419,7 @@ prepare-test-extra: protogen-python
$(MAKE) -C backend/python/chatterbox
$(MAKE) -C backend/python/vllm
$(MAKE) -C backend/python/vllm-omni
$(MAKE) -C backend/python/sglang
$(MAKE) -C backend/python/vibevoice
$(MAKE) -C backend/python/moonshine
$(MAKE) -C backend/python/pocket-tts
@@ -602,6 +603,17 @@ test-extra-backend-tinygrad-all: \
test-extra-backend-tinygrad-sd \
test-extra-backend-tinygrad-whisper
## sglang mirrors the vllm setup: HuggingFace model id, same tiny Qwen,
## tool-call extraction via sglang's native qwen parser. CPU builds use
## sglang's upstream pyproject_cpu.toml recipe (see backend/python/sglang/install.sh).
test-extra-backend-sglang: docker-build-sglang
BACKEND_IMAGE=local-ai-backend:sglang \
BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
BACKEND_TEST_CAPS=health,load,predict,stream,tools \
BACKEND_TEST_OPTIONS=tool_parser:qwen \
$(MAKE) test-extra-backend
## mlx is Apple-Silicon-first — the MLX backend auto-detects the right tool
## parser from the chat template, so no tool_parser: option is needed (it
## would be ignored at runtime). Run this on macOS / arm64 with Metal; the
@@ -741,6 +753,7 @@ BACKEND_NEUTTS = neutts|python|.|false|true
BACKEND_KOKORO = kokoro|python|.|false|true
BACKEND_VLLM = vllm|python|.|false|true
BACKEND_VLLM_OMNI = vllm-omni|python|.|false|true
BACKEND_SGLANG = sglang|python|.|false|true
BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true
BACKEND_CHATTERBOX = chatterbox|python|.|false|true
BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
@@ -811,6 +824,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_NEUTTS)))
$(eval $(call generate-docker-build-target,$(BACKEND_KOKORO)))
$(eval $(call generate-docker-build-target,$(BACKEND_VLLM)))
$(eval $(call generate-docker-build-target,$(BACKEND_VLLM_OMNI)))
$(eval $(call generate-docker-build-target,$(BACKEND_SGLANG)))
$(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS)))
$(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX)))
$(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
@@ -839,7 +853,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
########################################################
### Mock Backend for E2E Tests

View File

@@ -227,6 +227,28 @@
intel: "intel-vllm"
nvidia-cuda-12: "cuda12-vllm"
cpu: "cpu-vllm"
- &sglang
name: "sglang"
license: apache-2.0
urls:
- https://github.com/sgl-project/sglang
tags:
- text-to-text
- multimodal
icon: https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png
description: |
SGLang is a fast serving framework for large language models and vision language models.
It co-designs the backend runtime (RadixAttention, continuous batching, structured
decoding) and the frontend language to make interaction with models faster and more
controllable. Features include fast backend runtime, flexible frontend language,
extensive model support, and an active community.
alias: "sglang"
capabilities:
nvidia: "cuda12-sglang"
amd: "rocm-sglang"
intel: "intel-sglang"
nvidia-cuda-12: "cuda12-sglang"
cpu: "cpu-sglang"
- &vllm-omni
name: "vllm-omni"
license: apache-2.0
@@ -1766,6 +1788,54 @@
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm"
mirrors:
- localai/localai-backends:master-cpu-vllm
# sglang
- !!merge <<: *sglang
name: "sglang-development"
capabilities:
nvidia: "cuda12-sglang-development"
amd: "rocm-sglang-development"
intel: "intel-sglang-development"
cpu: "cpu-sglang-development"
- !!merge <<: *sglang
name: "cuda12-sglang"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sglang"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-sglang
- !!merge <<: *sglang
name: "rocm-sglang"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-sglang"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-sglang
- !!merge <<: *sglang
name: "intel-sglang"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sglang"
mirrors:
- localai/localai-backends:latest-gpu-intel-sglang
- !!merge <<: *sglang
name: "cpu-sglang"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sglang"
mirrors:
- localai/localai-backends:latest-cpu-sglang
- !!merge <<: *sglang
name: "cuda12-sglang-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sglang"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-sglang
- !!merge <<: *sglang
name: "rocm-sglang-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-sglang"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-sglang
- !!merge <<: *sglang
name: "intel-sglang-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sglang"
mirrors:
- localai/localai-backends:master-gpu-intel-sglang
- !!merge <<: *sglang
name: "cpu-sglang-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-sglang"
mirrors:
- localai/localai-backends:master-cpu-sglang
# vllm-omni
- !!merge <<: *vllm-omni
name: "vllm-omni-development"

View File

@@ -0,0 +1,17 @@
.PHONY: sglang
sglang:
bash install.sh
.PHONY: run
run: sglang
@echo "Running sglang..."
bash run.sh
@echo "sglang run."
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View File

@@ -0,0 +1,502 @@
#!/usr/bin/env python3
"""LocalAI gRPC backend for sglang.
Wraps sglang's async Engine API behind the Backend gRPC contract defined
in backend.proto. Mirrors the structure of backend/python/vllm/backend.py
so that the two backends stay behavior-equivalent at the protocol level.
The streaming path applies sglang's per-request FunctionCallParser and
ReasoningParser so tool_calls and reasoning_content are emitted
incrementally inside ChatDelta, which is a capability sglang exposes
natively and vLLM does not.
"""
import asyncio
from concurrent import futures
import argparse
import signal
import sys
import os
import json
import gc
import uuid
import base64
import io
from typing import Dict, List, Optional, Tuple
from PIL import Image
import backend_pb2
import backend_pb2_grpc
import grpc
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
from grpc_auth import get_auth_interceptors
# sglang imports. Engine is the stable public entry point; parser modules
# are wrapped in try/except so older / leaner installs that omit them
# still load the backend for plain text generation.
from sglang.srt.entrypoints.engine import Engine
try:
from sglang.srt.function_call.function_call_parser import FunctionCallParser
# sglang's FunctionCallParser expects a list of pydantic Tool objects
# (protocol.Tool with .function.name), not plain dicts. Wrap at the
# request boundary to match.
from sglang.srt.entrypoints.openai.protocol import Tool as SglTool
HAS_TOOL_PARSERS = True
except Exception:
FunctionCallParser = None # type: ignore
SglTool = None # type: ignore
HAS_TOOL_PARSERS = False
try:
from sglang.srt.parser.reasoning_parser import ReasoningParser
HAS_REASONING_PARSERS = True
except Exception:
ReasoningParser = None # type: ignore
HAS_REASONING_PARSERS = False
try:
from transformers import AutoTokenizer
HAS_TRANSFORMERS = True
except Exception:
AutoTokenizer = None # type: ignore
HAS_TRANSFORMERS = False
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""gRPC servicer implementing the Backend service for sglang."""
def _parse_options(self, options_list) -> Dict[str, str]:
opts: Dict[str, str] = {}
for opt in options_list:
if ":" not in opt:
continue
key, value = opt.split(":", 1)
opts[key.strip()] = value.strip()
return opts
def _messages_to_dicts(self, messages) -> List[dict]:
result: List[dict] = []
for msg in messages:
d = {"role": msg.role, "content": msg.content or ""}
if msg.name:
d["name"] = msg.name
if msg.tool_call_id:
d["tool_call_id"] = msg.tool_call_id
if msg.reasoning_content:
d["reasoning_content"] = msg.reasoning_content
if msg.tool_calls:
try:
d["tool_calls"] = json.loads(msg.tool_calls)
except json.JSONDecodeError:
pass
result.append(d)
return result
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
async def LoadModel(self, request, context):
engine_kwargs = {"model_path": request.Model}
if request.Quantization:
engine_kwargs["quantization"] = request.Quantization
if request.LoadFormat:
engine_kwargs["load_format"] = request.LoadFormat
if request.GPUMemoryUtilization:
engine_kwargs["mem_fraction_static"] = float(request.GPUMemoryUtilization)
if request.TrustRemoteCode:
engine_kwargs["trust_remote_code"] = True
if request.EnforceEager:
engine_kwargs["disable_cuda_graph"] = True
if request.TensorParallelSize:
engine_kwargs["tp_size"] = int(request.TensorParallelSize)
if request.MaxModelLen:
engine_kwargs["context_length"] = int(request.MaxModelLen)
if request.DType:
engine_kwargs["dtype"] = request.DType
opts = self._parse_options(request.Options)
# Cache parser names — actual parser instances are created per
# request because sglang's parsers are stateful.
self.tool_parser_name: Optional[str] = opts.get("tool_parser") or None
self.reasoning_parser_name: Optional[str] = opts.get("reasoning_parser") or None
# Also hand the parser names to sglang's engine so its HTTP/OAI
# paths work identically if someone hits the engine directly.
if self.tool_parser_name:
engine_kwargs["tool_call_parser"] = self.tool_parser_name
if self.reasoning_parser_name:
engine_kwargs["reasoning_parser"] = self.reasoning_parser_name
try:
self.llm = Engine(**engine_kwargs)
except Exception as err:
print(f"sglang Engine init failed: {err!r}", file=sys.stderr)
return backend_pb2.Result(success=False, message=f"{err!r}")
# sglang does not expose a uniform get_tokenizer() off Engine.
# Use transformers directly — same path sglang uses internally.
self.tokenizer = None
if HAS_TRANSFORMERS:
try:
self.tokenizer = AutoTokenizer.from_pretrained(
request.Model,
trust_remote_code=bool(request.TrustRemoteCode),
)
except Exception as err:
print(f"AutoTokenizer load failed (non-fatal): {err!r}", file=sys.stderr)
print("Model loaded successfully", file=sys.stderr)
return backend_pb2.Result(message="Model loaded successfully", success=True)
async def Predict(self, request, context):
gen = self._predict(request, context, streaming=False)
res = await gen.__anext__()
return res
async def PredictStream(self, request, context):
iterations = self._predict(request, context, streaming=True)
try:
async for iteration in iterations:
yield iteration
finally:
try:
await iterations.aclose()
except Exception:
pass
async def TokenizeString(self, request, context):
if not getattr(self, "tokenizer", None):
context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
context.set_details("tokenizer not loaded")
return backend_pb2.TokenizationResponse()
try:
tokens = self.tokenizer.encode(request.Prompt)
return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
except Exception as e:
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(str(e))
return backend_pb2.TokenizationResponse()
async def Free(self, request, context):
try:
if hasattr(self, "llm"):
try:
self.llm.shutdown()
except Exception:
pass
del self.llm
if hasattr(self, "tokenizer"):
del self.tokenizer
self.tool_parser_name = None
self.reasoning_parser_name = None
gc.collect()
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
except ImportError:
pass
return backend_pb2.Result(success=True, message="Model freed")
except Exception as e:
return backend_pb2.Result(success=False, message=str(e))
def _build_sampling_params(self, request) -> dict:
sampling_params: dict = {"temperature": 0.7, "max_new_tokens": 200}
mapping = {
"N": "n",
"PresencePenalty": "presence_penalty",
"FrequencyPenalty": "frequency_penalty",
"RepetitionPenalty": "repetition_penalty",
"Temperature": "temperature",
"TopP": "top_p",
"TopK": "top_k",
"MinP": "min_p",
"Seed": "seed",
"StopPrompts": "stop",
"StopTokenIds": "stop_token_ids",
"IgnoreEOS": "ignore_eos",
"Tokens": "max_new_tokens",
"MinTokens": "min_new_tokens",
"SkipSpecialTokens": "skip_special_tokens",
}
for proto_field, sgl_key in mapping.items():
if not hasattr(request, proto_field):
continue
value = getattr(request, proto_field)
if value in (None, 0, 0.0, [], False, ""):
continue
# repeated fields come back as RepeatedScalarContainer — convert
if hasattr(value, "__iter__") and not isinstance(value, (str, bytes)):
value = list(value)
if not value:
continue
sampling_params[sgl_key] = value
# Grammar → JSON schema or EBNF structured decoding.
if getattr(request, "Grammar", ""):
grammar = request.Grammar
try:
json.loads(grammar)
sampling_params["json_schema"] = grammar
except json.JSONDecodeError:
sampling_params["ebnf"] = grammar
return sampling_params
def _build_prompt(self, request) -> str:
prompt = request.Prompt
if prompt or not request.UseTokenizerTemplate or not request.Messages:
return prompt
if self.tokenizer is None:
print(
"UseTokenizerTemplate requested but tokenizer not loaded; "
"falling back to naive concatenation",
file=sys.stderr,
)
return "\n".join(m.content or "" for m in request.Messages)
messages_dicts = self._messages_to_dicts(request.Messages)
template_kwargs: dict = {"tokenize": False, "add_generation_prompt": True}
if request.Tools:
try:
template_kwargs["tools"] = json.loads(request.Tools)
except json.JSONDecodeError:
pass
if request.Metadata.get("enable_thinking", "").lower() == "true":
template_kwargs["enable_thinking"] = True
try:
return self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
except TypeError:
return self.tokenizer.apply_chat_template(
messages_dicts, tokenize=False, add_generation_prompt=True,
)
def _make_parsers(self, request):
"""Construct fresh per-request parser instances (stateful)."""
tool_parser = None
reasoning_parser = None
if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools:
try:
tools_raw = json.loads(request.Tools)
tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw
tool_parser = FunctionCallParser(
tools=tools, tool_call_parser=self.tool_parser_name,
)
except Exception as e:
print(f"FunctionCallParser init failed: {e!r}", file=sys.stderr)
if HAS_REASONING_PARSERS and self.reasoning_parser_name:
try:
reasoning_parser = ReasoningParser(
model_type=self.reasoning_parser_name,
stream_reasoning=True,
)
except Exception as e:
print(f"ReasoningParser init failed: {e!r}", file=sys.stderr)
return tool_parser, reasoning_parser
async def _predict(self, request, context, streaming: bool = False):
sampling_params = self._build_sampling_params(request)
prompt = self._build_prompt(request)
tool_parser, reasoning_parser = self._make_parsers(request)
image_data = list(request.Images) if request.Images else None
video_data = list(request.Videos) if request.Videos else None
# Kick off streaming generation. We always use stream=True so the
# non-stream path still gets parser coverage on the final text.
try:
iterator = await self.llm.async_generate(
prompt=prompt,
sampling_params=sampling_params,
image_data=image_data,
video_data=video_data,
stream=True,
)
except Exception as e:
print(f"sglang async_generate failed: {e!r}", file=sys.stderr)
yield backend_pb2.Reply(message=bytes(f"error: {e!r}", "utf-8"))
return
generated_text = ""
last_chunk: Optional[dict] = None
# Track tool call ids once per (request, tool_index) to match the
# OpenAI streaming contract (id sent on first chunk for that tool).
tool_ids_seen: Dict[int, str] = {}
try:
async for chunk in iterator:
last_chunk = chunk
cumulative = chunk.get("text", "") if isinstance(chunk, dict) else ""
delta_text = cumulative[len(generated_text):] if cumulative.startswith(generated_text) else cumulative
generated_text = cumulative
if not delta_text:
continue
reasoning_delta = ""
content_delta = delta_text
if reasoning_parser is not None:
try:
r, n = reasoning_parser.parse_stream_chunk(delta_text)
reasoning_delta = r or ""
content_delta = n or ""
except Exception as e:
print(f"reasoning_parser.parse_stream_chunk: {e!r}", file=sys.stderr)
tool_call_deltas: List[backend_pb2.ToolCallDelta] = []
if tool_parser is not None and content_delta:
try:
normal_text, calls = tool_parser.parse_stream_chunk(content_delta)
content_delta = normal_text or ""
for tc in calls:
idx = int(getattr(tc, "tool_index", 0) or 0)
tc_id = tool_ids_seen.get(idx)
if tc_id is None:
tc_id = f"call_{uuid.uuid4().hex[:24]}"
tool_ids_seen[idx] = tc_id
tool_call_deltas.append(backend_pb2.ToolCallDelta(
index=idx,
id=tc_id,
name=getattr(tc, "name", "") or "",
arguments=getattr(tc, "parameters", "") or "",
))
except Exception as e:
print(f"tool_parser.parse_stream_chunk: {e!r}", file=sys.stderr)
if streaming and (content_delta or reasoning_delta or tool_call_deltas):
yield backend_pb2.Reply(
message=bytes(content_delta, "utf-8"),
chat_deltas=[backend_pb2.ChatDelta(
content=content_delta,
reasoning_content=reasoning_delta,
tool_calls=tool_call_deltas,
)],
)
finally:
try:
await iterator.aclose()
except Exception:
pass
# Extract token counts from the final chunk's meta_info.
meta = {}
if isinstance(last_chunk, dict):
meta = last_chunk.get("meta_info") or {}
prompt_tokens = int(meta.get("prompt_tokens", 0) or 0)
completion_tokens = int(meta.get("completion_tokens", 0) or 0)
# Non-streaming path: re-parse the full text with fresh parsers
# so we return a clean, complete ChatDelta. Streaming parsers
# used above have accumulated state we don't want to reuse.
final_content = generated_text
final_reasoning = ""
final_tool_calls: List[backend_pb2.ToolCallDelta] = []
if not streaming:
final_reasoning_parser = None
if HAS_REASONING_PARSERS and self.reasoning_parser_name:
try:
final_reasoning_parser = ReasoningParser(
model_type=self.reasoning_parser_name,
stream_reasoning=False,
)
except Exception:
final_reasoning_parser = None
if final_reasoning_parser is not None:
try:
r, n = final_reasoning_parser.parse_non_stream(generated_text)
final_reasoning = r or ""
final_content = n if n is not None else generated_text
except Exception as e:
print(f"reasoning_parser.parse_non_stream: {e!r}", file=sys.stderr)
if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools:
try:
tools_raw = json.loads(request.Tools)
tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw
fresh_tool_parser = FunctionCallParser(
tools=tools, tool_call_parser=self.tool_parser_name,
)
normal, calls = fresh_tool_parser.parse_non_stream(final_content)
if calls:
final_content = normal
for tc in calls:
idx = int(getattr(tc, "tool_index", 0) or 0)
final_tool_calls.append(backend_pb2.ToolCallDelta(
index=idx,
id=f"call_{uuid.uuid4().hex[:24]}",
name=getattr(tc, "name", "") or "",
arguments=getattr(tc, "parameters", "") or "",
))
except Exception as e:
print(f"tool_parser.parse_non_stream: {e!r}", file=sys.stderr)
chat_delta = backend_pb2.ChatDelta(
content=final_content if not streaming else "",
reasoning_content=final_reasoning,
tool_calls=final_tool_calls,
)
if streaming:
yield backend_pb2.Reply(
message=b"",
prompt_tokens=prompt_tokens,
tokens=completion_tokens,
chat_deltas=[chat_delta],
)
return
yield backend_pb2.Reply(
message=bytes(final_content or "", "utf-8"),
prompt_tokens=prompt_tokens,
tokens=completion_tokens,
chat_deltas=[chat_delta],
)
async def serve(address):
server = grpc.aio.server(
migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
options=[
('grpc.max_message_length', 50 * 1024 * 1024),
('grpc.max_send_message_length', 50 * 1024 * 1024),
('grpc.max_receive_message_length', 50 * 1024 * 1024),
],
interceptors=get_auth_interceptors(aio=True),
)
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
loop = asyncio.get_event_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, lambda: asyncio.ensure_future(server.stop(5)))
await server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
await server.wait_for_termination()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the sglang gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to.",
)
args = parser.parse_args()
asyncio.run(serve(args.addr))

View File

@@ -0,0 +1,72 @@
#!/bin/bash
set -e
EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
# Avoid overcommitting the CPU during builds that compile native code.
export NVCC_THREADS=2
export MAX_JOBS=1
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes
# a separate pyproject_cpu.toml that must be swapped in before `pip install`.
# Reference: docker/xeon.Dockerfile in the sglang upstream repo.
#
# When BUILD_TYPE is empty (CPU profile) or FROM_SOURCE=true is forced,
# install torch/transformers/etc from requirements-cpu.txt, then clone
# sglang and install its python/ and sgl-kernel/ packages from source
# using the CPU pyproject.
if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
# sgl-kernel's CPU build links against libnuma and libtbb. Install
# them here (Docker builder stage) before running the source build.
# Harmless no-op on runs outside the docker build since installRequirements
# below still needs them only if we reach the source build branch.
if command -v apt-get >/dev/null 2>&1 && [ "$(id -u)" = "0" ]; then
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libnuma-dev numactl libtbb-dev libgomp1 libomp-dev google-perftools \
build-essential cmake ninja-build
fi
installRequirements
# sgl-kernel's pyproject_cpu.toml uses scikit-build-core as its build
# backend. With --no-build-isolation, that (and ninja/cmake) must be
# present in the venv before we build from source.
uv pip install --no-build-isolation "scikit-build-core>=0.10" ninja cmake
_sgl_src=$(mktemp -d)
trap 'rm -rf "${_sgl_src}"' EXIT
git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
pushd "${_sgl_src}/sglang/sgl-kernel"
if [ -f pyproject_cpu.toml ]; then
cp pyproject_cpu.toml pyproject.toml
fi
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} .
popd
pushd "${_sgl_src}/sglang/python"
if [ -f pyproject_cpu.toml ]; then
cp pyproject_cpu.toml pyproject.toml
fi
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} .
popd
else
installRequirements
fi

View File

@@ -0,0 +1,63 @@
#!/bin/bash
# Package runtime shared libraries for the sglang backend.
#
# Dockerfile.python's final stage is FROM scratch — every system library
# the backend dlopens at runtime must be explicitly copied into
# ${BACKEND}/lib, which libbackend.sh adds to LD_LIBRARY_PATH.
#
# sglang's CPU kernel links against libnuma and libtbb; torch's CPU
# kernels use libgomp; tcmalloc + iomp5 are preloaded per sglang's
# docker/xeon.Dockerfile recipe for best CPU throughput. Missing any of
# these makes the engine crash on import.
set -e
CURDIR=$(dirname "$(realpath "$0")")
LIB_DIR="${CURDIR}/lib"
mkdir -p "${LIB_DIR}"
copy_with_symlinks() {
local soname="$1"
local hit=""
for dir in \
/usr/lib/x86_64-linux-gnu \
/usr/lib/aarch64-linux-gnu \
/lib/x86_64-linux-gnu \
/lib/aarch64-linux-gnu \
/usr/lib \
/lib; do
if [ -e "${dir}/${soname}" ]; then
hit="${dir}/${soname}"
break
fi
done
if [ -z "${hit}" ]; then
echo "warning: ${soname} not found in standard lib paths" >&2
return 0
fi
local real
real=$(readlink -f "${hit}")
cp -v "${real}" "${LIB_DIR}/"
local real_base
real_base=$(basename "${real}")
if [ "${real_base}" != "${soname}" ]; then
ln -sf "${real_base}" "${LIB_DIR}/${soname}"
fi
}
copy_with_symlinks libnuma.so.1
copy_with_symlinks libgomp.so.1
copy_with_symlinks libtbb.so.12
copy_with_symlinks libtbbmalloc.so.2
copy_with_symlinks libtcmalloc.so.4
# intel-openmp ships libiomp5.so inside the venv under venv/lib/ — sglang's
# CPU kernel was compiled against its __kmpc_* symbols, so it must be on
# LD_LIBRARY_PATH at runtime. Copy it into the backend lib dir where
# libbackend.sh will pick it up.
if [ -f "${CURDIR}/venv/lib/libiomp5.so" ]; then
cp -v "${CURDIR}/venv/lib/libiomp5.so" "${LIB_DIR}/"
fi
echo "sglang packaging completed successfully"
ls -liah "${LIB_DIR}/"

View File

@@ -0,0 +1,2 @@
# sglang is installed per-acceleration in requirements-{profile}-after.txt
# (cublas12, hipblas, intel, cpu)

View File

@@ -0,0 +1,3 @@
# sglang has no prebuilt CPU wheel on PyPI. install.sh performs a
# from-source build using the upstream pyproject_cpu.toml recipe from
# docker/xeon.Dockerfile when BUILD_TYPE is empty (CPU profile).

View File

@@ -0,0 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.9.0
torchvision
torchaudio
transformers
intel-openmp; platform_machine == 'x86_64'

View File

@@ -0,0 +1,3 @@
# Bump this pin deliberately — sglang releases weekly and API surfaces
# (FunctionCallParser, ReasoningParser) move between releases.
sglang[all]>=0.4.0

View File

@@ -0,0 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu124
accelerate
torch==2.9.1
torchvision
torchaudio==2.9.1
transformers

View File

@@ -0,0 +1,2 @@
# sglang's ROCm build is installed from source per docker/rocm.Dockerfile
# upstream; install.sh handles the source build when BUILD_TYPE=hipblas.

View File

@@ -0,0 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/nightly/rocm7.0
accelerate
torch
torchvision
transformers

View File

@@ -0,0 +1,6 @@
# sglang and sgl-kernel do not declare full PEP517 build deps; install the
# basic build tooling into the venv before pulling the rest of the stack.
packaging
setuptools
wheel
setuptools-scm

View File

@@ -0,0 +1,2 @@
# sglang's Intel XPU build is installed from source per docker/xpu.Dockerfile
# upstream; install.sh handles the source build when BUILD_PROFILE=intel.

View File

@@ -0,0 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/xpu
accelerate
torch
torchvision
transformers
optimum[openvino]
setuptools

View File

@@ -0,0 +1,4 @@
grpcio==1.80.0
protobuf
certifi
setuptools

29
backend/python/sglang/run.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/bash
backend_dir=$(dirname $(realpath $0))
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
# sglang's CPU kernel references LLVM OpenMP (__kmpc_*) symbols that are
# not declared in its NEEDED list — they get resolved through LD_PRELOAD
# of libiomp5.so in sglang's own docker/xeon.Dockerfile. Do the same here.
# Harmless on GPU builds where libiomp5.so is absent.
if [ -f "${backend_dir}/lib/libiomp5.so" ]; then
if [ -n "${LD_PRELOAD:-}" ]; then
export LD_PRELOAD="${backend_dir}/lib/libiomp5.so:${LD_PRELOAD}"
else
export LD_PRELOAD="${backend_dir}/lib/libiomp5.so"
fi
fi
# sglang CPU engine requires this env var to switch to the CPU backend.
# No-op on GPU builds. See docker/xeon.Dockerfile in sglang upstream.
if [ -f "${backend_dir}/lib/libiomp5.so" ]; then
export SGLANG_USE_CPU_ENGINE=1
fi
startBackend $@

View File

@@ -12,6 +12,7 @@ const BACKENDS = [
{ value: 'mlx-vlm', label: 'mlx-vlm' },
{ value: 'transformers', label: 'transformers' },
{ value: 'vllm', label: 'vllm' },
{ value: 'sglang', label: 'sglang' },
{ value: 'diffusers', label: 'diffusers' },
]