diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 68953d2f9..d8ff23691 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -66,6 +66,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-sglang' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'true' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" @@ -411,6 +424,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-sglang' + runs-on: 'arc-runner-set' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -1427,6 +1453,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-sglang' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:7.2.1" + skip-drivers: 'false' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -1689,6 +1728,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sglang' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "sglang" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 761fda665..d8b417f3a 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -33,6 +33,7 @@ jobs: ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }} turboquant: ${{ steps.detect.outputs.turboquant }} vllm: ${{ steps.detect.outputs.vllm }} + sglang: ${{ steps.detect.outputs.sglang }} acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }} qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }} voxtral: ${{ steps.detect.outputs.voxtral }} @@ -589,6 +590,32 @@ jobs: # - name: Build vllm (cpu) backend image and run gRPC e2e tests # run: | # make test-extra-backend-vllm + tests-sglang-grpc: + needs: detect-changes + if: needs.detect-changes.outputs.sglang == 'true' || needs.detect-changes.outputs.run-all == 'true' + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + make build-essential curl unzip ca-certificates git tar + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.4' + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true + df -h + - name: Build sglang (cpu) backend image and run gRPC e2e tests + run: | + make test-extra-backend-sglang tests-acestep-cpp: needs: detect-changes if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true' diff --git a/Makefile b/Makefile index 61e51aad4..2a695ae28 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad GOCMD=go GOTEST=$(GOCMD) test @@ -419,6 +419,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/chatterbox $(MAKE) -C backend/python/vllm $(MAKE) -C backend/python/vllm-omni + $(MAKE) -C backend/python/sglang $(MAKE) -C backend/python/vibevoice $(MAKE) -C backend/python/moonshine $(MAKE) -C backend/python/pocket-tts @@ -602,6 +603,17 @@ test-extra-backend-tinygrad-all: \ test-extra-backend-tinygrad-sd \ test-extra-backend-tinygrad-whisper +## sglang mirrors the vllm setup: HuggingFace model id, same tiny Qwen, +## tool-call extraction via sglang's native qwen parser. CPU builds use +## sglang's upstream pyproject_cpu.toml recipe (see backend/python/sglang/install.sh). +test-extra-backend-sglang: docker-build-sglang + BACKEND_IMAGE=local-ai-backend:sglang \ + BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ + BACKEND_TEST_CAPS=health,load,predict,stream,tools \ + BACKEND_TEST_OPTIONS=tool_parser:qwen \ + $(MAKE) test-extra-backend + + ## mlx is Apple-Silicon-first — the MLX backend auto-detects the right tool ## parser from the chat template, so no tool_parser: option is needed (it ## would be ignored at runtime). Run this on macOS / arm64 with Metal; the @@ -741,6 +753,7 @@ BACKEND_NEUTTS = neutts|python|.|false|true BACKEND_KOKORO = kokoro|python|.|false|true BACKEND_VLLM = vllm|python|.|false|true BACKEND_VLLM_OMNI = vllm-omni|python|.|false|true +BACKEND_SGLANG = sglang|python|.|false|true BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true BACKEND_CHATTERBOX = chatterbox|python|.|false|true BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true @@ -811,6 +824,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_NEUTTS))) $(eval $(call generate-docker-build-target,$(BACKEND_KOKORO))) $(eval $(call generate-docker-build-target,$(BACKEND_VLLM))) $(eval $(call generate-docker-build-target,$(BACKEND_VLLM_OMNI))) +$(eval $(call generate-docker-build-target,$(BACKEND_SGLANG))) $(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS))) $(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX))) $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) @@ -839,7 +853,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/index.yaml b/backend/index.yaml index f7dc72251..83ae8f78d 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -227,6 +227,28 @@ intel: "intel-vllm" nvidia-cuda-12: "cuda12-vllm" cpu: "cpu-vllm" +- &sglang + name: "sglang" + license: apache-2.0 + urls: + - https://github.com/sgl-project/sglang + tags: + - text-to-text + - multimodal + icon: https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png + description: | + SGLang is a fast serving framework for large language models and vision language models. + It co-designs the backend runtime (RadixAttention, continuous batching, structured + decoding) and the frontend language to make interaction with models faster and more + controllable. Features include fast backend runtime, flexible frontend language, + extensive model support, and an active community. + alias: "sglang" + capabilities: + nvidia: "cuda12-sglang" + amd: "rocm-sglang" + intel: "intel-sglang" + nvidia-cuda-12: "cuda12-sglang" + cpu: "cpu-sglang" - &vllm-omni name: "vllm-omni" license: apache-2.0 @@ -1766,6 +1788,54 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm" mirrors: - localai/localai-backends:master-cpu-vllm +# sglang +- !!merge <<: *sglang + name: "sglang-development" + capabilities: + nvidia: "cuda12-sglang-development" + amd: "rocm-sglang-development" + intel: "intel-sglang-development" + cpu: "cpu-sglang-development" +- !!merge <<: *sglang + name: "cuda12-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sglang" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "rocm-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-sglang" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-sglang +- !!merge <<: *sglang + name: "intel-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sglang" + mirrors: + - localai/localai-backends:latest-gpu-intel-sglang +- !!merge <<: *sglang + name: "cpu-sglang" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sglang" + mirrors: + - localai/localai-backends:latest-cpu-sglang +- !!merge <<: *sglang + name: "cuda12-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sglang" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-sglang +- !!merge <<: *sglang + name: "rocm-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-sglang" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-sglang +- !!merge <<: *sglang + name: "intel-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sglang" + mirrors: + - localai/localai-backends:master-gpu-intel-sglang +- !!merge <<: *sglang + name: "cpu-sglang-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-sglang" + mirrors: + - localai/localai-backends:master-cpu-sglang # vllm-omni - !!merge <<: *vllm-omni name: "vllm-omni-development" diff --git a/backend/python/sglang/Makefile b/backend/python/sglang/Makefile new file mode 100644 index 000000000..e1933f41a --- /dev/null +++ b/backend/python/sglang/Makefile @@ -0,0 +1,17 @@ +.PHONY: sglang +sglang: + bash install.sh + +.PHONY: run +run: sglang + @echo "Running sglang..." + bash run.sh + @echo "sglang run." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/sglang/backend.py b/backend/python/sglang/backend.py new file mode 100644 index 000000000..8def22a4c --- /dev/null +++ b/backend/python/sglang/backend.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python3 +"""LocalAI gRPC backend for sglang. + +Wraps sglang's async Engine API behind the Backend gRPC contract defined +in backend.proto. Mirrors the structure of backend/python/vllm/backend.py +so that the two backends stay behavior-equivalent at the protocol level. + +The streaming path applies sglang's per-request FunctionCallParser and +ReasoningParser so tool_calls and reasoning_content are emitted +incrementally inside ChatDelta, which is a capability sglang exposes +natively and vLLM does not. +""" +import asyncio +from concurrent import futures +import argparse +import signal +import sys +import os +import json +import gc +import uuid +import base64 +import io +from typing import Dict, List, Optional, Tuple + +from PIL import Image + +import backend_pb2 +import backend_pb2_grpc + +import grpc + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common')) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common')) +from grpc_auth import get_auth_interceptors + +# sglang imports. Engine is the stable public entry point; parser modules +# are wrapped in try/except so older / leaner installs that omit them +# still load the backend for plain text generation. +from sglang.srt.entrypoints.engine import Engine + +try: + from sglang.srt.function_call.function_call_parser import FunctionCallParser + # sglang's FunctionCallParser expects a list of pydantic Tool objects + # (protocol.Tool with .function.name), not plain dicts. Wrap at the + # request boundary to match. + from sglang.srt.entrypoints.openai.protocol import Tool as SglTool + HAS_TOOL_PARSERS = True +except Exception: + FunctionCallParser = None # type: ignore + SglTool = None # type: ignore + HAS_TOOL_PARSERS = False + +try: + from sglang.srt.parser.reasoning_parser import ReasoningParser + HAS_REASONING_PARSERS = True +except Exception: + ReasoningParser = None # type: ignore + HAS_REASONING_PARSERS = False + +try: + from transformers import AutoTokenizer + HAS_TRANSFORMERS = True +except Exception: + AutoTokenizer = None # type: ignore + HAS_TRANSFORMERS = False + + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + + +class BackendServicer(backend_pb2_grpc.BackendServicer): + """gRPC servicer implementing the Backend service for sglang.""" + + def _parse_options(self, options_list) -> Dict[str, str]: + opts: Dict[str, str] = {} + for opt in options_list: + if ":" not in opt: + continue + key, value = opt.split(":", 1) + opts[key.strip()] = value.strip() + return opts + + def _messages_to_dicts(self, messages) -> List[dict]: + result: List[dict] = [] + for msg in messages: + d = {"role": msg.role, "content": msg.content or ""} + if msg.name: + d["name"] = msg.name + if msg.tool_call_id: + d["tool_call_id"] = msg.tool_call_id + if msg.reasoning_content: + d["reasoning_content"] = msg.reasoning_content + if msg.tool_calls: + try: + d["tool_calls"] = json.loads(msg.tool_calls) + except json.JSONDecodeError: + pass + result.append(d) + return result + + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + async def LoadModel(self, request, context): + engine_kwargs = {"model_path": request.Model} + + if request.Quantization: + engine_kwargs["quantization"] = request.Quantization + if request.LoadFormat: + engine_kwargs["load_format"] = request.LoadFormat + if request.GPUMemoryUtilization: + engine_kwargs["mem_fraction_static"] = float(request.GPUMemoryUtilization) + if request.TrustRemoteCode: + engine_kwargs["trust_remote_code"] = True + if request.EnforceEager: + engine_kwargs["disable_cuda_graph"] = True + if request.TensorParallelSize: + engine_kwargs["tp_size"] = int(request.TensorParallelSize) + if request.MaxModelLen: + engine_kwargs["context_length"] = int(request.MaxModelLen) + if request.DType: + engine_kwargs["dtype"] = request.DType + + opts = self._parse_options(request.Options) + + # Cache parser names — actual parser instances are created per + # request because sglang's parsers are stateful. + self.tool_parser_name: Optional[str] = opts.get("tool_parser") or None + self.reasoning_parser_name: Optional[str] = opts.get("reasoning_parser") or None + + # Also hand the parser names to sglang's engine so its HTTP/OAI + # paths work identically if someone hits the engine directly. + if self.tool_parser_name: + engine_kwargs["tool_call_parser"] = self.tool_parser_name + if self.reasoning_parser_name: + engine_kwargs["reasoning_parser"] = self.reasoning_parser_name + + try: + self.llm = Engine(**engine_kwargs) + except Exception as err: + print(f"sglang Engine init failed: {err!r}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"{err!r}") + + # sglang does not expose a uniform get_tokenizer() off Engine. + # Use transformers directly — same path sglang uses internally. + self.tokenizer = None + if HAS_TRANSFORMERS: + try: + self.tokenizer = AutoTokenizer.from_pretrained( + request.Model, + trust_remote_code=bool(request.TrustRemoteCode), + ) + except Exception as err: + print(f"AutoTokenizer load failed (non-fatal): {err!r}", file=sys.stderr) + + print("Model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="Model loaded successfully", success=True) + + async def Predict(self, request, context): + gen = self._predict(request, context, streaming=False) + res = await gen.__anext__() + return res + + async def PredictStream(self, request, context): + iterations = self._predict(request, context, streaming=True) + try: + async for iteration in iterations: + yield iteration + finally: + try: + await iterations.aclose() + except Exception: + pass + + async def TokenizeString(self, request, context): + if not getattr(self, "tokenizer", None): + context.set_code(grpc.StatusCode.FAILED_PRECONDITION) + context.set_details("tokenizer not loaded") + return backend_pb2.TokenizationResponse() + try: + tokens = self.tokenizer.encode(request.Prompt) + return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens) + except Exception as e: + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(str(e)) + return backend_pb2.TokenizationResponse() + + async def Free(self, request, context): + try: + if hasattr(self, "llm"): + try: + self.llm.shutdown() + except Exception: + pass + del self.llm + if hasattr(self, "tokenizer"): + del self.tokenizer + self.tool_parser_name = None + self.reasoning_parser_name = None + gc.collect() + try: + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except ImportError: + pass + return backend_pb2.Result(success=True, message="Model freed") + except Exception as e: + return backend_pb2.Result(success=False, message=str(e)) + + def _build_sampling_params(self, request) -> dict: + sampling_params: dict = {"temperature": 0.7, "max_new_tokens": 200} + mapping = { + "N": "n", + "PresencePenalty": "presence_penalty", + "FrequencyPenalty": "frequency_penalty", + "RepetitionPenalty": "repetition_penalty", + "Temperature": "temperature", + "TopP": "top_p", + "TopK": "top_k", + "MinP": "min_p", + "Seed": "seed", + "StopPrompts": "stop", + "StopTokenIds": "stop_token_ids", + "IgnoreEOS": "ignore_eos", + "Tokens": "max_new_tokens", + "MinTokens": "min_new_tokens", + "SkipSpecialTokens": "skip_special_tokens", + } + for proto_field, sgl_key in mapping.items(): + if not hasattr(request, proto_field): + continue + value = getattr(request, proto_field) + if value in (None, 0, 0.0, [], False, ""): + continue + # repeated fields come back as RepeatedScalarContainer — convert + if hasattr(value, "__iter__") and not isinstance(value, (str, bytes)): + value = list(value) + if not value: + continue + sampling_params[sgl_key] = value + + # Grammar → JSON schema or EBNF structured decoding. + if getattr(request, "Grammar", ""): + grammar = request.Grammar + try: + json.loads(grammar) + sampling_params["json_schema"] = grammar + except json.JSONDecodeError: + sampling_params["ebnf"] = grammar + + return sampling_params + + def _build_prompt(self, request) -> str: + prompt = request.Prompt + if prompt or not request.UseTokenizerTemplate or not request.Messages: + return prompt + + if self.tokenizer is None: + print( + "UseTokenizerTemplate requested but tokenizer not loaded; " + "falling back to naive concatenation", + file=sys.stderr, + ) + return "\n".join(m.content or "" for m in request.Messages) + + messages_dicts = self._messages_to_dicts(request.Messages) + template_kwargs: dict = {"tokenize": False, "add_generation_prompt": True} + if request.Tools: + try: + template_kwargs["tools"] = json.loads(request.Tools) + except json.JSONDecodeError: + pass + if request.Metadata.get("enable_thinking", "").lower() == "true": + template_kwargs["enable_thinking"] = True + + try: + return self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs) + except TypeError: + return self.tokenizer.apply_chat_template( + messages_dicts, tokenize=False, add_generation_prompt=True, + ) + + def _make_parsers(self, request): + """Construct fresh per-request parser instances (stateful).""" + tool_parser = None + reasoning_parser = None + + if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools: + try: + tools_raw = json.loads(request.Tools) + tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw + tool_parser = FunctionCallParser( + tools=tools, tool_call_parser=self.tool_parser_name, + ) + except Exception as e: + print(f"FunctionCallParser init failed: {e!r}", file=sys.stderr) + + if HAS_REASONING_PARSERS and self.reasoning_parser_name: + try: + reasoning_parser = ReasoningParser( + model_type=self.reasoning_parser_name, + stream_reasoning=True, + ) + except Exception as e: + print(f"ReasoningParser init failed: {e!r}", file=sys.stderr) + + return tool_parser, reasoning_parser + + async def _predict(self, request, context, streaming: bool = False): + sampling_params = self._build_sampling_params(request) + prompt = self._build_prompt(request) + + tool_parser, reasoning_parser = self._make_parsers(request) + + image_data = list(request.Images) if request.Images else None + video_data = list(request.Videos) if request.Videos else None + + # Kick off streaming generation. We always use stream=True so the + # non-stream path still gets parser coverage on the final text. + try: + iterator = await self.llm.async_generate( + prompt=prompt, + sampling_params=sampling_params, + image_data=image_data, + video_data=video_data, + stream=True, + ) + except Exception as e: + print(f"sglang async_generate failed: {e!r}", file=sys.stderr) + yield backend_pb2.Reply(message=bytes(f"error: {e!r}", "utf-8")) + return + + generated_text = "" + last_chunk: Optional[dict] = None + # Track tool call ids once per (request, tool_index) to match the + # OpenAI streaming contract (id sent on first chunk for that tool). + tool_ids_seen: Dict[int, str] = {} + + try: + async for chunk in iterator: + last_chunk = chunk + cumulative = chunk.get("text", "") if isinstance(chunk, dict) else "" + delta_text = cumulative[len(generated_text):] if cumulative.startswith(generated_text) else cumulative + generated_text = cumulative + if not delta_text: + continue + + reasoning_delta = "" + content_delta = delta_text + + if reasoning_parser is not None: + try: + r, n = reasoning_parser.parse_stream_chunk(delta_text) + reasoning_delta = r or "" + content_delta = n or "" + except Exception as e: + print(f"reasoning_parser.parse_stream_chunk: {e!r}", file=sys.stderr) + + tool_call_deltas: List[backend_pb2.ToolCallDelta] = [] + if tool_parser is not None and content_delta: + try: + normal_text, calls = tool_parser.parse_stream_chunk(content_delta) + content_delta = normal_text or "" + for tc in calls: + idx = int(getattr(tc, "tool_index", 0) or 0) + tc_id = tool_ids_seen.get(idx) + if tc_id is None: + tc_id = f"call_{uuid.uuid4().hex[:24]}" + tool_ids_seen[idx] = tc_id + tool_call_deltas.append(backend_pb2.ToolCallDelta( + index=idx, + id=tc_id, + name=getattr(tc, "name", "") or "", + arguments=getattr(tc, "parameters", "") or "", + )) + except Exception as e: + print(f"tool_parser.parse_stream_chunk: {e!r}", file=sys.stderr) + + if streaming and (content_delta or reasoning_delta or tool_call_deltas): + yield backend_pb2.Reply( + message=bytes(content_delta, "utf-8"), + chat_deltas=[backend_pb2.ChatDelta( + content=content_delta, + reasoning_content=reasoning_delta, + tool_calls=tool_call_deltas, + )], + ) + finally: + try: + await iterator.aclose() + except Exception: + pass + + # Extract token counts from the final chunk's meta_info. + meta = {} + if isinstance(last_chunk, dict): + meta = last_chunk.get("meta_info") or {} + prompt_tokens = int(meta.get("prompt_tokens", 0) or 0) + completion_tokens = int(meta.get("completion_tokens", 0) or 0) + + # Non-streaming path: re-parse the full text with fresh parsers + # so we return a clean, complete ChatDelta. Streaming parsers + # used above have accumulated state we don't want to reuse. + final_content = generated_text + final_reasoning = "" + final_tool_calls: List[backend_pb2.ToolCallDelta] = [] + + if not streaming: + final_reasoning_parser = None + if HAS_REASONING_PARSERS and self.reasoning_parser_name: + try: + final_reasoning_parser = ReasoningParser( + model_type=self.reasoning_parser_name, + stream_reasoning=False, + ) + except Exception: + final_reasoning_parser = None + + if final_reasoning_parser is not None: + try: + r, n = final_reasoning_parser.parse_non_stream(generated_text) + final_reasoning = r or "" + final_content = n if n is not None else generated_text + except Exception as e: + print(f"reasoning_parser.parse_non_stream: {e!r}", file=sys.stderr) + + if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools: + try: + tools_raw = json.loads(request.Tools) + tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw + fresh_tool_parser = FunctionCallParser( + tools=tools, tool_call_parser=self.tool_parser_name, + ) + normal, calls = fresh_tool_parser.parse_non_stream(final_content) + if calls: + final_content = normal + for tc in calls: + idx = int(getattr(tc, "tool_index", 0) or 0) + final_tool_calls.append(backend_pb2.ToolCallDelta( + index=idx, + id=f"call_{uuid.uuid4().hex[:24]}", + name=getattr(tc, "name", "") or "", + arguments=getattr(tc, "parameters", "") or "", + )) + except Exception as e: + print(f"tool_parser.parse_non_stream: {e!r}", file=sys.stderr) + + chat_delta = backend_pb2.ChatDelta( + content=final_content if not streaming else "", + reasoning_content=final_reasoning, + tool_calls=final_tool_calls, + ) + + if streaming: + yield backend_pb2.Reply( + message=b"", + prompt_tokens=prompt_tokens, + tokens=completion_tokens, + chat_deltas=[chat_delta], + ) + return + + yield backend_pb2.Reply( + message=bytes(final_content or "", "utf-8"), + prompt_tokens=prompt_tokens, + tokens=completion_tokens, + chat_deltas=[chat_delta], + ) + + +async def serve(address): + server = grpc.aio.server( + migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), + ('grpc.max_send_message_length', 50 * 1024 * 1024), + ('grpc.max_receive_message_length', 50 * 1024 * 1024), + ], + interceptors=get_auth_interceptors(aio=True), + ) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, lambda: asyncio.ensure_future(server.stop(5))) + + await server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + await server.wait_for_termination() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the sglang gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to.", + ) + args = parser.parse_args() + asyncio.run(serve(args.addr)) diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh new file mode 100755 index 000000000..3b58ebcb0 --- /dev/null +++ b/backend/python/sglang/install.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -e + +EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation" + +# Avoid overcommitting the CPU during builds that compile native code. +export NVCC_THREADS=2 +export MAX_JOBS=1 + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +if [ "x${BUILD_PROFILE}" == "xintel" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +fi + +if [ "x${BUILD_PROFILE}" == "xcpu" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + +# sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes +# a separate pyproject_cpu.toml that must be swapped in before `pip install`. +# Reference: docker/xeon.Dockerfile in the sglang upstream repo. +# +# When BUILD_TYPE is empty (CPU profile) or FROM_SOURCE=true is forced, +# install torch/transformers/etc from requirements-cpu.txt, then clone +# sglang and install its python/ and sgl-kernel/ packages from source +# using the CPU pyproject. +if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then + # sgl-kernel's CPU build links against libnuma and libtbb. Install + # them here (Docker builder stage) before running the source build. + # Harmless no-op on runs outside the docker build since installRequirements + # below still needs them only if we reach the source build branch. + if command -v apt-get >/dev/null 2>&1 && [ "$(id -u)" = "0" ]; then + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libnuma-dev numactl libtbb-dev libgomp1 libomp-dev google-perftools \ + build-essential cmake ninja-build + fi + + installRequirements + + # sgl-kernel's pyproject_cpu.toml uses scikit-build-core as its build + # backend. With --no-build-isolation, that (and ninja/cmake) must be + # present in the venv before we build from source. + uv pip install --no-build-isolation "scikit-build-core>=0.10" ninja cmake + + _sgl_src=$(mktemp -d) + trap 'rm -rf "${_sgl_src}"' EXIT + git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang" + + pushd "${_sgl_src}/sglang/sgl-kernel" + if [ -f pyproject_cpu.toml ]; then + cp pyproject_cpu.toml pyproject.toml + fi + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} . + popd + + pushd "${_sgl_src}/sglang/python" + if [ -f pyproject_cpu.toml ]; then + cp pyproject_cpu.toml pyproject.toml + fi + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} . + popd +else + installRequirements +fi diff --git a/backend/python/sglang/package.sh b/backend/python/sglang/package.sh new file mode 100755 index 000000000..3a1557e9b --- /dev/null +++ b/backend/python/sglang/package.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Package runtime shared libraries for the sglang backend. +# +# Dockerfile.python's final stage is FROM scratch — every system library +# the backend dlopens at runtime must be explicitly copied into +# ${BACKEND}/lib, which libbackend.sh adds to LD_LIBRARY_PATH. +# +# sglang's CPU kernel links against libnuma and libtbb; torch's CPU +# kernels use libgomp; tcmalloc + iomp5 are preloaded per sglang's +# docker/xeon.Dockerfile recipe for best CPU throughput. Missing any of +# these makes the engine crash on import. + +set -e + +CURDIR=$(dirname "$(realpath "$0")") +LIB_DIR="${CURDIR}/lib" +mkdir -p "${LIB_DIR}" + +copy_with_symlinks() { + local soname="$1" + local hit="" + for dir in \ + /usr/lib/x86_64-linux-gnu \ + /usr/lib/aarch64-linux-gnu \ + /lib/x86_64-linux-gnu \ + /lib/aarch64-linux-gnu \ + /usr/lib \ + /lib; do + if [ -e "${dir}/${soname}" ]; then + hit="${dir}/${soname}" + break + fi + done + if [ -z "${hit}" ]; then + echo "warning: ${soname} not found in standard lib paths" >&2 + return 0 + fi + local real + real=$(readlink -f "${hit}") + cp -v "${real}" "${LIB_DIR}/" + local real_base + real_base=$(basename "${real}") + if [ "${real_base}" != "${soname}" ]; then + ln -sf "${real_base}" "${LIB_DIR}/${soname}" + fi +} + +copy_with_symlinks libnuma.so.1 +copy_with_symlinks libgomp.so.1 +copy_with_symlinks libtbb.so.12 +copy_with_symlinks libtbbmalloc.so.2 +copy_with_symlinks libtcmalloc.so.4 + +# intel-openmp ships libiomp5.so inside the venv under venv/lib/ — sglang's +# CPU kernel was compiled against its __kmpc_* symbols, so it must be on +# LD_LIBRARY_PATH at runtime. Copy it into the backend lib dir where +# libbackend.sh will pick it up. +if [ -f "${CURDIR}/venv/lib/libiomp5.so" ]; then + cp -v "${CURDIR}/venv/lib/libiomp5.so" "${LIB_DIR}/" +fi + +echo "sglang packaging completed successfully" +ls -liah "${LIB_DIR}/" diff --git a/backend/python/sglang/requirements-after.txt b/backend/python/sglang/requirements-after.txt new file mode 100644 index 000000000..4caf8fe76 --- /dev/null +++ b/backend/python/sglang/requirements-after.txt @@ -0,0 +1,2 @@ +# sglang is installed per-acceleration in requirements-{profile}-after.txt +# (cublas12, hipblas, intel, cpu) diff --git a/backend/python/sglang/requirements-cpu-after.txt b/backend/python/sglang/requirements-cpu-after.txt new file mode 100644 index 000000000..6b783e2c9 --- /dev/null +++ b/backend/python/sglang/requirements-cpu-after.txt @@ -0,0 +1,3 @@ +# sglang has no prebuilt CPU wheel on PyPI. install.sh performs a +# from-source build using the upstream pyproject_cpu.toml recipe from +# docker/xeon.Dockerfile when BUILD_TYPE is empty (CPU profile). diff --git a/backend/python/sglang/requirements-cpu.txt b/backend/python/sglang/requirements-cpu.txt new file mode 100644 index 000000000..ad467ffa8 --- /dev/null +++ b/backend/python/sglang/requirements-cpu.txt @@ -0,0 +1,7 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +accelerate +torch==2.9.0 +torchvision +torchaudio +transformers +intel-openmp; platform_machine == 'x86_64' diff --git a/backend/python/sglang/requirements-cublas12-after.txt b/backend/python/sglang/requirements-cublas12-after.txt new file mode 100644 index 000000000..57203f125 --- /dev/null +++ b/backend/python/sglang/requirements-cublas12-after.txt @@ -0,0 +1,3 @@ +# Bump this pin deliberately — sglang releases weekly and API surfaces +# (FunctionCallParser, ReasoningParser) move between releases. +sglang[all]>=0.4.0 diff --git a/backend/python/sglang/requirements-cublas12.txt b/backend/python/sglang/requirements-cublas12.txt new file mode 100644 index 000000000..a84b7e989 --- /dev/null +++ b/backend/python/sglang/requirements-cublas12.txt @@ -0,0 +1,6 @@ +--extra-index-url https://download.pytorch.org/whl/cu124 +accelerate +torch==2.9.1 +torchvision +torchaudio==2.9.1 +transformers diff --git a/backend/python/sglang/requirements-hipblas-after.txt b/backend/python/sglang/requirements-hipblas-after.txt new file mode 100644 index 000000000..0802cbcfe --- /dev/null +++ b/backend/python/sglang/requirements-hipblas-after.txt @@ -0,0 +1,2 @@ +# sglang's ROCm build is installed from source per docker/rocm.Dockerfile +# upstream; install.sh handles the source build when BUILD_TYPE=hipblas. diff --git a/backend/python/sglang/requirements-hipblas.txt b/backend/python/sglang/requirements-hipblas.txt new file mode 100644 index 000000000..55670f4cb --- /dev/null +++ b/backend/python/sglang/requirements-hipblas.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/nightly/rocm7.0 +accelerate +torch +torchvision +transformers diff --git a/backend/python/sglang/requirements-install.txt b/backend/python/sglang/requirements-install.txt new file mode 100644 index 000000000..ea7076c1d --- /dev/null +++ b/backend/python/sglang/requirements-install.txt @@ -0,0 +1,6 @@ +# sglang and sgl-kernel do not declare full PEP517 build deps; install the +# basic build tooling into the venv before pulling the rest of the stack. +packaging +setuptools +wheel +setuptools-scm diff --git a/backend/python/sglang/requirements-intel-after.txt b/backend/python/sglang/requirements-intel-after.txt new file mode 100644 index 000000000..a729fc7e8 --- /dev/null +++ b/backend/python/sglang/requirements-intel-after.txt @@ -0,0 +1,2 @@ +# sglang's Intel XPU build is installed from source per docker/xpu.Dockerfile +# upstream; install.sh handles the source build when BUILD_PROFILE=intel. diff --git a/backend/python/sglang/requirements-intel.txt b/backend/python/sglang/requirements-intel.txt new file mode 100644 index 000000000..288607429 --- /dev/null +++ b/backend/python/sglang/requirements-intel.txt @@ -0,0 +1,7 @@ +--extra-index-url https://download.pytorch.org/whl/xpu +accelerate +torch +torchvision +transformers +optimum[openvino] +setuptools diff --git a/backend/python/sglang/requirements.txt b/backend/python/sglang/requirements.txt new file mode 100644 index 000000000..7c6b3143e --- /dev/null +++ b/backend/python/sglang/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.80.0 +protobuf +certifi +setuptools diff --git a/backend/python/sglang/run.sh b/backend/python/sglang/run.sh new file mode 100755 index 000000000..426d3eedb --- /dev/null +++ b/backend/python/sglang/run.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +backend_dir=$(dirname $(realpath $0)) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +# sglang's CPU kernel references LLVM OpenMP (__kmpc_*) symbols that are +# not declared in its NEEDED list — they get resolved through LD_PRELOAD +# of libiomp5.so in sglang's own docker/xeon.Dockerfile. Do the same here. +# Harmless on GPU builds where libiomp5.so is absent. +if [ -f "${backend_dir}/lib/libiomp5.so" ]; then + if [ -n "${LD_PRELOAD:-}" ]; then + export LD_PRELOAD="${backend_dir}/lib/libiomp5.so:${LD_PRELOAD}" + else + export LD_PRELOAD="${backend_dir}/lib/libiomp5.so" + fi +fi + +# sglang CPU engine requires this env var to switch to the CPU backend. +# No-op on GPU builds. See docker/xeon.Dockerfile in sglang upstream. +if [ -f "${backend_dir}/lib/libiomp5.so" ]; then + export SGLANG_USE_CPU_ENGINE=1 +fi + +startBackend $@ diff --git a/core/http/react-ui/src/pages/ImportModel.jsx b/core/http/react-ui/src/pages/ImportModel.jsx index 48fd04e42..f2267684d 100644 --- a/core/http/react-ui/src/pages/ImportModel.jsx +++ b/core/http/react-ui/src/pages/ImportModel.jsx @@ -12,6 +12,7 @@ const BACKENDS = [ { value: 'mlx-vlm', label: 'mlx-vlm' }, { value: 'transformers', label: 'transformers' }, { value: 'vllm', label: 'vllm' }, + { value: 'sglang', label: 'sglang' }, { value: 'diffusers', label: 'diffusers' }, ]