mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
feat(backends): add sglang
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
52
.github/workflows/backend.yml
vendored
52
.github/workflows/backend.yml
vendored
@@ -66,6 +66,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-sglang'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'true'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
@@ -411,6 +424,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
@@ -1427,6 +1453,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
@@ -1689,6 +1728,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'intel'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'intel'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
|
||||
27
.github/workflows/test-extra.yml
vendored
27
.github/workflows/test-extra.yml
vendored
@@ -33,6 +33,7 @@ jobs:
|
||||
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
||||
turboquant: ${{ steps.detect.outputs.turboquant }}
|
||||
vllm: ${{ steps.detect.outputs.vllm }}
|
||||
sglang: ${{ steps.detect.outputs.sglang }}
|
||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
|
||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||
@@ -589,6 +590,32 @@ jobs:
|
||||
# - name: Build vllm (cpu) backend image and run gRPC e2e tests
|
||||
# run: |
|
||||
# make test-extra-backend-vllm
|
||||
tests-sglang-grpc:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.sglang == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
make build-essential curl unzip ca-certificates git tar
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.25.4'
|
||||
- name: Free disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
|
||||
df -h
|
||||
- name: Build sglang (cpu) backend image and run gRPC e2e tests
|
||||
run: |
|
||||
make test-extra-backend-sglang
|
||||
tests-acestep-cpp:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
|
||||
18
Makefile
18
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -419,6 +419,7 @@ prepare-test-extra: protogen-python
|
||||
$(MAKE) -C backend/python/chatterbox
|
||||
$(MAKE) -C backend/python/vllm
|
||||
$(MAKE) -C backend/python/vllm-omni
|
||||
$(MAKE) -C backend/python/sglang
|
||||
$(MAKE) -C backend/python/vibevoice
|
||||
$(MAKE) -C backend/python/moonshine
|
||||
$(MAKE) -C backend/python/pocket-tts
|
||||
@@ -602,6 +603,17 @@ test-extra-backend-tinygrad-all: \
|
||||
test-extra-backend-tinygrad-sd \
|
||||
test-extra-backend-tinygrad-whisper
|
||||
|
||||
## sglang mirrors the vllm setup: HuggingFace model id, same tiny Qwen,
|
||||
## tool-call extraction via sglang's native qwen parser. CPU builds use
|
||||
## sglang's upstream pyproject_cpu.toml recipe (see backend/python/sglang/install.sh).
|
||||
test-extra-backend-sglang: docker-build-sglang
|
||||
BACKEND_IMAGE=local-ai-backend:sglang \
|
||||
BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
|
||||
BACKEND_TEST_CAPS=health,load,predict,stream,tools \
|
||||
BACKEND_TEST_OPTIONS=tool_parser:qwen \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
|
||||
## mlx is Apple-Silicon-first — the MLX backend auto-detects the right tool
|
||||
## parser from the chat template, so no tool_parser: option is needed (it
|
||||
## would be ignored at runtime). Run this on macOS / arm64 with Metal; the
|
||||
@@ -741,6 +753,7 @@ BACKEND_NEUTTS = neutts|python|.|false|true
|
||||
BACKEND_KOKORO = kokoro|python|.|false|true
|
||||
BACKEND_VLLM = vllm|python|.|false|true
|
||||
BACKEND_VLLM_OMNI = vllm-omni|python|.|false|true
|
||||
BACKEND_SGLANG = sglang|python|.|false|true
|
||||
BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true
|
||||
BACKEND_CHATTERBOX = chatterbox|python|.|false|true
|
||||
BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
|
||||
@@ -811,6 +824,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_NEUTTS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_KOKORO)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VLLM)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VLLM_OMNI)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_SGLANG)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
|
||||
@@ -839,7 +853,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
@@ -227,6 +227,28 @@
|
||||
intel: "intel-vllm"
|
||||
nvidia-cuda-12: "cuda12-vllm"
|
||||
cpu: "cpu-vllm"
|
||||
- &sglang
|
||||
name: "sglang"
|
||||
license: apache-2.0
|
||||
urls:
|
||||
- https://github.com/sgl-project/sglang
|
||||
tags:
|
||||
- text-to-text
|
||||
- multimodal
|
||||
icon: https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png
|
||||
description: |
|
||||
SGLang is a fast serving framework for large language models and vision language models.
|
||||
It co-designs the backend runtime (RadixAttention, continuous batching, structured
|
||||
decoding) and the frontend language to make interaction with models faster and more
|
||||
controllable. Features include fast backend runtime, flexible frontend language,
|
||||
extensive model support, and an active community.
|
||||
alias: "sglang"
|
||||
capabilities:
|
||||
nvidia: "cuda12-sglang"
|
||||
amd: "rocm-sglang"
|
||||
intel: "intel-sglang"
|
||||
nvidia-cuda-12: "cuda12-sglang"
|
||||
cpu: "cpu-sglang"
|
||||
- &vllm-omni
|
||||
name: "vllm-omni"
|
||||
license: apache-2.0
|
||||
@@ -1766,6 +1788,54 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-vllm
|
||||
# sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "sglang-development"
|
||||
capabilities:
|
||||
nvidia: "cuda12-sglang-development"
|
||||
amd: "rocm-sglang-development"
|
||||
intel: "intel-sglang-development"
|
||||
cpu: "cpu-sglang-development"
|
||||
- !!merge <<: *sglang
|
||||
name: "cuda12-sglang"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-12-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "rocm-sglang"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-rocm-hipblas-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "intel-sglang"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "cpu-sglang"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "cuda12-sglang-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-12-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "rocm-sglang-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-rocm-hipblas-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "intel-sglang-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sglang
|
||||
- !!merge <<: *sglang
|
||||
name: "cpu-sglang-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-sglang"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-sglang
|
||||
# vllm-omni
|
||||
- !!merge <<: *vllm-omni
|
||||
name: "vllm-omni-development"
|
||||
|
||||
17
backend/python/sglang/Makefile
Normal file
17
backend/python/sglang/Makefile
Normal file
@@ -0,0 +1,17 @@
|
||||
.PHONY: sglang
|
||||
sglang:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: sglang
|
||||
@echo "Running sglang..."
|
||||
bash run.sh
|
||||
@echo "sglang run."
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
502
backend/python/sglang/backend.py
Normal file
502
backend/python/sglang/backend.py
Normal file
@@ -0,0 +1,502 @@
|
||||
#!/usr/bin/env python3
|
||||
"""LocalAI gRPC backend for sglang.
|
||||
|
||||
Wraps sglang's async Engine API behind the Backend gRPC contract defined
|
||||
in backend.proto. Mirrors the structure of backend/python/vllm/backend.py
|
||||
so that the two backends stay behavior-equivalent at the protocol level.
|
||||
|
||||
The streaming path applies sglang's per-request FunctionCallParser and
|
||||
ReasoningParser so tool_calls and reasoning_content are emitted
|
||||
incrementally inside ChatDelta, which is a capability sglang exposes
|
||||
natively and vLLM does not.
|
||||
"""
|
||||
import asyncio
|
||||
from concurrent import futures
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import gc
|
||||
import uuid
|
||||
import base64
|
||||
import io
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from PIL import Image
|
||||
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
import grpc
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
|
||||
# sglang imports. Engine is the stable public entry point; parser modules
|
||||
# are wrapped in try/except so older / leaner installs that omit them
|
||||
# still load the backend for plain text generation.
|
||||
from sglang.srt.entrypoints.engine import Engine
|
||||
|
||||
try:
|
||||
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
||||
# sglang's FunctionCallParser expects a list of pydantic Tool objects
|
||||
# (protocol.Tool with .function.name), not plain dicts. Wrap at the
|
||||
# request boundary to match.
|
||||
from sglang.srt.entrypoints.openai.protocol import Tool as SglTool
|
||||
HAS_TOOL_PARSERS = True
|
||||
except Exception:
|
||||
FunctionCallParser = None # type: ignore
|
||||
SglTool = None # type: ignore
|
||||
HAS_TOOL_PARSERS = False
|
||||
|
||||
try:
|
||||
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
||||
HAS_REASONING_PARSERS = True
|
||||
except Exception:
|
||||
ReasoningParser = None # type: ignore
|
||||
HAS_REASONING_PARSERS = False
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
HAS_TRANSFORMERS = True
|
||||
except Exception:
|
||||
AutoTokenizer = None # type: ignore
|
||||
HAS_TRANSFORMERS = False
|
||||
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"""gRPC servicer implementing the Backend service for sglang."""
|
||||
|
||||
def _parse_options(self, options_list) -> Dict[str, str]:
|
||||
opts: Dict[str, str] = {}
|
||||
for opt in options_list:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
opts[key.strip()] = value.strip()
|
||||
return opts
|
||||
|
||||
def _messages_to_dicts(self, messages) -> List[dict]:
|
||||
result: List[dict] = []
|
||||
for msg in messages:
|
||||
d = {"role": msg.role, "content": msg.content or ""}
|
||||
if msg.name:
|
||||
d["name"] = msg.name
|
||||
if msg.tool_call_id:
|
||||
d["tool_call_id"] = msg.tool_call_id
|
||||
if msg.reasoning_content:
|
||||
d["reasoning_content"] = msg.reasoning_content
|
||||
if msg.tool_calls:
|
||||
try:
|
||||
d["tool_calls"] = json.loads(msg.tool_calls)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
result.append(d)
|
||||
return result
|
||||
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
|
||||
async def LoadModel(self, request, context):
|
||||
engine_kwargs = {"model_path": request.Model}
|
||||
|
||||
if request.Quantization:
|
||||
engine_kwargs["quantization"] = request.Quantization
|
||||
if request.LoadFormat:
|
||||
engine_kwargs["load_format"] = request.LoadFormat
|
||||
if request.GPUMemoryUtilization:
|
||||
engine_kwargs["mem_fraction_static"] = float(request.GPUMemoryUtilization)
|
||||
if request.TrustRemoteCode:
|
||||
engine_kwargs["trust_remote_code"] = True
|
||||
if request.EnforceEager:
|
||||
engine_kwargs["disable_cuda_graph"] = True
|
||||
if request.TensorParallelSize:
|
||||
engine_kwargs["tp_size"] = int(request.TensorParallelSize)
|
||||
if request.MaxModelLen:
|
||||
engine_kwargs["context_length"] = int(request.MaxModelLen)
|
||||
if request.DType:
|
||||
engine_kwargs["dtype"] = request.DType
|
||||
|
||||
opts = self._parse_options(request.Options)
|
||||
|
||||
# Cache parser names — actual parser instances are created per
|
||||
# request because sglang's parsers are stateful.
|
||||
self.tool_parser_name: Optional[str] = opts.get("tool_parser") or None
|
||||
self.reasoning_parser_name: Optional[str] = opts.get("reasoning_parser") or None
|
||||
|
||||
# Also hand the parser names to sglang's engine so its HTTP/OAI
|
||||
# paths work identically if someone hits the engine directly.
|
||||
if self.tool_parser_name:
|
||||
engine_kwargs["tool_call_parser"] = self.tool_parser_name
|
||||
if self.reasoning_parser_name:
|
||||
engine_kwargs["reasoning_parser"] = self.reasoning_parser_name
|
||||
|
||||
try:
|
||||
self.llm = Engine(**engine_kwargs)
|
||||
except Exception as err:
|
||||
print(f"sglang Engine init failed: {err!r}", file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"{err!r}")
|
||||
|
||||
# sglang does not expose a uniform get_tokenizer() off Engine.
|
||||
# Use transformers directly — same path sglang uses internally.
|
||||
self.tokenizer = None
|
||||
if HAS_TRANSFORMERS:
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
request.Model,
|
||||
trust_remote_code=bool(request.TrustRemoteCode),
|
||||
)
|
||||
except Exception as err:
|
||||
print(f"AutoTokenizer load failed (non-fatal): {err!r}", file=sys.stderr)
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
async def Predict(self, request, context):
|
||||
gen = self._predict(request, context, streaming=False)
|
||||
res = await gen.__anext__()
|
||||
return res
|
||||
|
||||
async def PredictStream(self, request, context):
|
||||
iterations = self._predict(request, context, streaming=True)
|
||||
try:
|
||||
async for iteration in iterations:
|
||||
yield iteration
|
||||
finally:
|
||||
try:
|
||||
await iterations.aclose()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def TokenizeString(self, request, context):
|
||||
if not getattr(self, "tokenizer", None):
|
||||
context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
|
||||
context.set_details("tokenizer not loaded")
|
||||
return backend_pb2.TokenizationResponse()
|
||||
try:
|
||||
tokens = self.tokenizer.encode(request.Prompt)
|
||||
return backend_pb2.TokenizationResponse(length=len(tokens), tokens=tokens)
|
||||
except Exception as e:
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return backend_pb2.TokenizationResponse()
|
||||
|
||||
async def Free(self, request, context):
|
||||
try:
|
||||
if hasattr(self, "llm"):
|
||||
try:
|
||||
self.llm.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
del self.llm
|
||||
if hasattr(self, "tokenizer"):
|
||||
del self.tokenizer
|
||||
self.tool_parser_name = None
|
||||
self.reasoning_parser_name = None
|
||||
gc.collect()
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
except ImportError:
|
||||
pass
|
||||
return backend_pb2.Result(success=True, message="Model freed")
|
||||
except Exception as e:
|
||||
return backend_pb2.Result(success=False, message=str(e))
|
||||
|
||||
def _build_sampling_params(self, request) -> dict:
|
||||
sampling_params: dict = {"temperature": 0.7, "max_new_tokens": 200}
|
||||
mapping = {
|
||||
"N": "n",
|
||||
"PresencePenalty": "presence_penalty",
|
||||
"FrequencyPenalty": "frequency_penalty",
|
||||
"RepetitionPenalty": "repetition_penalty",
|
||||
"Temperature": "temperature",
|
||||
"TopP": "top_p",
|
||||
"TopK": "top_k",
|
||||
"MinP": "min_p",
|
||||
"Seed": "seed",
|
||||
"StopPrompts": "stop",
|
||||
"StopTokenIds": "stop_token_ids",
|
||||
"IgnoreEOS": "ignore_eos",
|
||||
"Tokens": "max_new_tokens",
|
||||
"MinTokens": "min_new_tokens",
|
||||
"SkipSpecialTokens": "skip_special_tokens",
|
||||
}
|
||||
for proto_field, sgl_key in mapping.items():
|
||||
if not hasattr(request, proto_field):
|
||||
continue
|
||||
value = getattr(request, proto_field)
|
||||
if value in (None, 0, 0.0, [], False, ""):
|
||||
continue
|
||||
# repeated fields come back as RepeatedScalarContainer — convert
|
||||
if hasattr(value, "__iter__") and not isinstance(value, (str, bytes)):
|
||||
value = list(value)
|
||||
if not value:
|
||||
continue
|
||||
sampling_params[sgl_key] = value
|
||||
|
||||
# Grammar → JSON schema or EBNF structured decoding.
|
||||
if getattr(request, "Grammar", ""):
|
||||
grammar = request.Grammar
|
||||
try:
|
||||
json.loads(grammar)
|
||||
sampling_params["json_schema"] = grammar
|
||||
except json.JSONDecodeError:
|
||||
sampling_params["ebnf"] = grammar
|
||||
|
||||
return sampling_params
|
||||
|
||||
def _build_prompt(self, request) -> str:
|
||||
prompt = request.Prompt
|
||||
if prompt or not request.UseTokenizerTemplate or not request.Messages:
|
||||
return prompt
|
||||
|
||||
if self.tokenizer is None:
|
||||
print(
|
||||
"UseTokenizerTemplate requested but tokenizer not loaded; "
|
||||
"falling back to naive concatenation",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return "\n".join(m.content or "" for m in request.Messages)
|
||||
|
||||
messages_dicts = self._messages_to_dicts(request.Messages)
|
||||
template_kwargs: dict = {"tokenize": False, "add_generation_prompt": True}
|
||||
if request.Tools:
|
||||
try:
|
||||
template_kwargs["tools"] = json.loads(request.Tools)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if request.Metadata.get("enable_thinking", "").lower() == "true":
|
||||
template_kwargs["enable_thinking"] = True
|
||||
|
||||
try:
|
||||
return self.tokenizer.apply_chat_template(messages_dicts, **template_kwargs)
|
||||
except TypeError:
|
||||
return self.tokenizer.apply_chat_template(
|
||||
messages_dicts, tokenize=False, add_generation_prompt=True,
|
||||
)
|
||||
|
||||
def _make_parsers(self, request):
|
||||
"""Construct fresh per-request parser instances (stateful)."""
|
||||
tool_parser = None
|
||||
reasoning_parser = None
|
||||
|
||||
if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools:
|
||||
try:
|
||||
tools_raw = json.loads(request.Tools)
|
||||
tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw
|
||||
tool_parser = FunctionCallParser(
|
||||
tools=tools, tool_call_parser=self.tool_parser_name,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"FunctionCallParser init failed: {e!r}", file=sys.stderr)
|
||||
|
||||
if HAS_REASONING_PARSERS and self.reasoning_parser_name:
|
||||
try:
|
||||
reasoning_parser = ReasoningParser(
|
||||
model_type=self.reasoning_parser_name,
|
||||
stream_reasoning=True,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"ReasoningParser init failed: {e!r}", file=sys.stderr)
|
||||
|
||||
return tool_parser, reasoning_parser
|
||||
|
||||
async def _predict(self, request, context, streaming: bool = False):
|
||||
sampling_params = self._build_sampling_params(request)
|
||||
prompt = self._build_prompt(request)
|
||||
|
||||
tool_parser, reasoning_parser = self._make_parsers(request)
|
||||
|
||||
image_data = list(request.Images) if request.Images else None
|
||||
video_data = list(request.Videos) if request.Videos else None
|
||||
|
||||
# Kick off streaming generation. We always use stream=True so the
|
||||
# non-stream path still gets parser coverage on the final text.
|
||||
try:
|
||||
iterator = await self.llm.async_generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
image_data=image_data,
|
||||
video_data=video_data,
|
||||
stream=True,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"sglang async_generate failed: {e!r}", file=sys.stderr)
|
||||
yield backend_pb2.Reply(message=bytes(f"error: {e!r}", "utf-8"))
|
||||
return
|
||||
|
||||
generated_text = ""
|
||||
last_chunk: Optional[dict] = None
|
||||
# Track tool call ids once per (request, tool_index) to match the
|
||||
# OpenAI streaming contract (id sent on first chunk for that tool).
|
||||
tool_ids_seen: Dict[int, str] = {}
|
||||
|
||||
try:
|
||||
async for chunk in iterator:
|
||||
last_chunk = chunk
|
||||
cumulative = chunk.get("text", "") if isinstance(chunk, dict) else ""
|
||||
delta_text = cumulative[len(generated_text):] if cumulative.startswith(generated_text) else cumulative
|
||||
generated_text = cumulative
|
||||
if not delta_text:
|
||||
continue
|
||||
|
||||
reasoning_delta = ""
|
||||
content_delta = delta_text
|
||||
|
||||
if reasoning_parser is not None:
|
||||
try:
|
||||
r, n = reasoning_parser.parse_stream_chunk(delta_text)
|
||||
reasoning_delta = r or ""
|
||||
content_delta = n or ""
|
||||
except Exception as e:
|
||||
print(f"reasoning_parser.parse_stream_chunk: {e!r}", file=sys.stderr)
|
||||
|
||||
tool_call_deltas: List[backend_pb2.ToolCallDelta] = []
|
||||
if tool_parser is not None and content_delta:
|
||||
try:
|
||||
normal_text, calls = tool_parser.parse_stream_chunk(content_delta)
|
||||
content_delta = normal_text or ""
|
||||
for tc in calls:
|
||||
idx = int(getattr(tc, "tool_index", 0) or 0)
|
||||
tc_id = tool_ids_seen.get(idx)
|
||||
if tc_id is None:
|
||||
tc_id = f"call_{uuid.uuid4().hex[:24]}"
|
||||
tool_ids_seen[idx] = tc_id
|
||||
tool_call_deltas.append(backend_pb2.ToolCallDelta(
|
||||
index=idx,
|
||||
id=tc_id,
|
||||
name=getattr(tc, "name", "") or "",
|
||||
arguments=getattr(tc, "parameters", "") or "",
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"tool_parser.parse_stream_chunk: {e!r}", file=sys.stderr)
|
||||
|
||||
if streaming and (content_delta or reasoning_delta or tool_call_deltas):
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(content_delta, "utf-8"),
|
||||
chat_deltas=[backend_pb2.ChatDelta(
|
||||
content=content_delta,
|
||||
reasoning_content=reasoning_delta,
|
||||
tool_calls=tool_call_deltas,
|
||||
)],
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
await iterator.aclose()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract token counts from the final chunk's meta_info.
|
||||
meta = {}
|
||||
if isinstance(last_chunk, dict):
|
||||
meta = last_chunk.get("meta_info") or {}
|
||||
prompt_tokens = int(meta.get("prompt_tokens", 0) or 0)
|
||||
completion_tokens = int(meta.get("completion_tokens", 0) or 0)
|
||||
|
||||
# Non-streaming path: re-parse the full text with fresh parsers
|
||||
# so we return a clean, complete ChatDelta. Streaming parsers
|
||||
# used above have accumulated state we don't want to reuse.
|
||||
final_content = generated_text
|
||||
final_reasoning = ""
|
||||
final_tool_calls: List[backend_pb2.ToolCallDelta] = []
|
||||
|
||||
if not streaming:
|
||||
final_reasoning_parser = None
|
||||
if HAS_REASONING_PARSERS and self.reasoning_parser_name:
|
||||
try:
|
||||
final_reasoning_parser = ReasoningParser(
|
||||
model_type=self.reasoning_parser_name,
|
||||
stream_reasoning=False,
|
||||
)
|
||||
except Exception:
|
||||
final_reasoning_parser = None
|
||||
|
||||
if final_reasoning_parser is not None:
|
||||
try:
|
||||
r, n = final_reasoning_parser.parse_non_stream(generated_text)
|
||||
final_reasoning = r or ""
|
||||
final_content = n if n is not None else generated_text
|
||||
except Exception as e:
|
||||
print(f"reasoning_parser.parse_non_stream: {e!r}", file=sys.stderr)
|
||||
|
||||
if HAS_TOOL_PARSERS and self.tool_parser_name and request.Tools:
|
||||
try:
|
||||
tools_raw = json.loads(request.Tools)
|
||||
tools = [SglTool.model_validate(t) for t in tools_raw] if SglTool else tools_raw
|
||||
fresh_tool_parser = FunctionCallParser(
|
||||
tools=tools, tool_call_parser=self.tool_parser_name,
|
||||
)
|
||||
normal, calls = fresh_tool_parser.parse_non_stream(final_content)
|
||||
if calls:
|
||||
final_content = normal
|
||||
for tc in calls:
|
||||
idx = int(getattr(tc, "tool_index", 0) or 0)
|
||||
final_tool_calls.append(backend_pb2.ToolCallDelta(
|
||||
index=idx,
|
||||
id=f"call_{uuid.uuid4().hex[:24]}",
|
||||
name=getattr(tc, "name", "") or "",
|
||||
arguments=getattr(tc, "parameters", "") or "",
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"tool_parser.parse_non_stream: {e!r}", file=sys.stderr)
|
||||
|
||||
chat_delta = backend_pb2.ChatDelta(
|
||||
content=final_content if not streaming else "",
|
||||
reasoning_content=final_reasoning,
|
||||
tool_calls=final_tool_calls,
|
||||
)
|
||||
|
||||
if streaming:
|
||||
yield backend_pb2.Reply(
|
||||
message=b"",
|
||||
prompt_tokens=prompt_tokens,
|
||||
tokens=completion_tokens,
|
||||
chat_deltas=[chat_delta],
|
||||
)
|
||||
return
|
||||
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(final_content or "", "utf-8"),
|
||||
prompt_tokens=prompt_tokens,
|
||||
tokens=completion_tokens,
|
||||
chat_deltas=[chat_delta],
|
||||
)
|
||||
|
||||
|
||||
async def serve(address):
|
||||
server = grpc.aio.server(
|
||||
migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024),
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024),
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024),
|
||||
],
|
||||
interceptors=get_auth_interceptors(aio=True),
|
||||
)
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(sig, lambda: asyncio.ensure_future(server.stop(5)))
|
||||
|
||||
await server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
await server.wait_for_termination()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the sglang gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
asyncio.run(serve(args.addr))
|
||||
72
backend/python/sglang/install.sh
Executable file
72
backend/python/sglang/install.sh
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
|
||||
|
||||
# Avoid overcommitting the CPU during builds that compile native code.
|
||||
export NVCC_THREADS=2
|
||||
export MAX_JOBS=1
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
||||
fi
|
||||
|
||||
# sglang's CPU path has no prebuilt wheel on PyPI — upstream publishes
|
||||
# a separate pyproject_cpu.toml that must be swapped in before `pip install`.
|
||||
# Reference: docker/xeon.Dockerfile in the sglang upstream repo.
|
||||
#
|
||||
# When BUILD_TYPE is empty (CPU profile) or FROM_SOURCE=true is forced,
|
||||
# install torch/transformers/etc from requirements-cpu.txt, then clone
|
||||
# sglang and install its python/ and sgl-kernel/ packages from source
|
||||
# using the CPU pyproject.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
|
||||
# sgl-kernel's CPU build links against libnuma and libtbb. Install
|
||||
# them here (Docker builder stage) before running the source build.
|
||||
# Harmless no-op on runs outside the docker build since installRequirements
|
||||
# below still needs them only if we reach the source build branch.
|
||||
if command -v apt-get >/dev/null 2>&1 && [ "$(id -u)" = "0" ]; then
|
||||
apt-get update
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
libnuma-dev numactl libtbb-dev libgomp1 libomp-dev google-perftools \
|
||||
build-essential cmake ninja-build
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
|
||||
# sgl-kernel's pyproject_cpu.toml uses scikit-build-core as its build
|
||||
# backend. With --no-build-isolation, that (and ninja/cmake) must be
|
||||
# present in the venv before we build from source.
|
||||
uv pip install --no-build-isolation "scikit-build-core>=0.10" ninja cmake
|
||||
|
||||
_sgl_src=$(mktemp -d)
|
||||
trap 'rm -rf "${_sgl_src}"' EXIT
|
||||
git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
|
||||
|
||||
pushd "${_sgl_src}/sglang/sgl-kernel"
|
||||
if [ -f pyproject_cpu.toml ]; then
|
||||
cp pyproject_cpu.toml pyproject.toml
|
||||
fi
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} .
|
||||
popd
|
||||
|
||||
pushd "${_sgl_src}/sglang/python"
|
||||
if [ -f pyproject_cpu.toml ]; then
|
||||
cp pyproject_cpu.toml pyproject.toml
|
||||
fi
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} .
|
||||
popd
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
63
backend/python/sglang/package.sh
Executable file
63
backend/python/sglang/package.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
# Package runtime shared libraries for the sglang backend.
|
||||
#
|
||||
# Dockerfile.python's final stage is FROM scratch — every system library
|
||||
# the backend dlopens at runtime must be explicitly copied into
|
||||
# ${BACKEND}/lib, which libbackend.sh adds to LD_LIBRARY_PATH.
|
||||
#
|
||||
# sglang's CPU kernel links against libnuma and libtbb; torch's CPU
|
||||
# kernels use libgomp; tcmalloc + iomp5 are preloaded per sglang's
|
||||
# docker/xeon.Dockerfile recipe for best CPU throughput. Missing any of
|
||||
# these makes the engine crash on import.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
LIB_DIR="${CURDIR}/lib"
|
||||
mkdir -p "${LIB_DIR}"
|
||||
|
||||
copy_with_symlinks() {
|
||||
local soname="$1"
|
||||
local hit=""
|
||||
for dir in \
|
||||
/usr/lib/x86_64-linux-gnu \
|
||||
/usr/lib/aarch64-linux-gnu \
|
||||
/lib/x86_64-linux-gnu \
|
||||
/lib/aarch64-linux-gnu \
|
||||
/usr/lib \
|
||||
/lib; do
|
||||
if [ -e "${dir}/${soname}" ]; then
|
||||
hit="${dir}/${soname}"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ -z "${hit}" ]; then
|
||||
echo "warning: ${soname} not found in standard lib paths" >&2
|
||||
return 0
|
||||
fi
|
||||
local real
|
||||
real=$(readlink -f "${hit}")
|
||||
cp -v "${real}" "${LIB_DIR}/"
|
||||
local real_base
|
||||
real_base=$(basename "${real}")
|
||||
if [ "${real_base}" != "${soname}" ]; then
|
||||
ln -sf "${real_base}" "${LIB_DIR}/${soname}"
|
||||
fi
|
||||
}
|
||||
|
||||
copy_with_symlinks libnuma.so.1
|
||||
copy_with_symlinks libgomp.so.1
|
||||
copy_with_symlinks libtbb.so.12
|
||||
copy_with_symlinks libtbbmalloc.so.2
|
||||
copy_with_symlinks libtcmalloc.so.4
|
||||
|
||||
# intel-openmp ships libiomp5.so inside the venv under venv/lib/ — sglang's
|
||||
# CPU kernel was compiled against its __kmpc_* symbols, so it must be on
|
||||
# LD_LIBRARY_PATH at runtime. Copy it into the backend lib dir where
|
||||
# libbackend.sh will pick it up.
|
||||
if [ -f "${CURDIR}/venv/lib/libiomp5.so" ]; then
|
||||
cp -v "${CURDIR}/venv/lib/libiomp5.so" "${LIB_DIR}/"
|
||||
fi
|
||||
|
||||
echo "sglang packaging completed successfully"
|
||||
ls -liah "${LIB_DIR}/"
|
||||
2
backend/python/sglang/requirements-after.txt
Normal file
2
backend/python/sglang/requirements-after.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
# sglang is installed per-acceleration in requirements-{profile}-after.txt
|
||||
# (cublas12, hipblas, intel, cpu)
|
||||
3
backend/python/sglang/requirements-cpu-after.txt
Normal file
3
backend/python/sglang/requirements-cpu-after.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
# sglang has no prebuilt CPU wheel on PyPI. install.sh performs a
|
||||
# from-source build using the upstream pyproject_cpu.toml recipe from
|
||||
# docker/xeon.Dockerfile when BUILD_TYPE is empty (CPU profile).
|
||||
7
backend/python/sglang/requirements-cpu.txt
Normal file
7
backend/python/sglang/requirements-cpu.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
accelerate
|
||||
torch==2.9.0
|
||||
torchvision
|
||||
torchaudio
|
||||
transformers
|
||||
intel-openmp; platform_machine == 'x86_64'
|
||||
3
backend/python/sglang/requirements-cublas12-after.txt
Normal file
3
backend/python/sglang/requirements-cublas12-after.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
# Bump this pin deliberately — sglang releases weekly and API surfaces
|
||||
# (FunctionCallParser, ReasoningParser) move between releases.
|
||||
sglang[all]>=0.4.0
|
||||
6
backend/python/sglang/requirements-cublas12.txt
Normal file
6
backend/python/sglang/requirements-cublas12.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu124
|
||||
accelerate
|
||||
torch==2.9.1
|
||||
torchvision
|
||||
torchaudio==2.9.1
|
||||
transformers
|
||||
2
backend/python/sglang/requirements-hipblas-after.txt
Normal file
2
backend/python/sglang/requirements-hipblas-after.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
# sglang's ROCm build is installed from source per docker/rocm.Dockerfile
|
||||
# upstream; install.sh handles the source build when BUILD_TYPE=hipblas.
|
||||
5
backend/python/sglang/requirements-hipblas.txt
Normal file
5
backend/python/sglang/requirements-hipblas.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/rocm7.0
|
||||
accelerate
|
||||
torch
|
||||
torchvision
|
||||
transformers
|
||||
6
backend/python/sglang/requirements-install.txt
Normal file
6
backend/python/sglang/requirements-install.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
# sglang and sgl-kernel do not declare full PEP517 build deps; install the
|
||||
# basic build tooling into the venv before pulling the rest of the stack.
|
||||
packaging
|
||||
setuptools
|
||||
wheel
|
||||
setuptools-scm
|
||||
2
backend/python/sglang/requirements-intel-after.txt
Normal file
2
backend/python/sglang/requirements-intel-after.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
# sglang's Intel XPU build is installed from source per docker/xpu.Dockerfile
|
||||
# upstream; install.sh handles the source build when BUILD_PROFILE=intel.
|
||||
7
backend/python/sglang/requirements-intel.txt
Normal file
7
backend/python/sglang/requirements-intel.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/xpu
|
||||
accelerate
|
||||
torch
|
||||
torchvision
|
||||
transformers
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
4
backend/python/sglang/requirements.txt
Normal file
4
backend/python/sglang/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
grpcio==1.80.0
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
29
backend/python/sglang/run.sh
Executable file
29
backend/python/sglang/run.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
|
||||
backend_dir=$(dirname $(realpath $0))
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
# sglang's CPU kernel references LLVM OpenMP (__kmpc_*) symbols that are
|
||||
# not declared in its NEEDED list — they get resolved through LD_PRELOAD
|
||||
# of libiomp5.so in sglang's own docker/xeon.Dockerfile. Do the same here.
|
||||
# Harmless on GPU builds where libiomp5.so is absent.
|
||||
if [ -f "${backend_dir}/lib/libiomp5.so" ]; then
|
||||
if [ -n "${LD_PRELOAD:-}" ]; then
|
||||
export LD_PRELOAD="${backend_dir}/lib/libiomp5.so:${LD_PRELOAD}"
|
||||
else
|
||||
export LD_PRELOAD="${backend_dir}/lib/libiomp5.so"
|
||||
fi
|
||||
fi
|
||||
|
||||
# sglang CPU engine requires this env var to switch to the CPU backend.
|
||||
# No-op on GPU builds. See docker/xeon.Dockerfile in sglang upstream.
|
||||
if [ -f "${backend_dir}/lib/libiomp5.so" ]; then
|
||||
export SGLANG_USE_CPU_ENGINE=1
|
||||
fi
|
||||
|
||||
startBackend $@
|
||||
@@ -12,6 +12,7 @@ const BACKENDS = [
|
||||
{ value: 'mlx-vlm', label: 'mlx-vlm' },
|
||||
{ value: 'transformers', label: 'transformers' },
|
||||
{ value: 'vllm', label: 'vllm' },
|
||||
{ value: 'sglang', label: 'sglang' },
|
||||
{ value: 'diffusers', label: 'diffusers' },
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user