feat(backends): add sglang (#9359)

* feat(backends): add sglang

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(sglang): force AVX-512 CXXFLAGS and disable CI e2e job

sgl-kernel's shm.cpp uses __m512 AVX-512 intrinsics unconditionally;
-march=native fails on CI runners without AVX-512 in /proc/cpuinfo.
Force -march=sapphirerapids so the build always succeeds, matching
sglang upstream's docker/xeon.Dockerfile recipe.

The resulting binary still requires an AVX-512 capable CPU at runtime,
so disable tests-sglang-grpc in test-extra.yml for the same reason
tests-vllm-grpc is disabled. Local runs with make test-extra-backend-sglang
still work on hosts with the right SIMD baseline.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(sglang): patch CMakeLists.txt instead of CXXFLAGS for AVX-512

CXXFLAGS with -march=sapphirerapids was being overridden by
add_compile_options(-march=native) in sglang's CPU CMakeLists.txt,
since CMake appends those flags after CXXFLAGS. Sed-patch the
CMakeLists.txt directly after cloning to replace -march=native.

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-04-16 22:40:56 +02:00
committed by GitHub
parent 61d34ccb11
commit b4e30692a2
21 changed files with 926 additions and 2 deletions

View File

@@ -66,6 +66,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-sglang'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
@@ -411,6 +424,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-sglang'
runs-on: 'arc-runner-set'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -1427,6 +1453,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-sglang'
runs-on: 'arc-runner-set'
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
@@ -1689,6 +1728,19 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sglang'
runs-on: 'arc-runner-set'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "sglang"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""

View File

@@ -33,6 +33,7 @@ jobs:
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
turboquant: ${{ steps.detect.outputs.turboquant }}
vllm: ${{ steps.detect.outputs.vllm }}
sglang: ${{ steps.detect.outputs.sglang }}
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -589,6 +590,48 @@ jobs:
# - name: Build vllm (cpu) backend image and run gRPC e2e tests
# run: |
# make test-extra-backend-vllm
# tests-sglang-grpc is currently disabled in CI for the same reason as
# tests-vllm-grpc: sglang's CPU kernel (sgl-kernel) uses __m512 AVX-512
# intrinsics unconditionally in shm.cpp, so the from-source build
# requires `-march=sapphirerapids` (already set in install.sh) and the
# resulting binary SIGILLs at import on CPUs without AVX-512 VNNI/BF16.
# The ubuntu-latest runner pool does not guarantee that ISA baseline.
#
# The test itself (tests/e2e-backends + make test-extra-backend-sglang)
# is fully working and validated locally on a host with the right
# SIMD baseline. Run it manually with:
#
# make test-extra-backend-sglang
#
# Re-enable this job once we have a self-hosted runner label with
# guaranteed AVX-512 VNNI/BF16 support.
#
# tests-sglang-grpc:
# needs: detect-changes
# if: needs.detect-changes.outputs.sglang == 'true' || needs.detect-changes.outputs.run-all == 'true'
# runs-on: bigger-runner
# timeout-minutes: 90
# steps:
# - name: Clone
# uses: actions/checkout@v6
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install -y --no-install-recommends \
# make build-essential curl unzip ca-certificates git tar
# - name: Setup Go
# uses: actions/setup-go@v5
# with:
# go-version: '1.25.4'
# - name: Free disk space
# run: |
# sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
# df -h
# - name: Build sglang (cpu) backend image and run gRPC e2e tests
# run: |
# make test-extra-backend-sglang
tests-acestep-cpp:
needs: detect-changes
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'