mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-19 22:40:35 -04:00
feat(backends): add sglang (#9359)
* feat(backends): add sglang Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(sglang): force AVX-512 CXXFLAGS and disable CI e2e job sgl-kernel's shm.cpp uses __m512 AVX-512 intrinsics unconditionally; -march=native fails on CI runners without AVX-512 in /proc/cpuinfo. Force -march=sapphirerapids so the build always succeeds, matching sglang upstream's docker/xeon.Dockerfile recipe. The resulting binary still requires an AVX-512 capable CPU at runtime, so disable tests-sglang-grpc in test-extra.yml for the same reason tests-vllm-grpc is disabled. Local runs with make test-extra-backend-sglang still work on hosts with the right SIMD baseline. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(sglang): patch CMakeLists.txt instead of CXXFLAGS for AVX-512 CXXFLAGS with -march=sapphirerapids was being overridden by add_compile_options(-march=native) in sglang's CPU CMakeLists.txt, since CMake appends those flags after CXXFLAGS. Sed-patch the CMakeLists.txt directly after cloning to replace -march=native. --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
61d34ccb11
commit
b4e30692a2
52
.github/workflows/backend.yml
vendored
52
.github/workflows/backend.yml
vendored
@@ -66,6 +66,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-sglang'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'true'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
@@ -411,6 +424,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
@@ -1427,6 +1453,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
@@ -1689,6 +1728,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'intel'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sglang'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "sglang"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'intel'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
|
||||
43
.github/workflows/test-extra.yml
vendored
43
.github/workflows/test-extra.yml
vendored
@@ -33,6 +33,7 @@ jobs:
|
||||
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
||||
turboquant: ${{ steps.detect.outputs.turboquant }}
|
||||
vllm: ${{ steps.detect.outputs.vllm }}
|
||||
sglang: ${{ steps.detect.outputs.sglang }}
|
||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||
qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
|
||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||
@@ -589,6 +590,48 @@ jobs:
|
||||
# - name: Build vllm (cpu) backend image and run gRPC e2e tests
|
||||
# run: |
|
||||
# make test-extra-backend-vllm
|
||||
# tests-sglang-grpc is currently disabled in CI for the same reason as
|
||||
# tests-vllm-grpc: sglang's CPU kernel (sgl-kernel) uses __m512 AVX-512
|
||||
# intrinsics unconditionally in shm.cpp, so the from-source build
|
||||
# requires `-march=sapphirerapids` (already set in install.sh) and the
|
||||
# resulting binary SIGILLs at import on CPUs without AVX-512 VNNI/BF16.
|
||||
# The ubuntu-latest runner pool does not guarantee that ISA baseline.
|
||||
#
|
||||
# The test itself (tests/e2e-backends + make test-extra-backend-sglang)
|
||||
# is fully working and validated locally on a host with the right
|
||||
# SIMD baseline. Run it manually with:
|
||||
#
|
||||
# make test-extra-backend-sglang
|
||||
#
|
||||
# Re-enable this job once we have a self-hosted runner label with
|
||||
# guaranteed AVX-512 VNNI/BF16 support.
|
||||
#
|
||||
# tests-sglang-grpc:
|
||||
# needs: detect-changes
|
||||
# if: needs.detect-changes.outputs.sglang == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
# runs-on: bigger-runner
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# make build-essential curl unzip ca-certificates git tar
|
||||
# - name: Setup Go
|
||||
# uses: actions/setup-go@v5
|
||||
# with:
|
||||
# go-version: '1.25.4'
|
||||
# - name: Free disk space
|
||||
# run: |
|
||||
# sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
|
||||
# df -h
|
||||
# - name: Build sglang (cpu) backend image and run gRPC e2e tests
|
||||
# run: |
|
||||
# make test-extra-backend-sglang
|
||||
tests-acestep-cpp:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
|
||||
Reference in New Issue
Block a user