diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index a7813393e..1349a8101 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -105,6 +105,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "9" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-qwen-asr' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "9" @@ -366,6 +379,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-qwen-asr' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -470,6 +496,19 @@ jobs: backend: "vibevoice" dockerfile: "./backend/Dockerfile.python" context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-qwen-asr' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" @@ -732,6 +771,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-qwen-asr' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -889,6 +941,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2204' + - build-type: 'l4t' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-qwen-asr' + runs-on: 'ubuntu-24.04-arm' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + skip-drivers: 'true' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2204' - build-type: 'l4t' cuda-major-version: "12" cuda-minor-version: "0" @@ -968,6 +1033,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-qwen-asr' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1380,6 +1458,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-qwen-asr' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "qwen-asr" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 945beafb5..a473c8904 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -304,6 +304,25 @@ jobs: run: | make --jobs=5 --output-sync=target -C backend/python/qwen-tts make --jobs=5 --output-sync=target -C backend/python/qwen-tts test + tests-qwen-asr: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential ffmpeg sox + sudo apt-get install -y ca-certificates cmake curl patch python3-pip + # Install UV + curl -LsSf https://astral.sh/uv/install.sh | sh + pip install --user --no-cache-dir grpcio-tools==1.64.1 + - name: Test qwen-asr + run: | + make --jobs=5 --output-sync=target -C backend/python/qwen-asr + make --jobs=5 --output-sync=target -C backend/python/qwen-asr test tests-voxcpm: runs-on: ubuntu-latest steps: diff --git a/Makefile b/Makefile index 6580d7486..7f4941532 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/voxcpm +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/voxcpm GOCMD=go GOTEST=$(GOCMD) test @@ -319,6 +319,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/moonshine $(MAKE) -C backend/python/pocket-tts $(MAKE) -C backend/python/qwen-tts + $(MAKE) -C backend/python/qwen-asr $(MAKE) -C backend/python/voxcpm test-extra: prepare-test-extra @@ -331,6 +332,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/moonshine test $(MAKE) -C backend/python/pocket-tts test $(MAKE) -C backend/python/qwen-tts test + $(MAKE) -C backend/python/qwen-asr test $(MAKE) -C backend/python/voxcpm test DOCKER_IMAGE?=local-ai @@ -464,6 +466,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true BACKEND_MOONSHINE = moonshine|python|.|false|true BACKEND_POCKET_TTS = pocket-tts|python|.|false|true BACKEND_QWEN_TTS = qwen-tts|python|.|false|true +BACKEND_QWEN_ASR = qwen-asr|python|.|false|true BACKEND_VOXCPM = voxcpm|python|.|false|true # Helper function to build docker image for a backend @@ -510,13 +513,14 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE))) $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS))) $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS))) +$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR))) $(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-voxcpm +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-voxcpm ######################################################## ### END Backends diff --git a/backend/index.yaml b/backend/index.yaml index 50c1b0288..e666cb14a 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -414,6 +414,28 @@ nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png +- &qwen-asr + urls: + - https://github.com/QwenLM/Qwen3-ASR + description: | + Qwen3-ASR is an automatic speech recognition model supporting multiple languages and batch inference. + tags: + - speech-recognition + - ASR + license: apache-2.0 + name: "qwen-asr" + alias: "qwen-asr" + capabilities: + nvidia: "cuda12-qwen-asr" + intel: "intel-qwen-asr" + amd: "rocm-qwen-asr" + nvidia-l4t: "nvidia-l4t-qwen-asr" + default: "cpu-qwen-asr" + nvidia-cuda-13: "cuda13-qwen-asr" + nvidia-cuda-12: "cuda12-qwen-asr" + nvidia-l4t-cuda-12: "nvidia-l4t-qwen-asr" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-asr" + icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png - &voxcpm urls: - https://github.com/ModelBest/VoxCPM @@ -1671,6 +1693,89 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts +## qwen-asr +- !!merge <<: *qwen-asr + name: "qwen-asr-development" + capabilities: + nvidia: "cuda12-qwen-asr-development" + intel: "intel-qwen-asr-development" + amd: "rocm-qwen-asr-development" + nvidia-l4t: "nvidia-l4t-qwen-asr-development" + default: "cpu-qwen-asr-development" + nvidia-cuda-13: "cuda13-qwen-asr-development" + nvidia-cuda-12: "cuda12-qwen-asr-development" + nvidia-l4t-cuda-12: "nvidia-l4t-qwen-asr-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-asr-development" +- !!merge <<: *qwen-asr + name: "cpu-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-qwen-asr" + mirrors: + - localai/localai-backends:latest-cpu-qwen-asr +- !!merge <<: *qwen-asr + name: "cpu-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-qwen-asr" + mirrors: + - localai/localai-backends:master-cpu-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda12-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-qwen-asr" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda12-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-qwen-asr" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda13-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-qwen-asr" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda13-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-qwen-asr" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-qwen-asr +- !!merge <<: *qwen-asr + name: "intel-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-qwen-asr" + mirrors: + - localai/localai-backends:latest-gpu-intel-qwen-asr +- !!merge <<: *qwen-asr + name: "intel-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-qwen-asr" + mirrors: + - localai/localai-backends:master-gpu-intel-qwen-asr +- !!merge <<: *qwen-asr + name: "rocm-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-qwen-asr" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-qwen-asr +- !!merge <<: *qwen-asr + name: "rocm-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-qwen-asr" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-qwen-asr +- !!merge <<: *qwen-asr + name: "nvidia-l4t-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-qwen-asr" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-qwen-asr +- !!merge <<: *qwen-asr + name: "nvidia-l4t-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-qwen-asr" + mirrors: + - localai/localai-backends:master-nvidia-l4t-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda13-nvidia-l4t-arm64-qwen-asr" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-qwen-asr" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-qwen-asr +- !!merge <<: *qwen-asr + name: "cuda13-nvidia-l4t-arm64-qwen-asr-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-asr" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-asr ## voxcpm - !!merge <<: *voxcpm name: "voxcpm-development" diff --git a/backend/python/qwen-asr/Makefile b/backend/python/qwen-asr/Makefile new file mode 100644 index 000000000..5fd27aa3b --- /dev/null +++ b/backend/python/qwen-asr/Makefile @@ -0,0 +1,25 @@ +.DEFAULT_GOAL := install + +.PHONY: qwen-asr +qwen-asr: + bash install.sh + +.PHONY: run +run: qwen-asr + @echo "Running qwen-asr..." + bash run.sh + @echo "qwen-asr run." + +.PHONY: test +test: qwen-asr + @echo "Testing qwen-asr..." + bash test.sh + @echo "qwen-asr tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py new file mode 100644 index 000000000..53660c82e --- /dev/null +++ b/backend/python/qwen-asr/backend.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +gRPC server of LocalAI for Qwen3-ASR (transformers backend, non-vLLM). +""" +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import backend_pb2 +import backend_pb2_grpc +import torch +from qwen_asr import Qwen3ASRModel + +import grpc + + +def is_float(s): + try: + float(s) + return True + except ValueError: + return False + + +def is_int(s): + try: + int(s) + return True + except ValueError: + return False + + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + + +class BackendServicer(backend_pb2_grpc.BackendServicer): + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + if torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + self.device = device + self.options = {} + + for opt in request.Options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + model_path = request.Model or "Qwen/Qwen3-ASR-1.7B" + default_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32 + load_dtype = default_dtype + if "torch_dtype" in self.options: + d = str(self.options["torch_dtype"]).lower() + if d == "fp16": + load_dtype = torch.float16 + elif d == "bf16": + load_dtype = torch.bfloat16 + elif d == "fp32": + load_dtype = torch.float32 + del self.options["torch_dtype"] + + self.max_inference_batch_size = self.options.get("max_inference_batch_size", 32) + self.max_new_tokens = self.options.get("max_new_tokens", 256) + + forced_aligner = self.options.get("forced_aligner") + if forced_aligner is not None and isinstance(forced_aligner, str): + forced_aligner = forced_aligner.strip() or None + attn_implementation = self.options.get("attn_implementation") + if attn_implementation is not None and isinstance(attn_implementation, str): + attn_implementation = attn_implementation.strip() or None + + if self.device == "mps": + device_map = None + elif self.device == "cuda": + device_map = "cuda:0" + else: + device_map = "cpu" + + load_kwargs = dict( + dtype=load_dtype, + device_map=device_map, + max_inference_batch_size=self.max_inference_batch_size, + max_new_tokens=self.max_new_tokens, + ) + if attn_implementation: + load_kwargs["attn_implementation"] = attn_implementation + if forced_aligner: + load_kwargs["forced_aligner"] = forced_aligner + forced_aligner_kwargs = dict( + dtype=load_dtype, + device_map=device_map, + ) + if attn_implementation: + forced_aligner_kwargs["attn_implementation"] = attn_implementation + load_kwargs["forced_aligner_kwargs"] = forced_aligner_kwargs + + try: + print(f"Loading Qwen3-ASR from {model_path}", file=sys.stderr) + if attn_implementation: + print(f"Using attn_implementation: {attn_implementation}", file=sys.stderr) + if forced_aligner: + print(f"Loading with forced_aligner: {forced_aligner}", file=sys.stderr) + self.model = Qwen3ASRModel.from_pretrained(model_path, **load_kwargs) + print("Qwen3-ASR model loaded successfully", file=sys.stderr) + except Exception as err: + print(f"[ERROR] LoadModel failed: {err}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return backend_pb2.Result(success=False, message=str(err)) + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def AudioTranscription(self, request, context): + result_segments = [] + text = "" + try: + audio_path = request.dst + if not audio_path or not os.path.exists(audio_path): + print(f"Error: Audio file not found: {audio_path}", file=sys.stderr) + return backend_pb2.TranscriptResult(segments=[], text="") + + language = None + if request.language and request.language.strip(): + language = request.language.strip() + + results = self.model.transcribe(audio=audio_path, language=language) + + if not results: + return backend_pb2.TranscriptResult(segments=[], text="") + + r = results[0] + text = r.text or "" + + if getattr(r, 'time_stamps', None) and len(r.time_stamps) > 0: + for idx, ts in enumerate(r.time_stamps): + start_ms = 0 + end_ms = 0 + seg_text = text + if isinstance(ts, (list, tuple)) and len(ts) >= 3: + start_ms = int(float(ts[0]) * 1000) if ts[0] is not None else 0 + end_ms = int(float(ts[1]) * 1000) if ts[1] is not None else 0 + seg_text = ts[2] if len(ts) > 2 and ts[2] is not None else "" + result_segments.append(backend_pb2.TranscriptSegment( + id=idx, start=start_ms, end=end_ms, text=seg_text + )) + else: + if text: + result_segments.append(backend_pb2.TranscriptSegment( + id=0, start=0, end=0, text=text + )) + except Exception as err: + print(f"Error in AudioTranscription: {err}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return backend_pb2.TranscriptResult(segments=[], text="") + + return backend_pb2.TranscriptResult(segments=result_segments, text=text) + + +def serve(address): + server = grpc.server( + futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), + ('grpc.max_send_message_length', 50 * 1024 * 1024), + ('grpc.max_receive_message_length', 50 * 1024 * 1024), + ]) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument("--addr", default="localhost:50051", help="The address to bind the server to.") + args = parser.parse_args() + serve(args.addr) diff --git a/backend/python/qwen-asr/install.sh b/backend/python/qwen-asr/install.sh new file mode 100644 index 000000000..71c9e79a9 --- /dev/null +++ b/backend/python/qwen-asr/install.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation" + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +if [ "x${BUILD_PROFILE}" == "xintel" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +fi + +PYTHON_VERSION="3.12" +PYTHON_PATCH="12" +PY_STANDALONE_TAG="20251120" + +installRequirements diff --git a/backend/python/qwen-asr/requirements-cpu.txt b/backend/python/qwen-asr/requirements-cpu.txt new file mode 100644 index 000000000..d14849103 --- /dev/null +++ b/backend/python/qwen-asr/requirements-cpu.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-cublas12-after.txt b/backend/python/qwen-asr/requirements-cublas12-after.txt new file mode 100644 index 000000000..7bfe8efeb --- /dev/null +++ b/backend/python/qwen-asr/requirements-cublas12-after.txt @@ -0,0 +1 @@ +flash-attn \ No newline at end of file diff --git a/backend/python/qwen-asr/requirements-cublas12.txt b/backend/python/qwen-asr/requirements-cublas12.txt new file mode 100644 index 000000000..a09881656 --- /dev/null +++ b/backend/python/qwen-asr/requirements-cublas12.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-cublas13.txt b/backend/python/qwen-asr/requirements-cublas13.txt new file mode 100644 index 000000000..0780883c9 --- /dev/null +++ b/backend/python/qwen-asr/requirements-cublas13.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-hipblas.txt b/backend/python/qwen-asr/requirements-hipblas.txt new file mode 100644 index 000000000..6871f93f5 --- /dev/null +++ b/backend/python/qwen-asr/requirements-hipblas.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.7.1+rocm6.3 +qwen-asr diff --git a/backend/python/qwen-asr/requirements-intel-after.txt b/backend/python/qwen-asr/requirements-intel-after.txt new file mode 100644 index 000000000..d0f509936 --- /dev/null +++ b/backend/python/qwen-asr/requirements-intel-after.txt @@ -0,0 +1 @@ +flash-attn diff --git a/backend/python/qwen-asr/requirements-intel.txt b/backend/python/qwen-asr/requirements-intel.txt new file mode 100644 index 000000000..2c8e1bc44 --- /dev/null +++ b/backend/python/qwen-asr/requirements-intel.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/xpu +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-l4t12.txt b/backend/python/qwen-asr/requirements-l4t12.txt new file mode 100644 index 000000000..c65ec756a --- /dev/null +++ b/backend/python/qwen-asr/requirements-l4t12.txt @@ -0,0 +1,3 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-l4t13.txt b/backend/python/qwen-asr/requirements-l4t13.txt new file mode 100644 index 000000000..0780883c9 --- /dev/null +++ b/backend/python/qwen-asr/requirements-l4t13.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +torch +qwen-asr diff --git a/backend/python/qwen-asr/requirements-mps.txt b/backend/python/qwen-asr/requirements-mps.txt new file mode 100644 index 000000000..8bb38dee8 --- /dev/null +++ b/backend/python/qwen-asr/requirements-mps.txt @@ -0,0 +1,2 @@ +torch==2.7.1 +qwen-asr diff --git a/backend/python/qwen-asr/requirements.txt b/backend/python/qwen-asr/requirements.txt new file mode 100644 index 000000000..9ce0da738 --- /dev/null +++ b/backend/python/qwen-asr/requirements.txt @@ -0,0 +1,5 @@ +grpcio==1.71.0 +protobuf +certifi +packaging==24.1 +setuptools diff --git a/backend/python/qwen-asr/run.sh b/backend/python/qwen-asr/run.sh new file mode 100644 index 000000000..eae121f37 --- /dev/null +++ b/backend/python/qwen-asr/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ diff --git a/backend/python/qwen-asr/test.py b/backend/python/qwen-asr/test.py new file mode 100644 index 000000000..4968f238a --- /dev/null +++ b/backend/python/qwen-asr/test.py @@ -0,0 +1,94 @@ +""" +Tests for the Qwen3-ASR gRPC backend. +""" +import unittest +import subprocess +import time +import os +import tempfile +import shutil +import backend_pb2 +import backend_pb2_grpc + +import grpc + +# Skip heavy transcription test in CI (model download + inference) +SKIP_ASR_TESTS = os.environ.get("SKIP_ASR_TESTS", "false").lower() == "true" + + +class TestBackendServicer(unittest.TestCase): + def setUp(self): + self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) + time.sleep(15) + + def tearDown(self): + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + + def test_load_model(self): + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="Qwen/Qwen3-ASR-1.7B")) + self.assertTrue(response.success, response.message) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + @unittest.skipIf(SKIP_ASR_TESTS, "ASR transcription test skipped (SKIP_ASR_TESTS=true)") + def test_audio_transcription(self): + temp_dir = tempfile.mkdtemp() + audio_file = os.path.join(temp_dir, 'audio.wav') + try: + url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" + result = subprocess.run( + ["wget", "-q", url, "-O", audio_file], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + self.skipTest(f"Could not download sample audio: {result.stderr}") + if not os.path.exists(audio_file): + self.skipTest("Sample audio file not found after download") + + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + load_response = stub.LoadModel(backend_pb2.ModelOptions(Model="Qwen/Qwen3-ASR-0.6B")) + self.assertTrue(load_response.success, load_response.message) + + transcript_response = stub.AudioTranscription( + backend_pb2.TranscriptRequest(dst=audio_file) + ) + self.assertIsNotNone(transcript_response) + self.assertIsNotNone(transcript_response.text) + self.assertGreaterEqual(len(transcript_response.segments), 0) + all_text = "" + for segment in transcript_response.segments: + all_text += segment.text + print(f"All text: {all_text}") + self.assertIn("big", all_text) + if transcript_response.segments: + self.assertIsNotNone(transcript_response.segments[0].text) + finally: + self.tearDown() + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) diff --git a/backend/python/qwen-asr/test.sh b/backend/python/qwen-asr/test.sh new file mode 100644 index 000000000..eb59f2aaf --- /dev/null +++ b/backend/python/qwen-asr/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/gallery/index.yaml b/gallery/index.yaml index 33a2f8029..89f34c089 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -94,6 +94,34 @@ voice: Aiden # Available speakers: Vivian, Serena, Uncle_Fu, Dylan, Eric, Ryan, Aiden, Ono_Anna, Sohee parameters: model: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +- &qwen-asr + urls: + - https://huggingface.co/Qwen/Qwen3-ASR-1.7B + description: | + Qwen3-ASR is an automatic speech recognition model supporting multiple languages and batch inference. + tags: + - speech-recognition + - ASR + license: apache-2.0 + icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png + name: "qwen3-asr-1.7b" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + overrides: + backend: qwen-asr + known_usecases: + - transcript + parameters: + model: Qwen/Qwen3-ASR-1.7B +- !!merge <<: *qwen-asr + urls: + - https://huggingface.co/Qwen/Qwen3-ASR-0.6B + name: "qwen3-asr-0.6b" + overrides: + backend: qwen-asr + known_usecases: + - transcript + parameters: + model: Qwen/Qwen3-ASR-0.6B - name: "huihui-glm-4.7-flash-abliterated-i1" url: "github:mudler/LocalAI/gallery/virtual.yaml@master" urls: