From c69380e866389d37679f7aa55daf74ebad4bbfcd Mon Sep 17 00:00:00 2001 From: leslie2046 <253605712@qq.com> Date: Thu, 5 Feb 2026 11:38:07 +0800 Subject: [PATCH 1/4] FEAT: added support qwen3-asr models --- xinference/constants.py | 11 +- xinference/model/audio/core.py | 5 + xinference/model/audio/qwen3_asr.py | 155 ++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 xinference/model/audio/qwen3_asr.py diff --git a/xinference/constants.py b/xinference/constants.py index 8db65ac13e..6b5758a86d 100644 --- a/xinference/constants.py +++ b/xinference/constants.py @@ -44,11 +44,12 @@ def get_xinference_home() -> str: home_path = os.environ.get(XINFERENCE_ENV_HOME_PATH) if home_path is None: home_path = str(Path.home() / ".xinference") - else: - # if user has already set `XINFERENCE_HOME` env, change huggingface and modelscope default download path - os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(home_path, "huggingface") - os.environ["MODELSCOPE_CACHE"] = os.path.join(home_path, "modelscope") - os.environ["XDG_CACHE_HOME"] = os.path.join(home_path, "openmind_hub") + # Always change huggingface, modelscope, and openmind_hub default download path + # to ensure xinference process has write permissions for downloading dependencies + # (e.g., Qwen3-ASR's forced aligner model downloaded from Hugging Face Hub) + os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(home_path, "huggingface") + os.environ["MODELSCOPE_CACHE"] = os.path.join(home_path, "modelscope") + os.environ["XDG_CACHE_HOME"] = os.path.join(home_path, "openmind_hub") # In multi-tenant mode, # gradio's temporary files are stored in their respective home directories, # to prevent insufficient permissions diff --git a/xinference/model/audio/core.py b/xinference/model/audio/core.py index 373500b882..38089121be 100644 --- a/xinference/model/audio/core.py +++ b/xinference/model/audio/core.py @@ -29,6 +29,7 @@ from .kokoro_zh import KokoroZHModel from .megatts import MegaTTSModel from .melotts import MeloTTSModel +from .qwen3_asr import Qwen3ASRModel from .whisper import WhisperModel from .whisper_mlx import WhisperMLXModel @@ -155,6 +156,7 @@ def create_audio_model_instance( KokoroZHModel, MegaTTSModel, Indextts2, + Qwen3ASRModel, ]: from ..cache_manager import CacheManager @@ -178,6 +180,7 @@ def create_audio_model_instance( KokoroZHModel, MegaTTSModel, Indextts2, + Qwen3ASRModel, ] if model_spec.model_family == "whisper": if not model_spec.engine: @@ -208,6 +211,8 @@ def create_audio_model_instance( model = MegaTTSModel(model_uid, model_path, model_spec, **kwargs) elif model_spec.model_family == "IndexTTS2": model = Indextts2(model_uid, model_path, model_spec, **kwargs) + elif model_spec.model_family == "qwen3_asr": + model = Qwen3ASRModel(model_uid, model_path, model_spec, **kwargs) else: raise Exception(f"Unsupported audio model family: {model_spec.model_family}") return model diff --git a/xinference/model/audio/qwen3_asr.py b/xinference/model/audio/qwen3_asr.py new file mode 100644 index 0000000000..a984dae279 --- /dev/null +++ b/xinference/model/audio/qwen3_asr.py @@ -0,0 +1,155 @@ +# Copyright 2022-2026 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import tempfile +from typing import TYPE_CHECKING, List, Optional, Tuple + +from ...device_utils import get_available_device, get_device_preferred_dtype +from ...device_utils import is_device_available + +if TYPE_CHECKING: + from .core import AudioModelFamilyV2 + +logger = logging.getLogger(__name__) + + +class Qwen3ASRModel: + def __init__( + self, + model_uid: str, + model_path: str, + model_spec: "AudioModelFamilyV2", + device: Optional[str] = None, + **kwargs, + ): + self.model_family = model_spec + self._model_uid = model_uid + self._model_path = model_path + self._model_spec = model_spec + self._device = device + self._model = None + self._kwargs = kwargs + + @property + def model_ability(self): + return self._model_spec.model_ability + + def load(self): + try: + from qwen_asr import Qwen3ASRModel as QwenASR + except ImportError: + error_message = "Failed to import module 'qwen_asr'" + installation_guide = [ + "Please make sure 'qwen-asr' is installed. ", + "You can install it by `pip install qwen-asr`\n", + ] + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + + if self._device is None: + self._device = get_available_device() + else: + if not is_device_available(self._device): + raise ValueError(f"Device {self._device} is not available!") + + init_kwargs = ( + self._model_spec.default_model_config.copy() + if getattr(self._model_spec, "default_model_config", None) + else {} + ) + init_kwargs.update(self._kwargs) + init_kwargs.setdefault("device_map", self._device) + init_kwargs.setdefault("dtype", get_device_preferred_dtype(self._device)) + if "forced_aligner" in init_kwargs: + forced_aligner_kwargs = init_kwargs.get("forced_aligner_kwargs") or {} + forced_aligner_kwargs.setdefault("device_map", self._device) + forced_aligner_kwargs.setdefault( + "dtype", get_device_preferred_dtype(self._device) + ) + init_kwargs["forced_aligner_kwargs"] = forced_aligner_kwargs + logger.debug("Loading Qwen3-ASR model with kwargs: %s", init_kwargs) + self._model = QwenASR.from_pretrained(self._model_path, **init_kwargs) + + def _extract_text_and_language(self, result) -> Tuple[str, Optional[str]]: + if isinstance(result, list): + if not result: + return "", None + result = result[0] + + if hasattr(result, "text"): + text = result.text + language = getattr(result, "language", None) + return text, language + + if isinstance(result, dict): + text = result.get("text") or result.get("transcript") or "" + language = result.get("language") + return text, language + + return str(result), None + + def transcriptions( + self, + audio: bytes, + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: str = "json", + temperature: float = 0, + timestamp_granularities: Optional[List[str]] = None, + **kwargs, + ): + if temperature != 0: + raise RuntimeError("`temperature` is not supported for Qwen3-ASR") + if timestamp_granularities is not None: + raise RuntimeError( + "`timestamp_granularities` is not supported for Qwen3-ASR" + ) + if prompt is not None: + logger.warning( + "Prompt for Qwen3-ASR transcriptions will be ignored: %s", prompt + ) + + kw = ( + self._model_spec.default_transcription_config.copy() + if getattr(self._model_spec, "default_transcription_config", None) + else {} + ) + kw.update(kwargs) + + with tempfile.NamedTemporaryFile(buffering=0) as f: + f.write(audio) + assert self._model is not None + result = self._model.transcribe(audio=f.name, language=language, **kw) + text, detected_language = self._extract_text_and_language(result) + + if response_format == "json": + return {"text": text} + if response_format == "verbose_json": + return { + "task": "transcribe", + "language": detected_language, + "text": text, + } + raise ValueError(f"Unsupported response format: {response_format}") + + def translations( + self, + audio: bytes, + language: Optional[str] = None, + prompt: Optional[str] = None, + response_format: str = "json", + temperature: float = 0, + timestamp_granularities: Optional[List[str]] = None, + ): + raise RuntimeError("Qwen3-ASR does not support translations API") From f7f168f28060a5824c87c811234f64ea44418343 Mon Sep 17 00:00:00 2001 From: leslie2046 <253605712@qq.com> Date: Thu, 5 Feb 2026 11:46:30 +0800 Subject: [PATCH 2/4] fix lint --- xinference/model/audio/qwen3_asr.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xinference/model/audio/qwen3_asr.py b/xinference/model/audio/qwen3_asr.py index a984dae279..f3cecabfa1 100644 --- a/xinference/model/audio/qwen3_asr.py +++ b/xinference/model/audio/qwen3_asr.py @@ -16,8 +16,11 @@ import tempfile from typing import TYPE_CHECKING, List, Optional, Tuple -from ...device_utils import get_available_device, get_device_preferred_dtype -from ...device_utils import is_device_available +from ...device_utils import ( + get_available_device, + get_device_preferred_dtype, + is_device_available, +) if TYPE_CHECKING: from .core import AudioModelFamilyV2 From 605c1eeda516abffa6785d4956753d4420bc7a70 Mon Sep 17 00:00:00 2001 From: leslie2046 <253605712@qq.com> Date: Thu, 5 Feb 2026 12:06:45 +0800 Subject: [PATCH 3/4] fix lint --- xinference/model/audio/qwen3_asr.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xinference/model/audio/qwen3_asr.py b/xinference/model/audio/qwen3_asr.py index f3cecabfa1..66eb5c4567 100644 --- a/xinference/model/audio/qwen3_asr.py +++ b/xinference/model/audio/qwen3_asr.py @@ -123,10 +123,8 @@ def transcriptions( "Prompt for Qwen3-ASR transcriptions will be ignored: %s", prompt ) - kw = ( - self._model_spec.default_transcription_config.copy() - if getattr(self._model_spec, "default_transcription_config", None) - else {} + kw = dict( + getattr(self._model_spec, "default_transcription_config", None) or {} ) kw.update(kwargs) From bf6b3014e1a950e21dc6b3d8af14899115792829 Mon Sep 17 00:00:00 2001 From: leslie2046 <253605712@qq.com> Date: Thu, 5 Feb 2026 12:08:57 +0800 Subject: [PATCH 4/4] fix lint --- xinference/model/audio/qwen3_asr.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xinference/model/audio/qwen3_asr.py b/xinference/model/audio/qwen3_asr.py index 66eb5c4567..215c61d8f5 100644 --- a/xinference/model/audio/qwen3_asr.py +++ b/xinference/model/audio/qwen3_asr.py @@ -123,9 +123,7 @@ def transcriptions( "Prompt for Qwen3-ASR transcriptions will be ignored: %s", prompt ) - kw = dict( - getattr(self._model_spec, "default_transcription_config", None) or {} - ) + kw = dict(getattr(self._model_spec, "default_transcription_config", None) or {}) kw.update(kwargs) with tempfile.NamedTemporaryFile(buffering=0) as f: