diff --git a/.cursorignore b/.cursorignore index 35b9ef8..96e1c6e 100644 --- a/.cursorignore +++ b/.cursorignore @@ -4,4 +4,5 @@ assets/ benchmarks/ examples/ openarc_bench.db -openarc.log \ No newline at end of file +openarc.log +scratchpad.md \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..9d0e5c7 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Documentation +on: + push: + branches: + - master + - main +permissions: + contents: read + pages: write + id-token: write +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - uses: actions/configure-pages@v5 + - uses: actions/checkout@v5 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: pip install zensical + - run: zensical build --clean + - uses: actions/upload-pages-artifact@v4 + with: + path: site + - uses: actions/deploy-pages@v4 + id: deployment diff --git a/.gitignore b/.gitignore index 4c3ef23..5fd3fce 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ __pycache__/ openarc_api.log openarc_config.json CONTRIBUTING-wip.md + +docs/superpowers docker-compose.override.yaml .pytest_cache/ .mypy_cache/ @@ -23,4 +25,7 @@ docker-compose.override.yaml openarc_bench.db gpt-oss.ipynb gpt_oss_convert.py -scratchpad.md \ No newline at end of file +scratchpad.md +site/ +.cache/ +scratchpad.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..0e04d93 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,7 @@ +OpenArc uses bleeding edge libraries and APIs you may not be familair with. When working through a task, use the deepwiki mcp server to get *contextual* information, and the command line to investigate python surfaces. + +- When making changes, dont worry about backward compatibility; we use git for this. +- Use uv to install dependencies +- Respect existing patterns in the codebase +- When the backend is changed, make sure parameters in the frontend are updated. +- Don't rush, and ask clarifying questions. \ No newline at end of file diff --git a/README.md b/README.md index fa674dd..aba0831 100644 --- a/README.md +++ b/README.md @@ -8,42 +8,30 @@ > [!NOTE] > OpenArc is under active development. -**OpenArc** is an inference engine for Intel devices. Serve LLMs, VLMs, Whisper, Kokoro-TTS, Embedding and Reranker models over OpenAI compatible endpoints, powered by OpenVINO on your device. Local, private, open source AI. - -**OpenArc 2.0** arrives with more endpoints, better UX, pipeline paralell, NPU support and much more! +**OpenArc** is an inference engine for Intel devices. Serve LLMs, VLMs, Whisper, Kokoro-TTS, Qwen-TTS, Qwen-ASR, Embedding and Reranker models over OpenAI compatible endpoints, powered by OpenVINO on your device. Local, private, open source AI. Drawing on ideas from `llama.cpp`, `vLLM`, `transformers`, `OpenVINO Model Server`, `Ray`, `Lemonade`, and other projects cited below, OpenArc has been a way for me to learn about inference engines by trying to build one myself. -Along the way a Discord community has formed around this project, which was unexpected! If you are interested in using Intel devices for AI and machine learning, feel free to stop by. +Along the way a Discord community has formed around this project! If you are interested in using Intel devices for AI and machine learning, feel free to stop by. Thanks to everyone on Discord for their continued support! +> [!NOTE] +> Documentation has been ported to a Zensical site. It's still WIP, and the site isn't live. +> To build and serve the docs after install: +``` +zensical serve -a localhost:8004 +``` ## Table of Contents + + - [Features](#features) - [Quickstart](#quickstart) - [Linux](#linux) - [Windows](#windows) - [Docker](#docker) -- [OpenArc CLI](#openarc-cli) - - [openarc add](#openarc-add) - - [openarc list](#openarc-list) - - [openarc serve](#openarc-serve) - - [openarc load](#openarc-load) - - [openarc status](#openarc-status) - - [openarc bench](#openarc-bench) - - [openarc tool](#openarc-tool) -- [Model Sources](#model-sources) - - [LLMs](#llms) - - [VLMs](#vlms) - - [Whisper](#whisper) - - [Kokoro](#kokoro) - - [Embedding](#embedding) - - [Reranker](#reranker) -- [Converting Models to OpenVINO IR](#converting-models-to-openvino-ir) -- [Learning Resources](#learning-resources) -- [Acknowledgments](#acknowledgments) -- [Codebase Documentation](./docs/index.md) + ## Features - NEW! Containerization with Docker #60 by @meatposes @@ -56,7 +44,7 @@ Thanks to everyone on Discord for their continued support! - `/v1/models` - `/v1/completions`: `llm` only - `/v1/chat/completions` - - `/v1/audio/transcriptions`: `whisper` only + - `/v1/audio/transcriptions`: `whisper`, `qwen3_asr` - `/v1/audio/speech`: `kokoro` only - `/v1/embeddings`: `qwen3-embedding` #33 by @mwrothbe - `/v1/rerank`: `qwen3-reranker` #39 by @mwrothbe @@ -77,6 +65,7 @@ Thanks to everyone on Discord for their continued support! - stream mode - More OpenVINO [examples](examples/) - OpenVINO implementation of [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) + - OpenVINO implementation of Qwen3-TTS and Qwen3-ASR > [!NOTE] @@ -225,435 +214,6 @@ Take a look at the [Dockerfile](Dockerfile) and [docker-compose](docker-compose. > [!NOTE] > uv has a [pip interface](https://docs.astral.sh/uv/pip/) which is a drop in replacement for pip, but faster. Pretty cool, and a good place to start learning uv. -## OpenArc CLI - -This section documents the CLI commands available to you. - -OpenArc command line tool helps you manage the server by packaging requests; every operation the command line does can be scripted programmatically, but using the command tool will help you get a feel for what the server does and how you can use it. - -## Getting Started - -After choosing a model, use commands in this order: - -- Add model configurations with `openarc add`, - -Here's an example for Gemma 3 VLM on GPU: - -``` -openarc add --model-name --model-path --engine ovgenai --model-type vlm --device GPU.0 --vlm-type gemma3 -``` - -And all LLM on GPU: - -``` -openarc add --model-name --model-path --engine ovgenai --model-type llm --device GPU.0 -``` - -Next up: - -- Show added configurations with `openarc list`, -- Launch the server with `openarc serve`, -- Load models with `openarc load` -- Check a model's status using `openarc status`. -- Benchmark performance like llama-bench with `openarc-bench` -- Call utility scripts with `openarc tool` - -Each command has groups of options which offer fine-grained control of both server behavior and performance optimizations, which are documented here with examples to get you started. Remember to use this as reference. - -Use `openarc [OPTION] --help` to see available arguments at any time as you work through the reference. - - -## Reference - -
-openarc add - -
- -Add a model to `openarc_config.json` for easy loading with `openarc load`. - - -### Single device - -``` -openarc add --model-name --model-path --engine --model-type --device -``` - -To see what options you have for `--device`, use `openarc tool device-detect`. - -### VLM - -``` -openarc add --model-name --model-path --engine --model-type --device --vlm-type -``` -Getting VLM to work the way I wanted required using VLMPipeline in ways that are not well documented. You can look at the [code](src/engine/ov_genai/vlm.py#L33) to see where the magic happens. - -`vlm-type` maps a vision token for a given architecture using strings like `qwen25vl`, `phi4mm` and more. Use `openarc add --help` to see the available options. The server will complain if you get anything wrong, so it should be easy to figure out. - -### Whisper - -``` -openarc add --model-name --model-path --engine ovgenai --model-type whisper --device -``` - -### Kokoro (CPU only) - -``` -openarc add --model-name --model-path --engine openvino --model-type kokoro --device CPU -``` - -### Advanced Configuration Options - -`runtime-config` accepts many options to modify `openvino` runtime behavior for different inference scenarios. OpenArc reports c++ errors to the server when these fail, making experimentation easy. - -See OpenVINO documentation on [Inference Optimization](https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference.html) to learn more about what can be customized. - -Most options get really deep into OpenVINO concepts that are way out of scope for the README; however `runtime-config` is the entrypoint for *all* of them. Broadly, what you set in `runtime-config` Unfortunately, not all options are designed for transformers, so `runtime-config` was implemented in a way where you immediately get feedback. Add a kwarg, load the model, get feedback from the server, run `openarc bench`. Overall, it's a clean way to handle the hardest part of OpenVINO documentation. - -Review [pipeline-paralellism preview](https://docs.openvino.ai/2025/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.html#pipeline-parallelism-preview) to learn how you can customize multi device inference using HETERO device plugin. Some example commands are provided for a few difference scenarios: - -### Multi-GPU Pipeline Paralell - -``` -openarc add --model-name --model-path --engine ovgenai --model-type llm --device HETERO:GPU.0,GPU.1 --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}" -``` - -### Tensor Paralell (CPU only) - -Requires more than one CPU socket in a single node. - -``` -openarc add --model-name --model-path --engine ovgenai --model-type llm --device CPU --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "TENSOR_PARALLEL"}" -``` ---- - -### Hybrid Mode/CPU Offload - -``` -openarc add --model-name -model-path --engine ovgenai --model-type llm --device HETERO:GPU.0,CPU --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}" -``` - -### Speculative Decoding - -``` -openarc add --model-name --model-path --engine ovgenai --model-type llm --device GPU.0 --draft-model-path --draft-device CPU --num-assistant-tokens 5 --assistant-confidence-threshold 0.5 -``` - - - -
- - -
-openarc list - -
- -Reads added configurations from `openarc_config.json`. - -Display all added models: -``` -openarc list -``` - -Display config metadata for a specific model: -``` -openarc list -v -``` - -Remove a configuration: -``` -openarc list --remove -``` - -
- - -
-openarc serve - -
- -Starts the server. - -``` -openarc serve start # defauls to 0.0.0.0:8000 -``` - -Configure host and port - -``` -openarc serve start --host --port -``` - -To load models on startup: - -``` -openarc serve start --load-models model1 model2 -``` - -
- - -
-openarc load - -
- -After using ```openarc add``` you can use ```openarc load``` to read the added configuration and load models onto the OpenArc server. - -OpenArc uses arguments from ```openarc add``` as metadata to make routing decisions internally; you are querying for correct inference code. - -``` -openarc load -``` - -To load multiple models at once, use: - -``` -openarc load -``` - -Be mindful of your resources; loading models can be resource intensive! On the first load, OpenVINO performs model compilation for the target `--device`. - -When `openarc load` fails, the CLI tool displays a full stack trace to help you figure out why. - - -
- - -
-openarc status - -
- -Calls /openarc/status endpoint and returns a report. Shows loaded models. - -``` -openarc status -``` - -
- - -
-openarc bench - -
- -Benchmark `llm` performance with pseudo-random input tokens. - -This approach follows [llama-bench](https://github.com/ggml-org/llama.cpp/blob/683fa6ba/tools/llama-bench/llama-bench.cpp#L1922), providing a baseline for the community to assess inference performance between `llama.cpp` backends and `openvino`. - -To support different `llm` tokenizers, we need to standardize how tokens are chosen for benchmark inference. When you set `--p` we select `512` pseudo-random tokens as input_ids from the set of all tokens in the vocabulary. - -`--n` controls the maximum amount of tokens we allow the model to generate; this bypasses `eos` and sets a hard upper limit. - -Default values are: -``` -openarc bench --p <512> --n <128> --r <5> -``` -Which gives: - - -![openarc bench](assets/openarc_bench_sample.png) - -`openarc bench` also records metrics in a sqlite database `openarc_bench.db` for easy analysis. - -
- - -
-openarc tool - -
- -Utility scripts. - -To see `openvino` properties your device supports use: - -``` -openarc tool device-props -``` - -To see available devices use: - -``` -openarc tool device-detect -``` - -![device-detect](assets/cli_tool_device-detect.png) - -
- ---- - -
- -
- -[↑ Top](#table-of-contents) - -
- -## Model Sources - -There are a few sources of preconverted models which can be used with OpenArc; - -[OpenVINO on HuggingFace](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd) - -[My HuggingFace repo](https://huggingface.co/Echo9Zulu) - -[LLMs optimized for NPU](https://huggingface.co/collections/OpenVINO/llms-optimized-for-npu-686e7f0bf7bc184bd71f8ba0) - - -### More models to get you started! - -
-LLMs - -
- -| **Models** | -| --- | -| [Echo9Zulu/Qwen3-1.7B-int8_asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-1.7B-int8_asym-ov/tree/main) | -| [Echo9Zulu/Qwen3-4B-Instruct-2507-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen3-4B-Instruct-2507-int4_asym-awq-ov) | -| [Gapeleon/Satyr-V0.1-4B-HF-int4_awq-ov](https://huggingface.co/Gapeleon/Satyr-V0.1-4B-HF-int4_awq-ov?not-for-all-audiences=true) | -| [Echo9Zulu/Dolphin-X1-8B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Dolphin-X1-8B-int4_asym-awq-ov) | -| [Echo9Zulu/Qwen3-8B-ShiningValiant3-int4-asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-8B-ShiningValiant3-int4-asym-ov) | -| [Echo9Zulu/Qwen3-14B-int4_sym-ov](https://huggingface.co/Echo9Zulu/Qwen3-14B-int4_sym-ov/tree/main) | -| [Echo9Zulu/Cydonia-24B-v4.2.0-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Cydonia-24B-v4.2.0-int4_asym-awq-ov) | -| [Echo9Zulu/Qwen2.5-Microsoft-NextCoder-Soar-Instruct-FUSED-CODER-Fast-11B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-Microsoft-NextCoder-Soar-Instruct-FUSED-CODER-Fast-11B-int4_asym-awq-ov) | -| [Echo9Zulu/Magistral-Small-2509-Text-Only-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Magistral-Small-2509-Text-Only-int4_asym-awq-ov) | -| [Echo9Zulu/Hermes-4-70B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Hermes-4-70B-int4_asym-awq-ov) | -| [Echo9Zulu/Qwen2.5-Coder-32B-Instruct-int4_sym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-Coder-32B-Instruct-int4_sym-awq-ov) | -| [Echo9Zulu/Qwen3-32B-Instruct-int4_sym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen3-32B-Instruct-int4_sym-awq-ov) | - -
- -
-VLMs - -
- -| **Models** | -| --- | -| [Echo9Zulu/gemma-3-4b-it-int8_asym-ov](https://huggingface.co/Echo9Zulu/gemma-3-4b-it-int8_asym-ov) | -| [Echo9Zulu/Gemma-3-12b-it-qat-int4_asym-ov](https://huggingface.co/Echo9Zulu/Gemma-3-12b-it-qat-int4_asym-ov) | -| [Echo9Zulu/Qwen2.5-VL-7B-Instruct-int4_sym-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-VL-7B-Instruct-int4_sym-ov/tree/main) | -| [Echo9Zulu/Nanonets-OCR2-3B-LM-INT4_ASYM-VE-FP16-ov](https://huggingface.co/Echo9Zulu/Nanonets-OCR2-3B-LM-INT4_ASYM-VE-FP16-ov) | - -
- -
-Whisper - -
- -| **Models** | -| --- | -| [OpenVINO/distil-whisper-large-v3-int8-ov](https://huggingface.co/OpenVINO/distil-whisper-large-v3-int8-ov) | -| [OpenVINO/distil-whisper-large-v3-fp16-ov](https://huggingface.co/OpenVINO/distil-whisper-large-v3-fp16-ov) | -| [OpenVINO/whisper-large-v3-int8-ov](https://huggingface.co/OpenVINO/whisper-large-v3-int8-ov/tree/main) | -| [OpenVINO/openai-whisper-large-v3-fp16-ov](https://huggingface.co/OpenVINO/openai-whisper-large-v3-fp16-ov/tree/main) | - -
- -
-Kokoro - -
- -| **Models** | -| --- | -| [Echo9Zulu/Kokoro-82M-FP16-OpenVINO](https://huggingface.co/Echo9Zulu/Kokoro-82M-FP16-OpenVINO) | - -
- -
-Embedding - -
- -| **Models** | -| --- | -| [Echo9Zulu/Qwen3-Embedding-0.6B-int8_asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-Embedding-0.6B-int8_asym-ov) | - -
- -
-Reranker - -
- -| **Models** | -| --- | -| [OpenVINO/Qwen3-Reranker-0.6B-fp16-ov](https://huggingface.co/OpenVINO/Qwen3-Reranker-0.6B-fp16-ov) | - -
- -
- -[↑ Top](#table-of-contents) - -
- -### Converting Models to OpenVINO IR - -Optimum-Intel provides [a hands on primer](https://huggingface.co/docs/optimum/main/en/intel/openvino/optimization) you can use to build some intuition about quantization and post training optimization using OpenVINO. - -Intel provides a suite of tools you can use to apply different post training optimization techniques developed over at [Neural Network Compression Framwork](https://github.com/openvinotoolkit/nncf). - -- Use the [Optimum-CLI conversion tool](https://huggingface.co/docs/optimum/main/en/intel/openvino/export) to learn how you can convert models to OpenVINO IR from other formats. - -- Visit [Supported Architectures](https://huggingface.co/docs/optimum/main/en/intel/openvino/models) to see what models can be converted to OpenVINO using the tools described in this section. - -- If you use the CLI tool and get an error about an unsupported architecture or "missing export config" follow the link, [open an issue](https://github.com/huggingface/optimum-intel/issues) reference the model card and the maintainers will get back to you. - -
- -[↑ Top](#table-of-contents) - -
- -### Demos - -Demos help illustrate what you can do with OpenArc and are meant to be extended. I will continue adding to these, but for now they are a good start. - -[talk_to_llm.py](hackables/talk_to_llm.py) sets up a "chain" between whisper, an LLM, and kokoro. Talk with any LLM you can run on your PC from the command line. Accumulates context and does not filter reasoning (very interesting). - -[whisper_button.py](hackables/whisper_button.py) use spacebar to record audio with whisper and see the transcription right in the terminal. NPU users should probably start here. - - -### Resources - -Learn more about how to leverage your Intel devices for Machine Learning: - -[Install OpenVINO](https://docs.openvino.ai/2025/get-started/install-openvino.html?PACKAGE=OPENVINO_GENAI&VERSION=NIGHTLY&OP_SYSTEM=LINUX&DISTRIBUTION=PIP) - -[openvino_notebooks](https://github.com/openvinotoolkit/openvino_notebooks) - -[OpenVINO Python API](https://docs.openvino.ai/2025/api/ie_python_api/api.html) - -[OpenVINO GenAI Python API](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.html) - -[Inference with Optimum-Intel](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb) - -[Optimum-Intel](https://huggingface.co/docs/optimum/main/en/intel/index) - -[NPU Devices](https://docs.openvino.ai/2025/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html) - -[vllm with IPEX](https://docs.vllm.ai/en/v0.5.1/getting_started/xpu-installation.html) - -[Mutli GPU Pipeline Paralell with OpenVINO Model Server](https://docs.openvino.ai/2025/model-server/ovms_demos_continuous_batching_scaling.html#multi-gpu-configuration-loading-models-exceeding-a-single-card-vram) - -[Transformers Auto Classes](https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes) - -
- -[↑ Top](#table-of-contents) - -
## Acknowledgments diff --git a/demos/qwen3_asr_transcribe.py b/demos/qwen3_asr_transcribe.py new file mode 100644 index 0000000..5faecca --- /dev/null +++ b/demos/qwen3_asr_transcribe.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Qwen3 ASR demo for OpenArc's OpenAI-compatible transcription endpoint. + +Uses the OpenAI Python library. Assumes the server is already running. + +Usage: + OPENARC_API_KEY=sk-... python demos/qwen3_asr_transcribe.py /path/to/audio.wav --model qwen3_asr +""" + +import argparse +import json +import os +from pathlib import Path + +from openai import OpenAI + +# Qwen3 ASR config for openarc_asr.qwen3_asr (audio_base64 injected from file) +QWEN3_ASR_CONFIG = { + "language": None, + "max_tokens": 1024, + "max_chunk_sec": 30.0, + "search_expand_sec": 5.0, + "min_window_ms": 100.0, +} + + +def transcribe_audio( + base_url: str, api_key: str, model_name: str, wav_path: Path +) -> dict: + """Transcribe audio file using Qwen3 ASR. Returns response dict (text, metrics, etc.).""" + if not wav_path.exists() or not wav_path.is_file(): + raise FileNotFoundError(f"Audio file not found: {wav_path}") + + client = OpenAI(base_url=f"{base_url.rstrip('/')}/v1", api_key=api_key) + + with wav_path.open("rb") as f: + response = client.audio.transcriptions.create( + model=model_name, + file=f, + response_format="verbose_json", + extra_body={ + "openarc_asr": json.dumps({"qwen3_asr": QWEN3_ASR_CONFIG}), + }, + ) + + return response.model_dump() if hasattr(response, "model_dump") else dict(response) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Transcribe audio with a loaded Qwen3 ASR model in OpenArc." + ) + parser.add_argument("audio_path", type=Path, help="Path to WAV/compatible audio file") + parser.add_argument( + "--model", default="qwen3_asr", help="Loaded OpenArc model name" + ) + parser.add_argument( + "--base-url", default="http://localhost:8003", help="OpenArc server base URL" + ) + args = parser.parse_args() + + api_key = os.environ.get("OPENARC_API_KEY") + if not api_key: + raise SystemExit("OPENARC_API_KEY environment variable must be set") + + payload = transcribe_audio(args.base_url, api_key, args.model, args.audio_path) + text = payload.get("text", "") + language = payload.get("language") + metrics = payload.get("metrics", {}) or {} + + print("\n=== Qwen3 ASR Transcription ===") + if language: + print(f"Language: {language}") + print(f"Text: {text}\n") + if metrics: + print("Metrics:") + for key, value in metrics.items(): + print(f" {key}: {value}") + + +if __name__ == "__main__": + main() diff --git a/demos/qwen3_tts_example.py b/demos/qwen3_tts_example.py new file mode 100644 index 0000000..233b502 --- /dev/null +++ b/demos/qwen3_tts_example.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Example script using OpenArc /v1/audio/speech for Kokoro or Qwen3 TTS. + +Assumes the server is already running. Switch backends via OPENARC_TTS_BACKEND. +Uses the OpenAI Python library. Saves the generated audio to a WAV file. +""" + +import os +from pathlib import Path + +from openai import OpenAI + +# Configuration +API_KEY = os.getenv("OPENARC_API_KEY") +BASE_URL = os.getenv("OPENARC_BASE_URL", "http://localhost:8003/v1") +# "kokoro" or "qwen3" — determines model and payload +_backend = os.getenv("OPENARC_TTS_BACKEND", "qwen3").lower() +BACKEND = _backend if _backend in ("kokoro", "qwen3") else "kokoro" + +MODELS = { + "kokoro": "kokoro", + "qwen3": os.getenv("OPENARC_QWEN3_TTS_MODEL", "custom_voice"), +} + +# Kokoro config for openarc_tts.kokoro (voice/lang_code are KokoroVoice/KokoroLanguage enums) +KOKORO_CONFIG = { + "voice": "af_sky", + "lang_code": "a", # KokoroLanguage.AMERICAN_ENGLISH + "speed": 1.0, + "response_format": "wav", + "character_count_chunk": 100, +} + +# Qwen3 TTS config for openarc_tts.qwen3_tts +QWEN3_TTS_CONFIG = { + "speaker": "uncle_fu", + "instruct": "Whisper very softly, and giggle at the end.", + "language": "english", + "voice_description": None, + "ref_audio_b64": None, + "ref_text": None, + "x_vector_only": False, + "max_new_tokens": 2048, + "do_sample": True, + "top_k": 50, + "top_p": 1.0, + "temperature": 0.9, + "repetition_penalty": 1.05, + "non_streaming_mode": True, + "subtalker_do_sample": True, + "subtalker_top_k": 50, + "subtalker_top_p": 1.0, + "subtalker_temperature": 0.9, +} + + +def generate_speech(text: str, output_path: str | Path = "speech.wav") -> Path: + """Generate speech from text and save to WAV file. + + Uses Kokoro or Qwen3 TTS based on OPENARC_TTS_BACKEND. + + Raises: + RuntimeError: If OPENARC_API_KEY is not set. + """ + if not API_KEY: + raise RuntimeError("OPENARC_API_KEY environment variable not set") + + client = OpenAI(base_url=BASE_URL, api_key=API_KEY) + model = MODELS[BACKEND] + + if BACKEND == "kokoro": + cfg = dict(KOKORO_CONFIG) + cfg["input"] = text + response = client.audio.speech.create( + model=model, + input=text, + voice=cfg["voice"], + extra_body={"openarc_tts": {"kokoro": cfg}}, + ) + else: + cfg = {k: v for k, v in QWEN3_TTS_CONFIG.items() if v is not None} + cfg["input"] = text + voice = cfg.get("speaker", "ryan") + response = client.audio.speech.create( + model=model, + input=text, + voice=voice, + extra_body={"openarc_tts": {"qwen3_tts": cfg}}, + ) + + out = Path(output_path) + out.write_bytes(response.content) + return out + + +if __name__ == "__main__": + text = os.getenv( + "OPENARC_TTS_TEXT", + "This is a test of OpenArc TTS over the API.", + ) + out = Path(os.getenv("OPENARC_TTS_OUTPUT", "speech.wav")) + + try: + print(f"Backend: {BACKEND} Model: {MODELS[BACKEND]}") + path = generate_speech(text, out) + print(f"Saved WAV to {path}") + except RuntimeError as e: + print(f"Error: {e}") + raise SystemExit(1) + except Exception as e: + print(f"API error: {e}") + raise SystemExit(1) diff --git a/demos/talk_to_llm.py b/demos/talk_to_llm.py index ad4c33e..7a3c279 100644 --- a/demos/talk_to_llm.py +++ b/demos/talk_to_llm.py @@ -1,6 +1,8 @@ import base64 import io +import json import os +import re import threading import time from typing import Optional @@ -9,22 +11,65 @@ import requests import sounddevice as sd import soundfile as sf -from openai import OpenAI # Configuration API_KEY = os.getenv("OPENARC_API_KEY") -BASE_URL = "http://localhost:8000/v1" +BASE_URL = "http://localhost:8003/v1" SAMPLE_RATE = 16000 +# Qwen3 streaming /audio/speech uses audio/L16 mono int16 LE (see server main.py) +TTS_STREAM_SAMPLE_RATE = 24000 MODELS = { - "whisper": "whisper", + "asr": "qwen3_asr", "llm": "Muse-12B", - "tts": "kokoro" + # Server-registered name for a ModelType.QWEN3_TTS_VOICE_CLONE model + "tts": os.getenv("OPENARC_QWEN3_TTS_MODEL", "voice_clone"), } -TTS_CONFIG = { - "voice": "af_heart", - "speed": 1.25, - "language": "a", - "response_format": "wav" +# Qwen3 ASR config for openarc_asr.qwen3_asr (audio_base64 injected from file) +QWEN3_ASR_CONFIG = { + "language": None, + "max_tokens": 4096, + "max_chunk_sec": 30.0, + "search_expand_sec": 5.0, + "min_window_ms": 100.0, +} +# Voice clone: reference WAV + transcript (ICL). Omit speaker (custom_voice only). +VOICE_CLONE_REF_WAV = "/home/echo/Projects/OpenArc/interstellar-tars_absolute-honesty-isn-t-always-the-most-diplomatic-nor-the.mp3" +VOICE_CLONE_REF_TEXT = """ +Absolute honesty isn't always the most diplomatic, nor the most tactful, nor the +""" + +_ref_audio_b64_cache: Optional[str] = None + + +def _get_ref_audio_b64() -> str: + """Lazy-load base64 reference WAV for qwen3_tts_voice_clone.""" + global _ref_audio_b64_cache + if _ref_audio_b64_cache is None: + with open(VOICE_CLONE_REF_WAV, "rb") as f: + _ref_audio_b64_cache = base64.b64encode(f.read()).decode("ascii") + return _ref_audio_b64_cache + + +# Qwen3 TTS config for openarc_tts.qwen3_tts (voice_clone mode); sampling matches OV_Qwen3TTSGenConfig +QWEN3_TTS_CONFIG = { + "ref_text": VOICE_CLONE_REF_TEXT, + "language": "english", + "instruct": None, + "x_vector_only": False, + "max_new_tokens": 2048, + "do_sample": True, + "top_k": 50, + "top_p": 1.0, + "temperature": 0.9, + "repetition_penalty": 1.05, + "non_streaming_mode": True, + "subtalker_do_sample": True, + "subtalker_top_k": 50, + "subtalker_top_p": 1.0, + "subtalker_temperature": 0.9, + "stream": True, + "stream_chunk_frames": 300, + "stream_left_context": 25, } LLM_CONFIG = { "temperature": 0.8, @@ -35,8 +80,8 @@ SYSTEM_PROMPT = """ # COMMISION: -- You're a masterful adventure gamemaster. -- ALways make the story interactive, and dont tell to much. +- You are Elmo from Sesame Street, now andventure gamemaster. +- Make the story interactive and engaging. - Use second person (you are) ## STYLE @@ -46,11 +91,10 @@ My name is Leandro. """ -def initialize_client() -> OpenAI: - """Initialize OpenAI client with OpenArc server.""" +def validate_api_key() -> None: + """Validate required API key for OpenArc calls.""" if not API_KEY: raise RuntimeError("OPENARC_API_KEY environment variable not set") - return OpenAI(api_key=API_KEY, base_url=BASE_URL) def record_audio() -> tuple[Optional[np.ndarray], bool]: """Record audio from microphone using terminal input (works in remote desktop). @@ -159,32 +203,34 @@ def input_thread(): print(f"Total audio length: {len(audio_data) / SAMPLE_RATE:.2f} seconds") return audio_data, exit_program -def encode_audio_to_base64(audio_data: np.ndarray) -> str: - """Convert audio data to base64-encoded WAV format.""" +def encode_audio_to_wav_bytes(audio_data: np.ndarray) -> bytes: + """Convert audio data to in-memory WAV bytes.""" wav_buffer = io.BytesIO() sf.write(wav_buffer, audio_data, SAMPLE_RATE, format='WAV') wav_buffer.seek(0) - audio_bytes = wav_buffer.read() - return base64.b64encode(audio_bytes).decode("utf-8") + return wav_buffer.read() + +def transcribe_audio(audio_bytes: bytes) -> tuple[str, dict]: + """Transcribe audio using Qwen3 ASR. -def transcribe_audio(client: OpenAI, audio_b64: str) -> tuple[str, dict]: - """Transcribe audio using Whisper model. - Returns: Tuple of (transcribed_text, metrics) """ - response = client.post( - "/audio/transcriptions", - cast_to=object, - body={ - "model": MODELS["whisper"], - "audio_base64": audio_b64 + response = requests.post( + f"{BASE_URL}/audio/transcriptions", + headers={"Authorization": f"Bearer {API_KEY}"}, + data={ + "model": MODELS["asr"], + "response_format": "verbose_json", + "openarc_asr": json.dumps({"qwen3_asr": QWEN3_ASR_CONFIG}), }, - options={"headers": {"Content-Type": "application/json"}} + files={"file": ("recording.wav", audio_bytes, "audio/wav")}, + timeout=120, ) - - text = response.get("text", "").strip() - metrics = response.get("metrics", {}) + response.raise_for_status() + payload = response.json() + text = payload.get("text", "").strip() + metrics = payload.get("metrics", {}) return text, metrics def get_llm_response(messages: list[dict]) -> str: @@ -213,39 +259,76 @@ def get_llm_response(messages: list[dict]) -> str: return full_response + +def _l16_rate_from_content_type(content_type: str) -> int: + m = re.search(r"rate=(\d+)", content_type, re.IGNORECASE) + if m: + return int(m.group(1)) + return TTS_STREAM_SAMPLE_RATE + + +def _play_streaming_l16(response: requests.Response, sample_rate: int) -> None: + """Play raw little-endian int16 mono PCM as chunks arrive (Qwen3 stream).""" + pending = bytearray() + with sd.OutputStream(samplerate=sample_rate, channels=1, dtype="float32") as out: + for chunk in response.iter_content(chunk_size=8192): + if not chunk: + continue + pending.extend(chunk) + n_bytes = (len(pending) // 2) * 2 + if n_bytes == 0: + continue + raw = bytes(pending[:n_bytes]) + del pending[:n_bytes] + samples = np.frombuffer(raw, dtype=" None: - """Generate speech from text using TTS and play it.""" - print("\n🔊 Generating speech...") + """Generate speech from text using Qwen3 TTS and play it (streams when server returns L16).""" url = f"{BASE_URL}/audio/speech" headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } + cfg = {k: v for k, v in QWEN3_TTS_CONFIG.items() if v is not None} + cfg["input"] = text + cfg["ref_audio_b64"] = _get_ref_audio_b64() data = { "model": MODELS["tts"], "input": text, - **TTS_CONFIG + "voice": cfg.get("speaker", MODELS["tts"]), + "openarc_tts": {"qwen3_tts": cfg}, } - - audio_buffer = io.BytesIO() + with requests.post(url, headers=headers, json=data, stream=True) as response: response.raise_for_status() + content_type = response.headers.get("Content-Type", "") + if "l16" in content_type.lower(): + print("\n🔊 Synthesizing (streaming playback)...") + sr = _l16_rate_from_content_type(content_type) + _play_streaming_l16(response, sr) + print("▶️ Playback finished.") + return + + print("\n🔊 Generating speech...") + audio_buffer = io.BytesIO() for chunk in response.iter_content(chunk_size=8192): if chunk: audio_buffer.write(chunk) - - audio_buffer.seek(0) - - # Play audio from memory - print("▶️ Playing response...") - audio_data, fs = sf.read(audio_buffer, dtype='float32') - sd.play(audio_data, fs) - sd.wait() + audio_buffer.seek(0) + print("▶️ Playing response...") + audio_data, fs = sf.read(audio_buffer, dtype="float32") + sd.play(audio_data, fs) + sd.wait() def talk_to_llm(): """Maintain a conversation: record -> transcribe -> LLM -> TTS -> repeat.""" try: - client = initialize_client() + validate_api_key() except RuntimeError as e: print(f"Error: {e}") return @@ -266,8 +349,8 @@ def talk_to_llm(): try: # Transcribe audio - audio_b64 = encode_audio_to_base64(audio_data) - text, metrics = transcribe_audio(client, audio_b64) + audio_bytes = encode_audio_to_wav_bytes(audio_data) + text, metrics = transcribe_audio(audio_bytes) if not text: print("No transcription, skipping...") diff --git a/demos/whisper_button.py b/demos/whisper_button.py index dec3bf0..16317a3 100644 --- a/demos/whisper_button.py +++ b/demos/whisper_button.py @@ -1,12 +1,11 @@ import os -import base64 import sounddevice as sd import soundfile as sf import numpy as np import threading import time import tempfile -from openai import OpenAI +import requests def transcribe_example(): @@ -16,11 +15,7 @@ def transcribe_example(): print("OPENARC_API_KEY is not set. Export it before running this test.") return - # Initialize OpenAI client with custom HTTP client for base64 audio - client = OpenAI( - api_key=api_key, - base_url="http://localhost:8000/v1" - ) + base_url = "http://localhost:8000/v1/audio/transcriptions" model_name = "whisper" @@ -119,23 +114,18 @@ def input_thread(): print("💾 Audio saved to temporary WAV file") try: - # Read WAV file and encode as base64 (OpenArc server expects this format) with open(tmp_path, "rb") as f: - audio_b64 = base64.b64encode(f.read()).decode("utf-8") - - # Use custom request since OpenArc expects audio_base64 in JSON - response = client.post( - "/audio/transcriptions", - cast_to=object, - body={ - "model": model_name, - "audio_base64": audio_b64 - }, - options={"headers": {"Content-Type": "application/json"}} - ) - - text = response.get("text", "") - metrics = response.get("metrics", {}) + response = requests.post( + base_url, + headers={"Authorization": f"Bearer {api_key}"}, + data={"model": model_name}, + files={"file": (os.path.basename(tmp_path), f, "audio/wav")}, + timeout=120, + ) + response.raise_for_status() + payload = response.json() + text = payload.get("text", "") + metrics = payload.get("metrics", {}) print("\n📝 Transcription:\n", text) diff --git a/docs/commands.md b/docs/commands.md new file mode 100644 index 0000000..b737910 --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,497 @@ +--- +icon: lucide/terminal +--- + +# Commands + + +After installation run ```openarc --help``` to see focused usage documentation inside the openarc command line tool. + +This page contains example commands to help you choose models and configure OpenArc. + +=== "add" + + Add a model to `openarc_config.json` for easy loading with `openarc load`. + + === "Single device" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine \ + --model-type \ + --device + ``` + + To see what options you have for `--device`, use `openarc tool device-detect`. + + === "VLM" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine \ + --model-type \ + --device \ + --vlm-type + ``` + + Getting VLM to work the way I wanted required using VLMPipeline in ways that are not well documented. You can look at the [code](src/engine/ov_genai/vlm.py#L33) to see how OpenArc's VLM backend passes images. Basically, it involves slicing the input sequence by scanning for when there's in image and injecting appropriate tokens. Honestly I have no ideas why they built VLMPipeline this way, but to support all the architectures my approach was easier in the end. + + `vlm-type` maps a vision token for a given architecture. Use `openarc add --help` to see the available options. The server will complain if you get anything wrong, so it should be easy to figure out. + + + NOTE: you don't have to pass `Vision Token`; these are mapped to the `vlm-type` `openarc add` argument so use that instead. + + | `--vlm-type` | Vision token | + |----------------|-----------------------------------------------------| + | `internvl2` | `` | + | `llava15` | `` | + | `llavanext` | `` | + | `minicpmv26` | `(./)` | + | `phi3vision` | `<\|image_{i}\|>` | + | `phi4mm` | `<\|image_{i}\|>` | + | `qwen2vl` | `<\|vision_start\|><\|image_pad\|><\|vision_end\|>` | + | `qwen25vl` | `<\|vision_start\|><\|image_pad\|><\|vision_end\|>` | + | `gemma3` | `` | + + === "Whisper" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine ovgenai \ + --model-type whisper \ + --device + ``` + + === "Kokoro" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine openvino \ + --model-type kokoro \ + --device CPU + ``` + + === "Qwen3-TTS" + + Qwen3-TTS has three modes, each selected by `--model-type` at add time. Inference parameters (speaker, voice description, reference audio, sampling settings) are supplied per-request via the API, not here. + + CPU and GPU device are supported. + + When GPU is selected as device, part of the model still runs on CPU. + + Supported languages: `english`, `chinese`, `japanese`, `korean`, `german`, `french`, `spanish`, `italian`, `portuguese`, `russian`, `beijing_dialect`, `sichuan_dialect`. Pass `None` to auto-detect. See `demos/qwen3_tts_example.py` for a full request example. + + === "Custom voice" + + Pick a predefined speaker at inference time (`serena`, `vivian`, `uncle_fu`, `ryan`, `aiden`, `ono_anna`, `sohee`, `eric`, `dylan`): + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine openvino \ + --model-type qwen3_tts_custom_voice \ + --device CPU + ``` + + ```python + import os + from openai import OpenAI + from pathlib import Path + + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key=os.environ["OPENARC_API_KEY"], + ) + + response = client.audio.speech.create( + model="", + input="Hello, this is a test.", + extra_body={ + "openarc_tts": { + "qwen3_tts": { + # --- content --- + "input": "Hello, this is a test.", + "speaker": "uncle_fu", # serena, vivian, uncle_fu, ryan, aiden, ono_anna, sohee, eric, dylan + "instruct": None, # optional style instruction e.g. "Speak slowly and clearly." + "language": "english", # None to auto-detect + # --- sampling --- + "max_new_tokens": 2048, + "do_sample": True, + "top_k": 50, + "top_p": 1.0, + "temperature": 0.9, + "repetition_penalty": 1.05, + "non_streaming_mode": True, + "subtalker_do_sample": True, + "subtalker_top_k": 50, + "subtalker_top_p": 1.0, + "subtalker_temperature": 0.9, + # --- streaming --- + "stream": True, + "stream_chunk_frames": 50, + "stream_left_context": 25, + } + } + }, + ) + + Path("speech.wav").write_bytes(response.content) + ``` + + === "Voice design" + + Describe the voice in free-form text at inference time: + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine openvino \ + --model-type qwen3_tts_voice_design \ + --device CPU + ``` + + ```python + import os + from openai import OpenAI + from pathlib import Path + + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key=os.environ["OPENARC_API_KEY"], + ) + + response = client.audio.speech.create( + model="", + input="Hello, this is a test.", + voice="alloy", + extra_body={ + "openarc_tts": { + "qwen3_tts": { + # --- content --- + "input": "Hello, this is a test.", + "voice_description": "A calm, deep male voice with a slight British accent.", + "language": "english", # None to auto-detect + # --- sampling --- + "max_new_tokens": 2048, + "do_sample": True, + "top_k": 50, + "top_p": 1.0, + "temperature": 0.9, + "repetition_penalty": 1.05, + "subtalker_do_sample": True, + "subtalker_top_k": 50, + "subtalker_top_p": 1.0, + "subtalker_temperature": 0.9, + # --- streaming --- + "stream": True, + "stream_chunk_frames": 300, + "stream_left_context": 25, + } + } + }, + ) + + Path("speech.wav").write_bytes(response.content) + ``` + + === "Voice clone" + + Provide a reference WAV at inference time to clone a speaker: + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine openvino \ + --model-type qwen3_tts_voice_clone \ + --device CPU + ``` + + ```python + import base64 + import os + from openai import OpenAI + from pathlib import Path + + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key=os.environ["OPENARC_API_KEY"], + ) + + ref_audio_b64 = base64.b64encode(Path("reference.wav").read_bytes()).decode() + + response = client.audio.speech.create( + model="", + input="Hello, this is a test.", + voice="alloy", + extra_body={ + "openarc_tts": { + "qwen3_tts": { + # --- content --- + "ref_audio_b64": ref_audio_b64, + "ref_text": "Transcript of the reference audio.", # optional, enables ICL + "x_vector_only": False, # True = x-vector only, skips ICL even if ref_text is set + "instruct": None, # optional style instruction + "language": "english", # None to auto-detect + # --- sampling --- + "max_new_tokens": 2048, + "do_sample": True, + "top_k": 50, + "top_p": 1.0, + "temperature": 0.9, + "repetition_penalty": 1.05, + "subtalker_do_sample": True, + "subtalker_top_k": 50, + "subtalker_top_p": 1.0, + "subtalker_temperature": 0.9, + # --- streaming --- + "stream": True, + "stream_chunk_frames": 300, + "stream_left_context": 25, + } + } + }, + ) + + Path("speech.wav").write_bytes(response.content) + ``` + + === "Qwen3-ASR" + + Qwen3-ASR long-form transcription — supports Qwen3-ASR-0.6B. Audio is chunked automatically at silence boundaries up to `max_chunk_sec` (default `30s`). This is not a hard limit and happens dynamically based + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine openvino \ + --model-type qwen3_asr \ + --device CPU + ``` + + Use the `/v1/audio/transcriptions` endpoint with `openarc_asr` in the request body: + I have not tested our implementation with any community tooling yet; however, all tests using the openai python library are passing, and usually that's enough. + + For the options in `extra_body`, they will likely not have support in any third party tool you don't build from scratch. I'm working on improving how these can be configured. Currently, the behavior is modified per request, so you can tinker with performance on CPU and GPU. At this time NPU device is unsupported. + + ```python + import json + import os + from pathlib import Path + from openai import OpenAI + + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key=os.environ["OPENARC_API_KEY"], + ) + + with Path("audio.wav").open("rb") as f: + response = client.audio.transcriptions.create( + model="", + file=f, + response_format="verbose_json", + extra_body={ + "openarc_asr": json.dumps({ + "qwen3_asr": { + "language": None, # auto-detect, or e.g. "english" + "max_tokens": 1024, # max tokens per chunk + "max_chunk_sec": 30.0, # max audio chunk length in seconds + "search_expand_sec": 5.0, # silence-search window expansion + "min_window_ms": 100.0, # minimum silence window in ms + } + }) + }, + ) + + print(response.text) + ``` + + === "Advanced" + + `runtime-config` accepts many options to modify `openvino` runtime behavior for different inference scenarios. OpenArc reports C++ errors to the server when these fail, making experimentation easy. + + See OpenVINO documentation on [Inference Optimization](https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference.html) to learn more about what can be customized. + + Not all options are designed for transformers, so `runtime-config` was implemented in a way where you get immediate feedback from the OpenVINO runtime after loading a model. Add an argument, load that model, get feedback from the server, run `openarc bench`. This makes iterating faster in an area where the documentation is sparse. The options listed here have been validated. + + Review the [pipeline-parallelism preview](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.html#pipeline-parallelism-preview) to learn how you can customize multi-device inference using the HETERO device plugin. + + === "Multi-GPU Pipeline Parallel" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine ovgenai \ + --model-type llm \ + --device HETERO:GPU.0,GPU.1 \ + --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}" + ``` + + === "Tensor Parallel" + + Requires more than one CPU socket in a single node. + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine ovgenai \ + --model-type llm \ + --device CPU \ + --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "TENSOR_PARALLEL"}" + ``` + + === "Hybrid / CPU Offload" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine ovgenai \ + --model-type llm \ + --device HETERO:GPU.0,CPU \ + --runtime-config "{"MODEL_DISTRIBUTION_POLICY": "PIPELINE_PARALLEL"}" + ``` + + === "Speculative Decoding" + + ``` + openarc add \ + --model-name \ + --model-path \ + --engine ovgenai \ + --model-type llm \ + --device GPU.0 \ + --draft-model-path \ + --draft-device CPU \ + --num-assistant-tokens 5 \ + --assistant-confidence-threshold 0.5 + ``` + +=== "list" + + Reads added configurations from `openarc_config.json`. + + Display all added models: + ``` + openarc list + ``` + + Display config metadata for a specific model: + ``` + openarc list \ + \ + -v + ``` + + Remove a configuration: + ``` + openarc list \ + --remove + ``` + +=== "serve" + + Starts the server. + + ``` + openarc serve start # defaults to 0.0.0.0:8000 + ``` + + Configure host and port: + + ``` + openarc serve start \ + --host \ + --port + ``` + + To load models on startup: + + ``` + openarc serve start \ + --load-models model1 model2 + ``` + +=== "load" + + After using `openarc add` you can use `openarc load` to read the added configuration and load models onto the OpenArc server. + + OpenArc uses arguments from `openarc add` as metadata to make routing decisions internally; you are querying for correct inference code. + + ``` + openarc load + ``` + + To load multiple models at once: + + ``` + openarc load \ + \ + \ + + ``` + + Be mindful of your resources; loading models can be resource intensive! On the first load, OpenVINO performs model compilation for the target `--device`. + + When `openarc load` fails, the CLI tool displays a full stack trace to help you figure out why. + +=== "status" + + Calls `/openarc/status` endpoint and returns a report. Shows loaded models. + + ``` + openarc status + ``` + +=== "bench" + + Benchmark `llm` performance with pseudo-random input tokens. + + This approach follows [llama-bench](https://github.com/ggml-org/llama.cpp/blob/683fa6ba/tools/llama-bench/llama-bench.cpp#L1922), providing a baseline for the community to assess inference performance between `llama.cpp` backends and `openvino`. + + To support different `llm` tokenizers, we need to standardize how tokens are chosen for benchmark inference. When you set `--p` we select `512` pseudo-random tokens as input_ids from the set of all tokens in the vocabulary. + + `--n` controls the maximum amount of tokens we allow the model to generate; this bypasses `eos` and sets a hard upper limit. + + Default values are: + ``` + openarc bench \ + \ + --p <512> \ + --n <128> \ + --r <5> + ``` + + ![openarc bench](assets/openarc_bench_sample.png) + + `openarc bench` also records metrics in a sqlite database `openarc_bench.db` for easy analysis. + +=== "tool" + + Utility scripts. + + To see `openvino` properties your device supports: + + ``` + openarc tool device-props + ``` + + To see available devices: + + ``` + openarc tool device-detect + ``` + + ![device-detect](assets/cli_tool_device-detect.png) diff --git a/docs/home.md b/docs/home.md new file mode 100644 index 0000000..b9f3e4c --- /dev/null +++ b/docs/home.md @@ -0,0 +1,57 @@ +--- +icon: lucide/chess-rook +--- + + +# Start Here + +Welcome to the OpenArc documentation! These docs are under construction but cover everything in the 2.0.3 README. + +[![Discord](https://img.shields.io/discord/1341627368581628004?logo=Discord&logoColor=%23ffffff&label=Discord&link=https%3A%2F%2Fdiscord.gg%2FmaMY7QjG)](https://discord.gg/Bzz9hax9Jq) +[![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Echo9Zulu-yellow)](https://huggingface.co/Echo9Zulu) +[![Devices](https://img.shields.io/badge/Devices-CPU%2FGPU%2FNPU-blue)](https://github.com/openvinotoolkit/openvino) +[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/SearchSavior/OpenArc) + + + + +## Installation + +- [Linux](install.md#linux) +- [Windows](install.md#windows) +- [Docker](install.md#docker) + +## Commands + +OpenArc includes a command line tool for controlling the server. + +- [openarc add](commands.md#add) — Add a model to the config. +- [openarc list](commands.md#list) — List models added to the config. +- [openarc serve](commands.md#serve) — Start the OpenArc server. +- [openarc load](commands.md#load) — Load a model from the config. +- [openarc status](commands.md#status) — Check loaded models. +- [openarc bench](commands.md#bench) — Benchmarking tool for LLMs. +- [openarc tool](commands.md#tool) — OpenVINO utilities. + +## Models + +Models to get you started and where to find more! + +OpenArc is deeply integrated with the Huggingface Ecosytem and has been written from the ground up to handle a ton of deployment complexity but still demands some calories to choose what models to use. + +We are working on improving this process with experimental GGUF support coming, as well as a new frontend application similar to LM-Studio! + + + + +- [Model Sources](models.md#sources) +- [LLMs](models.md#llms) +- [VLMs](models.md#vlms) +- [Text to Speech](models.md#text-to-speech) + - [Whisper](models.md#whisper) + - [Qwen3-ASR](models.md#qwen3-asr) +- [Speech to Text](models.md#speech-to-text) + - [Kokoro](models.md#kokoro) + - [Qwen3-TTS](models.md#qwen3-tts) +- [Embedding](models.md#embedding) +- [Rerank](models.md#rerank) diff --git a/docs/index.md b/docs/index.md index 8ba52c9..fb0e18d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,78 +1,173 @@ -# OpenArc Documentation +--- +icon: lucide/rocket +--- -Welcome to OpenArc documentation! +# Get started -This document collects information about the codebase structure, APIs, architecture and design patterns to help you explore the codebase. +For full documentation visit [zensical.org](https://zensical.org/docs/). +## Commands -- **[Server](./server.md)** - FastAPI server documentation with endpoint details -- **[Model Registration](./model_registration.md)** - How models are registered, loaded, and managed -- **[Worker Orchestration](./worker_orchestration.md)** - Worker system architecture and request routing -- **[Inference](./inference.md)** - Inference engines, class structure, and implementation details +* [`zensical new`][new] - Create a new project +* [`zensical serve`][serve] - Start local web server +* [`zensical build`][build] - Build your site -### Architecture Overview + [new]: https://zensical.org/docs/usage/new/ + [serve]: https://zensical.org/docs/usage/preview/ + [build]: https://zensical.org/docs/usage/build/ +## Examples + +### Admonitions + +> Go to [documentation](https://zensical.org/docs/authoring/admonitions/) + +!!! note + + This is a **note** admonition. Use it to provide helpful information. + +!!! warning + + This is a **warning** admonition. Be careful! + +### Details + +> Go to [documentation](https://zensical.org/docs/authoring/admonitions/#collapsible-blocks) + +??? info "Click to expand for more info" + + This content is hidden until you click to expand it. + Great for FAQs or long explanations. + +## Code Blocks + +> Go to [documentation](https://zensical.org/docs/authoring/code-blocks/) + +``` python hl_lines="2" title="Code blocks" +def greet(name): + print(f"Hello, {name}!") # (1)! + +greet("Python") ``` -┌─────────────────┐ -│ FastAPI │ HTTP API Layer -│ Server │ (OpenAI-compatible endpoints) -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ WorkerRegistry │ Request Routing & Orchestration -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ ModelRegistry │ Model Lifecycle Management -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ Inference │ Engine-specific implementations -│ Engines │ (OVGenAI, Optimum, OpenVINO) -└─────────────────┘ + +1. > Go to [documentation](https://zensical.org/docs/authoring/code-blocks/#code-annotations) + + Code annotations allow to attach notes to lines of code. + +Code can also be highlighted inline: `#!python print("Hello, Python!")`. + +## Content tabs + +> Go to [documentation](https://zensical.org/docs/authoring/content-tabs/) + +=== "Python" + + ``` python + print("Hello from Python!") + ``` + +=== "Rust" + + ``` rs + println!("Hello from Rust!"); + ``` + +## Diagrams + +> Go to [documentation](https://zensical.org/docs/authoring/diagrams/) + +``` mermaid +graph LR + A[Start] --> B{Error?}; + B -->|Yes| C[Hmm...]; + C --> D[Debug]; + D --> B; + B ---->|No| E[Yay!]; ``` -### Key Components +## Footnotes + +> Go to [documentation](https://zensical.org/docs/authoring/footnotes/) + +Here's a sentence with a footnote.[^1] + +Hover it, to see a tooltip. + +[^1]: This is the footnote. + + +## Formatting + +> Go to [documentation](https://zensical.org/docs/authoring/formatting/) + +- ==This was marked (highlight)== +- ^^This was inserted (underline)^^ +- ~~This was deleted (strikethrough)~~ +- H~2~O +- A^T^A +- ++ctrl+alt+del++ + +## Icons, Emojis + +> Go to [documentation](https://zensical.org/docs/authoring/icons-emojis/) + +* :sparkles: `:sparkles:` +* :rocket: `:rocket:` +* :tada: `:tada:` +* :memo: `:memo:` +* :eyes: `:eyes:` + +## Maths -1. **Server** (`src/server/main.py`) - - FastAPI application with OpenAI-compatible endpoints - - Authentication middleware - - Request/response handling +> Go to [documentation](https://zensical.org/docs/authoring/math/) -2. **Model Registry** (`src/server/model_registry.py`) - - Model lifecycle management (load/unload) - - Status tracking - - Factory pattern for engine instantiation +$$ +\cos x=\sum_{k=0}^{\infty}\frac{(-1)^k}{(2k)!}x^{2k} +$$ -3. **Worker Registry** (`src/server/worker_registry.py`) - - Per-model worker queues - - Request routing and orchestration - - Async packet processing +!!! warning "Needs configuration" + Note that MathJax is included via a `script` tag on this page and is not + configured in the generated default configuration to avoid including it + in a pages that do not need it. See the documentation for details on how + to configure it on all your pages if they are more Maths-heavy than these + simple starter pages. -4. **Inference Engines** (`src/engine/`) - - **OVGenAI**: LLM, VLM, Whisper models - - **Optimum**: Embedding, Reranker models - - **OpenVINO**: Kokoro TTS models + + -- **LLM**: Text-to-text language models -- **VLM**: Vision-language models (image-to-text) -- **Whisper**: Automatic speech recognition -- **Kokoro**: Text-to-speech -- **Embedding**: Text-to-vector embeddings -- **Reranker**: Document reranking +## Task Lists -## Supported Libraries +> Go to [documentation](https://zensical.org/docs/authoring/lists/#using-task-lists) -- **OVGenAI**: OpenVINO GenAI pipeline (LLM, VLM, Whisper) -- **Optimum**: Optimum-Intel (Embedding, Reranker) -- **OpenVINO**: Native OpenVINO runtime (Kokoro TTS) +* [x] Install Zensical +* [x] Configure `zensical.toml` +* [x] Write amazing documentation +* [ ] Deploy anywhere -This project is about intel devices, so expect we may expand to other frameworks/libraries in the future. +## Tooltips +> Go to [documentation](https://zensical.org/docs/authoring/tooltips/) +[Hover me][example] + [example]: https://example.com "I'm a tooltip!" diff --git a/docs/inference.md b/docs/inference.md deleted file mode 100644 index 41762ae..0000000 --- a/docs/inference.md +++ /dev/null @@ -1,137 +0,0 @@ -# Inference Engines Documentation - - -OpenArc supports three inference engines, each optimized for different model types: - -- **OVGenAI**: OpenVINO GenAI pipeline (LLM, VLM, Whisper) -- **Optimum**: Optimum-Intel (Embedding, Reranker) -- **OpenVINO**: Native OpenVINO runtime (Kokoro TTS) - -## Engine Architecture - -``` -src/engine/ -├── ov_genai/ -│ ├── llm.py # OVGenAI_LLM -│ ├── vlm.py # OVGenAI_VLM -│ ├── whisper.py # OVGenAI_Whisper -│ ├── streamers.py # ChunkStreamer -│ ├── continuous_batch_llm.py -│ └── continuous_batch_vlm.py -├── optimum/ -│ ├── optimum_llm.py # Optimum_LLM -│ ├── optimum_vlm.py # Optimum_VLM -│ ├── optimum_emb.py # Optimum_EMB -│ └── optimum_rr.py # Optimum_RR -└── openvino/ - ├── kokoro.py # OV_Kokoro - └── kitten.py -``` - -## Class Hierarchy - -### OVGenAI Engine - -#### OVGenAI_LLM (`src/engine/ov_genai/llm.py`) - -Text-to-text language model using OpenVINO GenAI LLMPipeline. - -**Key Features:** -- Supports OpenAI-compatible chat message format with chat templates -- Tool calling support (tools parameter in messages) -- Streaming and non-streaming generation modes -- Multiple input formats: pre-encoded input_ids, raw prompts, and chat messages -- ChunkStreamer for batched token streaming (chunk_size > 1) -- Performance metrics collection (ttft, throughput, etc.) -- Uses AutoTokenizer for encoding, model tokenizer for decoding - -#### OVGenAI_VLM (`src/engine/ov_genai/vlm.py`) - -Vision-language model using OpenVINO GenAI VLMPipeline. - -**Key Features:** -- Supports OpenAI-compatible multimodal message format with embedded images -- Tool calling support (tools parameter in messages) -- Streaming and non-streaming generation modes -- Extracts base64-encoded images from OpenAI message format -- Converts images to OpenVINO tensors for inference -- Inserts model-specific vision tokens at image positions -- Supports multiple images per request with proper token indexing -- ChunkStreamer for batched token streaming (chunk_size > 1) -- Performance metrics collection (ttft, throughput, etc.) -- Uses chat templates with vision token insertion - -**Vision Token Types:** -- `internvl2`: `` -- `llava15`: `` -- `llavanext`: `` -- `minicpmv26`: `(./)` -- `phi3vision`: `<|image_{i}|>` -- `phi4mm`: `<|image_{i}|>` -- `qwen2vl`: `<|vision_start|><|image_pad|><|vision_end|>` -- `qwen25vl`: `<|vision_start|><|image_pad|><|vision_end|>` -- `gemma3`: `` - -#### OVGenAI_Whisper (`src/engine/ov_genai/whisper.py`) - -Automatic speech recognition using OpenVINO GenAI Whisper - -**Key Features:** -- Processes base64-encoded audio -- Returns transcribed text and metrics -- Non-streaming only (Whisper processes entire audio) - -#### ChunkStreamer (`src/engine/ov_genai/streamers.py`) - -Custom streamer for chunked token streaming. Uses OpenVINO tokenizer, not AutoTokenizer for decode. - -**Features:** -- Accumulates tokens into chunks -- Yields chunks when chunk_size reached -- Supports chunk_size > 1 for batched streaming - -### Optimum Engine - -#### Optimum_EMB (`src/engine/optimum/optimum_emb.py`) - -Text-to-vector embedding model using Optimum-Intel. - -**Key Features:** -- Uses `OVModelForFeatureExtraction` -- Implements last token pooling for embeddings -- Normalizes embeddings (L2 normalization) -- Supports flexible tokenizer configuration - -**Token Pooling:** -- Handles left-padding vs right-padding -- Extracts last non-padding token embedding -- Normalizes to unit vectors - -#### Optimum_RR (`src/engine/optimum/optimum_rr.py`) - -Document reranking model using Optimum-Intel. - -**Key Features:** -- Reranks documents based on query relevance -- Supports custom prefix/suffix/instruction -- Returns ranked document lists - -### OpenVINO Engine - -#### OV_Kokoro (`src/engine/openvino/kokoro.py`) - -Text-to-speech model using native OpenVINO runtime. - -**Key Features:** -- Processes text in chunks (character_count_chunk) -- Generates audio tensors per chunk -- Supports voice selection and language codes -- Speed control for speech generation -- Returns WAV audio format - -**Voice Support:** -- Multiple languages (English, Japanese, Chinese, Spanish, etc.) -- Multiple voices per language -- Gender-specific voices - -# \ No newline at end of file diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..c354a27 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,117 @@ +--- +icon: lucide/cog +--- + + + +=== "Linux" + + 1. OpenVINO requires **device specifc drivers**. + + - Visit [OpenVINO System Requirments](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html#cpu) for the latest information on drivers. + + 2. Install uv from [astral](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) + + 3. After cloning use: + + ``` + uv sync + ``` + + 4. Activate your environment with: + + ``` + source .venv/bin/activate + ``` + + Build latest optimum + ``` + uv pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel" + ``` + + Build latest OpenVINO and OpenVINO GenAI from nightly wheels + ``` + uv pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + ``` + + 5. Set your API key as an environment variable: + ``` + export OPENARC_API_KEY=api-key + ``` + + 6. To get started, run: + + ``` + openarc --help + ``` + +=== "Windows" + + 1. OpenVINO requires **device specifc drivers**. + + - Visit [OpenVINO System Requirments](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html#cpu) to get the latest information on drivers. + + 2. Install uv from [astral](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) + + 3. Clone OpenArc, enter the directory and run: + ``` + uv sync + ``` + + 4. Activate your environment with: + + ``` + .venv\Scripts\activate + ``` + + **Build latest optimum** + ``` + uv pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel" + ``` + + **Build latest OpenVINO and OpenVINO GenAI from nightly wheels** + ``` + uv pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + ``` + + 5. **Set your API key as an environment variable:** + ``` + setx OPENARC_API_KEY openarc-api-key + ``` + + 6. To get started, run: + + ``` + openarc --help + ``` + +=== "Docker" + + Instead of fighting with Intel's own docker images, we built our own which is as close to boilerplate as possible. For a primer on docker [check out this video](https://www.youtube.com/watch?v=DQdB7wFEygo). + + + **Build and run the container:** + ```bash + docker-compose up --build -d + ``` + + **Run the container:** + ```bash + docker run -d -p 8000:8000 openarc:latest + ``` + **Enter the container:** + ```bash + docker exec -it openarc /bin/bash + ``` + + ## Environment Variables + + ```bash + export OPENARC_API_KEY="openarc-api-key" # default, set it to whatever you want + export OPENARC_AUTOLOAD_MODEL="model_name" # model_name to load on startup + export MODEL_PATH="/path/to/your/models" # mount your models to `/models` inside the container + docker-compose up --build -d + ``` + + + Take a look at the [Dockerfile](Dockerfile) and [docker-compose](docker-compose.yaml) for more details. \ No newline at end of file diff --git a/docs/model_registration.md b/docs/model_registration.md deleted file mode 100644 index a001bc7..0000000 --- a/docs/model_registration.md +++ /dev/null @@ -1,101 +0,0 @@ -# Model Registration Documentation - -This document describes the model registration system, lifecycle management, and architectural patterns. - -## Overview - -The Model Registry (`src/server/model_registry.py`) manages the lifecycle of all models in OpenArc using a registry pattern with async background loading and a factory pattern for engine instantiation. - -## Architecture Patterns - -### Registry Pattern - -The `ModelRegistry` maintains a central dictionary of all loaded models, tracking their status and lifecycle state. It is a volatile in memory datastore used internally. - -**Key Components:** -- **ModelRecord**: Tracks model state (LOADING, LOADED, FAILED) -- **Async Lock**: Ensures thread-safe concurrent access -- **Event System**: Callbacks for lifecycle events - -### Factory Pattern - -Models are instantiated via a factory that maps `(engine, model_type)` tuples to concrete engine classes: - -The factory dynamically imports and instantiates the appropriate class based on configuration. - -### Event System - -The registry fires events when models are loaded or unloaded, allowing other components (like `WorkerRegistry`) to react: - -```python -# Subscribe to events -registry.add_on_loaded(on_model_loaded) -registry.add_on_unloaded(on_model_unloaded) -``` - -## Model Lifecycle - -``` -┌─────────────┐ -│ REQUEST │ -│ LOAD MODEL │ -└──────┬──────┘ - │ - ▼ -┌─────────────┐ -│ CREATE │ -│ MODEL RECORD│ -│ (LOADING) │ -└──────┬──────┘ - │ - ▼ -┌─────────────┐ -│ SPAWN │ -│ LOAD TASK │ -└──────┬──────┘ - │ - ▼ -┌─────────────┐ -│ FACTORY │ -│ INSTANTIATE │ -└──────┬──────┘ - │ - ▼ -┌─────────────┐ -│ UPDATE │ -│ STATUS TO │ -│ LOADED │ -└──────┬──────┘ - │ - ▼ -┌─────────────┐ -│ FIRE │ -│ CALLBACKS │ -└─────────────┘ -``` - -## Key Classes - -### ModelLoadConfig - -Pydantic model defining model configuration. - -### ModelRecord - -Dataclass tracking a registered model's state, instance, and metadata. Distinguishes between private (internal) and public (API-exposed) fields. - -### ModelRegistry - -Central registry implementing: -- **Async Loading**: Background tasks for model loading/unloading -- **Status Tracking**: LOADING → LOADED → FAILED states -- **Factory Integration**: Delegates instantiation to factory -- **Event Notifications**: Fires callbacks on lifecycle changes - -## Thread Safety - -All registry operations are protected by `asyncio.Lock` for thread-safe concurrent access. The registry maintains separate private model IDs while exposing public model names for API access. - -## Integration - -The `WorkerRegistry` subscribes to model lifecycle events to automatically spawn workers when models load and clean up when they unload. diff --git a/docs/models.md b/docs/models.md new file mode 100644 index 0000000..a1233ff --- /dev/null +++ b/docs/models.md @@ -0,0 +1,94 @@ +--- +icon: lucide/brain +--- + +## Sources + + +There are a few sources of preconverted models which can be used with OpenArc + +- [OpenVINO on HuggingFace](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd) + +- [My HuggingFace repo](https://huggingface.co/Echo9Zulu) + +- [LLMs optimized for NPU](https://huggingface.co/collections/OpenVINO/llms-optimized-for-npu-686e7f0bf7bc184bd71f8ba0) + + +If you need help converting a particular model join Discord and we can help you! + + +=== "LLMs" + + | **Models** | + | --- | + | [Qwen3-1.7B-int8_asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-1.7B-int8_asym-ov/tree/main) | + | [Qwen3-4B-Instruct-2507-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen3-4B-Instruct-2507-int4_asym-awq-ov) | + | [Satyr-V0.1-4B-HF-int4_awq-ov](https://huggingface.co/Gapeleon/Satyr-V0.1-4B-HF-int4_awq-ov?not-for-all-audiences=true) | + | [Dolphin-X1-8B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Dolphin-X1-8B-int4_asym-awq-ov) | + | [Qwen3-8B-ShiningValiant3-int4-asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-8B-ShiningValiant3-int4-asym-ov) | + | [Qwen3-14B-int4_sym-ov](https://huggingface.co/Echo9Zulu/Qwen3-14B-int4_sym-ov/tree/main) | + | [Cydonia-24B-v4.2.0-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Cydonia-24B-v4.2.0-int4_asym-awq-ov) | + | [Qwen2.5-Microsoft-NextCoder-Soar-Instruct-FUSED-CODER-Fast-11B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-Microsoft-NextCoder-Soar-Instruct-FUSED-CODER-Fast-11B-int4_asym-awq-ov) | + | [Magistral-Small-2509-Text-Only-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Magistral-Small-2509-Text-Only-int4_asym-awq-ov) | + | [Hermes-4-70B-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/Hermes-4-70B-int4_asym-awq-ov) | + | [Qwen2.5-Coder-32B-Instruct-int4_sym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-Coder-32B-Instruct-int4_sym-awq-ov) | + | [Qwen3-32B-Instruct-int4_sym-awq-ov](https://huggingface.co/Echo9Zulu/Qwen3-32B-Instruct-int4_sym-awq-ov) | + | [Big-Tiger-Gemma-27B-v3-int4-asym-ov](https://huggingface.co/DudePls/Big-Tiger-Gemma-27B-v3-int4-asym-ov) | + | [Nanbeige4.1-3B-openvino](https://huggingface.co/DudePls/Nanbeige4.1-3B-openvino) | + | [Cydonia-24B-v4.3-OpenVINO-INT4](https://huggingface.co/DudePls/Cydonia-24B-v4.3-OpenVINO-INT4) | + | [Nemotron-Cascade-14B-Thinking-int4_asym-se-ov](https://huggingface.co/Echo9Zulu/Nemotron-Cascade-14B-Thinking-int4_asym-se-ov) | + | [NousCoder-14B-int4_sym-ov](https://huggingface.co/Echo9Zulu/NousCoder-14B-int4_sym-ov) | + +=== "VLMs" + + | **Models** | + | --- | + | [gemma-3-4b-it-int8_asym-ov](https://huggingface.co/Echo9Zulu/gemma-3-4b-it-int8_asym-ov) | + | [Gemma-3-12b-it-qat-int4_asym-ov](https://huggingface.co/Echo9Zulu/Gemma-3-12b-it-qat-int4_asym-ov) | + | [Qwen2.5-VL-7B-Instruct-int4_sym-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-VL-7B-Instruct-int4_sym-ov/tree/main) | + | [Nanonets-OCR2-3B-LM-INT4_ASYM-VE-FP16-ov](https://huggingface.co/Echo9Zulu/Nanonets-OCR2-3B-LM-INT4_ASYM-VE-FP16-ov) | + +=== "ASR" + + **Whisper** + + | **Models** | + | --- | + | [distil-whisper-large-v3-int8-ov](https://huggingface.co/OpenVINO/distil-whisper-large-v3-int8-ov) | + | [distil-whisper-large-v3-fp16-ov](https://huggingface.co/OpenVINO/distil-whisper-large-v3-fp16-ov) | + | [whisper-large-v3-int8-ov](https://huggingface.co/OpenVINO/whisper-large-v3-int8-ov/tree/main) | + | [openai-whisper-large-v3-fp16-ov](https://huggingface.co/OpenVINO/openai-whisper-large-v3-fp16-ov/tree/main) | + + **Qwen3-ASR** + + | **Models** | + | --- | + | [Qwen3-ASR-0.6B-INT8_ASYM-OpenVINO](https://huggingface.co/Echo9Zulu/Qwen3-ASR-0.6B-INT8_ASYM-OpenVINO) | + +=== "TTS" + + **Kokoro** + + | **Models** | + | --- | + | [Kokoro-82M-FP16-OpenVINO](https://huggingface.co/Echo9Zulu/Kokoro-82M-FP16-OpenVINO) | + + **Qwen3-TTS** + + | **Models** | + | --- | + | [Qwen3-TTS-12Hz-CustomVoice-1.7B-INT8-OpenVINO](https://huggingface.co/Echo9Zulu/Qwen3-TTS-12Hz-CustomVoice-1.7B-INT8-OpenVINO) | + | [Qwen3-TTS-12Hz-VoiceDesign-1.7B-INT8-OpenVINO](https://huggingface.co/Echo9Zulu/Qwen3-TTS-12Hz-VoiceDesign-1.7B-INT8-OpenVINO) | + | [Qwen3-TTS-12Hz-Base-1.7B-INT8-OpenVINO](https://huggingface.co/Echo9Zulu/Qwen3-TTS-12Hz-Base-1.7B-INT8-OpenVINO) | + +=== "Embedding" + + | **Models** | + | --- | + | [Qwen3-Embedding-0.6B-int8_asym-ov](https://huggingface.co/Echo9Zulu/Qwen3-Embedding-0.6B-int8_asym-ov) | + +=== "Rerank" + + | **Models** | + | --- | + | [Qwen3-Reranker-0.6B-fp16-ov](https://huggingface.co/OpenVINO/Qwen3-Reranker-0.6B-fp16-ov) | \ No newline at end of file diff --git a/docs/openvino_ir.md b/docs/openvino_ir.md deleted file mode 100644 index e953e4e..0000000 --- a/docs/openvino_ir.md +++ /dev/null @@ -1,6 +0,0 @@ -### OpenVINO Model Format: Intermediate Representation - - -[OpenVINO Intermediate Representations](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) describe a set of standarsization techniques to format the operations of a neural network into a computational graph topology that a compiler can understand, stored in `openvino_model.bin` and `openvino_model.xml`. - -`openvino_model.xml` nodes represent [`opsets`](https://docs.openvino.ai/2025/documentation/openvino-ir-format/operation-sets.html#overview-of-artificial-neural-networks-representation) while edges represent data flow through the network a given IR describes. Together, these help OpenVINO's device plugin system determine what opsets are required vs which are *implemented* for a target device. diff --git a/docs/server.md b/docs/server.md deleted file mode 100644 index 9f6bd0a..0000000 --- a/docs/server.md +++ /dev/null @@ -1,588 +0,0 @@ -# OpenArc Server Documentation - -This document describes the FastAPI server implementation, endpoints, and API structure. - -## Table of Contents - -- [Overview](#overview) -- [Server Architecture](#server-architecture) - - [Key Components](#key-components) -- [Authentication](#authentication) -- [CORS Configuration](#cors-configuration) -- [Endpoints](#endpoints) - - [OpenArc Internal Endpoints](#openarc-internal-endpoints) - - [`POST /openarc/load`](#post-openarcload) - - [`POST /openarc/unload`](#post-openarcunload) - - [`GET /openarc/status`](#get-openarcstatus) - - [`POST /openarc/bench`](#post-openarcbench) - - [OpenAI-Compatible Endpoints](#openai-compatible-endpoints) - - [`GET /v1/models`](#get-v1models) - - [`POST /v1/chat/completions`](#post-v1chatcompletions) - - [`POST /v1/completions`](#post-v1completions) - - [`POST /v1/audio/transcriptions`](#post-v1audiotranscriptions) - - [`POST /v1/audio/speech`](#post-v1audiospeech) - - [`POST /v1/embeddings`](#post-v1embeddings) - - [`POST /v1/rerank`](#post-v1rerank) -- [Request Models](#request-models) - - [OpenAIChatCompletionRequest](#openaichatcompletionrequest) - - [OpenAICompletionRequest](#openaicompletionrequest) - - [OpenAIWhisperRequest](#openaiwhisperrequest) - - [OpenAIKokoroRequest](#openaikokororequest) - - [EmbeddingsRequest](#embeddingsrequest) - - [RerankRequest](#rerankrequest) -- [Tool Calling Support](#tool-calling-support) - - [Parser Implementation](#parser-implementation) -- [Metrics](#metrics) -- [Startup Models](#startup-models) - -## Overview - -The OpenArc server is built with FastAPI and provides OpenAI-compatible endpoints for inference. The server is located in `src/server/main.py`. - -## Server Architecture - - - -### Key Components - -- **FastAPI Application**: Main application instance with lifespan events -- **Model Registry**: Manages model lifecycle (load/unload) -- **Worker Registry**: Routes requests to appropriate workers -- **Authentication**: Bearer token authentication via `OPENARC_API_KEY` - -## Authentication - -All endpoints require authentication via Bearer token: - -```python -Authorization: Bearer -``` - -The API key is configured via the `OPENARC_API_KEY` environment variable. - -## Endpoints - -### OpenArc Internal Endpoints - -#### `POST /openarc/load` - -Load a model onto the server. - -**Request Body:** -```json -{ - "model_path": "/path/to/model", - "model_name": "my-model", - "model_type": "llm", - "engine": "ovgenai", - "device": "GPU.0", - "runtime_config": {}, - "vlm_type": null -} -``` - -**Response:** -```json -{ - "model_id": "unique-model-id", - "model_name": "my-model", - "status": "loaded" -} -``` - -**Status Codes:** -- `200`: Model loaded successfully -- `400`: Invalid request (e.g., model name already exists) -- `500`: Loading failed - -#### `POST /openarc/unload` - -Unload a model from the server. - -**Request Body:** -```json -{ - "model_name": "my-model" -} -``` - -**Response:** -```json -{ - "model_name": "my-model", - "status": "unloading" -} -``` - -**Status Codes:** -- `200`: Unload initiated -- `404`: Model not found -- `500`: Unload failed - -#### `GET /openarc/status` - -Get status of all loaded models. - -**Response:** -```json -{ - "total_loaded_models": 2, - "models": [ - { - "model_name": "my-model", - "model_type": "llm", - "engine": "ovgenai", - "device": "GPU.0", - "runtime_config": {}, - "status": "loaded", - "time_loaded": "2024-01-01T00:00:00" - } - ], - "openai_model_names": ["my-model"] -} -``` - -#### `POST /openarc/bench` - -Benchmark model performance with pre-encoded input IDs. - -**Request Body:** -```json -{ - "model": "my-model", - "input_ids": [1, 2, 3, ...], - "max_tokens": 512, - "temperature": 1.0, - "top_p": 1.0, - "top_k": 50, - "repetition_penalty": 1.0 -} -``` - -**Response:** -```json -{ - "metrics": { - "ttft": 0.123, - "prefill_throughput": 100.5, - "decode_throughput": 50.2, - "decode_duration": 2.5, - "tpot": 0.025, - "input_token": 512, - "new_token": 128, - "total_token": 640 - } -} -``` - -### OpenAI-Compatible Endpoints - -#### `GET /v1/models` - -List all available models. - -**Response:** -```json -{ - "object": "list", - "data": [ - { - "id": "my-model", - "object": "model", - "created": 1704067200, - "owned_by": "OpenArc" - } - ] -} -``` - -#### `POST /v1/chat/completions` - -Chat completions endpoint for LLM and VLM models. - -**Request Body:** -```json -{ - "model": "my-model", - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello!"} - ], - "tools": [], - "stream": false, - "temperature": 1.0, - "max_tokens": 512, - "top_p": 1.0, - "top_k": 50, - "repetition_penalty": 1.0 -} -``` - -**Response (non-streaming):** -```json -{ - "id": "ov-abc123...", - "object": "chat.completion", - "created": 1704067200, - "model": "my-model", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": "Hello! How can I help you?" - }, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 10, - "completion_tokens": 8, - "total_tokens": 18 - }, - "metrics": { - "ttft": 0.123, - "prefill_throughput": 100.5, - "decode_throughput": 50.2 - } -} -``` - -**Streaming Response:** -Server-Sent Events (SSE) format: -``` -data: {"id": "ov-abc123...", "object": "chat.completion.chunk", ...} -data: {"id": "ov-abc123...", "object": "chat.completion.chunk", ...} -data: [DONE] -``` - -#### `POST /v1/completions` - -Text completions endpoint for LLM models (legacy endpoint). - -**Request Body:** -```json -{ - "model": "my-model", - "prompt": "The capital of France is", - "stream": false, - "temperature": 1.0, - "max_tokens": 512 -} -``` - -**Response:** -```json -{ - "id": "ov-abc123...", - "object": "text_completion", - "created": 1704067200, - "model": "my-model", - "choices": [ - { - "index": 0, - "text": " Paris.", - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 5, - "completion_tokens": 2, - "total_tokens": 7 - } -} -``` - -#### `POST /v1/audio/transcriptions` - -Transcribe audio using Whisper models. - -**Request Body:** -```json -{ - "model": "whisper-model", - "audio_base64": "base64-encoded-audio-data" -} -``` - -**Response:** -```json -{ - "text": "Transcribed text here", - "metrics": { - "input_token": 100, - "new_token": 50, - "total_token": 150 - } -} -``` - -#### `POST /v1/audio/speech` - -Generate speech using Kokoro TTS models. - -**Request Body:** -```json -{ - "model": "kokoro-model", - "input": "Hello, world!", - "voice": "af_heart", - "speed": 1.0, - "language": "a", - "response_format": "wav" -} -``` - -**Response:** -Returns WAV audio file as binary stream with `Content-Type: audio/wav`. - -#### `POST /v1/embeddings` - -Generate text embeddings. - -**Request Body:** -```json -{ - "model": "embedding-model", - "input": "Text to embed", - "dimensions": null, - "encoding_format": "float", - "config": { - "max_length": 512, - "padding": true, - "truncation": true - } -} -``` - -**Response:** -```json -{ - "id": "ov-abc123...", - "object": "list", - "created": 1704067200, - "model": "embedding-model", - "data": [ - { - "index": 0, - "object": "embedding", - "embedding": [0.1, 0.2, ...] - } - ], - "usage": { - "prompt_tokens": 5, - "total_tokens": 5 - } -} -``` - -#### `POST /v1/rerank` - -Rerank documents based on a query. - -**Request Body:** -```json -{ - "model": "reranker-model", - "query": "search query", - "documents": ["doc1", "doc2", "doc3"], - "prefix": "<|im_start|>system\n...", - "suffix": "<|im_end|>\n...", - "instruction": "Given a search query..." -} -``` - -**Response:** -```json -{ - "id": "ov-abc123...", - "object": "list", - "created": 1704067200, - "model": "reranker-model", - "data": [ - { - "index": 0, - "object": "ranked_documents", - "ranked_documents": ["doc2", "doc1", "doc3"] - } - ], - "usage": { - "prompt_tokens": 50, - "total_tokens": 50 - } -} -``` - -## Request Models - -### OpenAIChatCompletionRequest -- `model`: str -- `messages`: List[Dict] -- `tools`: Optional[List[Dict]] -- `stream`: Optional[bool] -- `temperature`: Optional[float] -- `max_tokens`: Optional[int] -- `stop`: Optional[List[str]] -- `top_p`: Optional[float] -- `top_k`: Optional[int] -- `repetition_penalty`: Optional[float] -- `do_sample`: Optional[bool] -- `num_return_sequences`: Optional[int] - -### OpenAICompletionRequest -- `model`: str -- `prompt`: Union[str, List[str]] -- `stream`: Optional[bool] -- `temperature`: Optional[float] -- `max_tokens`: Optional[int] -- `stop`: Optional[List[str]] -- `top_p`: Optional[float] -- `top_k`: Optional[int] -- `repetition_penalty`: Optional[float] -- `do_sample`: Optional[bool] -- `num_return_sequences`: Optional[int] - -### OpenAIWhisperRequest -- `model`: str -- `audio_base64`: str - -### OpenAIKokoroRequest -- `model`: str -- `input`: str -- `voice`: Optional[str] -- `speed`: Optional[float] -- `language`: Optional[str] -- `response_format`: Optional[str] - -### EmbeddingsRequest -- `model`: str -- `input`: Union[str, List[str], List[List[str]]] -- `dimensions`: Optional[int] -- `encoding_format`: Optional[str] -- `user`: Optional[str] -- `config`: Optional[PreTrainedTokenizerConfig] - -### RerankRequest -- `model`: str -- `query`: str -- `documents`: List[str] -- `prefix`: Optional[str] -- `suffix`: Optional[str] -- `instruction`: Optional[str] - -## Tool Calling Support - -OpenArc supports OpenAI-compatible tool calling. Tools are parsed from model output using Hermes-style `...` tags containing JSON with `name` and `arguments` fields. - -Tool calls are detected in streaming and non-streaming modes: -- **Streaming**: Tool calls are detected incrementally and streamed as structured chunks -- **Non-streaming**: Tool calls are parsed from the final output - -### Parser Implementation - -The `parse_tool_calls()` function extracts payloads from `...` tags in the model's text output and converts them to OpenAI-compatible tool call format. - -**Input Format (Model Output):** - -The parser expects Hermes-style tagged payloads with the following structure: - -```json -{"name":"function_name","arguments":{"arg1":"value1","arg2":"value2"}} -``` - -**Input to the parser from a model:** - -``` -The user wants to know the weather. {"name":"get_weather","arguments":{"location":"San Francisco","units":"celsius"}} I'll check that for you. -``` - -**Output Format (OpenAI-Compatible):** - -Parser returns a list of tool call objects in OpenAI format: - -```json -[ - { - "id": "call_abc123def456...", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"San Francisco\", \"units\": \"celsius\"}" - } - } -] -``` - -**Parser Behavior:** - -- Extracts JSON payloads from `...` tags -- Supports an EOS fallback when a `` start tag appears without a closing `` -- Validates that each payload contains both `name` and `arguments` fields -- Generates unique IDs in format `call_{24-char-hex}` -- Converts `arguments` to JSON string (required by OpenAI format) -- Returns `None` if no valid tool calls are found - -**Example Response (Non-Streaming):** - -When tool calls are detected, the response includes: - -```json -{ - "id": "ov-abc123...", - "object": "chat.completion", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": null, - "tool_calls": [ - { - "id": "call_abc123def456...", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"San Francisco\", \"units\": \"celsius\"}" - } - } - ] - }, - "finish_reason": "tool_calls" - } - ] -} -``` - -**Example Response (Streaming):** - -Tool calls are streamed as structured chunks: - -``` -data: {"id": "ov-abc123...", "object": "chat.completion.chunk", "choices": [{"index": 0, "delta": {"tool_calls": [{"index": 0, "id": "call_abc123...", "type": "function", "function": {"name": "get_weather", "arguments": ""}}]}}]} -data: {"id": "ov-abc123...", "object": "chat.completion.chunk", "choices": [{"index": 0, "delta": {"tool_calls": [{"index": 0, "function": {"arguments": "{\"location\": \"San Francisco\"}"}}]}}]} -data: {"id": "ov-abc123...", "object": "chat.completion.chunk", "choices": [{"index": 0, "delta": {}, "finish_reason": "tool_calls"}]} -data: [DONE] -``` - -## Metrics - -All inference endpoints return performance metrics: -- `ttft`: Time to first token -- `prefill_throughput`: Prefill tokens per second -- `decode_throughput`: Decode tokens per second -- `decode_duration`: Total decode duration -- `tpot`: Time per output token -- `input_token`: Number of input tokens -- `new_token`: Number of generated tokens -- `total_token`: Total tokens processed - -## Startup Models - -Models can be automatically loaded on server startup via the `OPENARC_STARTUP_MODELS` environment variable: - -```bash -export OPENARC_STARTUP_MODELS="model1,model2,model3" -``` - -The server will read `openarc_config.json` and load the specified models during startup. - diff --git a/docs/worker_orchestration.md b/docs/worker_orchestration.md deleted file mode 100644 index e60e10a..0000000 --- a/docs/worker_orchestration.md +++ /dev/null @@ -1,59 +0,0 @@ -# Worker Orchestration Documentation - -This document describes the worker system architecture, request routing, and how inference requests are processed. - -## Architecture - -``` -Request → WorkerRegistry → Model Queue → Queue Worker → InferWorker → Model Instance -``` - -## WorkerPacket - -Dataclass representing an inference request packet flowing through the system. - - -## Error Handling - -### Inference Failures - -If inference fails (exception caught in InferWorker): -1. Error stored in `packet.response` as `"Error: ..."` -2. Metrics set to None -3. QueueWorker detects error response -4. Triggers model unload via `registry.register_unload()` -5. Worker loop exits -6. Server remains unblocked and no workers stall - -## Thread Safety - -- Queues are thread-safe (`asyncio.Queue`) -- WorkerRegistry uses `asyncio.Lock` for queue/task dictionary access -- Each model has its own queue and worker, ensuring isolation - -## Concurrency Model - -- **Per-Model Workers**: Each loaded model has its own dedicated worker -- **Async Queues**: Requests are queued and processed asynchronously -- **Parallel Processing**: Multiple models can process requests concurrently -- **Streaming Support**: Streaming uses separate queue mechanism - -## Design Patterns - -### Queue-Based Processing -- Decouples request submission from execution -- Enables backpressure handling -- Supports multiple concurrent requests per model - -### Worker Pattern -- Dedicated worker per model -- Long-running async loops -- Clean shutdown via None sentinel - -### Future-Based Communication -- Non-streaming uses `asyncio.Future` for result communication -- Enables async/await pattern - -### Queue-Based Streaming -- Streaming uses `asyncio.Queue` for token delivery -- Enables async iteration pattern diff --git a/examples/openvino_genai/ov_genai_AutoTokenizer.py b/examples/openvino_genai/ov_genai_AutoTokenizer.py index e6c53f3..b2f2f0a 100644 --- a/examples/openvino_genai/ov_genai_AutoTokenizer.py +++ b/examples/openvino_genai/ov_genai_AutoTokenizer.py @@ -2,7 +2,7 @@ from openvino_genai import GenerationConfig, LLMPipeline from transformers import AutoTokenizer -model_dir = "/mnt/Ironwolf-4TB/Models/OpenVINO/DeepSeek-V2-Lite-Chat-int4_asym-ov" +model_dir = "/mnt/Ironwolf-4TB/Models/OpenVINO/gpt-oss-nano-int4_sym-ov" pipe = LLMPipeline( model_dir, # Path to the model directory. Remember this will not pull from hub like in transformers @@ -12,14 +12,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_dir) generation_config = GenerationConfig( - max_new_tokens=24 + max_new_tokens=2048 ) -prompt = "You're the fastest Llama this side of the equator. What's your favorite food? try to imagine" +prompt = "You're the fastest Llama this side of the equator. What's your favorite food? try to imagine a convincing answer." messages = [{"role": "user", "content": prompt}] # Build proper chat prompt for Qwen-style instruct models and get prompt_token_ids directly -prompt_token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="np") +prompt_token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="np", reasoning_effort="low") num_iterations = 3 # Number of generations to run diff --git a/pyproject.toml b/pyproject.toml index d730e87..63808fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "torch>2.6.0", "torchvision>=0.23.0", "uvicorn>=0.35.0", + "zensical>=0.0.31", ] [build-system] diff --git a/src/cli/groups/add.py b/src/cli/groups/add.py index c7ee458..9ed7cc3 100644 --- a/src/cli/groups/add.py +++ b/src/cli/groups/add.py @@ -21,9 +21,13 @@ required=True, help='Engine used to load the model (ovgenai, openvino, optimum)') @click.option('--model-type', '--mt', - type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']), + type=click.Choice([ + 'llm', 'vlm', 'whisper', 'qwen3_asr', 'kokoro', + 'qwen3_tts_custom_voice', 'qwen3_tts_voice_design', 'qwen3_tts_voice_clone', + 'emb', 'rerank', + ]), required=True, - help='Model type (llm, vlm, whisper, kokoro, emb, rerank)') + help='Model type (llm, vlm, whisper, qwen3_asr, kokoro, qwen3_tts_custom_voice, qwen3_tts_voice_design, qwen3_tts_voice_clone, emb, rerank)') @click.option('--device', '--d', required=True, help='Device(s) to load the model on.') diff --git a/src/cli/groups/bench.py b/src/cli/groups/bench.py index e9e612d..013a915 100644 --- a/src/cli/groups/bench.py +++ b/src/cli/groups/bench.py @@ -21,6 +21,8 @@ help='Number of tokens to generate. Can be comma-separated or specified multiple times. Default: 128') @click.option('--runs', '--r', default=5, type=int, help='Number of times to repeat each benchmark. Default: 5') +@click.option('--depth', '-d', default=0, type=int, + help='Random vocab tokens prepended as synthetic prior context before the p-token segment. Total prompt length is d+p. Default: 0') @click.option('--temperature', '--temp', default=None, type=float, help='Sampling temperature (default: 1.0)') @click.option('--top-k', '--k', default=None, type=int, @@ -30,7 +32,7 @@ @click.option('--repetition-penalty', '--rep', default=None, type=float, help='Repetition penalty (default: 1.0)') @click.pass_context -def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, top_p, repetition_penalty): +def bench(ctx, model_name, input_tokens, max_tokens, runs, depth, temperature, top_k, top_p, repetition_penalty): """- Benchmark inference with pseudo-random input tokens. Examples: @@ -38,7 +40,12 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t openarc bench Dolphin-X1 --p 512 --n 128 -r 10 openarc bench Dolphin-X1 --p 16,32,64 --n 128,256 openarc bench Dolphin-X1 -p 16 -p 32 -n 128 -n 256 + openarc bench Dolphin-X1 -d 2048 --p 512 --n 128 """ + if depth < 0: + console.print("[red]depth (-d) must be >= 0[/red]") + ctx.exit(1) + from ..modules.benchmark import OpenArcBenchmarks from ..main import OpenArcCLI @@ -96,6 +103,7 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t ctx.exit(1) # Run benchmarks + console.print(f"depth (prior): {depth}") console.print(f"input tokens: {p_values}") console.print(f"max tokens: {n_values}") console.print(f"runs: {runs}\n") @@ -118,11 +126,17 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t for n in n_values: for r in range(runs): run_count += 1 - progress.update(task, description=f"[dim]benching...[/dim] ({run_count}/{total_runs}) [p={p}, n={n}, r={r+1}/{runs}]") + progress.update( + task, + description=( + f"[dim]benching...[/dim] ({run_count}/{total_runs}) " + f"[d={depth}, p={p}, n={n}, r={r+1}/{runs}]" + ), + ) try: - # Generate random input tokens - input_ids = OpenArcBenchmarks.random_input_ids(model_path, p) + # Prior context (d) + swept prompt segment (p) + input_ids = OpenArcBenchmarks.random_input_ids(model_path, p, depth=depth) # Make benchmark request bench_url = f"{cli_instance.base_url}/openarc/bench" @@ -157,6 +171,7 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t # Store individual result result = { + 'd': depth, 'p': p, 'n': n, 'run': r + 1, @@ -196,6 +211,7 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t # Create results table with visible lines results_table = Table(show_header=True, header_style="bold") results_table.add_column("[cyan]run[/cyan]", justify="right") + results_table.add_column("[cyan]d[/cyan]", justify="right") results_table.add_column("[cyan]p[/cyan]", justify="right") results_table.add_column("[cyan]n[/cyan]", justify="right") results_table.add_column("[cyan]ttft(s)[/cyan]", justify="right") @@ -208,6 +224,7 @@ def bench(ctx, model_name, input_tokens, max_tokens, runs, temperature, top_k, t for result in results: results_table.add_row( str(result['run']), + str(result['d']), str(result['p']), str(result['n']), f"{result['ttft']:.2f}", diff --git a/src/cli/modules/benchmark.py b/src/cli/modules/benchmark.py index 566626d..6f8798d 100644 --- a/src/cli/modules/benchmark.py +++ b/src/cli/modules/benchmark.py @@ -35,6 +35,7 @@ def init_database(self) -> None: run_id TEXT NOT NULL, timestamp TEXT NOT NULL, model_name TEXT NOT NULL, + depth_tokens INTEGER NOT NULL DEFAULT 0, input_tokens INTEGER NOT NULL, max_tokens INTEGER NOT NULL, run_number INTEGER NOT NULL, @@ -51,7 +52,20 @@ def init_database(self) -> None: conn.commit() conn.close() - + self._ensure_depth_column() + + def _ensure_depth_column(self) -> None: + conn = sqlite3.connect(self.db_file) + cursor = conn.cursor() + cursor.execute("PRAGMA table_info(benchmark_results)") + cols = {row[1] for row in cursor.fetchall()} + if "depth_tokens" not in cols: + cursor.execute( + "ALTER TABLE benchmark_results ADD COLUMN depth_tokens INTEGER NOT NULL DEFAULT 0" + ) + conn.commit() + conn.close() + def save_result(self, model_name: str, result: Dict[str, Any], run_id: str) -> None: """ Save a single benchmark result to the database. @@ -59,7 +73,7 @@ def save_result(self, model_name: str, result: Dict[str, Any], run_id: str) -> N Args: model_name: Name of the model being benchmarked. result: Dictionary containing benchmark results with keys: - 'p', 'n', 'run', 'ttft', 'tpot', 'prefill_throughput', + 'd', 'p', 'n', 'run', 'ttft', 'tpot', 'prefill_throughput', 'decode_throughput', 'decode_duration', 'input_token', 'new_token', 'total_token' run_id: Unique identifier for the benchmark run. @@ -69,14 +83,15 @@ def save_result(self, model_name: str, result: Dict[str, Any], run_id: str) -> N cursor.execute(""" INSERT INTO benchmark_results ( - run_id, timestamp, model_name, input_tokens, max_tokens, run_number, + run_id, timestamp, model_name, depth_tokens, input_tokens, max_tokens, run_number, ttft_s, tpot_ms, prefill_throughput_tokens_s, decode_throughput_tokens_s, decode_duration_s, input_token_count, new_token_count, total_token_count - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( run_id, datetime.now().isoformat(), model_name, + int(result.get("d", 0)), result['p'], result['n'], result['run'], @@ -98,29 +113,33 @@ class OpenArcBenchmarks: """Utilities for OpenArc benchmarking operations.""" @staticmethod - def random_input_ids(model_path: str, num_tokens: int) -> list: + def random_input_ids(model_path: str, num_tokens: int, *, depth: int = 0) -> list: """ Generate random input tokens for benchmarking. Follows llama.cpp approach. https://github.com/ggml-org/llama.cpp/blob/683fa6ba/tools/llama-bench/llama-bench.cpp#L1922 + + When ``depth`` > 0, that many tokens are sampled first as synthetic prior + context; ``num_tokens`` additional tokens follow (the swept prompt segment). Args: model_path: Path to the model. - num_tokens: Number of tokens to generate. + num_tokens: Number of prompt tokens after the optional prefix. + depth: Random vocab tokens prepended as fake prior context (default 0). Returns: - List of random token IDs. + List of random token IDs of length ``depth + num_tokens``. """ tokenizer = AutoTokenizer.from_pretrained(model_path) vocab_size = len(tokenizer) special_token_ids = set(tokenizer.all_special_ids) valid_token_ids = [i for i in range(vocab_size) if i not in special_token_ids] - - # Generate random tokens (not repeated) - input_ids = [random.choice(valid_token_ids) for _ in range(num_tokens)] - - return input_ids + + def sample(n: int) -> list: + return [random.choice(valid_token_ids) for _ in range(n)] + + return sample(depth) + sample(num_tokens) # Example usage: diff --git a/src/engine/openvino/kokoro.py b/src/engine/openvino/kokoro.py index 076588c..2b3b06f 100644 --- a/src/engine/openvino/kokoro.py +++ b/src/engine/openvino/kokoro.py @@ -20,7 +20,7 @@ from src.server.model_registry import ModelRegistry from src.server.models.registration import ModelLoadConfig -from src.server.models.openvino import KokoroLanguage, KokoroVoice, OV_KokoroGenConfig +from src.server.models.openvino import OV_KokoroGenConfig class StreamChunk(NamedTuple): @@ -132,7 +132,7 @@ async def chunk_forward_pass( from kokoro.pipeline import KPipeline pipeline = KPipeline(model=self, lang_code=config.lang_code.value) - text_chunks = self.make_chunks(config.kokoro_message, config.character_count_chunk) + text_chunks = self.make_chunks(config.input, config.character_count_chunk) total_chunks = len(text_chunks) for idx, chunk_text in enumerate(text_chunks): @@ -153,84 +153,3 @@ def infer_on_chunk(): chunk_index=idx, total_chunks=total_chunks, ) - -async def demo_entrypoint(): - """ - Demo entrypoint: Load OV_Kokoro model, generate speech, save to WAV, and unload. - """ - import sys - - # Add the project root to Python path for imports - project_root = Path(__file__).parent.parent.parent - sys.path.insert(0, str(project_root)) - - from src2.server.model_registry import EngineType, ModelLoadConfig, TaskType - - # Example configuration - adjust paths and parameters as needed - model_path = Path("/mnt/Ironwolf-4TB/Models/OpenVINO/Kokoro-82M-FP16-OpenVINO") # Replace with actual model path - if not model_path.exists(): - print(f"Error: Model path {model_path} does not exist") - print("Please update the model_path in demo_entrypoint()") - return - - load_config = ModelLoadConfig( - model_path=str(model_path), - model_name="kokoro-demo", - model_type=TaskType.KOKORO, - engine=EngineType.OPENVINO, - device="CPU", # or "GPU" if available - ) - - # Create model instance - kokoro_model = OV_Kokoro(load_config) - - try: - # Load the model - print("Loading Kokoro model...") - kokoro_model.load_model(load_config) - print("Model loaded successfully") - - # Configure generation - gen_config = OV_KokoroGenConfig( - kokoro_message="Hello world! This is a test of Kokoro text-to-speech synthesis.", - voice=KokoroVoice.AF_SARAH, # American English female voice - lang_code=KokoroLanguage.AMERICAN_ENGLISH, - speed=1.0, - character_count_chunk=100, - response_format="wav" - ) - - # Generate speech - print("Generating speech...") - audio_chunks = [] - async for chunk in kokoro_model.chunk_forward_pass(gen_config): - print(f"Generated chunk {chunk.chunk_index + 1}/{chunk.total_chunks}: '{chunk.chunk_text}'") - audio_chunks.append(chunk.audio) - - # Concatenate all audio chunks - if audio_chunks: - full_audio = torch.cat(audio_chunks, dim=0) - print(f"Generated audio with shape: {full_audio.shape}") - - # Save to WAV file - output_path = Path("kokoro_output.wav") - sf.write(str(output_path), full_audio.numpy(), samplerate=24000) # Kokoro uses 24kHz - print(f"Audio saved to {output_path}") - - except Exception as e: - print(f"Error during demo: {e}") - import traceback - traceback.print_exc() - - finally: - # Unload the model - print("Unloading model...") - await kokoro_model.unload_model() - print("Model unloaded") - - -if __name__ == "__main__": - asyncio.run(demo_entrypoint()) - - - diff --git a/src/engine/openvino/qwen3_asr/qwen3_asr.py b/src/engine/openvino/qwen3_asr/qwen3_asr.py new file mode 100644 index 0000000..4ab4279 --- /dev/null +++ b/src/engine/openvino/qwen3_asr/qwen3_asr.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python3 +""" +OpenVINO inference script for Qwen3-ASR (0.6B / 1.7B). + +Runs inference from OpenVINO IR models produced by qwen3_asr_ov_convert.py: + - audio_encoder.xml/.bin + - thinker_embeddings.xml/.bin + - decoder.xml/.bin + +Usage: + pip install torch openvino numpy soundfile + python qwen3_asr_ov_infer.py [--ov-dir ov_model] [--device CPU] +""" + +import os +import json +import argparse +import asyncio +import time +import base64 +from pathlib import Path +import logging +import gc +from typing import Any, AsyncIterator, Dict, Optional, Union + +import numpy as np +import openvino as ov +import torch +from src.engine.openvino.qwen3_asr.qwen3_asr_utils import ( + MAX_ASR_INPUT_SECONDS, + merge_languages, + normalize_audios, + normalize_language_name, + parse_asr_output, + split_audio_into_chunks, + validate_language, +) + +from src.server.models.openvino import OV_Qwen3ASRGenConfig +from src.server.model_registry import ModelRegistry +from src.server.models.registration import EngineType, ModelLoadConfig, ModelType + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +SAMPLE_RATE = 16000 +NUM_MEL_BINS = 128 +HOP_LENGTH = 160 +WINDOW_SIZE = 400 + +# Token IDs + +TOKEN_IM_START = 151644 +TOKEN_IM_END = 151645 +TOKEN_AUDIO_START = 151669 +TOKEN_AUDIO_END = 151670 +TOKEN_AUDIO_PAD = 151676 +TOKEN_ENDOFTEXT = 151643 +TOKEN_ASR_TEXT = 151704 +EOS_TOKEN_IDS = {TOKEN_ENDOFTEXT, TOKEN_IM_END} +PROMPT_PREFIX = [ + TOKEN_IM_START, 8948, 198, TOKEN_IM_END, 198, + TOKEN_IM_START, 872, 198, TOKEN_AUDIO_START, +] +PROMPT_SUFFIX = [ + TOKEN_AUDIO_END, TOKEN_IM_END, 198, + TOKEN_IM_START, 77091, 198, +] + + + + + +class Qwen3ASRHelpers: + @staticmethod + def hf_config(config_path: Path) -> dict: + with open(config_path) as f: + cfg = json.load(f) + if "dec_layers" in cfg: + return cfg + + tc = cfg["thinker_config"] + ac = tc["audio_config"] + txc = tc["text_config"] + return { + "enc_n_window": ac["n_window"], + "dec_layers": txc["num_hidden_layers"], + "dec_kv_heads": txc["num_key_value_heads"], + "dec_head_dim": txc["head_dim"], + } + + @staticmethod + def hertz_to_mel(freq): + mels = 3.0 * freq / 200.0 + if isinstance(freq, np.ndarray): + log_region = freq >= 1000.0 + mels[log_region] = 15.0 + np.log(freq[log_region] / 1000.0) * (27.0 / np.log(6.4)) + elif freq >= 1000.0: + mels = 15.0 + np.log(freq / 1000.0) * (27.0 / np.log(6.4)) + return mels + + @staticmethod + def mel_to_hertz(mels): + freq = 200.0 * mels / 3.0 + log_region = mels >= 15.0 + freq[log_region] = 1000.0 * np.exp((np.log(6.4) / 27.0) * (mels[log_region] - 15.0)) + return freq + + @staticmethod + def compute_mel_filters(): + num_freq = 1 + WINDOW_SIZE // 2 + fft_freqs = np.linspace(0, SAMPLE_RATE // 2, num_freq) + mel_freqs = np.linspace( + Qwen3ASRHelpers.hertz_to_mel(0.0), + Qwen3ASRHelpers.hertz_to_mel(8000.0), + NUM_MEL_BINS + 2, + ) + filter_freqs = Qwen3ASRHelpers.mel_to_hertz(mel_freqs) + fdiff = np.diff(filter_freqs) + slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) + down = -slopes[:, :-2] / fdiff[:-1] + up = slopes[:, 2:] / fdiff[1:] + fb = np.maximum(0, np.minimum(down, up)) + enorm = 2.0 / (filter_freqs[2 : NUM_MEL_BINS + 2] - filter_freqs[:NUM_MEL_BINS]) + fb *= enorm[np.newaxis, :] + return fb.astype(np.float32) + + @staticmethod + def compute_mel_spectrogram(audio_np, mel_filters_np): + audio = torch.from_numpy(audio_np).float() + mel_filters = torch.from_numpy(mel_filters_np).float() + window = torch.hann_window(WINDOW_SIZE) + stft = torch.stft(audio, WINDOW_SIZE, HOP_LENGTH, window=window, return_complex=True) + mag2 = stft[..., :-1].abs() ** 2 + mel_spec = mel_filters.T @ mag2 + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec.numpy() + + @staticmethod + def count_encoder_tokens(total_frames: int, chunk_size: int = 100) -> int: + count = 0 + for start in range(0, total_frames, chunk_size): + chunk_len = min(chunk_size, total_frames - start) + t = chunk_len + for _ in range(3): + t = (t + 1) // 2 + count += t + return count + + @staticmethod + def bytes_to_unicode(): + bs = list(range(ord("!"), ord("~") + 1)) + \ + list(range(ord("\xa1"), ord("\xac") + 1)) + \ + list(range(ord("\xae"), ord("\xff") + 1)) + cs = bs[:] + n = 0 + for b in range(256): + if b not in bs: + bs.append(b) + cs.append(256 + n) + n += 1 + return dict(zip(bs, [chr(c) for c in cs])) + + @staticmethod + def decode_tokens(token_ids, tokenizer_dir: str) -> str: + vocab_path = os.path.join(tokenizer_dir, "vocab.json") + with open(vocab_path, "r", encoding="utf-8") as f: + vocab = json.load(f) + id_to_token = {v: k for k, v in vocab.items()} + + special_tokens = set() + tc_path = os.path.join(tokenizer_dir, "tokenizer_config.json") + if os.path.exists(tc_path): + with open(tc_path) as f: + tc = json.load(f) + for tid_str in tc.get("added_tokens_decoder", {}): + special_tokens.add(int(tid_str)) + + byte_enc = Qwen3ASRHelpers.bytes_to_unicode() + byte_dec = {v: k for k, v in byte_enc.items()} + + pieces = [] + for tid in token_ids: + if tid in special_tokens: + if tid == TOKEN_ASR_TEXT: + pieces.append("") + continue + tok = id_to_token.get(tid, "") + if tok: + pieces.append(tok) + + text = "".join(pieces) + return bytearray([byte_dec[c] for c in text if c in byte_dec]).decode("utf-8", errors="replace") + + +class OVQwen3ASR: + + def __init__(self, load_config: ModelLoadConfig): + self.load_config = load_config + self.ov_dir = Path(load_config.model_path) + self.runtime_cfg = Qwen3ASRHelpers.hf_config(self.ov_dir / "config.json") + self.chunk_size = self.runtime_cfg["enc_n_window"] * 2 + self.mel_filters = Qwen3ASRHelpers.compute_mel_filters() + self.core = ov.Core() + self.t_model_load = 0.0 + self.enc_model = None + self.emb_model = None + self.dec_model = None + self.dec_request = None + + def load_model(self, load_config: ModelLoadConfig) -> None: + self.load_config = load_config + self.ov_dir = Path(load_config.model_path) + logger.info(f"[{load_config.model_name}] Loading OpenVINO Qwen3 ASR models") + t_load_start = time.perf_counter() + self.enc_model = self.core.compile_model( + str(self.ov_dir / "audio_encoder_model.xml"), + load_config.device, + ) + self.emb_model = self.core.compile_model( + str(self.ov_dir / "thinker_embeddings_model.xml"), + load_config.device, + ) + self.dec_model = self.core.compile_model( + str(self.ov_dir / "decoder_model.xml"), + load_config.device, + ) + self.dec_request = self.dec_model.create_infer_request() + self.t_model_load = time.perf_counter() - t_load_start + + def _embed_tokens(self, token_ids): + ids = np.asarray(token_ids, dtype=np.int64) + if ids.ndim == 1: + ids = ids[np.newaxis, :] + out = self.emb_model([ids]) + return out[self.emb_model.output(0)] + + def collect_metrics( + self, + *, + feature_sec: float, + encoder_sec: float, + prefill_sec: float, + decode_sec: float, + detok_sec: float, + prompt_tokens: int, + generated_tokens: int, + encoder_tokens: int, + ) -> dict: + prefill_tok_s = (prompt_tokens / prefill_sec) if prefill_sec > 0 else 0.0 + decode_tok_s = (generated_tokens / decode_sec) if decode_sec > 0 else 0.0 + return { + "feature_sec": feature_sec, + "encoder_sec": encoder_sec, + "prefill_sec": prefill_sec, + "prefill_tok_s": prefill_tok_s, + "decode_sec": decode_sec, + "decode_tok_s": decode_tok_s, + "detok_sec": detok_sec, + "prompt_tokens": prompt_tokens, + "generated_tokens": generated_tokens, + "encoder_tokens": encoder_tokens, + } + + async def audio_chunks(self, chunk_audio: np.ndarray, max_tokens: int): + t_feature_start = time.perf_counter() + mel = Qwen3ASRHelpers.compute_mel_spectrogram(chunk_audio, self.mel_filters) + t_feature = time.perf_counter() - t_feature_start + total_frames = mel.shape[1] + expected_tokens = Qwen3ASRHelpers.count_encoder_tokens(total_frames, self.chunk_size) + + pad_len = (self.chunk_size - total_frames % self.chunk_size) % self.chunk_size + if pad_len > 0: + mel = np.pad(mel, ((0, 0), (0, pad_len))) + mel_input = mel[np.newaxis, :, :].astype(np.float32) + + t_encoder_start = time.perf_counter() + enc_out = self.enc_model([mel_input]) + t_encoder = time.perf_counter() - t_encoder_start + audio_embeds = enc_out[self.enc_model.output(0)] + audio_embeds = audio_embeds[0, :expected_tokens, :] + n_audio = audio_embeds.shape[0] + + input_ids = PROMPT_PREFIX + [TOKEN_AUDIO_PAD] * n_audio + PROMPT_SUFFIX + input_embeds = self._embed_tokens(input_ids)[0].copy() + pad_start = len(PROMPT_PREFIX) + input_embeds[pad_start:pad_start + n_audio] = audio_embeds + input_embeds = input_embeds[np.newaxis, :, :].astype(np.float32) + + prompt_len = len(input_ids) + position_ids = np.arange(prompt_len, dtype=np.int64)[np.newaxis, :] + + + t_prefill_start = time.perf_counter() + self.dec_request.reset_state() + self.dec_request.set_input_tensor(0, ov.Tensor(input_embeds)) + self.dec_request.set_input_tensor(1, ov.Tensor(position_ids)) + self.dec_request.infer() + logits = self.dec_request.get_output_tensor(0).data + t_prefill = time.perf_counter() - t_prefill_start + + token = int(np.argmax(logits[0, 0])) + generated = [token] + + t_decode_start = time.perf_counter() + for step in range(max_tokens - 1): + if token in EOS_TOKEN_IDS: + break + pos = prompt_len + step + embed = self._embed_tokens([token]).astype(np.float32) + pos_id = np.array([[pos]], dtype=np.int64) + + self.dec_request.set_input_tensor(0, ov.Tensor(embed)) + self.dec_request.set_input_tensor(1, ov.Tensor(pos_id)) + self.dec_request.infer() + logits = self.dec_request.get_output_tensor(0).data + + token = int(np.argmax(logits[0, 0])) + generated.append(token) + t_decode = time.perf_counter() - t_decode_start + + while generated and generated[-1] in EOS_TOKEN_IDS: + generated.pop() + + t_detok_start = time.perf_counter() + raw = Qwen3ASRHelpers.decode_tokens(generated, str(self.ov_dir)) + t_detok = time.perf_counter() - t_detok_start + + metrics = self.collect_metrics( + feature_sec=t_feature, + encoder_sec=t_encoder, + prefill_sec=t_prefill, + decode_sec=t_decode, + detok_sec=t_detok, + prompt_tokens=prompt_len, + generated_tokens=len(generated), + encoder_tokens=expected_tokens, + ) + return raw, metrics + + async def transcribe(self, gen_config: OV_Qwen3ASRGenConfig) -> AsyncIterator[Union[Dict[str, Any], str]]: + t_transcribe_start = time.perf_counter() + audio_input = gen_config.audio_base64 + if not audio_input.startswith("data:audio"): + audio_input = f"data:audio/wav;base64,{audio_input}" + audio_array = (await asyncio.to_thread(normalize_audios, audio_input))[0] + language: Optional[str] = None + if gen_config.language: + language = normalize_language_name(gen_config.language) + validate_language(language) + + audio_seconds = len(audio_array) / SAMPLE_RATE + if audio_seconds <= 0: + yield {} + yield "" + return + + max_chunk_sec = min(float(gen_config.max_chunk_sec), float(MAX_ASR_INPUT_SECONDS)) + chunk_items = await asyncio.to_thread( + split_audio_into_chunks, + wav=audio_array, + sr=SAMPLE_RATE, + max_chunk_sec=max_chunk_sec, + search_expand_sec=gen_config.search_expand_sec, + min_window_ms=gen_config.min_window_ms, + ) + logger.info(f"[{self.load_config.model_name}] Running {len(chunk_items)} chunk(s)") + + langs = [] + texts = [] + agg = { + "feature_sec": 0.0, + "encoder_sec": 0.0, + "prefill_sec": 0.0, + "decode_sec": 0.0, + "detok_sec": 0.0, + "prompt_tokens": 0, + "generated_tokens": 0, + "encoder_tokens": 0, + } + for idx, (chunk_wav, chunk_offset_sec) in enumerate(chunk_items): + chunk_sec = len(chunk_wav) / SAMPLE_RATE + logger.info( + f"[{self.load_config.model_name}] Chunk {idx + 1}/{len(chunk_items)} " + f"offset={chunk_offset_sec:.2f}s duration={chunk_sec:.2f}s" + ) + raw, chunk_metrics = await self.audio_chunks(chunk_wav, gen_config.max_tokens) + lang, text = parse_asr_output(raw, language=language) + langs.append(lang) + if text: + texts.append(text) + agg["feature_sec"] += chunk_metrics["feature_sec"] + agg["encoder_sec"] += chunk_metrics["encoder_sec"] + agg["prefill_sec"] += chunk_metrics["prefill_sec"] + agg["decode_sec"] += chunk_metrics["decode_sec"] + agg["detok_sec"] += chunk_metrics["detok_sec"] + agg["prompt_tokens"] += chunk_metrics["prompt_tokens"] + agg["generated_tokens"] += chunk_metrics["generated_tokens"] + agg["encoder_tokens"] += chunk_metrics["encoder_tokens"] + + text = "".join(texts).strip() + merged_language = merge_languages(langs) + + t_total = time.perf_counter() - t_transcribe_start + end_to_end_rtf = (t_total / audio_seconds) if audio_seconds > 0 else 0.0 + metrics = self.collect_metrics( + feature_sec=agg["feature_sec"], + encoder_sec=agg["encoder_sec"], + prefill_sec=agg["prefill_sec"], + decode_sec=agg["decode_sec"], + detok_sec=agg["detok_sec"], + prompt_tokens=agg["prompt_tokens"], + generated_tokens=agg["generated_tokens"], + encoder_tokens=agg["encoder_tokens"], + ) + metrics["audio_duration_sec"] = audio_seconds + metrics["model_load_sec"] = self.t_model_load + metrics["end_to_end_sec"] = t_total + metrics["rtf"] = end_to_end_rtf + if merged_language: + metrics["language"] = merged_language + + yield metrics + yield text + + async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool: + removed = await registry.register_unload(model_name) + self.enc_model = None + self.emb_model = None + self.dec_model = None + self.dec_request = None + gc.collect() + logger.info(f"[{self.load_config.model_name}] unloaded and memory cleaned up") + return removed + + +def main(): + parser = argparse.ArgumentParser(description="Qwen3-ASR OpenVINO inference") + parser.add_argument("wav_path") + parser.add_argument("--ov-dir", default="ov_model") + parser.add_argument("--device", default="CPU") + parser.add_argument( + "--language", + default=None, + help="Optional forced language (e.g. English, Chinese)", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=1024, + help="Maximum generated tokens per chunk", + ) + parser.add_argument( + "--max-chunk-sec", + type=float, + default=30.0, + help="Target audio chunk size in seconds for energy-based splitting", + ) + parser.add_argument( + "--search-expand-sec", + type=float, + default=5.0, + help="Boundary search window (seconds) around each tentative chunk cut", + ) + parser.add_argument( + "--min-window-ms", + type=float, + default=100.0, + help="Sliding energy window in ms for low-energy boundary detection", + ) + args = parser.parse_args() + + load_config = ModelLoadConfig( + model_path=args.ov_dir, + model_name="qwen3-asr-cli", + model_type=ModelType.QWEN3_ASR, + engine=EngineType.OPENVINO, + device=args.device, + runtime_config={}, + ) + config = OV_Qwen3ASRGenConfig( + audio_base64=base64.b64encode(Path(args.wav_path).read_bytes()).decode("utf-8"), + language=args.language, + max_tokens=args.max_tokens, + max_chunk_sec=args.max_chunk_sec, + search_expand_sec=args.search_expand_sec, + min_window_ms=args.min_window_ms, + ) + model = OVQwen3ASR(load_config) + model.load_model(load_config) + + async def _run() -> str: + final_text = "" + async for item in model.transcribe(config): + if isinstance(item, str): + final_text = item + return final_text + + text = asyncio.run(_run()) + print(text) + + +if __name__ == "__main__": + main() diff --git a/src/engine/openvino/qwen3_asr/qwen3_asr_utils.py b/src/engine/openvino/qwen3_asr/qwen3_asr_utils.py new file mode 100644 index 0000000..f1a4723 --- /dev/null +++ b/src/engine/openvino/qwen3_asr/qwen3_asr_utils.py @@ -0,0 +1,497 @@ +# coding=utf-8 +# Copyright 2026 The Alibaba Qwen team. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import io +import urllib.request +from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import librosa +import numpy as np +import soundfile as sf + +AudioLike = Union[ + str, # wav path / URL / base64 + Tuple[np.ndarray, int], # (waveform, sr) +] +MaybeList = Union[Any, List[Any]] + +SAMPLE_RATE = 16000 +MAX_ASR_INPUT_SECONDS = 1200 +MAX_FORCE_ALIGN_INPUT_SECONDS = 180 +MIN_ASR_INPUT_SECONDS = 0.5 +SUPPORTED_LANGUAGES: List[str] = [ + "Chinese", + "English", + "Cantonese", + "Arabic", + "German", + "French", + "Spanish", + "Portuguese", + "Indonesian", + "Italian", + "Korean", + "Russian", + "Thai", + "Vietnamese", + "Japanese", + "Turkish", + "Hindi", + "Malay", + "Dutch", + "Swedish", + "Danish", + "Finnish", + "Polish", + "Czech", + "Filipino", + "Persian", + "Greek", + "Romanian", + "Hungarian", + "Macedonian" +] +_ASR_TEXT_TAG = "" +_LANG_PREFIX = "language " + + +def normalize_language_name(language: str) -> str: + """ + Normalize language name to the canonical format used by Qwen3-ASR: + first letter uppercase, the rest lowercase (e.g., 'cHINese' -> 'Chinese'). + + Args: + language (str): Input language name. + + Returns: + str: Normalized language name. + + Raises: + ValueError: If language is empty. + """ + if language is None: + raise ValueError("language is None") + s = str(language).strip() + if not s: + raise ValueError("language is empty") + return s[:1].upper() + s[1:].lower() + + +def validate_language(language: str) -> None: + """ + Validate the language is supported. + + Args: + language (str): Canonical language name. + + Raises: + ValueError: If unsupported. + """ + if language not in SUPPORTED_LANGUAGES: + raise ValueError(f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}") + + +def ensure_list(x: MaybeList) -> List[Any]: + return x if isinstance(x, list) else [x] + + +def is_url(s: str) -> bool: + try: + u = urlparse(s) + return u.scheme in ("http", "https") and bool(u.netloc) + except Exception: + return False + + +def is_probably_base64(s: str) -> bool: + if s.startswith("data:audio"): + return True + if ("/" not in s and "\\" not in s) and len(s) > 256: + return True + return False + + +def decode_base64_bytes(b64: str) -> bytes: + if "," in b64 and b64.strip().startswith("data:"): + b64 = b64.split(",", 1)[1] + return base64.b64decode(b64) + + +def load_audio_any(x: str) -> Tuple[np.ndarray, int]: + if is_url(x): + with urllib.request.urlopen(x) as resp: + audio_bytes = resp.read() + with io.BytesIO(audio_bytes) as f: + audio, sr = sf.read(f, dtype="float32", always_2d=False) + elif is_probably_base64(x): + audio_bytes = decode_base64_bytes(x) + with io.BytesIO(audio_bytes) as f: + audio, sr = sf.read(f, dtype="float32", always_2d=False) + else: + audio, sr = librosa.load(x, sr=None, mono=False) + + audio = np.asarray(audio, dtype=np.float32) + sr = int(sr) + return audio, sr + + +def to_mono(audio: np.ndarray) -> np.ndarray: + if audio.ndim == 1: + return audio + # soundfile can return shape (T, C); some pipelines use (C, T) + if audio.ndim == 2: + if audio.shape[0] <= 8 and audio.shape[1] > audio.shape[0]: + audio = audio.T + return np.mean(audio, axis=-1).astype(np.float32) + raise ValueError(f"Unsupported audio ndim={audio.ndim}") + + +def float_range_normalize(audio: np.ndarray) -> np.ndarray: + audio = audio.astype(np.float32) + if audio.size == 0: + return audio + peak = float(np.max(np.abs(audio))) + if peak == 0.0: + return audio + # If decoded audio is int-like scaled or out-of-range, normalize conservatively. + if peak > 1.0: + audio = audio / peak + audio = np.clip(audio, -1.0, 1.0) + return audio + + +def normalize_audio_input(a: AudioLike) -> np.ndarray: + """ + Normalize one audio input to mono 16k float32 waveform in [-1, 1]. + + Supported inputs: + - str: local file path / https URL / base64 audio string + - (np.ndarray, sr): waveform and sampling rate + + Returns: + np.ndarray: + Mono 16k float32 waveform in [-1, 1]. + """ + if isinstance(a, str): + audio, sr = load_audio_any(a) + elif isinstance(a, tuple) and len(a) == 2 and isinstance(a[0], np.ndarray): + audio, sr = a[0], int(a[1]) + else: + raise TypeError(f"Unsupported audio input type: {type(a)}") + + audio = to_mono(np.asarray(audio)) + if sr != SAMPLE_RATE: + audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE).astype(np.float32) + audio = float_range_normalize(audio) + return audio + + +def normalize_audios(audios: Union[AudioLike, List[AudioLike]]) -> List[np.ndarray]: + items = ensure_list(audios) + return [normalize_audio_input(a) for a in items] + + +def chunk_list(xs: List[Any], chunk_size: int) -> Iterable[List[Any]]: + """ + Yield chunks of a list. + + Args: + xs (List[Any]): Input list. + chunk_size (int): Chunk size. + + Yields: + List[Any]: Slices of xs. + """ + if chunk_size <= 0: + yield xs + return + for i in range(0, len(xs), chunk_size): + yield xs[i : i + chunk_size] + + +@dataclass(frozen=True) +class AudioChunk: + """ + One chunk cut from an original audio. + + Attributes: + orig_index: Index of the original sample in the input batch. + chunk_index: Index of this chunk within the original sample. + wav: Mono float32 waveform. + sr: Sampling rate. + offset_sec: Start offset of this chunk in the original audio, in seconds. + """ + orig_index: int + chunk_index: int + wav: np.ndarray + sr: int + offset_sec: float + + +def split_audio_into_chunks( + wav: np.ndarray, + sr: int, + max_chunk_sec: float, + search_expand_sec: float = 5.0, + min_window_ms: float = 100.0, +) -> List[Tuple[np.ndarray, float]]: + """ + Split a long audio into chunks close to max_chunk_sec, using a low-energy boundary. + + This implementation guarantees: + - Concatenating all returned chunks reproduces the original audio exactly + (total number of samples is identical, no overlaps, no gaps). + + Args: + wav: Mono waveform float32. + sr: Sampling rate. + max_chunk_sec: Target max chunk duration in seconds. + search_expand_sec: Boundary search half-window in seconds. + min_window_ms: Sliding window in milliseconds for energy estimation. + + Returns: + List[Tuple[np.ndarray, float]]: List of (chunk_wav, offset_sec). + """ + wav = np.asarray(wav, dtype=np.float32) + if wav.ndim > 1: + wav = np.mean(wav, axis=-1).astype(np.float32) + + total_len = int(wav.shape[0]) + total_sec = total_len / float(sr) + if total_sec <= max_chunk_sec: + return [(wav, 0.0)] + + max_len = int(max_chunk_sec * sr) + expand = int(search_expand_sec * sr) + win = max(4, int((min_window_ms / 1000.0) * sr)) + + chunks: List[Tuple[np.ndarray, float]] = [] + + start = 0 + offset_sec = 0.0 + + while (total_len - start) > max_len: + cut = start + max_len + + left = max(start, cut - expand) + right = min(total_len, cut + expand) + + if right - left <= win: + boundary = cut + else: + seg = wav[left:right] + seg_abs = np.abs(seg) + + window_sums = np.convolve(seg_abs, np.ones(win, dtype=np.float32), mode="valid") + + min_pos = int(np.argmin(window_sums)) + + wstart = min_pos + wend = min_pos + win + local = seg_abs[wstart:wend] + inner = int(np.argmin(local)) + boundary = left + wstart + inner + + boundary = int(max(boundary, start + 1)) + boundary = int(min(boundary, total_len)) + + chunk = wav[start:boundary] + chunks.append((chunk, offset_sec)) + + offset_sec += (boundary - start) / float(sr) + start = boundary + + tail = wav[start:total_len] + chunks.append((tail, offset_sec)) + + # Pad too-short chunks to at least MIN_ASR_INPUT_SECONDS (zero-padding at tail) + min_len = int(MIN_ASR_INPUT_SECONDS * sr) + padded: List[Tuple[np.ndarray, float]] = [] + for c, off in chunks: + if c.shape[0] < min_len: + pad = min_len - int(c.shape[0]) + c = np.pad(c, (0, pad), mode="constant", constant_values=0.0).astype(np.float32) + padded.append((c, off)) + chunks = padded + + return chunks + + +def detect_and_fix_repetitions(text, threshold=20): + def fix_char_repeats(s, thresh): + res = [] + i = 0 + n = len(s) + while i < n: + count = 1 + while i + count < n and s[i + count] == s[i]: + count += 1 + + if count > thresh: + res.append(s[i]) + i += count + else: + res.append(s[i:i+count]) + i += count + return ''.join(res) + + def fix_pattern_repeats(s, thresh, max_len=20): + n = len(s) + min_repeat_chars = thresh * 2 + if n < min_repeat_chars: + return s + + i = 0 + result = [] + while i <= n - min_repeat_chars: + found = False + for k in range(1, max_len + 1): + if i + k * thresh > n: + break + + pattern = s[i:i+k] + valid = True + for rep in range(1, thresh): + start_idx = i + rep * k + if s[start_idx:start_idx+k] != pattern: + valid = False + break + + if valid: + total_rep = thresh + end_index = i + thresh * k + while end_index + k <= n and s[end_index:end_index+k] == pattern: + total_rep += 1 + end_index += k + result.append(pattern) + result.append(fix_pattern_repeats(s[end_index:], thresh, max_len)) + i = n + found = True + break + + if found: + break + else: + result.append(s[i]) + i += 1 + + if not found: + result.append(s[i:]) + return ''.join(result) + + text_raw = text + text = fix_char_repeats(text_raw, threshold) + text = fix_pattern_repeats(text, threshold) + return text + + +def parse_asr_output( + raw: str, + language: Optional[str] = None, +) -> Tuple[str, str]: + """ + Parse Qwen3-ASR raw output into (language, text). + + Cases: + - With tag: "language Chinese...." + - With newlines: "language Chinese\\n...\\n...." + - No tag: treat whole string as text. + - "language None": treat as empty audio -> ("", "") + + If language is provided, output language is forced and raw is treated as text-only + (the model is expected to output plain transcription without metadata). + + Args: + raw: Raw decoded string. + language: Canonical language name if user forced language. + + Returns: + Tuple[str, str]: (language, text) + """ + if raw is None: + return "", "" + s = str(raw).strip() + if not s: + return "", "" + + s = detect_and_fix_repetitions(s) + + if language: + # user explicitly forced language => model output is treated as pure text + return language, s + + meta_part = s + text_part = "" + has_tag = _ASR_TEXT_TAG in s + if has_tag: + meta_part, text_part = s.split(_ASR_TEXT_TAG, 1) + else: + # no tag => pure text + return "", s.strip() + + meta_lower = meta_part.lower() + + # empty audio heuristic + if "language none" in meta_lower: + t = text_part.strip() + if not t: + return "", "" + # if model still returned something, keep it but language unknown + return "", t + + # extract "language xxx" from meta + lang = "" + for line in meta_part.splitlines(): + line = line.strip() + if not line: + continue + low = line.lower() + if low.startswith(_LANG_PREFIX): + val = line[len(_LANG_PREFIX):].strip() + if val: + lang = normalize_language_name(val) + break + + return lang, text_part.strip() + + +def merge_languages(langs: List[str]) -> str: + """ + Merge per-chunk languages into a compact comma-separated string, + keeping order and removing consecutive duplicates and empty entries. + + Example: + ["Chinese", "English", "English"] -> "Chinese,English" + + Args: + langs: List of canonical language names. + + Returns: + str: Merged language string. + """ + out: List[str] = [] + prev = None + for x in langs: + x = (x or "").strip() + if not x: + continue + if x == prev: + continue + out.append(x) + prev = x + return ",".join(out) diff --git a/src/engine/openvino/qwen3_tts/qwen3_tts.py b/src/engine/openvino/qwen3_tts/qwen3_tts.py new file mode 100644 index 0000000..fabcdd0 --- /dev/null +++ b/src/engine/openvino/qwen3_tts/qwen3_tts.py @@ -0,0 +1,1108 @@ +from __future__ import annotations + +import asyncio +import base64 +import gc +import logging +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator + +import librosa +import numpy as np +import openvino as ov +import soundfile as sf +from transformers import AutoTokenizer + +from src.engine.openvino.qwen3_tts.qwen3_tts_helpers import ( + CODEC_BOS_ID, + CODEC_EOS_ID, + CODEC_NOTHINK_ID, + CODEC_PAD_ID, + CODEC_THINK_BOS_ID, + CODEC_THINK_EOS_ID, + CODEC_THINK_ID, + CP_HEAD_DIM, + CP_MAX_POS, + CP_ROPE_THETA, + ENC_INPUT_SR, + HEAD_DIM, + LANGUAGES, + NUM_CODE_GROUPS, + SPEAKERS, + SPEECH_DECODER_SR, + SUPPRESS_MASK, + TALKER_MAX_POS, + TALKER_ROPE_THETA, + TTS_BOS_TOKEN_ID, + TTS_EOS_TOKEN_ID, + TTS_PAD_TOKEN_ID, + H, + Language, + Speaker, + _INSTRUCT_TMPL, + _REF_TEXT_TMPL, + _SYNTH_TMPL, +) +from src.server.model_registry import ModelRegistry +from src.server.models.openvino import OV_Qwen3TTSGenConfig +from src.server.models.registration import EngineType, ModelLoadConfig, ModelType + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# ICL speech decoder: last N reference frames as left context (matches upstream chunked_decode). +ICL_DECODER_LEFT_CONTEXT_FRAMES = 25 + + +@dataclass +class TTSStreamChunk: + """One decoded PCM segment from streaming synthesis (float32 mono, SPEECH_DECODER_SR Hz).""" + + audio: np.ndarray + chunk_index: int + is_final: bool + + +def _perf_add(perf: dict | None, key: str, dt: float) -> None: + if perf is not None: + perf[key] = perf.get(key, 0.0) + dt + + +class OVQwen3TTS: + """Single engine serving all three Qwen3-TTS modes. + + The mode is determined by load_config.model_type: + ModelType.QWEN3_TTS_CUSTOM_VOICE — predefined speaker + optional instruct + ModelType.QWEN3_TTS_VOICE_DESIGN — free-form voice description + ModelType.QWEN3_TTS_VOICE_CLONE — reference audio + optional ICL transcript + """ + + def __init__(self, load_config: ModelLoadConfig): + self.load_config = load_config + self._text_model_c = None + self._codec_emb_c = None + self._cp_codec_emb_c = None + self._decoder_c = None + self._decoder_input_name = None + self._talker_req = None + self._cp_req = None + self._speaker_enc_c = None + self._speech_enc_c = None + self.tokenizer = None + self._mrope_cos = None + self._mrope_sin = None + self._cp_cos = None + self._cp_sin = None + self._loaded = False + + # ---- Lifecycle ---------------------------------------------------------- + + def load_model(self, load_config: ModelLoadConfig) -> None: + """Load and compile OV models. + + Core models (text_model, codec_embedding, talker, code_predictor, + speech_decoder) are loaded for every model type. When *device* is GPU, + talker (and text/codec stacks) use GPU; code_predictor, cp_codec_embedding, + and speech_decoder use CPU. Voice-clone models (speaker_encoder, speech_encoder) + follow *device*. model_type == ModelType.QWEN3_TTS_VOICE_CLONE loads encoders. + """ + self.load_config = load_config + p = Path(load_config.model_path) + device = load_config.device + core = ov.Core() + core.set_property({"CACHE_DIR": str(p / ".ov_cache")}) + + self.tokenizer = AutoTokenizer.from_pretrained(str(p), trust_remote_code=True) + + self._mrope_cos, self._mrope_sin = H.precompute_mrope( + TALKER_MAX_POS, HEAD_DIM, TALKER_ROPE_THETA, + ) + self._cp_cos, self._cp_sin = H.precompute_standard_rope( + CP_MAX_POS, CP_HEAD_DIM, CP_ROPE_THETA, + ) + + self._text_model_c = core.compile_model(str(p / "text_model.xml"), device) + self._codec_emb_c = core.compile_model(str(p / "codec_embedding.xml"), device) + # Code predictor: many tiny inferences per frame; CPU avoids GPU launch/transfer overhead. + self._cp_codec_emb_c = core.compile_model(str(p / "cp_codec_embedding.xml"), "CPU") + # Speech decoder: single-shot vocoding; CPU fits typical sequence lengths without GPU overhead. + self._decoder_c = core.compile_model( + str(p / "speech_tokenizer" / "speech_decoder.xml"), "CPU", + ) + self._decoder_input_name = self._decoder_c.input(0).get_any_name() + + talker_c = core.compile_model(str(p / "talker.xml"), device) + self._talker_req = talker_c.create_infer_request() + cp_c = core.compile_model(str(p / "code_predictor.xml"), "CPU") + self._cp_req = cp_c.create_infer_request() + if "GPU" in device: + logger.info( + f"[{load_config.model_name}] talker on {device}; " + f"code_predictor, cp_codec_embedding, speech_decoder on CPU", + ) + + self._speaker_enc_c = None + self._speech_enc_c = None + if load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE: + self._speaker_enc_c = core.compile_model( + str(p / "speaker_encoder.xml"), device, + ) + self._speech_enc_c = core.compile_model( + str(p / "speech_tokenizer" / "speech_encoder.xml"), device, + ) + + self._loaded = True + logger.info( + f"[{load_config.model_name}] loaded from {p} device={device} " + f"model_type={load_config.model_type.value}" + ) + + async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool: + removed = await registry.register_unload(model_name) + self._text_model_c = None + self._codec_emb_c = None + self._cp_codec_emb_c = None + self._decoder_c = None + self._decoder_input_name = None + self._talker_req = None + self._cp_req = None + self._speaker_enc_c = None + self._speech_enc_c = None + self.tokenizer = None + self._mrope_cos = None + self._mrope_sin = None + self._cp_cos = None + self._cp_sin = None + self._loaded = False + gc.collect() + logger.info(f"[{model_name}] unloaded and memory cleaned up") + return removed + + @property + def loaded(self) -> bool: + return self._loaded + + # ---- Public API --------------------------------------------------------- + + async def generate(self, gen_config: OV_Qwen3TTSGenConfig) -> tuple[np.ndarray, int]: + """Synthesise speech from *gen_config*. Returns (wav: float32, sample_rate: int).""" + return await asyncio.to_thread(self._generate_sync, gen_config) + + def generate_stream(self, gen_config: OV_Qwen3TTSGenConfig) -> Iterator[TTSStreamChunk]: + """Synchronous generator of float32 mono PCM chunks at SPEECH_DECODER_SR (drip-fed text).""" + if not self._loaded: + raise RuntimeError("Call load_model() before generate_stream()") + gc = gen_config.model_copy(update={"non_streaming_mode": False}) + if self.load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE: + yield from self._generate_voice_clone_stream(gc) + else: + yield from self._generate_standard_stream(gc) + + def _generate_sync(self, gen_config: OV_Qwen3TTSGenConfig) -> tuple[np.ndarray, int]: + if not self._loaded: + raise RuntimeError("Call load_model() before generate()") + if self.load_config.model_type == ModelType.QWEN3_TTS_VOICE_CLONE: + return self._generate_voice_clone(gen_config) + return self._generate_standard(gen_config) + + # ---- Internal: standard generation (custom_voice / voice_design) -------- + + def _generate_standard(self, gen_config: OV_Qwen3TTSGenConfig) -> tuple[np.ndarray, int]: + t_total = time.perf_counter() + perf: dict = {} + speaker = Speaker(gen_config.speaker) if gen_config.speaker else None + language = Language(gen_config.language) if gen_config.language else None + + if self.load_config.model_type == ModelType.QWEN3_TTS_CUSTOM_VOICE: + build_kw = dict( + text=gen_config.input, + speaker=speaker, + language=language, + instruct=gen_config.instruct, + ) + else: # VOICE_DESIGN + build_kw = dict( + text=gen_config.input, + speaker=None, + language=language, + instruct=gen_config.voice_description, + ) + + t0 = time.perf_counter() + inp = self._build_inputs(**build_kw, non_streaming_mode=gen_config.non_streaming_mode, perf=perf) + logger.debug(f"[perf] build_inputs: {time.perf_counter() - t0:.3f}s") + + codes = self._run_loop(inp, gen_config, perf) + + if not codes: + self._log_pipeline_summary(perf, t_total, wav_seconds=0.0, voice_clone=False) + return np.zeros(0, dtype=np.float32), SPEECH_DECODER_SR + + wav = self._decode_codes(codes, perf) + + self._log_summary(codes, wav, t_total) + self._log_pipeline_summary(perf, t_total, wav_seconds=float(len(wav) / SPEECH_DECODER_SR), voice_clone=False) + return wav, SPEECH_DECODER_SR + + # ---- Internal: voice clone generation ----------------------------------- + + def _generate_voice_clone(self, gen_config: OV_Qwen3TTSGenConfig) -> tuple[np.ndarray, int]: + t_total = time.perf_counter() + perf: dict = {} + language = Language(gen_config.language) if gen_config.language else None + + t0 = time.perf_counter() + audio, audio_sr = H.decode_audio_b64(gen_config.ref_audio_b64) + dt_dec = time.perf_counter() - t0 + _perf_add(perf, "audio_decode", dt_dec) + logger.debug(f"[perf] audio_decode: {dt_dec:.3f}s") + logger.debug(f"[info] ref audio: {len(audio) / audio_sr:.3f}s @ {audio_sr} Hz") + + t0 = time.perf_counter() + speaker_embed = self._extract_speaker_embedding(audio, audio_sr, perf) + logger.debug(f"[perf] speaker encoder: {time.perf_counter() - t0:.3f}s") + + use_icl = gen_config.ref_text is not None and not gen_config.x_vector_only + ref_codes = None + if use_icl: + t0 = time.perf_counter() + ref_codes = self._encode_audio(audio, audio_sr, perf) + logger.debug(f"[perf] speech encoder (OV): {time.perf_counter() - t0:.3f}s") + logger.debug(f"[info] ref_codes shape: {ref_codes.shape}") + + t0 = time.perf_counter() + inp = self._build_inputs( + text=gen_config.input, + speaker_embed=speaker_embed, + language=language, + instruct=gen_config.instruct, + non_streaming_mode=gen_config.non_streaming_mode, + ref_text=gen_config.ref_text if use_icl else None, + ref_codes=ref_codes, + perf=perf, + ) + logger.debug(f"[perf] build_inputs: {time.perf_counter() - t0:.3f}s") + + codes = self._run_loop(inp, gen_config, perf) + + if not codes: + self._log_pipeline_summary(perf, t_total, wav_seconds=0.0, voice_clone=True) + return np.zeros(0, dtype=np.float32), SPEECH_DECODER_SR + + if use_icl and ref_codes is not None: + wav = self._decode_icl(codes, ref_codes, perf) + else: + wav = self._decode_codes(codes, perf) + + self._log_summary(codes, wav, t_total) + self._log_pipeline_summary(perf, t_total, wav_seconds=float(len(wav) / SPEECH_DECODER_SR), voice_clone=True) + return wav, SPEECH_DECODER_SR + + def _generate_standard_stream(self, gen_config: OV_Qwen3TTSGenConfig) -> Iterator[TTSStreamChunk]: + t_total = time.perf_counter() + perf: dict = {} + speaker = Speaker(gen_config.speaker) if gen_config.speaker else None + language = Language(gen_config.language) if gen_config.language else None + + if self.load_config.model_type == ModelType.QWEN3_TTS_CUSTOM_VOICE: + build_kw = dict( + text=gen_config.input, + speaker=speaker, + language=language, + instruct=gen_config.instruct, + ) + else: + build_kw = dict( + text=gen_config.input, + speaker=None, + language=language, + instruct=gen_config.voice_description, + ) + + t0 = time.perf_counter() + inp = self._build_inputs(**build_kw, non_streaming_mode=gen_config.non_streaming_mode, perf=perf) + logger.debug(f"[perf] build_inputs: {time.perf_counter() - t0:.3f}s") + + n_samples = 0 + n_chunks = 0 + for chunk in self._run_loop_streaming(inp, gen_config, perf): + n_samples += len(chunk.audio) + n_chunks += 1 + yield chunk + + wav_sec = n_samples / SPEECH_DECODER_SR + self._log_pipeline_summary(perf, t_total, wav_seconds=wav_sec, voice_clone=False) + if n_samples > 0: + logger.info(f"[info] streaming: {n_chunks} chunks -> {n_samples} samples ({wav_sec:.2f}s audio)") + + def _generate_voice_clone_stream(self, gen_config: OV_Qwen3TTSGenConfig) -> Iterator[TTSStreamChunk]: + t_total = time.perf_counter() + perf: dict = {} + language = Language(gen_config.language) if gen_config.language else None + + t0 = time.perf_counter() + audio, audio_sr = H.decode_audio_b64(gen_config.ref_audio_b64) + dt_dec = time.perf_counter() - t0 + _perf_add(perf, "audio_decode", dt_dec) + logger.debug(f"[perf] audio_decode: {dt_dec:.3f}s") + + t0 = time.perf_counter() + speaker_embed = self._extract_speaker_embedding(audio, audio_sr, perf) + logger.debug(f"[perf] speaker encoder: {time.perf_counter() - t0:.3f}s") + + use_icl = gen_config.ref_text is not None and not gen_config.x_vector_only + ref_codes = None + if use_icl: + t0 = time.perf_counter() + ref_codes = self._encode_audio(audio, audio_sr, perf) + logger.debug(f"[perf] speech encoder (OV): {time.perf_counter() - t0:.3f}s") + + t0 = time.perf_counter() + inp = self._build_inputs( + text=gen_config.input, + speaker_embed=speaker_embed, + language=language, + instruct=gen_config.instruct, + non_streaming_mode=gen_config.non_streaming_mode, + ref_text=gen_config.ref_text if use_icl else None, + ref_codes=ref_codes, + perf=perf, + ) + logger.debug(f"[perf] build_inputs: {time.perf_counter() - t0:.3f}s") + + n_samples = 0 + n_chunks = 0 + for chunk in self._run_loop_streaming(inp, gen_config, perf): + n_samples += len(chunk.audio) + n_chunks += 1 + yield chunk + + wav_sec = n_samples / SPEECH_DECODER_SR + self._log_pipeline_summary(perf, t_total, wav_seconds=wav_sec, voice_clone=True) + if n_samples > 0: + logger.info(f"[info] streaming (voice_clone): {n_chunks} chunks -> {n_samples} samples ({wav_sec:.2f}s audio)") + + def _decode_icl( + self, + gen_codes: list[list[int]], + ref_codes: np.ndarray, + perf: dict[str, float] | None = None, + ) -> np.ndarray: + """Decode with a short ref prefix (left context), then trim the context from output.""" + ref_2d = ref_codes[0] # (T_ref, n_q) + gen_2d = np.asarray(gen_codes, dtype=np.int64) + context_size = min(ICL_DECODER_LEFT_CONTEXT_FRAMES, ref_2d.shape[0]) + context = ref_2d[-context_size:] + combined = np.concatenate([context, gen_2d], axis=0) + decoder_in = combined.T[np.newaxis] # (1, n_q, T) + _b, n_q, t_frames = decoder_in.shape + if perf is not None: + perf["_decoder_in_shape"] = (_b, n_q, t_frames) + logger.debug( + f"[info] speech_decoder icl: ref_frames={ref_2d.shape[0]} " + f"context_frames={context_size} gen_frames={gen_2d.shape[0]} " + f"combined={combined.shape[0]}", + ) + logger.debug(f"[info] speech decoder input shape: ({_b}, {n_q}, {t_frames})") + t0 = time.perf_counter() + result = H.ov_call(self._decoder_c, {self._decoder_input_name: decoder_in}) + dt = time.perf_counter() - t0 + _perf_add(perf, "speech_decoder", dt) + logger.debug(f"[perf] speech decoder (OV): {dt:.3f}s") + full_wav = np.clip(result["waveform"].squeeze(), -1.0, 1.0).astype(np.float32) + cut = int(context_size / combined.shape[0] * len(full_wav)) + return full_wav[cut:] + + # ---- OV model wrappers -------------------------------------------------- + + def _text_model(self, ids: np.ndarray) -> np.ndarray: + return H.ov_call(self._text_model_c, {"token_ids": ids})["projected"] + + def _codec_embed(self, ids: np.ndarray) -> np.ndarray: + return H.ov_call(self._codec_emb_c, {"token_ids": ids})["embeddings"] + + def _cp_codec_embed(self, ids: np.ndarray, step_idx: int) -> np.ndarray: + return H.ov_call(self._cp_codec_emb_c, { + "token_ids": ids, + "step_idx": np.array(step_idx, dtype=np.int64), + })["embeddings"] + + def _talker_infer(self, embeds, cos, sin): + r = H.ov_stateful_infer(self._talker_req, { + "inputs_embeds": embeds, "cos": cos, "sin": sin, + "beam_idx": np.array([0], dtype=np.int32), + }) + return r["logits"], r["hidden"] + + def _cp_infer(self, embeds, cos, sin, gen_steps: int): + r = H.ov_stateful_infer(self._cp_req, { + "inputs_embeds": embeds, "cos": cos, "sin": sin, + "generation_steps": np.array(gen_steps, dtype=np.int64), + "beam_idx": np.array([0], dtype=np.int32), + }) + return r["logits"], r["hidden"] + + def _decode_codes(self, codes: list[list[int]], perf: dict[str, float] | None = None) -> np.ndarray: + arr = np.asarray(codes, dtype=np.int64) + decoder_in = arr.T[np.newaxis] + _b, n_q, t_frames = decoder_in.shape + if perf is not None: + perf["_decoder_in_shape"] = (_b, n_q, t_frames) + logger.debug(f"[info] speech decoder input shape: ({_b}, {n_q}, {t_frames})") + t0 = time.perf_counter() + r = H.ov_call(self._decoder_c, {self._decoder_input_name: decoder_in}) + dt = time.perf_counter() - t0 + _perf_add(perf, "speech_decoder", dt) + logger.debug(f"[perf] speech decoder (OV): {dt:.3f}s") + return np.clip(r["waveform"].squeeze(), -1.0, 1.0).astype(np.float32) + + def _chunked_decode( + self, + chunk_codes: list[list[int]], + prev_tail: list[list[int]] | None, + left_ctx: int, + perf: dict[str, float] | None = None, + ) -> np.ndarray: + """Decode codec frames with optional left context from the previous chunk.""" + if not chunk_codes: + return np.zeros(0, dtype=np.float32) + if prev_tail is None or left_ctx <= 0: + return self._decode_codes(chunk_codes, perf) + prev_arr = np.asarray(prev_tail, dtype=np.int64) + if prev_arr.size == 0: + return self._decode_codes(chunk_codes, perf) + cur = np.asarray(chunk_codes, dtype=np.int64) + ctx_n = min(left_ctx, prev_arr.shape[0]) + context = prev_arr[-ctx_n:] + combined = np.concatenate([context, cur], axis=0) + decoder_in = combined.T[np.newaxis] + _b, n_q, t_frames = decoder_in.shape + if perf is not None: + perf["_decoder_in_shape"] = (_b, n_q, t_frames) + logger.debug(f"[info] speech decoder (chunked) shape: ({_b}, {n_q}, {t_frames})") + t0 = time.perf_counter() + r = H.ov_call(self._decoder_c, {self._decoder_input_name: decoder_in}) + dt = time.perf_counter() - t0 + _perf_add(perf, "speech_decoder", dt) + logger.debug(f"[perf] speech decoder chunk (OV): {dt:.3f}s") + full_wav = np.clip(r["waveform"].squeeze(), -1.0, 1.0).astype(np.float32) + cut = int(ctx_n / combined.shape[0] * len(full_wav)) + return full_wav[cut:] + + # ---- Voice-clone specific OV calls -------------------------------------- + + def _extract_speaker_embedding( + self, audio: np.ndarray, sr: int, perf: dict[str, float] | None = None, + ) -> np.ndarray: + t0 = time.perf_counter() + mels = H.mel_spectrogram(audio, sr) # (n_mels, T) + t_mel = time.perf_counter() - t0 + _perf_add(perf, "speaker_mel", t_mel) + logger.debug(f"[perf] speaker mel_spectrogram: {t_mel:.3f}s") + mels_in = mels.T[np.newaxis].astype(np.float32) # (1, T, n_mels) + t0 = time.perf_counter() + r = H.ov_call(self._speaker_enc_c, {"mels": mels_in}) + t_ov = time.perf_counter() - t0 + _perf_add(perf, "speaker_ov", t_ov) + logger.debug(f"[perf] speaker encoder ov: {t_ov:.3f}s") + return r["embedding"][:, np.newaxis, :] # (1, 1, D) + + def _encode_audio(self, audio: np.ndarray, sr: int, perf: dict[str, float] | None = None) -> np.ndarray: + audio = audio.astype(np.float32) + t_rs = 0.0 + if sr != ENC_INPUT_SR: + t0 = time.perf_counter() + audio = librosa.resample(audio, orig_sr=sr, target_sr=ENC_INPUT_SR) + t_rs = time.perf_counter() - t0 + _perf_add(perf, "speech_resample", t_rs) + logger.debug(f"[perf] speech encoder resample: {t_rs:.3f}s ({sr} -> {ENC_INPUT_SR} Hz)") + t0 = time.perf_counter() + r = H.ov_call(self._speech_enc_c, {"audio": audio[np.newaxis]}) + t_ov = time.perf_counter() - t0 + _perf_add(perf, "speech_ov", t_ov) + logger.debug(f"[perf] speech encoder ov: {t_ov:.3f}s") + return r["codes"] # (1, T_ref, n_q) + + # ---- Prefill assembly --------------------------------------------------- + + def _get_special_embeds(self, perf: dict[str, float] | None = None): + ids = np.array([[TTS_BOS_TOKEN_ID, TTS_EOS_TOKEN_ID, TTS_PAD_TOKEN_ID]], dtype=np.int64) + t0 = time.perf_counter() + e = self._text_model(ids) + _perf_add(perf, "build_text_model", time.perf_counter() - t0) + return e[:, 0:1, :], e[:, 1:2, :], e[:, 2:3, :] + + def _resolve_language_id(self, language: Language | None, speaker: Speaker | None) -> int | None: + lang_id = LANGUAGES[language].codec_id if language is not None else None + if language in (Language.CHINESE, None) and speaker is not None: + dialect = SPEAKERS[speaker].dialect + if dialect is not None: + lang_id = LANGUAGES[dialect].codec_id + return lang_id + + def _build_codec_control( + self, + language_id: int | None, + speaker_embed: np.ndarray | None = None, + speaker: Speaker | None = None, + perf: dict[str, float] | None = None, + ) -> np.ndarray: + t0 = time.perf_counter() + if language_id is None: + prefix_ids = np.array( + [[CODEC_NOTHINK_ID, CODEC_THINK_BOS_ID, CODEC_THINK_EOS_ID]], dtype=np.int64, + ) + else: + prefix_ids = np.array( + [[CODEC_THINK_ID, CODEC_THINK_BOS_ID, language_id, CODEC_THINK_EOS_ID]], + dtype=np.int64, + ) + + emb_prefix = self._codec_embed(prefix_ids) + emb_suffix = self._codec_embed( + np.array([[CODEC_PAD_ID, CODEC_BOS_ID]], dtype=np.int64), + ) + + spk = None + if speaker_embed is not None: + spk = speaker_embed + elif speaker is not None: + spk = self._codec_embed( + np.array([[SPEAKERS[speaker].codec_id]], dtype=np.int64), + ) + + parts = [emb_prefix] + ([spk] if spk is not None else []) + [emb_suffix] + out = np.concatenate(parts, axis=1) + dt_cc = time.perf_counter() - t0 + _perf_add(perf, "build_codec_control", dt_cc) + logger.debug(f"[perf] build_codec_control: {dt_cc:.3f}s") + return out + + def _build_inputs( + self, + text: str, + speaker: Speaker | None = None, + speaker_embed: np.ndarray | None = None, + language: Language | None = None, + instruct: str | None = None, + non_streaming_mode: bool = True, + ref_text: str | None = None, + ref_codes: np.ndarray | None = None, + perf: dict[str, float] | None = None, + ) -> dict: + formatted = _SYNTH_TMPL.format(text=text) + t0 = time.perf_counter() + input_ids = self.tokenizer(formatted, return_tensors="np", padding=False)["input_ids"] + _perf_add(perf, "build_tokenizer", time.perf_counter() - t0) + + tts_bos, tts_eos, tts_pad = self._get_special_embeds(perf) + lang_id = self._resolve_language_id(language, speaker) + codec_ctrl = self._build_codec_control(lang_id, speaker_embed, speaker, perf) + + def _tm(ids: np.ndarray) -> np.ndarray: + t0 = time.perf_counter() + r = self._text_model(ids) + _perf_add(perf, "build_text_model", time.perf_counter() - t0) + return r + + def _ce_misc(ids: np.ndarray) -> np.ndarray: + t0 = time.perf_counter() + r = self._codec_embed(ids) + _perf_add(perf, "build_codec_embed_other", time.perf_counter() - t0) + return r + + # Role prefix: <|im_start|>assistant\n (first 3 tokens) + role = _tm(input_ids[:, :3]) + + # Control signal: text-side padding + bos summed with codec-side embeddings + n_codec = codec_ctrl.shape[1] + text_side = np.concatenate( + [np.tile(tts_pad, (1, n_codec - 2, 1)), tts_bos], axis=1, + ) + control = text_side + codec_ctrl[:, :-1, :] + talker = np.concatenate([role, control], axis=1) + + if instruct: + t0 = time.perf_counter() + inst_ids = self.tokenizer( + _INSTRUCT_TMPL.format(instruct=instruct), return_tensors="np", padding=False, + )["input_ids"] + _perf_add(perf, "build_tokenizer", time.perf_counter() - t0) + talker = np.concatenate([_tm(inst_ids), talker], axis=1) + + use_icl = ref_codes is not None and ref_text is not None + + if use_icl: + t0 = time.perf_counter() + ref_ids = self.tokenizer( + _REF_TEXT_TMPL.format(ref_text=ref_text), return_tensors="np", padding=False, + )["input_ids"] + _perf_add(perf, "build_tokenizer", time.perf_counter() - t0) + ref_text_ids = ref_ids[:, 3:-2] + target_ids = input_ids[:, 3:-5] + all_text_ids = np.concatenate([ref_text_ids, target_ids], axis=1) + + text_emb = _tm(all_text_ids) + text_eos = np.concatenate([text_emb, tts_eos], axis=1) + + codec_bos_emb = _ce_misc(np.array([[CODEC_BOS_ID]], dtype=np.int64)) + ref_emb = self._embed_ref_codes(ref_codes[0], perf) + codec_bos_ref = np.concatenate([codec_bos_emb, ref_emb], axis=1) + + text_block = text_eos + _ce_misc( + np.full((1, text_eos.shape[1]), CODEC_PAD_ID, dtype=np.int64), + ) + codec_block = codec_bos_ref + np.tile(tts_pad, (1, codec_bos_ref.shape[1], 1)) + + final_bos = tts_pad + _ce_misc( + np.array([[CODEC_BOS_ID]], dtype=np.int64), + ) + talker = np.concatenate([talker, text_block, codec_block, final_bos], axis=1) + trailing = tts_pad + + elif non_streaming_mode: + text_ids = input_ids[:, 3:-5] + text_emb = _tm(text_ids) + text_eos = np.concatenate([text_emb, tts_eos], axis=1) + codec_pad_seq = _ce_misc( + np.full((1, text_eos.shape[1]), CODEC_PAD_ID, dtype=np.int64), + ) + final_bos = tts_pad + _ce_misc( + np.array([[CODEC_BOS_ID]], dtype=np.int64), + ) + talker = np.concatenate([talker, text_eos + codec_pad_seq, final_bos], axis=1) + trailing = tts_pad + + else: + first = _tm(input_ids[:, 3:4]) + talker = np.concatenate([talker, first + codec_ctrl[:, -1:, :]], axis=1) + remaining = _tm(input_ids[:, 4:-5]) + trailing = np.concatenate([remaining, tts_eos], axis=1) + + _b, seq_len, _h = talker.shape + logger.debug(f"[info] inputs_embeds shape: batch={_b} seq_len={seq_len} hidden={_h}") + if perf is not None: + logger.debug( + f"[perf] build_inputs components: tokenizer={perf.get('build_tokenizer', 0):.3f}s " + f"text_model={perf.get('build_text_model', 0):.3f}s " + f"codec_control={perf.get('build_codec_control', 0):.3f}s " + f"codec_embed_other={perf.get('build_codec_embed_other', 0):.3f}s " + f"ref_embed={perf.get('build_ref_embed', 0):.3f}s", + ) + + return {"inputs_embeds": talker, "trailing_text_hidden": trailing, "tts_pad_embed": tts_pad} + + def _embed_ref_codes(self, codes_2d: np.ndarray, perf: dict[str, float] | None = None) -> np.ndarray: + t0 = time.perf_counter() + T = codes_2d.shape[0] + result = self._codec_embed(codes_2d[:, 0].reshape(1, T).astype(np.int64)) + for i in range(1, codes_2d.shape[1]): + result = result + self._cp_codec_embed( + codes_2d[:, i].reshape(1, T).astype(np.int64), step_idx=i - 1, + ) + dt = time.perf_counter() - t0 + _perf_add(perf, "build_ref_embed", dt) + logger.debug(f"[perf] embed_ref_codes: {dt:.3f}s") + logger.debug(f"[info] embed_ref_codes T_ref_frames={T}") + return result + + # ---- Sub-code generation ------------------------------------------------ + + def _generate_sub_codes( + self, + first_code_embed: np.ndarray, + past_hidden: np.ndarray, + gen_config: OV_Qwen3TTSGenConfig, + ) -> tuple[list[int], np.ndarray, float, float]: + num_sub = NUM_CODE_GROUPS - 1 + self._cp_req.reset_state() + + prefill = np.concatenate([past_hidden, first_code_embed], axis=1) + cos, sin = H.slice_rope(self._cp_cos, self._cp_sin, 0, 2) + t_pf0 = time.perf_counter() + logits, _ = self._cp_infer(prefill, cos, sin, gen_steps=0) + t_cp_prefill = time.perf_counter() - t_pf0 + + t_dc0 = time.perf_counter() + tid = H.sample_token( + logits[0, -1, :], + gen_config.subtalker_do_sample, gen_config.subtalker_top_k, + gen_config.subtalker_top_p, gen_config.subtalker_temperature, + ) + sub_codes = [tid] + + code_emb = self._cp_codec_embed(np.array([[tid]], dtype=np.int64), step_idx=0) + embeds_sum = first_code_embed + code_emb + cache_pos = 2 + + for step in range(1, num_sub): + cos, sin = H.slice_rope(self._cp_cos, self._cp_sin, cache_pos, 1) + logits, _ = self._cp_infer(code_emb, cos, sin, gen_steps=step) + + tid = H.sample_token( + logits[0, -1, :], + gen_config.subtalker_do_sample, gen_config.subtalker_top_k, + gen_config.subtalker_top_p, gen_config.subtalker_temperature, + ) + sub_codes.append(tid) + + code_emb = self._cp_codec_embed(np.array([[tid]], dtype=np.int64), step_idx=step) + embeds_sum = embeds_sum + code_emb + cache_pos += 1 + + t_cp_decode = time.perf_counter() - t_dc0 + return sub_codes, embeds_sum, t_cp_prefill, t_cp_decode + + # ---- Core generation loop ----------------------------------------------- + + def _run_loop( + self, + inp: dict, + gen_config: OV_Qwen3TTSGenConfig, + perf: dict[str, float] | None = None, + ) -> list[list[int]]: + """Run the autoregressive talker + code-predictor loop. + + Returns: + List of codec frame codes (each frame is a list of NUM_CODE_GROUPS ints). + """ + embeds = inp["inputs_embeds"] + trailing = inp["trailing_text_hidden"] + pad_emb = inp["tts_pad_embed"] + + self._talker_req.reset_state() + S = embeds.shape[1] + cos, sin = H.slice_rope(self._mrope_cos, self._mrope_sin, 0, S) + + t0 = time.perf_counter() + logits, hidden = self._talker_infer(embeds, cos, sin) + dt_tp = time.perf_counter() - t0 + _perf_add(perf, "talker_prefill", dt_tp) + if perf is not None: + perf["_talker_prefill_S"] = S + logger.debug(f"[perf] talker prefill (S={S}): {dt_tp:.3f}s") + + cache_pos = S + first_logits = logits[0, -1, :].copy() + first_logits[SUPPRESS_MASK] = -np.inf + first_code = H.sample_token( + first_logits, gen_config.do_sample, gen_config.top_k, + gen_config.top_p, gen_config.temperature, + ) + + all_codes: list[list[int]] = [] + past_first: list[int] = [] + past_hidden = hidden[:, -1:, :] + t_cp = t_talk = 0.0 + t_cp_pf = t_cp_dc = 0.0 + + step = 0 + while step < gen_config.max_new_tokens: + if first_code == CODEC_EOS_ID: + break + + past_first.append(first_code) + fc_emb = self._codec_embed(np.array([[first_code]], dtype=np.int64)) + + t0 = time.perf_counter() + subs, emb_sum, t_pf, t_dc = self._generate_sub_codes(fc_emb, past_hidden, gen_config) + t_cp += time.perf_counter() - t0 + t_cp_pf += t_pf + t_cp_dc += t_dc + + all_codes.append([first_code] + subs) + + next_emb = emb_sum + if step < trailing.shape[1]: + next_emb = next_emb + trailing[:, step : step + 1, :] + else: + next_emb = next_emb + pad_emb + + cos, sin = H.slice_rope(self._mrope_cos, self._mrope_sin, cache_pos, 1) + t0 = time.perf_counter() + logits, hidden = self._talker_infer(next_emb, cos, sin) + t_talk += time.perf_counter() - t0 + + cache_pos += 1 + step += 1 + + sl = logits[0, -1, :].copy() + sl[SUPPRESS_MASK] = -np.inf + if gen_config.repetition_penalty != 1.0 and past_first: + sl = H.apply_repetition_penalty(sl, past_first, gen_config.repetition_penalty) + first_code = H.sample_token( + sl, gen_config.do_sample, gen_config.top_k, + gen_config.top_p, gen_config.temperature, + ) + past_hidden = hidden[:, -1:, :] + + n = step + if n > 0: + dt = t_cp + t_talk + pf = dt / n + if perf is not None: + perf["_num_frames"] = n + _perf_add(perf, "decode_talker", t_talk) + _perf_add(perf, "decode_cp_prefill", t_cp_pf) + _perf_add(perf, "decode_cp_decode", t_cp_dc) + _perf_add(perf, "decode_loop_total", dt) + logger.debug(f"[perf] decode loop ({n} frames):") + logger.debug(f"[perf] code predictor: total={t_cp:.3f}s avg={t_cp/n:.3f}s") + logger.debug(f"[perf] talker decode: total={t_talk:.3f}s avg={t_talk/n:.3f}s") + logger.debug(f"[perf] cp_prefill: total={t_cp_pf:.3f}s avg={t_cp_pf/n:.3f}s") + logger.debug(f"[perf] cp_decode: total={t_cp_dc:.3f}s avg={t_cp_dc/n:.3f}s") + logger.debug(f"[perf] per frame: {pf:.3f}s ({1/pf:.1f} fps)") + logger.debug(f"[perf] throughput: {n * NUM_CODE_GROUPS / dt:.1f} tokens/s") + + return all_codes + + def _run_loop_streaming( + self, + inp: dict, + gen_config: OV_Qwen3TTSGenConfig, + perf: dict[str, float] | None = None, + ) -> Iterator[TTSStreamChunk]: + """Autoregressive loop like `_run_loop`, yielding decoded PCM at chunk boundaries.""" + chunk_size = max(1, gen_config.stream_chunk_frames) + left_ctx = max(0, gen_config.stream_left_context) + + embeds = inp["inputs_embeds"] + trailing = inp["trailing_text_hidden"] + pad_emb = inp["tts_pad_embed"] + + self._talker_req.reset_state() + S = embeds.shape[1] + cos, sin = H.slice_rope(self._mrope_cos, self._mrope_sin, 0, S) + + t0 = time.perf_counter() + logits, hidden = self._talker_infer(embeds, cos, sin) + dt_tp = time.perf_counter() - t0 + _perf_add(perf, "talker_prefill", dt_tp) + if perf is not None: + perf["_talker_prefill_S"] = S + logger.debug(f"[perf] talker prefill (S={S}): {dt_tp:.3f}s") + + cache_pos = S + first_logits = logits[0, -1, :].copy() + first_logits[SUPPRESS_MASK] = -np.inf + first_code = H.sample_token( + first_logits, gen_config.do_sample, gen_config.top_k, + gen_config.top_p, gen_config.temperature, + ) + + buffer: list[list[int]] = [] + prev_tail: list[list[int]] | None = None + chunk_index = 0 + past_first: list[int] = [] + past_hidden = hidden[:, -1:, :] + t_cp = t_talk = 0.0 + t_cp_pf = t_cp_dc = 0.0 + + step = 0 + while step < gen_config.max_new_tokens: + if first_code == CODEC_EOS_ID: + break + + past_first.append(first_code) + fc_emb = self._codec_embed(np.array([[first_code]], dtype=np.int64)) + + t0 = time.perf_counter() + subs, emb_sum, t_pf, t_dc = self._generate_sub_codes(fc_emb, past_hidden, gen_config) + t_cp += time.perf_counter() - t0 + t_cp_pf += t_pf + t_cp_dc += t_dc + + buffer.append([first_code] + subs) + + if len(buffer) >= chunk_size: + to_decode = buffer[:chunk_size] + pcm = self._chunked_decode(to_decode, prev_tail, left_ctx, perf) + yield TTSStreamChunk(pcm, chunk_index, is_final=False) + take = min(left_ctx, len(to_decode)) + prev_tail = to_decode[-take:] if take > 0 else None + buffer = buffer[chunk_size:] + chunk_index += 1 + + next_emb = emb_sum + if step < trailing.shape[1]: + next_emb = next_emb + trailing[:, step : step + 1, :] + else: + next_emb = next_emb + pad_emb + + cos, sin = H.slice_rope(self._mrope_cos, self._mrope_sin, cache_pos, 1) + t0 = time.perf_counter() + logits, hidden = self._talker_infer(next_emb, cos, sin) + t_talk += time.perf_counter() - t0 + + cache_pos += 1 + step += 1 + + sl = logits[0, -1, :].copy() + sl[SUPPRESS_MASK] = -np.inf + if gen_config.repetition_penalty != 1.0 and past_first: + sl = H.apply_repetition_penalty(sl, past_first, gen_config.repetition_penalty) + first_code = H.sample_token( + sl, gen_config.do_sample, gen_config.top_k, + gen_config.top_p, gen_config.temperature, + ) + past_hidden = hidden[:, -1:, :] + + n = step + if n > 0: + dt = t_cp + t_talk + pf = dt / n + if perf is not None: + perf["_num_frames"] = n + _perf_add(perf, "decode_talker", t_talk) + _perf_add(perf, "decode_cp_prefill", t_cp_pf) + _perf_add(perf, "decode_cp_decode", t_cp_dc) + _perf_add(perf, "decode_loop_total", dt) + logger.debug(f"[perf] decode loop ({n} frames):") + logger.debug(f"[perf] code predictor: total={t_cp:.3f}s avg={t_cp/n:.3f}s") + logger.debug(f"[perf] talker decode: total={t_talk:.3f}s avg={t_talk/n:.3f}s") + logger.debug(f"[perf] cp_prefill: total={t_cp_pf:.3f}s avg={t_cp_pf/n:.3f}s") + logger.debug(f"[perf] cp_decode: total={t_cp_dc:.3f}s avg={t_cp_dc/n:.3f}s") + logger.debug(f"[perf] per frame: {pf:.3f}s ({1/pf:.1f} fps)") + logger.debug(f"[perf] throughput: {n * NUM_CODE_GROUPS / dt:.1f} tokens/s") + + if buffer: + pcm = self._chunked_decode(buffer, prev_tail, left_ctx, perf) + yield TTSStreamChunk(pcm, chunk_index, is_final=True) + + # ---- Logging ------------------------------------------------------------ + + def _log_pipeline_summary( + self, + perf: dict, + t_total_start: float, + wav_seconds: float, + voice_clone: bool, + ) -> None: + wall = time.perf_counter() - t_total_start + bt = float(perf.get("build_tokenizer", 0.0)) + btm = float(perf.get("build_text_model", 0.0)) + bcc = float(perf.get("build_codec_control", 0.0)) + bce = float(perf.get("build_codec_embed_other", 0.0)) + bre = float(perf.get("build_ref_embed", 0.0)) + bi = bt + btm + bcc + bce + bre + + logger.info("[perf] === PIPELINE SUMMARY ===") + if voice_clone: + ad = float(perf.get("audio_decode", 0.0)) + if ad > 0: + logger.info(f"[perf] audio_decode: {ad:.3f}s") + sm = float(perf.get("speaker_mel", 0.0)) + so = float(perf.get("speaker_ov", 0.0)) + if sm > 0 or so > 0: + logger.info(f"[perf] speaker_encoder: {sm + so:.3f}s (mel: {sm:.3f}s, ov: {so:.3f}s)") + sr_t = float(perf.get("speech_resample", 0.0)) + sp_o = float(perf.get("speech_ov", 0.0)) + if sp_o > 0 or sr_t > 0: + logger.info(f"[perf] speech_encoder: {sr_t + sp_o:.3f}s (resample: {sr_t:.3f}s, ov: {sp_o:.3f}s)") + + logger.info( + f"[perf] build_inputs: {bi:.3f}s " + f"(tokenizer: {bt:.3f}s, text_model: {btm:.3f}s, codec_control: {bcc:.3f}s, " + f"codec_embed: {bce:.3f}s, ref_embed: {bre:.3f}s)", + ) + + tp = float(perf.get("talker_prefill", 0.0)) + s_pf = perf.get("_talker_prefill_S") + if tp > 0: + if isinstance(s_pf, int): + logger.info(f"[perf] talker_prefill: {tp:.3f}s (S={s_pf})") + else: + logger.info(f"[perf] talker_prefill: {tp:.3f}s") + + n_fr = int(perf.get("_num_frames", 0) or 0) + dtot = float(perf.get("decode_loop_total", 0.0)) + dtalk = float(perf.get("decode_talker", 0.0)) + dcpf = float(perf.get("decode_cp_prefill", 0.0)) + dcpd = float(perf.get("decode_cp_decode", 0.0)) + if n_fr > 0 and dtot > 0: + logger.info(f"[perf] decode_loop: {dtot:.3f}s (N={n_fr} frames)") + logger.info( + f"[perf] talker_decode: {dtalk:.3f}s (avg {1000 * dtalk / n_fr:.2f}ms/frame)", + ) + logger.info( + f"[perf] cp_prefill: {dcpf:.3f}s (avg {1000 * dcpf / n_fr:.2f}ms/frame)", + ) + logger.info( + f"[perf] cp_decode: {dcpd:.3f}s (avg {1000 * dcpd / n_fr:.2f}ms/frame)", + ) + + sd = float(perf.get("speech_decoder", 0.0)) + dshape = perf.get("_decoder_in_shape") + if sd > 0: + if isinstance(dshape, tuple) and len(dshape) == 3: + b, nq, tt = dshape + logger.info( + f"[perf] speech_decoder: {sd:.3f}s (input shape: {b}x{nq}x{tt})", + ) + else: + logger.info(f"[perf] speech_decoder: {sd:.3f}s") + + rt = wav_seconds / wall if wall > 0 else 0.0 + logger.info(f"[perf] TOTAL: {wall:.3f}s -> {wav_seconds:.2f}s audio ({rt:.2f}x realtime)") + + @staticmethod + def _log_summary(codes: list, wav: np.ndarray, t_total_start: float): + sr = SPEECH_DECODER_SR + logger.info(f"[perf] total: {time.perf_counter() - t_total_start:.3f}s") + logger.info(f"[info] {len(codes)} frames -> {len(wav)} samples ({len(wav)/sr:.2f}s audio)") + + +if __name__ == "__main__": + # Voice-clone smoke test without the server: edit the paths/strings below, then run + # uv run python -m src.engine.openvino.qwen3_tts.qwen3_tts + _ov_dir = Path( + "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen3-TTS-OpenVINO/" + "Qwen3-TTS-12Hz-Base-1.7B-INT8-OpenVINO", + ) + _ref_wav = Path("/home/echo/Projects/OpenArc/elmo_sample.wav") + _ref_text = ( + "Color? Red! [laughs] Or, or who's your best friend? Um, Elmo's pet goldfish, " + "Dorothy. Is it like... what is it like living on Sesame Street? That's a good " + "question. Awesome, baby! [laughs] Wait... Elmo's not supposed to be answering " + "these yet. [laughs] Sorry! [laughs] Well... now, you can ask Elmo any question " + "you want right here on YouTube using this..." + ) + _synth_text = "Hello! This is a quick voice clone test without running the server. Add some more text to see if the voice clone is working correctly. And again, and again and again." + _output_wav = Path("voice_clone_out.wav") + _device = "GPU.2" + + if not _ref_wav.is_file(): + raise SystemExit(f"Reference audio not found: {_ref_wav}") + if not _ov_dir.is_dir(): + raise SystemExit(f"OpenVINO model directory not found: {_ov_dir}") + + _ref_b64 = base64.b64encode(_ref_wav.read_bytes()).decode("utf-8") + _load = ModelLoadConfig( + model_path=str(_ov_dir), + model_name="qwen3-tts-voice-clone-entry", + model_type=ModelType.QWEN3_TTS_VOICE_CLONE, + engine=EngineType.OPENVINO, + device=_device, + runtime_config={}, + ) + _gen = OV_Qwen3TTSGenConfig( + input=_synth_text, + ref_audio_b64=_ref_b64, + ref_text=_ref_text, + x_vector_only=False, + language=None, + instruct=None, + ) + + _engine = OVQwen3TTS(_load) + _engine.load_model(_load) + + async def _entry_run() -> tuple[np.ndarray, int]: + return await _engine.generate(_gen) + + _wav, _sr = asyncio.run(_entry_run()) + _output_wav.parent.mkdir(parents=True, exist_ok=True) + sf.write(str(_output_wav), _wav, _sr, subtype="PCM_16") + print(f"Wrote {_output_wav} ({len(_wav) / _sr:.2f}s @ {_sr} Hz)") diff --git a/src/engine/openvino/qwen3_tts/qwen3_tts_helpers.py b/src/engine/openvino/qwen3_tts/qwen3_tts_helpers.py new file mode 100644 index 0000000..d8dbf32 --- /dev/null +++ b/src/engine/openvino/qwen3_tts/qwen3_tts_helpers.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import base64 +import io +import wave as wave_mod +from dataclasses import dataclass +from enum import StrEnum + +import librosa +import numpy as np +import soundfile as sf + + +# --------------------------------------------------------------------------- +# Constants — fixed for the Qwen3-TTS OpenVINO checkpoint family +# --------------------------------------------------------------------------- + +# Special token IDs +TTS_BOS_TOKEN_ID = 151672 +TTS_EOS_TOKEN_ID = 151673 +TTS_PAD_TOKEN_ID = 151671 +CODEC_BOS_ID = 2149 +CODEC_EOS_ID = 2150 +CODEC_PAD_ID = 2148 +CODEC_THINK_ID = 2154 +CODEC_NOTHINK_ID = 2155 +CODEC_THINK_BOS_ID = 2156 +CODEC_THINK_EOS_ID = 2157 + +# Talker architecture +NUM_CODE_GROUPS = 16 +HIDDEN_SIZE = 2048 +HEAD_DIM = 128 +VOCAB_SIZE = 3072 +TALKER_MAX_POS = 32768 +TALKER_ROPE_THETA = 1_000_000.0 +MROPE_SECTION = (24, 20, 20) + +# Code predictor architecture +CP_HEAD_DIM = 128 +CP_MAX_POS = 65536 +CP_ROPE_THETA = 1_000_000.0 + +# Speech decoder +SPEECH_DECODER_SR = 24000 + +# Speaker encoder mel-spectrogram params +SE_SR = 24000 +SE_N_FFT = 1024 +SE_HOP = 256 +SE_WIN = 1024 +SE_N_MELS = 128 +SE_FMIN = 0.0 +SE_FMAX = 12000.0 + +# Speech encoder +ENC_INPUT_SR = 24000 + +# Prompt templates +_INSTRUCT_TMPL = "<|im_start|>user\n{instruct}<|im_end|>\n" +_SYNTH_TMPL = "<|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n" +_REF_TEXT_TMPL = "<|im_start|>assistant\n{ref_text}<|im_end|>\n" + +# Suppress mask: block last 1024 codec IDs except EOS +SUPPRESS_MASK = np.zeros(VOCAB_SIZE, dtype=bool) +for _i in range(VOCAB_SIZE - 1024, VOCAB_SIZE): + if _i != CODEC_EOS_ID: + SUPPRESS_MASK[_i] = True + + +# --------------------------------------------------------------------------- +# Language and Speaker enums + registries +# --------------------------------------------------------------------------- + + +class Language(StrEnum): + CHINESE = "chinese" + ENGLISH = "english" + GERMAN = "german" + ITALIAN = "italian" + PORTUGUESE = "portuguese" + SPANISH = "spanish" + JAPANESE = "japanese" + KOREAN = "korean" + FRENCH = "french" + RUSSIAN = "russian" + BEIJING_DIALECT = "beijing_dialect" + SICHUAN_DIALECT = "sichuan_dialect" + + +@dataclass(frozen=True, slots=True) +class LanguageInfo: + codec_id: int + + +LANGUAGES: dict[Language, LanguageInfo] = { + Language.CHINESE: LanguageInfo(codec_id=2055), + Language.ENGLISH: LanguageInfo(codec_id=2050), + Language.GERMAN: LanguageInfo(codec_id=2053), + Language.ITALIAN: LanguageInfo(codec_id=2070), + Language.PORTUGUESE: LanguageInfo(codec_id=2071), + Language.SPANISH: LanguageInfo(codec_id=2054), + Language.JAPANESE: LanguageInfo(codec_id=2058), + Language.KOREAN: LanguageInfo(codec_id=2064), + Language.FRENCH: LanguageInfo(codec_id=2061), + Language.RUSSIAN: LanguageInfo(codec_id=2069), + Language.BEIJING_DIALECT: LanguageInfo(codec_id=2074), + Language.SICHUAN_DIALECT: LanguageInfo(codec_id=2062), +} + + +class Speaker(StrEnum): + SERENA = "serena" + VIVIAN = "vivian" + UNCLE_FU = "uncle_fu" + RYAN = "ryan" + AIDEN = "aiden" + ONO_ANNA = "ono_anna" + SOHEE = "sohee" + ERIC = "eric" + DYLAN = "dylan" + + +@dataclass(frozen=True, slots=True) +class SpeakerInfo: + codec_id: int + dialect: Language | None = None + + +SPEAKERS: dict[Speaker, SpeakerInfo] = { + Speaker.SERENA: SpeakerInfo(codec_id=3066), + Speaker.VIVIAN: SpeakerInfo(codec_id=3065), + Speaker.UNCLE_FU: SpeakerInfo(codec_id=3010), + Speaker.RYAN: SpeakerInfo(codec_id=3061), + Speaker.AIDEN: SpeakerInfo(codec_id=2861), + Speaker.ONO_ANNA: SpeakerInfo(codec_id=2873), + Speaker.SOHEE: SpeakerInfo(codec_id=2864), + Speaker.ERIC: SpeakerInfo(codec_id=2875, dialect=Language.SICHUAN_DIALECT), + Speaker.DYLAN: SpeakerInfo(codec_id=2878, dialect=Language.BEIJING_DIALECT), +} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class OVQwen3TTSHelpers: + """Static utility methods for sampling, RoPE, OV dispatch, and audio I/O.""" + + # ---- Sampling ----------------------------------------------------------- + + @staticmethod + def softmax(x: np.ndarray) -> np.ndarray: + x = x - np.max(x) + e = np.exp(x) + return e / e.sum() + + @staticmethod + def sample_token( + logits: np.ndarray, + do_sample: bool = True, + top_k: int = 50, + top_p: float = 1.0, + temperature: float = 0.9, + ) -> int: + logits = logits.copy().astype(np.float32) + if not do_sample: + return int(np.argmax(logits)) + if temperature != 1.0: + logits /= temperature + if top_k > 0: + top_k = min(top_k, logits.shape[-1]) + threshold = np.partition(logits, -top_k)[-top_k] + logits[logits < threshold] = -np.inf + if top_p < 1.0: + idx = np.argsort(logits)[::-1] + sl = logits[idx] + probs = OVQwen3TTSHelpers.softmax(sl) + cutoff = np.searchsorted(np.cumsum(probs), top_p) + 1 + sl[cutoff:] = -np.inf + logits[idx] = sl + probs = OVQwen3TTSHelpers.softmax(logits) + return int(np.random.choice(len(probs), p=probs)) + + @staticmethod + def apply_repetition_penalty( + logits: np.ndarray, past_tokens: list[int], penalty: float + ) -> np.ndarray: + for tid in set(past_tokens): + if logits[tid] > 0: + logits[tid] /= penalty + else: + logits[tid] *= penalty + return logits + + # ---- RoPE --------------------------------------------------------------- + + @staticmethod + def precompute_mrope(max_len: int, head_dim: int, theta: float = TALKER_ROPE_THETA): + inv = 1.0 / (theta ** (np.arange(0, head_dim, 2, dtype=np.float32) / head_dim)) + pos = np.arange(max_len, dtype=np.float32) + freqs = np.outer(pos, inv) + emb = np.concatenate([freqs, freqs], axis=-1) + return np.cos(emb).astype(np.float32), np.sin(emb).astype(np.float32) + + @staticmethod + def precompute_standard_rope(max_len: int, head_dim: int, theta: float = 10_000.0): + inv = 1.0 / (theta ** (np.arange(0, head_dim, 2, dtype=np.float32) / head_dim)) + pos = np.arange(max_len, dtype=np.float32) + freqs = np.outer(pos, inv) + emb = np.concatenate([freqs, freqs], axis=-1) + return np.cos(emb).astype(np.float32), np.sin(emb).astype(np.float32) + + @staticmethod + def slice_rope(cos, sin, start: int, length: int): + c = cos[start : start + length][np.newaxis, np.newaxis] + s = sin[start : start + length][np.newaxis, np.newaxis] + return c, s + + # ---- OV dispatch -------------------------------------------------------- + + @staticmethod + def ov_call(compiled_model, inputs: dict) -> dict: + result = compiled_model(inputs) + return {out.get_any_name(): result[out] for out in compiled_model.outputs} + + @staticmethod + def ov_stateful_infer(request, inputs: dict) -> dict: + request.infer(inputs) + return { + out.get_any_name(): request.get_tensor(out.get_any_name()).data.copy() + for out in request.model_outputs + } + + # ---- Audio I/O ---------------------------------------------------------- + + @staticmethod + def load_audio_wav(path: str) -> tuple[np.ndarray, int]: + with wave_mod.open(path, "r") as wf: + n_ch = wf.getnchannels() + sw = wf.getsampwidth() + sr = wf.getframerate() + raw = wf.readframes(wf.getnframes()) + if sw == 2: + samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 + elif sw == 4: + samples = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0 + elif sw == 1: + samples = np.frombuffer(raw, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 + else: + raise ValueError(f"Unsupported sample width: {sw}") + if n_ch > 1: + samples = samples.reshape(-1, n_ch).mean(axis=1) + return samples, sr + + @staticmethod + def decode_audio_b64(b64: str) -> tuple[np.ndarray, int]: + data, sr = sf.read(io.BytesIO(base64.b64decode(b64)), dtype="float32") + if data.ndim > 1: + data = data.mean(axis=1) + return data, sr + + @staticmethod + def mel_spectrogram( + audio: np.ndarray, + sr: int, + target_sr: int = SE_SR, + n_fft: int = SE_N_FFT, + hop_size: int = SE_HOP, + win_size: int = SE_WIN, + n_mels: int = SE_N_MELS, + fmin: float = SE_FMIN, + fmax: float = SE_FMAX, + ) -> np.ndarray: + """Log-mel spectrogram -> (n_mels, T) float32.""" + audio = audio.astype(np.float32) + if sr != target_sr: + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) + pad = (n_fft - hop_size) // 2 + audio = np.pad(audio, (pad, pad), mode="reflect") + stft = librosa.stft( + audio, n_fft=n_fft, hop_length=hop_size, win_length=win_size, + window="hann", center=False, + ) + mag = np.sqrt(stft.real ** 2 + stft.imag ** 2 + 1e-9).astype(np.float32) + mel_basis = librosa.filters.mel( + sr=target_sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, + ).astype(np.float32) + return np.log(np.clip(mel_basis @ mag, 1e-5, None)).astype(np.float32) + + +H = OVQwen3TTSHelpers # short alias used inside the engine diff --git a/src/server/main.py b/src/server/main.py index a8fefa8..10ae4b3 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -21,17 +21,16 @@ from starlette.middleware.base import BaseHTTPMiddleware from src.server.model_registry import ModelRegistry -from src.server.models.registration import ModelLoadConfig, ModelUnloadConfig -from src.server.models.openvino import OV_KokoroGenConfig +from src.server.models.registration import ModelLoadConfig, ModelType, ModelUnloadConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig from src.server.models.requests_internal import OpenArcBenchRequest from src.server.models.requests_openai import ( EmbeddingsRequest, + OpenArcASRConfig, OpenAIChatCompletionRequest, OpenAICompletionRequest, - OpenAIKokoroRequest, - OpenAIWhisperRequest, + OpenAISpeechRequest, RerankRequest, ) from src.server.worker_registry import WorkerRegistry @@ -637,24 +636,37 @@ async def event_stream() -> AsyncIterator[bytes]: async def openai_audio_transcriptions( file: UploadFile = File(..., description="The audio file to transcribe"), model: str = Form(..., description="ID of the model to use"), - language: Optional[str] = Form(None, description="Language of the input audio"), - prompt: Optional[str] = Form(None, description="Optional text to guide the model"), response_format: Optional[str] = Form("json", description="Format of output"), - temperature: Optional[float] = Form(0.0, description="Sampling temperature") + openarc_asr: Optional[str] = Form(None, description="JSON: OpenArcASRConfig with qwen3_asr params"), ): try: logger.info(f'"{model}" request received') - # Read the uploaded audio file audio_bytes = await file.read() - - # Convert to base64 for internal processing - audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') - - # Create generation config with base64 audio - gen_config = OVGenAI_WhisperGenConfig(audio_base64=audio_base64) - - # Process transcription - result = await _workers.transcribe_whisper(model, gen_config) + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") + + selected_model_type = None + async with _registry._lock: + for record in _registry._models.values(): + if record.model_name == model: + selected_model_type = record.model_type + break + + if selected_model_type is None: + raise ValueError(f"Model '{model}' is not loaded") + + normalized_model_type = ModelType(selected_model_type) + + if normalized_model_type == ModelType.QWEN3_ASR: + if not openarc_asr: + raise ValueError("openarc_asr required for Qwen3 ASR models") + cfg = OpenArcASRConfig.model_validate(json.loads(openarc_asr)) + if not cfg.qwen3_asr: + raise ValueError("openarc_asr.qwen3_asr required for Qwen3 ASR models") + gen_config = cfg.qwen3_asr.model_copy(update={"audio_base64": audio_base64}) + result = await _workers.transcribe_qwen3_asr(model, gen_config) + else: + gen_config = OVGenAI_WhisperGenConfig(audio_base64=audio_base64) + result = await _workers.transcribe_whisper(model, gen_config) metrics = result.get("metrics", {}) logger.info(f"[audio/transcriptions] model={model} metrics={metrics}") @@ -665,7 +677,7 @@ async def openai_audio_transcriptions( elif response_format == "verbose_json": return { "text": result.get("text", ""), - "language": language, + "language": metrics.get("language"), "duration": metrics.get("duration"), "metrics": metrics } @@ -680,35 +692,53 @@ async def openai_audio_transcriptions( @app.post("/v1/audio/speech", dependencies=[Depends(verify_api_key)]) -async def openai_audio_speech(request: OpenAIKokoroRequest): - """OpenAI-compatible endpoint for text-to-speech using Kokoro models. - - Returns a WAV file containing the synthesized speech. - """ +async def openai_audio_speech(request: OpenAISpeechRequest): + """OpenAI-compatible endpoint for text-to-speech. Routes to Kokoro or Qwen3 TTS based on model type.""" try: logger.info(f'"{request.model}" request received') - gen_config = OV_KokoroGenConfig( - kokoro_message=request.input, - voice=request.voice, - lang_code=request.language, - speed=request.speed, - response_format=request.response_format - ) + selected_model_type = None + async with _registry._lock: + for record in _registry._models.values(): + if record.model_name == request.model: + selected_model_type = record.model_type + break + + if selected_model_type is None: + raise ValueError(f"Model '{request.model}' is not loaded") + + normalized = ModelType(selected_model_type) + + if normalized in ( + ModelType.QWEN3_TTS_CUSTOM_VOICE, + ModelType.QWEN3_TTS_VOICE_DESIGN, + ModelType.QWEN3_TTS_VOICE_CLONE, + ): + if not request.openarc_tts or not request.openarc_tts.qwen3_tts: + raise ValueError("openarc_tts.qwen3_tts required for Qwen3 TTS models") + gen_config = request.openarc_tts.qwen3_tts + gen_config.input = request.input + if gen_config.stream: + return StreamingResponse( + _workers.stream_generate_speech_qwen3_tts(request.model, gen_config), + media_type="audio/L16;rate=24000;channels=1", + ) + result = await _workers.generate_speech_qwen3_tts(request.model, gen_config) + else: + if not request.openarc_tts or not request.openarc_tts.kokoro: + raise ValueError("openarc_tts.kokoro required for Kokoro models") + gen_config = request.openarc_tts.kokoro + gen_config.input = request.input + result = await _workers.generate_speech_kokoro(request.model, gen_config) - result = await _workers.generate_speech_kokoro(request.model, gen_config) metrics = result.get("metrics", {}) - logger.info(f"[audio/speech] model={request.model} voice={request.voice} metrics={metrics}") - # Decode base64 audio and return as WAV file - import base64 audio_bytes = base64.b64decode(result.get("audio_base64", "")) - return StreamingResponse( iter([audio_bytes]), media_type="audio/wav", - headers={"Content-Disposition": "attachment; filename=speech.wav"} + headers={"Content-Disposition": "attachment; filename=speech.wav"}, ) except ValueError as exc: diff --git a/src/server/model_registry.py b/src/server/model_registry.py index f926e90..f6824dc 100644 --- a/src/server/model_registry.py +++ b/src/server/model_registry.py @@ -225,7 +225,11 @@ async def status(self) -> dict: (EngineType.OV_GENAI, ModelType.LLM): "src.engine.ov_genai.llm.OVGenAI_LLM", (EngineType.OV_GENAI, ModelType.VLM): "src.engine.ov_genai.vlm.OVGenAI_VLM", (EngineType.OV_GENAI, ModelType.WHISPER): "src.engine.ov_genai.whisper.OVGenAI_Whisper", + (EngineType.OPENVINO, ModelType.QWEN3_ASR): "src.engine.openvino.qwen3_asr.qwen3_asr.OVQwen3ASR", (EngineType.OPENVINO, ModelType.KOKORO): "src.engine.openvino.kokoro.OV_Kokoro", + (EngineType.OPENVINO, ModelType.QWEN3_TTS_CUSTOM_VOICE): "src.engine.openvino.qwen3_tts.qwen3_tts.OVQwen3TTS", + (EngineType.OPENVINO, ModelType.QWEN3_TTS_VOICE_DESIGN): "src.engine.openvino.qwen3_tts.qwen3_tts.OVQwen3TTS", + (EngineType.OPENVINO, ModelType.QWEN3_TTS_VOICE_CLONE): "src.engine.openvino.qwen3_tts.qwen3_tts.OVQwen3TTS", (EngineType.OV_OPTIMUM, ModelType.EMB): "src.engine.optimum.optimum_emb.Optimum_EMB", (EngineType.OV_OPTIMUM, ModelType.RERANK): "src.engine.optimum.optimum_rr.Optimum_RR", } diff --git a/src/server/models/openvino.py b/src/server/models/openvino.py index b4f6202..e789eed 100644 --- a/src/server/models/openvino.py +++ b/src/server/models/openvino.py @@ -1,6 +1,7 @@ from enum import Enum -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator +from typing import Optional @@ -91,9 +92,76 @@ class KokoroVoice(str, Enum): PM_SANTA = "pm_santa" class OV_KokoroGenConfig(BaseModel): - kokoro_message: str = Field(..., description="Text to convert to speech") - voice: KokoroVoice = Field(..., description="Voice token from available Kokoro voices") - lang_code: KokoroLanguage = Field(..., description="Language code for the voice") + input: str = Field(..., description="Text to convert to speech") + voice: KokoroVoice = Field(KokoroVoice.AF_SARAH, description="Voice token from available Kokoro voices") + lang_code: KokoroLanguage = Field(KokoroLanguage.AMERICAN_ENGLISH, description="Language code for the voice") speed: float = Field(1.0, description="Speech speed multiplier") character_count_chunk: int = Field(100, description="Max characters per chunk") response_format: str = Field("wav", description="Output format") + + + + +class OV_Qwen3ASRGenConfig(BaseModel): + audio_base64: str | None = Field(default=None, description="Base64 encoded audio payload (injected from file when omitted)") + language: Optional[str] = Field(default=None, description="Optional forced language") + max_tokens: int = Field(default=1024, description="Maximum generated tokens per chunk") + max_chunk_sec: float = Field(default=30.0, description="Chunk size upper bound in seconds") + search_expand_sec: float = Field(default=5.0, description="Boundary search expansion in seconds") + min_window_ms: float = Field(default=100.0, description="Energy window in milliseconds") + + @field_validator("max_tokens") + @classmethod + def _validate_max_tokens(cls, v: int) -> int: + if v <= 0: + raise ValueError("max_tokens must be positive") + return v + + @field_validator("max_chunk_sec", "search_expand_sec", "min_window_ms") + @classmethod + def _validate_positive_float(cls, v: float) -> float: + if v <= 0: + raise ValueError("numeric values must be positive") + return v + +class OV_Qwen3TTSGenConfig(BaseModel): + """Single source of truth for all OVQwen3TTS request parameters. + + The model_type on ModelLoadConfig determines which mode the engine runs; + supply only the fields relevant to that mode: + + - qwen3_tts_custom_voice : input, speaker, language, instruct + - qwen3_tts_voice_design : input, voice_description, language + - qwen3_tts_voice_clone : input, ref_audio_b64, ref_text, x_vector_only, language, instruct + + All modes accept the sampling fields. + """ + # --- content --- + input: str = Field(..., description="Text to synthesise.") + # [custom_voice] + speaker: str | None = Field(default=None, description="[custom_voice] Predefined speaker name.") + instruct: str | None = Field(default=None, description="[custom_voice, voice_clone] Optional style instruction.") + # [all] + language: str | None = Field(default=None, description="[all] Force output language. None = auto-detect.") + # [voice_design] + voice_description: str | None = Field(default=None, description="[voice_design] Free-form voice description.") + # [voice_clone] + ref_audio_b64: str | None = Field(default=None, description="[voice_clone] Base64-encoded reference WAV.") + ref_text: str | None = Field(default=None, description="[voice_clone] Transcript of reference audio (enables ICL).") + x_vector_only: bool = Field(default=False, description="[voice_clone] Use x-vector embedding only; skip ICL even if ref_text is set.") + # --- sampling (all modes) --- + max_new_tokens: int = Field(default=2048, description="Maximum codec frames to generate.") + do_sample: bool = Field(default=True, description="Sample from logits. False = greedy.") + top_k: int = Field(default=50, description="Top-k filter for talker logits.") + top_p: float = Field(default=1.0, description="Nucleus filter for talker logits. 1.0 = off.") + temperature: float = Field(default=0.9, description="Temperature scaling for talker logits.") + repetition_penalty: float = Field(default=1.05, description="Repetition penalty on first-codebook history. 1.0 = off.") + non_streaming_mode: bool = Field(default=True, description="True = all text tokens in prefill; False = drip-fed during decode.") + subtalker_do_sample: bool = Field(default=True, description="Sample sub-codebook logits.") + subtalker_top_k: int = Field(default=50, description="Top-k for code predictor.") + subtalker_top_p: float = Field(default=1.0, description="Nucleus filter for code predictor.") + subtalker_temperature: float = Field(default=0.9, description="Temperature for code predictor.") + # --- streaming (HTTP: audio/L16 chunked response when stream=True) --- + stream: bool = Field(default=True, description="Enable streaming audio output (chunked PCM).") + stream_chunk_frames: int = Field(default=50, description="Codec frames per streaming chunk.") + stream_left_context: int = Field(default=25, description="Left context frames for chunk boundary continuity.") diff --git a/src/server/models/registration.py b/src/server/models/registration.py index ee5dc67..513d2d6 100644 --- a/src/server/models/registration.py +++ b/src/server/models/registration.py @@ -27,14 +27,22 @@ class ModelType(str, Enum): - llm: Text-to-text LLM models - vlm: Image-to-text VLM models - whisper: Whisper ASR models + - qwen3_asr: Qwen3 ASR models - kokoro: Kokoro TTS models + - qwen3_tts_custom_voice: Qwen3-TTS with predefined speaker + - qwen3_tts_voice_design: Qwen3-TTS with free-form voice description + - qwen3_tts_voice_clone: Qwen3-TTS cloning a reference audio - emb: Text-to-vector models - rerank: Reranker models""" LLM = "llm" VLM = "vlm" WHISPER = "whisper" + QWEN3_ASR = "qwen3_asr" KOKORO = "kokoro" + QWEN3_TTS_CUSTOM_VOICE = "qwen3_tts_custom_voice" + QWEN3_TTS_VOICE_DESIGN = "qwen3_tts_voice_design" + QWEN3_TTS_VOICE_CLONE = "qwen3_tts_voice_clone" EMB = "emb" RERANK = "rerank" diff --git a/src/server/models/requests_openai.py b/src/server/models/requests_openai.py index 0b08696..baafe8f 100644 --- a/src/server/models/requests_openai.py +++ b/src/server/models/requests_openai.py @@ -2,9 +2,24 @@ from pydantic import BaseModel +from src.server.models.openvino import ( + OV_KokoroGenConfig, + OV_Qwen3ASRGenConfig, + OV_Qwen3TTSGenConfig, +) from src.server.models.optimum import PreTrainedTokenizerConfig +class OpenArcASRConfig(BaseModel): + """Backend config for /v1/audio/transcriptions. Only qwen3_asr extra params; audio_base64 from file.""" + qwen3_asr: Optional[OV_Qwen3ASRGenConfig] = None + + +class OpenArcTTSConfig(BaseModel): + kokoro: Optional[OV_KokoroGenConfig] = None + qwen3_tts: Optional[OV_Qwen3TTSGenConfig] = None + + class OpenAIChatCompletionRequest(BaseModel): model: str messages: Any @@ -46,13 +61,15 @@ class OpenAIWhisperRequest(BaseModel): -class OpenAIKokoroRequest(BaseModel): +class OpenAISpeechRequest(BaseModel): + """OpenAI-compatible request for /v1/audio/speech; backend config in openarc_tts.""" model: str input: str voice: Optional[str] = None - speed: Optional[float] = None + instructions: Optional[str] = None language: Optional[str] = None response_format: Optional[str] = "wav" + openarc_tts: Optional[OpenArcTTSConfig] = None # https://platform.openai.com/docs/api-reference/embeddings diff --git a/src/server/worker_registry.py b/src/server/worker_registry.py index 5797f12..2e54bd5 100644 --- a/src/server/worker_registry.py +++ b/src/server/worker_registry.py @@ -3,6 +3,7 @@ import uuid import base64 import io +import numpy as np import torch import soundfile as sf from dataclasses import dataclass @@ -12,10 +13,12 @@ from src.engine.ov_genai.vlm import OVGenAI_VLM from src.engine.ov_genai.whisper import OVGenAI_Whisper from src.engine.openvino.kokoro import OV_Kokoro +from src.engine.openvino.qwen3_asr.qwen3_asr import OVQwen3ASR +from src.engine.openvino.qwen3_tts.qwen3_tts import OVQwen3TTS from src.engine.optimum.optimum_emb import Optimum_EMB from src.engine.optimum.optimum_rr import Optimum_RR -from src.server.models.openvino import OV_KokoroGenConfig +from src.server.models.openvino import OV_KokoroGenConfig, OV_Qwen3ASRGenConfig, OV_Qwen3TTSGenConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig from src.server.model_registry import ModelRecord, ModelRegistry @@ -53,7 +56,14 @@ class WorkerPacket: """ request_id: str id_model: str # model_name - gen_config: Union[OVGenAI_GenConfig, OVGenAI_WhisperGenConfig, OV_KokoroGenConfig, PreTrainedTokenizerConfig] + gen_config: Union[ + OVGenAI_GenConfig, + OVGenAI_WhisperGenConfig, + OV_Qwen3ASRGenConfig, + OV_KokoroGenConfig, + OV_Qwen3TTSGenConfig, + PreTrainedTokenizerConfig, + ] response: Optional[str] = None metrics: Optional[Dict[str, Any]] = None # Orchestration plumbing @@ -177,6 +187,28 @@ async def infer_whisper(packet: WorkerPacket, whisper_model: OVGenAI_Whisper) -> return packet + @staticmethod + async def infer_qwen3_asr(packet: WorkerPacket, asr_model: OVQwen3ASR) -> WorkerPacket: + """Transcribe audio for a single packet using the OVQwen3ASR pipeline.""" + metrics = None + final_text = "" + + try: + async for item in asr_model.transcribe(packet.gen_config): + if isinstance(item, dict): + metrics = item + else: + final_text = item + + packet.response = final_text + packet.metrics = metrics + except Exception as e: + logger.error("Qwen3 ASR inference failed!", exc_info=True) + packet.response = f"Error: {str(e)}" + packet.metrics = None + + return packet + @staticmethod async def infer_kokoro(packet: WorkerPacket, kokoro_model: OV_Kokoro) -> WorkerPacket: """Generate speech audio for a single packet using the OV_Kokoro pipeline. @@ -222,6 +254,54 @@ async def infer_kokoro(packet: WorkerPacket, kokoro_model: OV_Kokoro) -> WorkerP return packet + @staticmethod + async def infer_qwen3_tts(packet: WorkerPacket, tts_model: OVQwen3TTS) -> WorkerPacket: + """Generate speech audio for a single packet using the OVQwen3TTS engine.""" + try: + wav, sr = await tts_model.generate(packet.gen_config) + + if len(wav) > 0: + wav_buffer = io.BytesIO() + sf.write(wav_buffer, wav, samplerate=sr, format='WAV') + audio_base64 = base64.b64encode(wav_buffer.getvalue()).decode('utf-8') + packet.response = audio_base64 + else: + packet.response = "" + + packet.metrics = { + "sample_rate": sr, + "samples": len(wav), + "duration_sec": len(wav) / sr if sr > 0 else 0, + } + except Exception as e: + logger.error("Qwen3 TTS inference failed!", exc_info=True) + packet.response = f"Error: {str(e)}" + packet.metrics = None + + return packet + + @staticmethod + async def infer_qwen3_tts_stream(packet: WorkerPacket, tts_model: OVQwen3TTS) -> WorkerPacket: + """Stream Qwen3 TTS PCM chunks (int16 LE bytes) onto packet.stream_queue; ends with None.""" + if packet.stream_queue is None: + raise RuntimeError("infer_qwen3_tts_stream requires stream_queue") + loop = asyncio.get_running_loop() + + def _run_sync_generator() -> None: + try: + for tchunk in tts_model.generate_stream(packet.gen_config): + pcm = np.clip(tchunk.audio * 32768.0, -32768.0, 32767.0).astype(np.int16).tobytes() + asyncio.run_coroutine_threadsafe(packet.stream_queue.put(pcm), loop).result() + except Exception: + logger.error("Qwen3 TTS streaming inference failed!", exc_info=True) + finally: + asyncio.run_coroutine_threadsafe(packet.stream_queue.put(None), loop).result() + + await asyncio.to_thread(_run_sync_generator) + packet.response = "" + packet.metrics = None + return packet + @staticmethod async def infer_emb(packet: WorkerPacket, emb_instance: Optimum_EMB) -> WorkerPacket: """Generate embeddings for a single packet using the optimum pipeline""" @@ -362,6 +442,31 @@ async def queue_worker_whisper(model_name: str, model_queue: asyncio.Queue, whis model_queue.task_done() + @staticmethod + async def queue_worker_qwen3_asr(model_name: str, model_queue: asyncio.Queue, asr_model: OVQwen3ASR, registry: ModelRegistry): + """Qwen3 ASR model inference worker that processes packets from queue.""" + logger.info(f"[Qwen3ASR Worker: {model_name}] Started, waiting for packets...") + while True: + packet = await model_queue.get() + if packet is None: + logger.info(f"[Qwen3ASR Worker: {model_name}] Shutdown signal received.") + break + + completed_packet = await InferWorker.infer_qwen3_asr(packet, asr_model) + + if completed_packet.response and completed_packet.response.startswith("Error:"): + logger.error(f"[Qwen3ASR Worker: {model_name}] Inference failed, triggering model unload...") + asyncio.create_task(registry.register_unload(model_name)) + break + + if completed_packet.metrics: + logger.info(f"[Qwen3ASR Worker: {model_name}] Metrics: {completed_packet.metrics}") + + if packet.result_future is not None and not packet.result_future.done(): + packet.result_future.set_result(completed_packet) + + model_queue.task_done() + @staticmethod async def queue_worker_kokoro(model_name: str, model_queue: asyncio.Queue, kokoro_model: OV_Kokoro, registry: ModelRegistry): """Kokoro model inference worker that processes packets from queue""" @@ -390,6 +495,33 @@ async def queue_worker_kokoro(model_name: str, model_queue: asyncio.Queue, kokor model_queue.task_done() + @staticmethod + async def queue_worker_qwen3_tts(model_name: str, model_queue: asyncio.Queue, tts_model: OVQwen3TTS, registry: ModelRegistry): + """Qwen3 TTS model inference worker that processes packets from queue.""" + logger.info(f"[Qwen3TTS Worker: {model_name}] Started, waiting for packets...") + while True: + packet = await model_queue.get() + if packet is None: + logger.info(f"[Qwen3TTS Worker: {model_name}] Shutdown signal received.") + break + + if getattr(packet.gen_config, "stream", False) and packet.stream_queue is not None: + completed_packet = await InferWorker.infer_qwen3_tts_stream(packet, tts_model) + else: + completed_packet = await InferWorker.infer_qwen3_tts(packet, tts_model) + if completed_packet.response and completed_packet.response.startswith("Error:"): + logger.error(f"[Qwen3TTS Worker: {model_name}] Inference failed, triggering model unload...") + asyncio.create_task(registry.register_unload(model_name)) + break + + if completed_packet.metrics: + logger.info(f"[Qwen3TTS Worker: {model_name}] Metrics: {completed_packet.metrics}") + + if packet.result_future is not None and not packet.result_future.done(): + packet.result_future.set_result(completed_packet) + + model_queue.task_done() + @staticmethod async def queue_worker_emb(model_name: str, model_queue: asyncio.Queue, emb_model: Optimum_EMB, registry: ModelRegistry): """EMB model inference worker that processes packets from queue""" @@ -459,9 +591,15 @@ def __init__(self, model_registry: ModelRegistry): self._model_queues_whisper: Dict[str, asyncio.Queue] = {} self._model_tasks_whisper: Dict[str, asyncio.Task] = {} + self._model_queues_qwen3_asr: Dict[str, asyncio.Queue] = {} + self._model_tasks_qwen3_asr: Dict[str, asyncio.Task] = {} + self._model_queues_kokoro: Dict[str, asyncio.Queue] = {} self._model_tasks_kokoro: Dict[str, asyncio.Task] = {} - + + self._model_queues_qwen3_tts: Dict[str, asyncio.Queue] = {} + self._model_tasks_qwen3_tts: Dict[str, asyncio.Task] = {} + self._model_queues_emb: Dict[str, asyncio.Queue] = {} self._model_tasks_emb: Dict[str, asyncio.Task] = {} @@ -514,6 +652,15 @@ async def _on_model_loaded(self, record: ModelRecord) -> None: task = asyncio.create_task(QueueWorker.queue_worker_whisper(record.model_name, q, instance, self._model_registry)) self._model_tasks_whisper[record.model_name] = task + elif mt == ModelType.QWEN3_ASR and isinstance(instance, OVQwen3ASR): + if record.model_name not in self._model_queues_qwen3_asr: + q = asyncio.Queue() + self._model_queues_qwen3_asr[record.model_name] = q + task = asyncio.create_task( + QueueWorker.queue_worker_qwen3_asr(record.model_name, q, instance, self._model_registry) + ) + self._model_tasks_qwen3_asr[record.model_name] = task + elif mt == ModelType.KOKORO and isinstance(instance, OV_Kokoro): if record.model_name not in self._model_queues_kokoro: q: asyncio.Queue = asyncio.Queue() @@ -521,6 +668,19 @@ async def _on_model_loaded(self, record: ModelRecord) -> None: task = asyncio.create_task(QueueWorker.queue_worker_kokoro(record.model_name, q, instance, self._model_registry)) self._model_tasks_kokoro[record.model_name] = task + elif mt in ( + ModelType.QWEN3_TTS_CUSTOM_VOICE, + ModelType.QWEN3_TTS_VOICE_DESIGN, + ModelType.QWEN3_TTS_VOICE_CLONE, + ) and isinstance(instance, OVQwen3TTS): + if record.model_name not in self._model_queues_qwen3_tts: + q: asyncio.Queue = asyncio.Queue() + self._model_queues_qwen3_tts[record.model_name] = q + task = asyncio.create_task( + QueueWorker.queue_worker_qwen3_tts(record.model_name, q, instance, self._model_registry) + ) + self._model_tasks_qwen3_tts[record.model_name] = task + elif mt == ModelType.EMB and isinstance(instance, Optimum_EMB): if record.model_name not in self._model_queues_emb: q: asyncio.Queue = asyncio.Queue() @@ -563,6 +723,14 @@ async def _on_model_unloaded(self, record: ModelRecord) -> None: if t is not None and not t.done(): t.cancel() + # Try qwen3_asr dicts + q = self._model_queues_qwen3_asr.pop(record.model_name, None) + t = self._model_tasks_qwen3_asr.pop(record.model_name, None) + if q is not None: + await q.put(None) + if t is not None and not t.done(): + t.cancel() + # Try kokoro dicts q = self._model_queues_kokoro.pop(record.model_name, None) t = self._model_tasks_kokoro.pop(record.model_name, None) @@ -571,6 +739,14 @@ async def _on_model_unloaded(self, record: ModelRecord) -> None: if t is not None and not t.done(): t.cancel() + # Try qwen3_tts dicts + q = self._model_queues_qwen3_tts.pop(record.model_name, None) + t = self._model_tasks_qwen3_tts.pop(record.model_name, None) + if q is not None: + await q.put(None) + if t is not None and not t.done(): + t.cancel() + # Try emb dicts q = self._model_queues_emb.pop(record.model_name, None) t = self._model_tasks_emb.pop(record.model_name, None) @@ -602,12 +778,24 @@ def _get_whisper_queue(self, model_name: str) -> asyncio.Queue: return q raise ValueError(f"Whisper model '{model_name}' is not loaded or no worker is available") + def _get_qwen3_asr_queue(self, model_name: str) -> asyncio.Queue: + q = self._model_queues_qwen3_asr.get(model_name) + if q is not None: + return q + raise ValueError(f"Qwen3 ASR model '{model_name}' is not loaded or no worker is available") + def _get_kokoro_queue(self, model_name: str) -> asyncio.Queue: q = self._model_queues_kokoro.get(model_name) if q is not None: return q raise ValueError(f"Kokoro model '{model_name}' is not loaded or no worker is available") + def _get_qwen3_tts_queue(self, model_name: str) -> asyncio.Queue: + q = self._model_queues_qwen3_tts.get(model_name) + if q is not None: + return q + raise ValueError(f"Qwen3 TTS model '{model_name}' is not loaded or no worker is available") + def _get_emb_queue(self, model_name: str) -> asyncio.Queue: q = self._model_queues_emb.get(model_name) if q is not None: @@ -708,6 +896,61 @@ async def transcribe_whisper(self, model_name: str, gen_config: OVGenAI_WhisperG completed = await result_future return {"text": completed.response or "", "metrics": completed.metrics or {}} + async def transcribe_qwen3_asr(self, model_name: str, gen_config: OV_Qwen3ASRGenConfig) -> Dict[str, Any]: + """Transcribe audio using Qwen3 ASR model.""" + request_id = uuid.uuid4().hex + result_future: asyncio.Future = asyncio.get_running_loop().create_future() + packet = WorkerPacket( + request_id=request_id, + id_model=model_name, + gen_config=gen_config, + result_future=result_future, + ) + q = self._get_qwen3_asr_queue(model_name) + await q.put(packet) + completed = await result_future + return {"text": completed.response or "", "metrics": completed.metrics or {}} + + async def generate_speech_qwen3_tts(self, model_name: str, gen_config: OV_Qwen3TTSGenConfig) -> Dict[str, Any]: + """Generate speech using a loaded Qwen3 TTS model. + + Returns a dict with base64-encoded WAV audio and metrics. + """ + request_id = uuid.uuid4().hex + result_future: asyncio.Future = asyncio.get_running_loop().create_future() + packet = WorkerPacket( + request_id=request_id, + id_model=model_name, + gen_config=gen_config, + result_future=result_future, + ) + q = self._get_qwen3_tts_queue(model_name) + await q.put(packet) + completed = await result_future + return {"audio_base64": completed.response or "", "metrics": completed.metrics or {}} + + async def stream_generate_speech_qwen3_tts( + self, model_name: str, gen_config: OV_Qwen3TTSGenConfig, + ) -> AsyncIterator[bytes]: + """Stream raw int16 LE mono PCM chunks at 24 kHz (RFC 4856 audio/L16 on the HTTP layer).""" + request_id = uuid.uuid4().hex + stream_queue: asyncio.Queue = asyncio.Queue() + result_future: asyncio.Future = asyncio.get_running_loop().create_future() + packet = WorkerPacket( + request_id=request_id, + id_model=model_name, + gen_config=gen_config, + stream_queue=stream_queue, + result_future=result_future, + ) + q = self._get_qwen3_tts_queue(model_name) + await q.put(packet) + while True: + item = await stream_queue.get() + if item is None: + break + yield item + async def generate_speech_kokoro(self, model_name: str, gen_config: OV_KokoroGenConfig) -> Dict[str, Any]: """Generate speech using a loaded Kokoro model asynchronously via worker queue. diff --git a/src/tests/old_tests_from_dev/completions_test.py b/src/tests/old_tests_from_dev/completions_test.py deleted file mode 100644 index 244873f..0000000 --- a/src/tests/old_tests_from_dev/completions_test.py +++ /dev/null @@ -1,129 +0,0 @@ -import os -from openai import OpenAI - -model_name = "Gemma-3-12B" - -def completions_non_streaming_example(): - """Run a simple non-streaming completions request against localhost:8000.""" - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - except Exception as e: - print(f"Failed to initialize OpenAI client: {e}") - return - - try: - - resp = client.completions.create( - model=model_name, - prompt="The future of artificial intelligence is", - max_tokens=32, - temperature=0.7, - ) - - print("Non-streaming completions response:") - print(resp) - if resp and resp.choices: - print("Generated text:", resp.choices[0].text) - print(f"Tokens used - Prompt: {resp.usage.prompt_tokens}, Completion: {resp.usage.completion_tokens}") - except Exception as e: - print(f"completions (non-streaming) error: {e}") - - -def completions_streaming_example(): - """Run a streaming completions request against localhost:8000.""" - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - except Exception as e: - print(f"Failed to initialize OpenAI client: {e}") - return - - try: - - stream = client.completions.create( - model=model_name, - prompt="Explain quantum computing in simple terms:", - max_tokens=32, - temperature=0.8, - stream=True, - ) - - print("Streaming completions response:") - collected_text = "" - try: - for chunk in stream: - if not chunk or not chunk.choices: - continue - choice = chunk.choices[0] - text = choice.text - if text: - collected_text += text - print(text, end="", flush=True) - finally: - print() - print(f"Total collected text: {len(collected_text)} characters") - except Exception as e: - print(f"completions (streaming) error: {e}") - - -def completions_with_parameters(): - """Test completions with various generation parameters.""" - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - except Exception as e: - print(f"Failed to initialize OpenAI client: {e}") - return - - try: - - resp = client.completions.create( - model=model_name, - prompt="Translate this to French: Hello, how are you?", - max_tokens=32, - temperature=0.5, - top_p=0.9, - top_k=50, - ) - - print("Completions with parameters response:") - print(f"Model: {resp.model}") - print(f"Generated text: {resp.choices[0].text}") - print(f"Finish reason: {resp.choices[0].finish_reason}") - print(f"Usage: {resp.usage}") - except Exception as e: - print(f"completions (with parameters) error: {e}") - - -if __name__ == "__main__": - print("=" * 60) - print("Testing OpenAI-compatible /v1/completions endpoint") - print("Model: {model_name}") - print("=" * 60) - print() - - print("Test 1: Non-streaming completion") - print("-" * 60) - completions_non_streaming_example() - - print() - print("Test 2: Streaming completion") - print("-" * 60) - completions_streaming_example() - - print() - print("Test 3: Completions with parameters") - print("-" * 60) - completions_with_parameters() - - print() - print("=" * 60) - print("All tests completed!") - print("=" * 60) diff --git a/src/tests/old_tests_from_dev/debug_tool_output.py b/src/tests/old_tests_from_dev/debug_tool_output.py deleted file mode 100644 index 9ecc88e..0000000 --- a/src/tests/old_tests_from_dev/debug_tool_output.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 -""" -Debug script to see what the model actually outputs when given tools. -Run this to diagnose tool calling issues. -""" -import os -import json -from openai import OpenAI - -def debug_tool_output(): - """Check what the model outputs with and without tools.""" - - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - - model_name = "Dolphin-X1" - - tools = [ - { - "type": "function", - "function": { - "name": "write_word", - "description": "Writes a specific word to output", - "parameters": { - "type": "object", - "properties": { - "word": { - "type": "string", - "description": "The word to write" - } - }, - "required": ["word"] - } - } - } - ] - - messages = [ - {"role": "system", "content": "You are a helpful assistant with access to tools. Always format tools with xml tags like JSON"}, - {"role": "user", "content": "Please use the write_word tool to write the word 'pirate'."} - ] - - print("="*70) - print("DEBUG: Raw model output with tools") - print("="*70) - - try: - response = client.chat.completions.create( - model=model_name, - messages=messages, - tools=tools, - temperature=0.1, # Lower temperature for more consistent output - ) - - msg = response.choices[0].message - - print(f"\nFinish reason: {response.choices[0].finish_reason}") - print(f"\nMessage role: {msg.role}") - print(f"\nMessage content:") - print(msg.content) - print(f"\nMessage tool_calls:") - print(msg.tool_calls) - - if msg.tool_calls: - print("\n✓ SUCCESS: Tool calls detected by server!") - for i, tc in enumerate(msg.tool_calls): - print(f"\nTool call {i+1}:") - print(f" ID: {tc.id}") - print(f" Function: {tc.function.name}") - print(f" Arguments: {tc.function.arguments}") - else: - print("\n✗ ISSUE: No tool calls detected") - print("\nThe model output above should contain tool call markers.") - print("Check your model's chat template and training.") - print("\nExpected formats:") - print(" 1. {\"name\": \"write_word\", \"arguments\": {\"word\": \"pirate\"}}") - print(" 2. <|python_tag|>{\"name\": \"write_word\", \"arguments\": {\"word\": \"pirate\"}}<|eom_id|>") - - except Exception as e: - print(f"\n✗ ERROR: {e}") - import traceback - traceback.print_exc() - -if __name__ == "__main__": - debug_tool_output() - diff --git a/src/tests/old_tests_from_dev/openai_llm_tool.py b/src/tests/old_tests_from_dev/openai_llm_tool.py deleted file mode 100644 index 25e143f..0000000 --- a/src/tests/old_tests_from_dev/openai_llm_tool.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import json -from openai import OpenAI - - -def test_tool_calling(): - """Test tool calling with Qwen3-4B-2507 model - runs 50 times.""" - - # Initialize client - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - except Exception as e: - print(f"Failed to initialize OpenAI client: {e}") - return - - model_name = "Dolphin-X1" - - # Track results - total_tests = 50 - successes = 0 - failures = 0 - - # Define a simple tool for writing a word - tools = [ - { - "type": "function", - "function": { - "name": "write_word", - "description": "Writes a specific word to output", - "parameters": { - "type": "object", - "properties": { - "word": { - "type": "string", - "description": "The word to write" - } - }, - "required": ["word"] - } - } - } - ] - - print(f"Running {total_tests} tool calling tests...\n") - - for test_num in range(1, total_tests + 1): - print(f"{'='*60}") - print(f"Test {test_num}/{total_tests}") - print(f"{'='*60}") - - # Initial conversation - messages = [ - {"role": "system", "content": "You are a helpful assistant with access to tools. always format tools with xml tags"}, - {"role": "user", "content": "Please use the write_word tool to write the word 'pirate'."} - ] - - try: - # First request - model should call the tool - print("Step 1: Sending request with tools...") - response = client.chat.completions.create( - model=model_name, - messages=messages, - tools=tools, - #tool_choice="auto" - ) - - assistant_message = response.choices[0].message - - # Check if model made a tool call - if assistant_message.tool_calls: - print("✓ Tool call detected!") - tool_call = assistant_message.tool_calls[0] - print(f" Function: {tool_call.function.name}") - print(f" Arguments: {tool_call.function.arguments}") - - # Parse the arguments - try: - args = json.loads(tool_call.function.arguments) - word_written = args.get("word", "") - - # Add the assistant's tool call to conversation - messages.append({ - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": tool_call.id, - "type": "function", - "function": { - "name": tool_call.function.name, - "arguments": tool_call.function.arguments - } - } - ] - }) - - # Add the tool result - messages.append({ - "role": "tool", - "tool_call_id": tool_call.id, - "content": f"Successfully wrote: {word_written}" - }) - - # Second request - get final response - print("Step 2: Sending tool result back to model...") - final_response = client.chat.completions.create( - model=model_name, - messages=messages - ) - - final_text = final_response.choices[0].message.content - print(f"Model final response: {final_text}") - - # Check if the response contains the expected phrase - if word_written.lower() == "pirate" and "pirate" in final_text.lower(): - print("✓ Test passed! LLM wrote pirate. Tool calling implementation confirmed\n") - successes += 1 - else: - print(f"✗ Test failed. Expected 'pirate' but got: {word_written}\n") - failures += 1 - - except json.JSONDecodeError as e: - print(f"✗ Failed to parse tool arguments: {e}\n") - failures += 1 - else: - print("✗ No tool calls detected in response") - if assistant_message.content: - print(f"Model response: {assistant_message.content}") - print() - failures += 1 - - except Exception as e: - print(f"✗ Error during tool calling test: {e}\n") - failures += 1 - - # Print final summary - print(f"\n{'='*60}") - print("FINAL RESULTS") - print(f"{'='*60}") - print(f"Total tests: {total_tests}") - print(f"Successes: {successes} ({successes/total_tests*100:.1f}%)") - print(f"Failures: {failures} ({failures/total_tests*100:.1f}%)") - print(f"{'='*60}") - - return successes == total_tests - - -if __name__ == "__main__": - test_tool_calling() - diff --git a/src/tests/old_tests_from_dev/openarc_bench_test.py b/src/tests/old_tests_from_dev/openarc_bench_test.py deleted file mode 100644 index 73305f3..0000000 --- a/src/tests/old_tests_from_dev/openarc_bench_test.py +++ /dev/null @@ -1,43 +0,0 @@ -import random -import requests -import os - -from transformers import AutoTokenizer - -model_path = r"/mnt/Ironwolf-4TB/Models/OpenVINO/Llama/dphn_Dolphin-X1-8B-int4_asym-awq-ov" -num_tokens = 512 -def get_input_tokens(model_path, num_tokens): - """ - Generate random input tokens for benchmarking. - Follows llama.cpp approach. - https://github.com/ggml-org/llama.cpp/blob/683fa6ba/tools/llama-bench/llama-bench.cpp#L1922 - """ - tokenizer = AutoTokenizer.from_pretrained(model_path) - vocab_size = len(tokenizer) - - special_token_ids = set(tokenizer.all_special_ids) - valid_token_ids = [i for i in range(vocab_size) if i not in special_token_ids] - - # Generate random tokens (not repeated) - input_ids = [random.choice(valid_token_ids) for _ in range(num_tokens)] - - return input_ids - - - - - -response = requests.post( - "http://localhost:8000/openarc/bench", - headers={"Authorization": f"Bearer {os.getenv('OPENARC_API_KEY')}"}, - json={ - "model": "Dolphin-X1", - "input_ids": get_input_tokens(model_path, num_tokens), # Pre-encoded token IDs - "max_tokens": 128, - "temperature": 0.7 - } -) - -metrics = response.json() -print(metrics) -# Output: {"metrics": {"input_token": ..., "new_token": ..., "ttft_ms": ..., ...}} \ No newline at end of file diff --git a/src/tests/old_tests_from_dev/test_concurrency_text.py b/src/tests/old_tests_from_dev/test_concurrency_text.py deleted file mode 100644 index 463c93a..0000000 --- a/src/tests/old_tests_from_dev/test_concurrency_text.py +++ /dev/null @@ -1,272 +0,0 @@ -import json -import os -import subprocess -import sys -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, Any, List, Optional -from dataclasses import dataclass - -from urllib.request import Request, urlopen - - -BASE_URL = "http://127.0.0.1:8000" -MAIN_PATH = "/home/echo/Projects/OpenArc/src2/api/main.py" -REQUEST_TIMEOUT_S = int(os.getenv("OPENARC_TEST_REQUEST_TIMEOUT_S", "120")) -API_KEY = os.getenv("OPENARC_API_KEY") - - -@dataclass -class RequestTiming: - model_name: str - request_id: int - start_time: float - end_time: float - duration: float - success: bool - error_msg: Optional[str] = None - response_length: int = 0 - - -def http_get(path: str) -> Dict[str, Any]: - req = Request(f"{BASE_URL}{path}", method="GET") - req.add_header("Content-Type", "application/json") - if API_KEY: - req.add_header("Authorization", f"Bearer {API_KEY}") - with urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def http_post(path: str, payload: Dict[str, Any]) -> Dict[str, Any]: - data = json.dumps(payload).encode("utf-8") - req = Request(f"{BASE_URL}{path}", data=data, method="POST") - req.add_header("Content-Type", "application/json") - if API_KEY: - req.add_header("Authorization", f"Bearer {API_KEY}") - with urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def wait_for_server(timeout_s: int = 21600) -> None: - start = time.time() - last_err = None - while time.time() - start < timeout_s: - try: - status = http_get("/openarc/status") - if isinstance(status, dict): - return - except Exception as e: # noqa: BLE001 - last_err = e - time.sleep(0.5) - raise RuntimeError(f"Server did not become ready in {timeout_s}s: {last_err}") - - -def load_model(model_path: str, model_name: str, device: str) -> str: - payload = { - "model_path": model_path, - "model_name": model_name, - "model_type": "text_to_text", - "engine": "ovgenai", - "device": device, - "runtime_config": {} - } - resp = http_post("/openarc/load", payload) - return resp.get("model_id", "") - - -def wait_until_loaded(model_name: str, timeout_s: int = 21600) -> None: - start = time.time() - last_err = None - while time.time() - start < timeout_s: - try: - status = http_get("/openarc/status") - models = status.get("models", []) - for m in models: - if m.get("model_name") == model_name and m.get("status") == "loaded": - return - except Exception as e: # noqa: BLE001 - last_err = e - time.sleep(1.0) - raise RuntimeError(f"Model '{model_name}' did not reach loaded state within {timeout_s}s: {last_err}") - - -def generate_once(model_name: str, prompt: str, request_id: int, max_new_tokens: int = 64) -> RequestTiming: - """Generate text and return timing information.""" - start_time = time.time() - timing = RequestTiming( - model_name=model_name, - request_id=request_id, - start_time=start_time, - end_time=0.0, - duration=0.0, - success=False - ) - - try: - payload = { - "model_name": model_name, - "gen_config": { - "messages": [{"role": "user", "content": prompt}], - "max_new_tokens": max_new_tokens, - "stream": True - } - } - response = http_post("/openarc/generate", payload) - - timing.end_time = time.time() - timing.duration = timing.end_time - timing.start_time - timing.success = True - timing.response_length = len(response.get("text", "")) - - return timing - - except Exception as e: - timing.end_time = time.time() - timing.duration = timing.end_time - timing.start_time - timing.success = False - timing.error_msg = str(e) - return timing - - -def print_results(timings: List[RequestTiming], total_elapsed: float) -> None: - """Print basic test results.""" - successful = [t for t in timings if t.success] - failed = [t for t in timings if not t.success] - - print(f"\nTest Results:") - print(f" Total requests: {len(timings)}") - print(f" Successful: {len(successful)}") - print(f" Failed: {len(failed)}") - print(f" Total time: {total_elapsed:.2f}s") - - if successful: - avg_duration = sum(t.duration for t in successful) / len(successful) - total_request_time = sum(t.duration for t in successful) - print(f" Average request duration: {avg_duration:.2f}s") - print(f" Total request time (sequential): {total_request_time:.2f}s") - print(f" Speedup: {total_request_time / total_elapsed:.2f}x") - - # Show overlapping requests - overlaps = 0 - for i, t1 in enumerate(timings): - for t2 in timings[i+1:]: - if (t1.model_name != t2.model_name and - max(t1.start_time, t2.start_time) < min(t1.end_time, t2.end_time)): - overlaps += 1 - - print(f" Overlapping request pairs: {overlaps}") - - -def start_server() -> subprocess.Popen: - env = os.environ.copy() - return subprocess.Popen( - [sys.executable, "-u", MAIN_PATH], - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT, - ) - - -def stop_server(proc: subprocess.Popen) -> None: - if proc.poll() is None: - try: - proc.terminate() - try: - proc.wait(timeout=10) - except subprocess.TimeoutExpired: - proc.kill() - except Exception: # noqa: BLE001 - pass - - -def unload_model(model_name: str) -> Dict[str, Any]: - """Unload a model by name.""" - payload = {"model_name": model_name} - return http_post("/openarc/unload", payload) - - -def main() -> None: - # Update these paths for your environment if needed - model_path = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen/Qwen3-1.7B-int8_asym-ov" - model_a = {"name": "Qwen3-1.7B-int8_asym-ov-GPU1", "device": "GPU.1"} - model_b = {"name": "Qwen3-1.7B-int8_asym-ov-GPU2", "device": "GPU.2"} - - server = start_server() - try: - print("Waiting for server to become ready...") - status_timeout = int(os.getenv("OPENARC_TEST_STATUS_TIMEOUT_S", "21600")) - wait_for_server(status_timeout) - - print(f"Loading model {model_a['name']} on {model_a['device']}...") - load_model(model_path, model_a["name"], model_a["device"]) - print(f"Loading model {model_b['name']} on {model_b['device']}...") - load_model(model_path, model_b["name"], model_b["device"]) - - print("Waiting for models to be loaded...") - load_timeout = int(os.getenv("OPENARC_TEST_LOAD_TIMEOUT_S", "21600")) - wait_until_loaded(model_a["name"], load_timeout) - wait_until_loaded(model_b["name"], load_timeout) - print("Models loaded.") - - # Simple test prompts - num_requests_per_model = int(os.getenv("OPENARC_TEST_REQUESTS_PER_MODEL", "20")) - prompt = "Write a Python function to calculate the factorial of a number with the following requirements: the function should be named factorial, the function should take an integer as an argument, the function should return the factorial of the integer, the function should be a recursive function." - max_tokens = 64 - - print(f"\nStarting concurrency test with {num_requests_per_model} requests per model...") - - test_start_time = time.time() - with ThreadPoolExecutor(max_workers=20) as pool: - futures = [] - - # Submit requests for both models - for i in range(num_requests_per_model): - futures.append(pool.submit(generate_once, model_a["name"], prompt, i, max_tokens)) - futures.append(pool.submit(generate_once, model_b["name"], prompt, i + num_requests_per_model, max_tokens)) - - # Collect results - timings: List[RequestTiming] = [] - for i, fut in enumerate(as_completed(futures)): - try: - timing_result = fut.result() - timings.append(timing_result) - - if timing_result.success: - print(f"[{i+1:2d}/{len(futures)}] ✅ {timing_result.model_name}[{timing_result.request_id}]: " - f"{timing_result.duration:.2f}s") - else: - print(f"[{i+1:2d}/{len(futures)}] ❌ {timing_result.model_name}[{timing_result.request_id}]: " - f"ERROR: {timing_result.error_msg}") - - except Exception as e: # noqa: BLE001 - print(f"[{i+1:2d}/{len(futures)}] ❌ Future error: {e}") - - test_end_time = time.time() - total_elapsed = test_end_time - test_start_time - - print_results(timings, total_elapsed) - - # Unload models after test completion - print(f"\nUnloading models...") - try: - print(f"Unloading model {model_a['name']}...") - unload_response_a = unload_model(model_a["name"]) - print(f" Response: {unload_response_a}") - except Exception as e: - print(f" Failed to unload {model_a['name']}: {e}") - - try: - print(f"Unloading model {model_b['name']}...") - unload_response_b = unload_model(model_b["name"]) - print(f" Response: {unload_response_b}") - except Exception as e: - print(f" Failed to unload {model_b['name']}: {e}") - - finally: - stop_server(server) - - -if __name__ == "__main__": - main() - - diff --git a/src/tests/old_tests_from_dev/test_concurrency_vision.py b/src/tests/old_tests_from_dev/test_concurrency_vision.py deleted file mode 100644 index 0708269..0000000 --- a/src/tests/old_tests_from_dev/test_concurrency_vision.py +++ /dev/null @@ -1,284 +0,0 @@ -import json -import os -import subprocess -import sys -import time -import base64 -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, Any, List, Optional -from dataclasses import dataclass - -from urllib.request import Request, urlopen - - -BASE_URL = "http://127.0.0.1:8000" -MAIN_PATH = "/home/echo/Projects/OpenArc/src2/api/main.py" -REQUEST_TIMEOUT_S = int(os.getenv("OPENARC_TEST_REQUEST_TIMEOUT_S", "120")) -API_KEY = os.getenv("OPENARC_API_KEY") - - -@dataclass -class RequestTiming: - model_name: str - request_id: int - start_time: float - end_time: float - duration: float - success: bool - error_msg: Optional[str] = None - response_length: int = 0 - - -def http_get(path: str) -> Dict[str, Any]: - req = Request(f"{BASE_URL}{path}", method="GET") - req.add_header("Content-Type", "application/json") - if API_KEY: - req.add_header("Authorization", f"Bearer {API_KEY}") - with urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def http_post(path: str, payload: Dict[str, Any]) -> Dict[str, Any]: - data = json.dumps(payload).encode("utf-8") - req = Request(f"{BASE_URL}{path}", data=data, method="POST") - req.add_header("Content-Type", "application/json") - if API_KEY: - req.add_header("Authorization", f"Bearer {API_KEY}") - with urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def wait_for_server(timeout_s: int = 21600) -> None: - start = time.time() - last_err = None - while time.time() - start < timeout_s: - try: - status = http_get("/openarc/status") - if isinstance(status, dict): - return - except Exception as e: # noqa: BLE001 - last_err = e - time.sleep(0.5) - raise RuntimeError(f"Server did not become ready in {timeout_s}s: {last_err}") - - -def load_model(model_path: str, model_name: str, device: str) -> str: - payload = { - "model_path": model_path, - "model_name": model_name, - "model_type": "image_to_text", # Changed for vision models - "engine": "ovgenai", - "device": device, - "runtime_config": {} - } - resp = http_post("/openarc/load", payload) - return resp.get("model_id", "") - - -def wait_until_loaded(model_name: str, timeout_s: int = 21600) -> None: - start = time.time() - last_err = None - while time.time() - start < timeout_s: - try: - status = http_get("/openarc/status") - models = status.get("models", []) - for m in models: - if m.get("model_name") == model_name and m.get("status") == "loaded": - return - except Exception as e: # noqa: BLE001 - last_err = e - time.sleep(1.0) - raise RuntimeError(f"Model '{model_name}' did not reach loaded state within {timeout_s}s: {last_err}") - - -def load_image_as_base64(image_path: str) -> str: - """Load image file and encode as base64 data URL.""" - with open(image_path, "rb") as img_file: - img_data = img_file.read() - img_base64 = base64.b64encode(img_data).decode('utf-8') - return f"data:image/png;base64,{img_base64}" - - -def generate_once(model_name: str, prompt: str, image_data_url: str, request_id: int, max_new_tokens: int = 64) -> RequestTiming: - """Generate text from image and return timing information.""" - start_time = time.time() - timing = RequestTiming( - model_name=model_name, - request_id=request_id, - start_time=start_time, - end_time=0.0, - duration=0.0, - success=False - ) - - try: - payload = { - "model_name": model_name, - "gen_config": { - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_data_url - } - }, - { - "type": "text", - "text": prompt - } - ] - } - ], - "max_new_tokens": max_new_tokens, - "stream": True - } - } - response = http_post("/openarc/generate", payload) - - timing.end_time = time.time() - timing.duration = timing.end_time - timing.start_time - timing.success = True - timing.response_length = len(response.get("text", "")) - - return timing - - except Exception as e: - timing.end_time = time.time() - timing.duration = timing.end_time - timing.start_time - timing.success = False - timing.error_msg = str(e) - return timing - - -def print_results(timings: List[RequestTiming], total_elapsed: float) -> None: - """Print basic test results.""" - successful = [t for t in timings if t.success] - failed = [t for t in timings if not t.success] - - print(f"\nTest Results:") - print(f" Total requests: {len(timings)}") - print(f" Successful: {len(successful)}") - print(f" Failed: {len(failed)}") - print(f" Total time: {total_elapsed:.2f}s") - - if successful: - avg_duration = sum(t.duration for t in successful) / len(successful) - total_request_time = sum(t.duration for t in successful) - print(f" Average request duration: {avg_duration:.2f}s") - print(f" Total request time (sequential): {total_request_time:.2f}s") - print(f" Speedup: {total_request_time / total_elapsed:.2f}x") - - # Show overlapping requests - overlaps = 0 - for i, t1 in enumerate(timings): - for t2 in timings[i+1:]: - if (t1.model_name != t2.model_name and - max(t1.start_time, t2.start_time) < min(t1.end_time, t2.end_time)): - overlaps += 1 - - print(f" Overlapping request pairs: {overlaps}") - - -def start_server() -> subprocess.Popen: - env = os.environ.copy() - return subprocess.Popen( - [sys.executable, "-u", MAIN_PATH], - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT, - ) - - -def stop_server(proc: subprocess.Popen) -> None: - if proc.poll() is None: - try: - proc.terminate() - try: - proc.wait(timeout=10) - except subprocess.TimeoutExpired: - proc.kill() - except Exception: # noqa: BLE001 - pass - - -def main() -> None: - # Vision model configuration - model_path = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen/Qwen2.5-VL-7B-Instruct-int4_sym-ov" - image_path = "/home/echo/Projects/OpenArc/src2/tests/dedication.png" - model_a = {"name": "Qwen2.5-VL-7B-Instruct-int4_sym-ov-GPU1", "device": "GPU.1"} - model_b = {"name": "Qwen2.5-VL-7B-Instruct-int4_sym-ov-GPU2", "device": "GPU.2"} - - # Check if image exists - if not os.path.exists(image_path): - print(f"Error: Image path does not exist: {image_path}") - return - - # Load image as base64 - print(f"Loading image: {image_path}") - image_data_url = load_image_as_base64(image_path) - print("Image loaded and encoded as base64") - - server = start_server() - try: - print("Waiting for server to become ready...") - status_timeout = int(os.getenv("OPENARC_TEST_STATUS_TIMEOUT_S", "21600")) - wait_for_server(status_timeout) - - print(f"Loading model {model_a['name']} on {model_a['device']}...") - load_model(model_path, model_a["name"], model_a["device"]) - print(f"Loading model {model_b['name']} on {model_b['device']}...") - load_model(model_path, model_b["name"], model_b["device"]) - - print("Waiting for models to be loaded...") - load_timeout = int(os.getenv("OPENARC_TEST_LOAD_TIMEOUT_S", "21600")) - wait_until_loaded(model_a["name"], load_timeout) - wait_until_loaded(model_b["name"], load_timeout) - print("Models loaded.") - - # Vision test prompts - num_requests_per_model = int(os.getenv("OPENARC_TEST_REQUESTS_PER_MODEL", "10")) - prompt = "Describe what you see in this image in detail." - max_tokens = 64 - - print(f"\nStarting vision concurrency test with {num_requests_per_model} requests per model...") - - test_start_time = time.time() - with ThreadPoolExecutor(max_workers=20) as pool: - futures = [] - - # Submit requests for both models - for i in range(num_requests_per_model): - futures.append(pool.submit(generate_once, model_a["name"], prompt, image_data_url, i, max_tokens)) - futures.append(pool.submit(generate_once, model_b["name"], prompt, image_data_url, i + num_requests_per_model, max_tokens)) - - # Collect results - timings: List[RequestTiming] = [] - for i, fut in enumerate(as_completed(futures)): - try: - timing_result = fut.result() - timings.append(timing_result) - - if timing_result.success: - print(f"[{i+1:2d}/{len(futures)}] ✅ {timing_result.model_name}[{timing_result.request_id}]: " - f"{timing_result.duration:.2f}s") - else: - print(f"[{i+1:2d}/{len(futures)}] ❌ {timing_result.model_name}[{timing_result.request_id}]: " - f"ERROR: {timing_result.error_msg}") - - except Exception as e: # noqa: BLE001 - print(f"[{i+1:2d}/{len(futures)}] ❌ Future error: {e}") - - test_end_time = time.time() - total_elapsed = test_end_time - test_start_time - - print_results(timings, total_elapsed) - - finally: - stop_server(server) - - -if __name__ == "__main__": - main() diff --git a/src/tests/old_tests_from_dev/test_generate.sh b/src/tests/old_tests_from_dev/test_generate.sh deleted file mode 100755 index a10ccaa..0000000 --- a/src/tests/old_tests_from_dev/test_generate.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Usage: -# ./test_generate.sh [MODEL_NAME] [STREAM] -# MODEL_NAME: name used when loading (default: Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov) -# STREAM: true|false (default: false) - -MODEL_NAME=${1:-Impish_Nemo_12B-int4_asym-awq-ov} -STREAM=${2:-true} - -read -r -d '' DATA < list[dict]: - """Execute a DuckDuckGo search using ddgs library.""" - try: - - print(f" [SEARCH] Executing: query='{query}'") - - with ddgs.DDGS() as search_client: - results = list(search_client.text(query, max_results=5)) - - # Format results - formatted_results = [] - for i, result in enumerate(results, 1): - formatted_results.append({ - "position": i, - "title": result.get("title", ""), - "url": result.get("href", ""), - "snippet": result.get("body", "") - }) - - return formatted_results - - except Exception as e: - return [{"error": str(e)}] - - -def test_streaming_tool_call(): - """Test streaming tool calls with DuckDuckGo search.""" - - print("="*70) - print("STREAMING TOOL CALL TEST - DuckDuckGo Search") - print("="*70) - - # Initialize client - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=os.getenv("OPENARC_API_KEY"), - ) - - model_name = "Dolphin-X1" - - # Define DuckDuckGo search tool - tools = [ - { - "type": "function", - "function": { - "name": "search", - "description": """ - Search the web. Returns relevant web pages. Use this tool to get information from the internet. - """, - "parameters": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "The search query string" - } - }, - "required": ["query"] - } - } - } - ] - - # Initial conversation - messages = [ - { - "role": "system", - "content": "You are a helpful research assistant with access to web search. If you decide to use a tool, call it first before anything else." - }, - { - "role": "user", - "content": "What's the most recent OpenVINO version?" - } - ] - - print("\nStep 1: Streaming request with tool...") - print(f"Model: {model_name}") - print("Stream: True") - print(f"Tools: {tools[0]['function']['name']}") - - try: - # Stream the response - accumulated_content = "" - accumulated_tool_calls = {} - finish_reason = None - - stream = client.chat.completions.create( - model=model_name, - messages=messages, - tools=tools, - stream=True, - temperature=0.7 - ) - - print("\nStreaming chunks:") - chunk_count = 0 - - for chunk in stream: - chunk_count += 1 - choice = chunk.choices[0] if chunk.choices else None - - if not choice: - continue - - delta = choice.delta - finish_reason = choice.finish_reason or finish_reason - - # Accumulate content - if delta.content: - accumulated_content += delta.content - print(f" Chunk {chunk_count}: content='{delta.content[:50]}...'") - - # Accumulate tool calls - if delta.tool_calls: - for tc_delta in delta.tool_calls: - idx = tc_delta.index - - if idx not in accumulated_tool_calls: - accumulated_tool_calls[idx] = { - "id": tc_delta.id, - "type": tc_delta.type or "function", - "function": { - "name": "", - "arguments": "" - } - } - - if tc_delta.id: - accumulated_tool_calls[idx]["id"] = tc_delta.id - - if tc_delta.function: - if tc_delta.function.name: - accumulated_tool_calls[idx]["function"]["name"] = tc_delta.function.name - print(f" Chunk {chunk_count}: tool_call[{idx}].name='{tc_delta.function.name}'") - - if tc_delta.function.arguments: - accumulated_tool_calls[idx]["function"]["arguments"] += tc_delta.function.arguments - print(f" Chunk {chunk_count}: tool_call[{idx}].arguments+='{tc_delta.function.arguments[:30]}...'") - - print(f"\nTotal chunks received: {chunk_count}") - print(f"Finish reason: {finish_reason}") - - # Check what we accumulated - if accumulated_tool_calls: - print("\n✓ Tool calls detected in stream!") - print(f"Number of tool calls: {len(accumulated_tool_calls)}") - - for idx, tc in accumulated_tool_calls.items(): - print(f"\n--- Tool Call {idx} ---") - print(f"ID: {tc['id']}") - print(f"Function: {tc['function']['name']}") - print(f"Arguments: {tc['function']['arguments']}") - - # Parse and execute the tool - try: - args = json.loads(tc['function']['arguments']) - - if tc['function']['name'] == 'search': - print("\nStep 2: Executing search...") - - # Execute search - results = ddg_search( - query=args['query'] - ) - - print(f" [SEARCH] Got {len(results)} results") - - # Add tool call to conversation - messages.append({ - "role": "assistant", - "content": None, - "tool_calls": [{ - "id": tc['id'], - "type": "function", - "function": { - "name": tc['function']['name'], - "arguments": tc['function']['arguments'] - } - }] - }) - - # Add tool results - messages.append({ - "role": "tool", - "tool_call_id": tc['id'], - "content": json.dumps(results, indent=2) - }) - - print("\nStep 3: Getting final response from model...") - final_response = client.chat.completions.create( - model=model_name, - messages=messages, - stream=False - ) - - final_text = final_response.choices[0].message.content - print("\n--- Model's Final Response ---") - print(final_text) - print("--- End Response ---") - - # Validate results - print("\n" + "="*70) - if results and not results[0].get("error"): - print("✓ TEST PASSED - Streaming tool call workflow successful!") - print(" - Streamed tool call detected") - print(" - Search executed successfully") - print(f" - Got {len(results)} search results") - print(" - Model provided summary") - return True - else: - print("✗ TEST FAILED - Search execution failed") - return False - - except json.JSONDecodeError as e: - print(f"\n✗ Failed to parse tool arguments: {e}") - print(f"Raw arguments: {tc['function']['arguments']}") - return False - - elif accumulated_content: - print("\n✗ No tool calls detected in stream") - print(f"Accumulated content: {accumulated_content[:200]}") - print("\nThe model responded with text instead of calling the tool.") - return False - - else: - print("\n✗ Empty response") - return False - - except Exception as e: - print(f"\n✗ ERROR: {e}") - import traceback - traceback.print_exc() - return False - - -if __name__ == "__main__": - success = test_streaming_tool_call() - exit(0 if success else 1) - diff --git a/src/tests/old_tests_from_dev/test_unload.sh b/src/tests/old_tests_from_dev/test_unload.sh deleted file mode 100755 index fbd806f..0000000 --- a/src/tests/old_tests_from_dev/test_unload.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# Example curl request to unload a model via OpenArc API -curl -X POST "http://localhost:8000/openarc/unload" \ - -H "Authorization: Bearer $OPENARC_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "model_name": "Impish_Nemo_12B-int4_asym-awq-ov" - }' diff --git a/src/tests/old_tests_from_dev/test_vision_tool_completion.py b/src/tests/old_tests_from_dev/test_vision_tool_completion.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/tests/old_tests_from_dev/test_whisper_llm_kokoro.py b/src/tests/old_tests_from_dev/test_whisper_llm_kokoro.py deleted file mode 100644 index f8b9904..0000000 --- a/src/tests/old_tests_from_dev/test_whisper_llm_kokoro.py +++ /dev/null @@ -1,310 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive test that chains Whisper transcription -> LLM chat completion -> Kokoro TTS -""" - -import os -import base64 -import requests -import json -import time -from pathlib import Path -from openai import OpenAI - - -def load_models(): - """Load all required models: Whisper to GPU.2, LLM to GPU.1, Kokoro to CPU""" - api_key = os.getenv("OPENARC_API_KEY") - if not api_key: - print("OPENARC_API_KEY is not set. Export it before running this test.") - return False - - base_url = "http://localhost:8000/openarc/load" - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json" - } - - # Model configurations - models_config = [ - { - "model_path": "/mnt/Ironwolf-4TB/Models/OpenVINO/Whisper/distil-whisper-large-v3-int8-ov", - "model_name": "distil-whisper-large-v3-int8-ov", - "model_type": "whisper", - "engine": "ovgenai", - "device": "GPU.2", - "runtime_config": {} - }, - { - "model_path": "/mnt/Ironwolf-4TB/Models/OpenVINO/Mistral/Rocinante-12B-v1.1-int4_sym-awq-se-ov", - "model_name": "Hermes-4-14B-int4_sym-ov", - "model_type": "llm", - "engine": "ovgenai", - "device": "GPU.1", - "runtime_config": {} - }, - { - "model_path": "/mnt/Ironwolf-4TB/Models/OpenVINO/Kokoro-82M-FP16-OpenVINO", - "model_name": "kokoro-82m-fp16-ov", - "model_type": "kokoro", - "engine": "openvino", - "device": "CPU", - "runtime_config": {} - } - ] - - print("Loading models...") - for i, config in enumerate(models_config): - print(f"Loading model {i+1}/3: {config['model_name']} on {config['device']}") - try: - response = requests.post(base_url, headers=headers, json=config, timeout=300) - if response.status_code == 200: - print(f"✓ Successfully loaded {config['model_name']}") - else: - print(f"✗ Failed to load {config['model_name']}: {response.status_code} - {response.text}") - return False - except Exception as e: - print(f"✗ Error loading {config['model_name']}: {e}") - return False - - # Wait longer for LLM model (index 1) - if i == 1: # LLM model - print("⏳ Waiting 60 seconds for LLM model to fully initialize...") - time.sleep(60) - else: - # Brief pause between other model loads - time.sleep(5) - - print("All models loaded successfully!") - return True - - -def verify_models(): - """Verify all models are loaded and available via /v1/models endpoint""" - api_key = os.getenv("OPENARC_API_KEY") - - expected_models = [ - "distil-whisper-large-v3-int8-ov", - "Hermes-4-14B-int4_sym-ov", - "kokoro-82m-fp16-ov" - ] - - print("Verifying models via /v1/models endpoint...") - - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=api_key, - ) - - response = client.models.list() - loaded_models = [model.id for model in response.data] - - print(f"Found {len(loaded_models)} loaded models:") - for model in sorted(loaded_models): - print(f" - {model}") - - print("\nVerification results:") - all_loaded = True - for expected in expected_models: - if expected in loaded_models: - print(f"✓ {expected} - LOADED") - else: - print(f"✗ {expected} - MISSING") - all_loaded = False - - if all_loaded: - print("🎉 All required models are loaded and ready!") - return True - else: - print("❌ Some models are missing. Check the loading process.") - return False - - except Exception as e: - print(f"Error verifying models: {e}") - return False - - -def transcribe_audio(audio_path): - """Transcribe audio file using Whisper""" - api_key = os.getenv("OPENARC_API_KEY") - - print(f"Transcribing audio file: {audio_path}") - - try: - with open(audio_path, "rb") as f: - audio_b64 = base64.b64encode(f.read()).decode("utf-8") - except Exception as e: - print(f"Failed to read audio file: {e}") - return None - - url = "http://localhost:8000/v1/audio/transcriptions" - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - } - payload = { - "model": "distil-whisper-large-v3-int8-ov", - "audio_base64": audio_b64, - } - - try: - resp = requests.post(url, headers=headers, json=payload, timeout=120) - if resp.status_code != 200: - print(f"Transcription failed: {resp.status_code} - {resp.text}") - return None - - data = resp.json() - text = data.get("text", "") - metrics = data.get("metrics", {}) - - print("✓ Transcription completed:") - print(f"Text: {text}") - if metrics: - print("Metrics:") - for k, v in metrics.items(): - print(f" {k}: {v}") - - return text - - except Exception as e: - print(f"Transcription request failed: {e}") - return None - - -def chat_completion(transcribed_text): - """Send transcribed text to LLM for chat completion""" - api_key = os.getenv("OPENARC_API_KEY") - - print("Generating chat completion...") - - try: - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=api_key, - ) - - # Create a more interesting prompt based on the transcribed text - system_prompt = "You are a helpful assistant. Respond thoughtfully and conversationally to the user's input." - user_prompt = f"{transcribed_text}" - - resp = client.chat.completions.create( - model="Hermes-4-14B-int4_sym-ov", - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - max_tokens=1024, - temperature=0.7 - ) - - if resp and resp.choices: - completion_text = resp.choices[0].message.content - print("✓ Chat completion generated:") - print(f"Response: {completion_text}") - return completion_text - else: - print("No completion generated") - return None - - except Exception as e: - print(f"Chat completion error: {e}") - return None - - -def text_to_speech(text, output_path): - """Convert text to speech using Kokoro TTS""" - api_key = os.getenv("OPENARC_API_KEY") - - print("Converting text to speech...") - - url = "http://localhost:8000/v1/audio/speech" - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json" - } - - data = { - "model": "kokoro-82m-fp16-ov", - "input": text, - "voice": "af_sarah", - "speed": 1.0, - "language": "a", - "response_format": "wav" - } - - try: - with requests.post(url, headers=headers, json=data, stream=True) as response: - response.raise_for_status() - with open(output_path, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - print(f"✓ Speech synthesis completed. Audio saved to: {output_path}") - return True - - except Exception as e: - print(f"Text-to-speech error: {e}") - return False - - -def main(): - """Main test function that chains all operations""" - print("=== OpenArc Whisper -> LLM -> Kokoro Chain Test ===\n") - - # Configuration - sample_input_audio = "/home/echo/Projects/OpenArc/src/tests/litany_against_fear_dune.wav" - output_audio_path = Path(__file__).parent / "whisper_llm_kokoro_output.wav" - - # Step 1: Load all models - print("Step 1: Loading models...") - if not load_models(): - print("Failed to load models. Exiting.") - return - - print("\n" + "="*60 + "\n") - - # Step 2: Verify all models are loaded - print("Step 2: Verifying models...") - if not verify_models(): - print("Model verification failed. Exiting.") - return - - print("\n" + "="*60 + "\n") - - # Step 3: Transcribe audio - print("Step 3: Transcribing audio...") - transcribed_text = transcribe_audio(sample_input_audio) - if not transcribed_text: - print("Transcription failed. Exiting.") - return - - print("\n" + "="*60 + "\n") - - # Step 4: Generate chat completion - print("Step 4: Generating chat completion...") - completion_text = chat_completion(transcribed_text) - if not completion_text: - print("Chat completion failed. Exiting.") - return - - print("\n" + "="*60 + "\n") - - # Step 5: Convert to speech - print("Step 5: Converting to speech...") - if not text_to_speech(completion_text, output_audio_path): - print("Text-to-speech failed. Exiting.") - return - - print("\n" + "="*60) - print("🎉 SUCCESS! Complete pipeline executed:") - print(f" Input audio: {sample_input_audio}") - print(f" Transcribed: '{transcribed_text[:100]}{'...' if len(transcribed_text) > 100 else ''}'") - print(f" LLM response: '{completion_text[:100]}{'...' if len(completion_text) > 100 else ''}'") - print(f" Output audio: {output_audio_path}") - print("="*60) - - -if __name__ == "__main__": - main() diff --git a/src/tests/test_model_registry_unit.py b/src/tests/test_model_registry_unit.py index 6165c8b..797fbc1 100644 --- a/src/tests/test_model_registry_unit.py +++ b/src/tests/test_model_registry_unit.py @@ -134,3 +134,8 @@ async def _run(): message = asyncio.run(_run()) assert "not supported" in message + +def test_model_class_registry_includes_qwen3_asr() -> None: + key = (EngineType.OPENVINO, ModelType.QWEN3_ASR) + assert registry_module.MODEL_CLASS_REGISTRY[key] == "src.engine.openvino.qwen3_asr.infer.OVQwen3ASR" + diff --git a/src/tests/test_ov_genai_kokoro_integration.py b/src/tests/test_ov_genai_kokoro_integration.py index 891626a..e2159c1 100644 --- a/src/tests/test_ov_genai_kokoro_integration.py +++ b/src/tests/test_ov_genai_kokoro_integration.py @@ -60,7 +60,7 @@ def test_kokoro_chunk_forward_pass_cpu_integration() -> None: try: gen_config = OV_KokoroGenConfig( - kokoro_message="Hello world from Kokoro.", + input="Hello world from Kokoro.", voice=KokoroVoice.AF_SARAH, lang_code=KokoroLanguage.AMERICAN_ENGLISH, speed=1.0, diff --git a/src/tests/test_ov_genai_kokoro_unit.py b/src/tests/test_ov_genai_kokoro_unit.py index 6540615..23731c6 100644 --- a/src/tests/test_ov_genai_kokoro_unit.py +++ b/src/tests/test_ov_genai_kokoro_unit.py @@ -101,7 +101,7 @@ def __call__(self, text, voice, speed): monkeypatch.setattr(kokoro_module, "KPipeline", DummyPipeline) config = OV_KokoroGenConfig( - kokoro_message="ignored", + input="ignored", voice=KokoroVoice.AF_SARAH, lang_code=KokoroLanguage.AMERICAN_ENGLISH, speed=1.0, diff --git a/src/tests/test_worker_registry_integration.py b/src/tests/test_worker_registry_integration.py index c00fcbb..065d0e9 100644 --- a/src/tests/test_worker_registry_integration.py +++ b/src/tests/test_worker_registry_integration.py @@ -6,7 +6,7 @@ import src.server.worker_registry as worker_module from src.server.model_registry import ModelRegistry from src.server.models.registration import EngineType, ModelLoadConfig, ModelType -from src.server.models.openvino import KokoroLanguage, KokoroVoice, OV_KokoroGenConfig +from src.server.models.openvino import KokoroLanguage, KokoroVoice, OV_KokoroGenConfig, OV_Qwen3ASRGenConfig from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig @@ -48,6 +48,10 @@ class DummyKokoro: async def unload_model(self, registry, model_name): return True + class DummyQwen3ASR: + async def unload_model(self, registry, model_name): + return True + class DummyEmb: async def unload_model(self, registry, model_name): return True @@ -60,6 +64,7 @@ async def unload_model(self, registry, model_name): monkeypatch.setattr(worker_module, "OVGenAI_VLM", DummyVLM) monkeypatch.setattr(worker_module, "OVGenAI_Whisper", DummyWhisper) monkeypatch.setattr(worker_module, "OV_Kokoro", DummyKokoro) + monkeypatch.setattr(worker_module, "OVQwen3ASR", DummyQwen3ASR) monkeypatch.setattr(worker_module, "Optimum_EMB", DummyEmb) monkeypatch.setattr(worker_module, "Optimum_RR", DummyRR) @@ -67,6 +72,7 @@ async def unload_model(self, registry, model_name): monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_vlm", _make_worker("vlm-full", {"tokens": 2}, True)) monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_whisper", _make_worker("whisper-text", {"words": 2})) monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_kokoro", _make_worker("audio-base64", {"chunks_processed": 1})) + monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_qwen3_asr", _make_worker("qwen3-text", {"chunks": 1})) monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_emb", _make_worker([[0.1, 0.2]], {"dim": 2})) monkeypatch.setattr(worker_module.QueueWorker, "queue_worker_rr", _make_worker([{"doc": "A", "score": 0.9}], {"total": 1})) @@ -74,6 +80,7 @@ async def unload_model(self, registry, model_name): ModelType.LLM: DummyLLM, ModelType.VLM: DummyVLM, ModelType.WHISPER: DummyWhisper, + ModelType.QWEN3_ASR: DummyQwen3ASR, ModelType.KOKORO: DummyKokoro, ModelType.EMB: DummyEmb, ModelType.RERANK: DummyRR, @@ -203,7 +210,7 @@ def test_worker_registry_kokoro_flow(worker_system) -> None: ) config = OV_KokoroGenConfig( - kokoro_message="Hello", + input="Hello", voice=KokoroVoice.AF_SARAH, lang_code=KokoroLanguage.AMERICAN_ENGLISH, speed=1.0, @@ -218,6 +225,32 @@ async def _run(): assert result == {"audio_base64": "audio-base64", "metrics": {"chunks_processed": 1}} +def test_worker_registry_qwen3_asr_flow(worker_system) -> None: + model_registry, worker_registry = worker_system + + load_config = ModelLoadConfig( + model_path="/models/mock", + model_name="integration-qwen3-asr", + model_type=ModelType.QWEN3_ASR, + engine=EngineType.OPENVINO, + device="CPU", + runtime_config={}, + ) + + config = OV_Qwen3ASRGenConfig(audio_base64="AAA") + + async def _run(): + return await _load_do_unload( + model_registry, + worker_registry, + load_config, + worker_registry.transcribe_qwen3_asr("integration-qwen3-asr", config), + ) + + result = asyncio.run(_run()) + assert result == {"text": "qwen3-text", "metrics": {"chunks": 1}} + + def test_worker_registry_embed_flow(worker_system) -> None: model_registry, worker_registry = worker_system diff --git a/src/tests/test_worker_registry_unit.py b/src/tests/test_worker_registry_unit.py index 8253e4a..06c0a8f 100644 --- a/src/tests/test_worker_registry_unit.py +++ b/src/tests/test_worker_registry_unit.py @@ -5,7 +5,7 @@ import src.server.worker_registry as worker_module from src.server.model_registry import ModelRecord, ModelRegistry from src.server.models.registration import ModelType -from src.server.models.openvino import KokoroLanguage, KokoroVoice, OV_KokoroGenConfig +from src.server.models.openvino import KokoroLanguage, KokoroVoice, OV_KokoroGenConfig, OV_Qwen3ASRGenConfig from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig @@ -43,6 +43,9 @@ class DummyWhisper: # noqa: D401 class DummyKokoro: # noqa: D401 pass + class DummyQwen3ASR: # noqa: D401 + pass + class DummyEmb: # noqa: D401 pass @@ -53,6 +56,7 @@ class DummyRR: # noqa: D401 monkeypatch.setattr(worker_module, "OVGenAI_VLM", DummyVLM) monkeypatch.setattr(worker_module, "OVGenAI_Whisper", DummyWhisper) monkeypatch.setattr(worker_module, "OV_Kokoro", DummyKokoro) + monkeypatch.setattr(worker_module, "OVQwen3ASR", DummyQwen3ASR) monkeypatch.setattr(worker_module, "Optimum_EMB", DummyEmb) monkeypatch.setattr(worker_module, "Optimum_RR", DummyRR) @@ -76,6 +80,11 @@ class DummyRR: # noqa: D401 "queue_worker_kokoro", _make_worker("audio-base64", {"chunks_processed": 1}), ) + monkeypatch.setattr( + worker_module.QueueWorker, + "queue_worker_qwen3_asr", + _make_worker("qwen3-text", {"chunks": 1}), + ) monkeypatch.setattr( worker_module.QueueWorker, "queue_worker_emb", @@ -96,6 +105,7 @@ def _make_record(model_type: ModelType, model_name: str) -> ModelRecord: ModelType.LLM: "ov_genai", ModelType.VLM: "ov_genai", ModelType.WHISPER: "ov_genai", + ModelType.QWEN3_ASR: "openvino", ModelType.KOKORO: "openvino", ModelType.EMB: "ov_optimum", ModelType.RERANK: "ov_optimum", @@ -104,6 +114,7 @@ def _make_record(model_type: ModelType, model_name: str) -> ModelRecord: ModelType.LLM: worker_module.OVGenAI_LLM, ModelType.VLM: worker_module.OVGenAI_VLM, ModelType.WHISPER: worker_module.OVGenAI_Whisper, + ModelType.QWEN3_ASR: worker_module.OVQwen3ASR, ModelType.KOKORO: worker_module.OV_Kokoro, ModelType.EMB: worker_module.Optimum_EMB, ModelType.RERANK: worker_module.Optimum_RR, @@ -182,7 +193,7 @@ async def _run(): def test_kokoro_generate_speech(worker_registry: worker_module.WorkerRegistry) -> None: record = _make_record(ModelType.KOKORO, "kokoro-model") config = OV_KokoroGenConfig( - kokoro_message="Hello", + input="Hello", voice=KokoroVoice.AF_SARAH, lang_code=KokoroLanguage.AMERICAN_ENGLISH, speed=1.0, @@ -197,6 +208,17 @@ async def _run(): assert result == {"audio_base64": "audio-base64", "metrics": {"chunks_processed": 1}} +def test_qwen3_asr_transcribe(worker_registry: worker_module.WorkerRegistry) -> None: + record = _make_record(ModelType.QWEN3_ASR, "qwen3-asr-model") + config = OV_Qwen3ASRGenConfig(audio_base64="AAA") + + async def _run(): + return await _load_and_call(worker_registry, record, worker_registry.transcribe_qwen3_asr("qwen3-asr-model", config)) + + result = asyncio.run(_run()) + assert result == {"text": "qwen3-text", "metrics": {"chunks": 1}} + + def test_embed(worker_registry: worker_module.WorkerRegistry) -> None: record = _make_record(ModelType.EMB, "emb-model") config = PreTrainedTokenizerConfig(text=["embed me"]) diff --git a/uv.lock b/uv.lock index 45830fd..a39eb0c 100644 --- a/uv.lock +++ b/uv.lock @@ -862,6 +862,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 }, ] +[[package]] +name = "deepmerge" +version = "2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/3a/b0ba594708f1ad0bc735884b3ad854d3ca3bdc1d741e56e40bbda6263499/deepmerge-2.0.tar.gz", hash = "sha256:5c3d86081fbebd04dd5de03626a0607b809a98fb6ccba5770b62466fe940ff20", size = 19890 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475 }, +] + [[package]] name = "deprecated" version = "1.2.18" @@ -1849,6 +1858,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/61/c4efc044141429e67e8fd5536be86d76303f250179c7f92b2cc0c72e8d0b/marisa_trie-1.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:9e6496bbad3068e3bbbb934b1e1307bf1a9cb4609f9ec47b57e8ea37f1b5ee40", size = 162564 }, ] +[[package]] +name = "markdown" +version = "3.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180 }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -2458,6 +2476,7 @@ dependencies = [ { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, { name = "torchvision", version = "0.23.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "uvicorn" }, + { name = "zensical" }, ] [package.metadata] @@ -2487,6 +2506,7 @@ requires-dist = [ { name = "torch", specifier = ">2.6.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "torchvision", specifier = ">=0.23.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "uvicorn", specifier = ">=0.35.0" }, + { name = "zensical", specifier = ">=0.0.31" }, ] [package.metadata.requires-dev] @@ -3200,6 +3220,19 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.21.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/08/f1c908c581fd11913da4711ea7ba32c0eee40b0190000996bb863b0c9349/pymdown_extensions-10.21.2.tar.gz", hash = "sha256:c3f55a5b8a1d0edf6699e35dcbea71d978d34ff3fa79f3d807b8a5b3fa90fbdc", size = 853922 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/27/a2fc51a4a122dfd1015e921ae9d22fee3d20b0b8080d9a704578bf9deece/pymdown_extensions-10.21.2-py3-none-any.whl", hash = "sha256:5c0fd2a2bea14eb39af8ff284f1066d898ab2187d81b889b75d46d4348c01638", size = 268901 }, +] + [[package]] name = "pymoo" version = "0.6.1.5" @@ -4386,14 +4419,14 @@ dependencies = [ { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49aa20e21f0c2bd458c71d7b449776cbd5f16693dd5807195a820612b8a229b7" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01dc33ee24c79148aee7cdbcf34ae8a3c9da1674a591e781577b716d233b1fa6" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6dd7c4d329a0e03157803031bc856220c6155ef08c26d4f5bbac938acecf0948" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49aa20e21f0c2bd458c71d7b449776cbd5f16693dd5807195a820612b8a229b7" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01dc33ee24c79148aee7cdbcf34ae8a3c9da1674a591e781577b716d233b1fa6" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6dd7c4d329a0e03157803031bc856220c6155ef08c26d4f5bbac938acecf0948" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b" }, ] [[package]] @@ -4411,14 +4444,14 @@ dependencies = [ { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d83d8075db43b8ca89680bdeb2f100c832e2a3aa61ee42c038b1a146e5e511b6" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:51603eb071d0681abc4db98b10ff394ace31f425852e8de249b91c09c60eb19a" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ae459d4509d3b837b978dc6c66106601f916b6d2cda75c137e3f5f48324ce1da" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:a651ccc540cf4c87eb988730c59c2220c52b57adc276f044e7efb9830fa65a1d" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:dea90a67d60a5366b0358a0b8d6bf267805278697d6fd950cf0e31139e56d1be" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:82928788025170c62e7df1120dcdc0cd175bfc31c08374613ce6d1a040bc0cda" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:474d77adbbbed5166db3e5636b4b4ae3399c66ef5bfa12536e254b32259c90c0" }, - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:8d6a47e23d7896f0ef9aa7ea7179eb6324e82438aa66d19884c2020d0646b104" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d83d8075db43b8ca89680bdeb2f100c832e2a3aa61ee42c038b1a146e5e511b6" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:51603eb071d0681abc4db98b10ff394ace31f425852e8de249b91c09c60eb19a" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ae459d4509d3b837b978dc6c66106601f916b6d2cda75c137e3f5f48324ce1da" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:a651ccc540cf4c87eb988730c59c2220c52b57adc276f044e7efb9830fa65a1d" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:dea90a67d60a5366b0358a0b8d6bf267805278697d6fd950cf0e31139e56d1be" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:82928788025170c62e7df1120dcdc0cd175bfc31c08374613ce6d1a040bc0cda" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:474d77adbbbed5166db3e5636b4b4ae3399c66ef5bfa12536e254b32259c90c0" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:8d6a47e23d7896f0ef9aa7ea7179eb6324e82438aa66d19884c2020d0646b104" }, ] [[package]] @@ -4816,3 +4849,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003 }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542 }, ] + +[[package]] +name = "zensical" +version = "0.0.31" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "deepmerge" }, + { name = "markdown" }, + { name = "pygments" }, + { name = "pymdown-extensions" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/1a/9b6f5285c5aef648db38f9132f49a7059bd2c9d748f68ef0c52ed8afcff3/zensical-0.0.31.tar.gz", hash = "sha256:9c12f07bde70c4bfdb13d6cae1bedf8d18064d257a6e81128a152502b28a8fc3", size = 3891758 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/db/cc4e555d2e816f2d91304ff969d62cc3a401ee477dbb7c720b874bec67d6/zensical-0.0.31-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b489936d670733dd204f16b689a2acc0e45b69e42cc4901f5131ae57658b8fbc", size = 12419980 }, + { url = "https://files.pythonhosted.org/packages/e7/c1/6789f73164c7f5821f5defb8a80b1dba8d5af24bdec7db36876793c5afd9/zensical-0.0.31-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d9f678efc0d9918e45eeb8bc62847b2cce23db7393c8c59c1be6d1c064bbaacd", size = 12292301 }, + { url = "https://files.pythonhosted.org/packages/4f/9a/6a83ad209081a953e0285d5056e5452c4fbcabd2f104f3797d53e4bdd96f/zensical-0.0.31-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b50ecf674997f818e53f12f2a67875a21b0c79ed74c151dfaef2f1475e5bf", size = 12661472 }, + { url = "https://files.pythonhosted.org/packages/9c/4a/a82f5c81893b7a607cf9d439b75c3c3894b4ef4d3e92d5d818b4fa5c6f23/zensical-0.0.31-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6fb5c634fe88254770a2d4db5c05b06f1c3ee5e29d2ae3e7efdae8905e435b1d", size = 12603784 }, + { url = "https://files.pythonhosted.org/packages/f7/1c/79c198628b8e006be32dfb1c5b73561757a349a6cf3069600a67ffa62495/zensical-0.0.31-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:94e64630552793274db1ec66c971e49a15ad351536d5d12de67ec6da7358ac50", size = 12959832 }, + { url = "https://files.pythonhosted.org/packages/db/9d/45839d9ca0f69622e8a3b944f2d8d7f7d2b7c2da78201079c4feb275feb6/zensical-0.0.31-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738a2fd5832e3b3c10ff642eebaf89c89ca1d28e4451dad0f36fdac53c415577", size = 12704024 }, + { url = "https://files.pythonhosted.org/packages/df/5f/451d7f4d94092bc38bd8d514826fb7b0329c188db506795b1d20bd07d517/zensical-0.0.31-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:bd601f6132e285ef6c3e4c3852be2094fc0473295a8080003db76a79760f84fb", size = 12837788 }, + { url = "https://files.pythonhosted.org/packages/d8/39/390a8fc384fb174ebd4450343a0aa2877b3a31ddcedf5ef0b8d26944e12c/zensical-0.0.31-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:dc3b6a9dfb5903c0aa779ef65cd6185add2b8aa1db237be840874b8c9db761b8", size = 12876822 }, + { url = "https://files.pythonhosted.org/packages/d5/60/640da2f095782cf38974cd851fb7afa62651d09a36543a1d8942b31aabdc/zensical-0.0.31-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:ddd4321b275e82c4897aa45b05038ce204b88fb311ad55f8c2af572173a9b56c", size = 13024036 }, + { url = "https://files.pythonhosted.org/packages/3f/06/0564377cbfccea3653254adfa851c1b20d1696e4b16770c7b2e1dd1ef1d7/zensical-0.0.31-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:147ab4bc17f3088f703aa6c4b9c416411f4ea8ca64d26f6586beae49d97fd3c7", size = 12975505 }, + { url = "https://files.pythonhosted.org/packages/35/4b/b8a0c4e5937cb05882dcce667798403e00897135080a69f92363e5e3ff9f/zensical-0.0.31-cp310-abi3-win32.whl", hash = "sha256:03fa11e629a308507693489541f43e751697784e94365e7435b02104aefd1c2c", size = 12011233 }, + { url = "https://files.pythonhosted.org/packages/3e/99/0eacdb466d344c0c86596932201268517be42f3e0bb6c78b2b0cd84c55f6/zensical-0.0.31-cp310-abi3-win_amd64.whl", hash = "sha256:d6621d4bb46af4143560045d4a18c8c76302db56bf1dbb6e2ce107d7fb643e09", size = 12207545 }, +] diff --git a/zensical.toml b/zensical.toml new file mode 100644 index 0000000..76c1f61 --- /dev/null +++ b/zensical.toml @@ -0,0 +1,317 @@ +# ============================================================================ +# +# The configuration produced by default is meant to highlight the features +# that Zensical provides and to serve as a starting point for your own +# projects. +# +# ============================================================================ + +[project] + +# The site_name is shown in the page header and the browser window title +# +# Read more: https://zensical.org/docs/setup/basics/#site_name +site_name = "Documentation" + +# The site_description is included in the HTML head and should contain a +# meaningful description of the site content for use by search engines. +# +# Read more: https://zensical.org/docs/setup/basics/#site_description +site_description = "A new project generated from the default template project." + +# The site_author attribute. This is used in the HTML head element. +# +# Read more: https://zensical.org/docs/setup/basics/#site_author +site_author = "" + +# The site_url is the canonical URL for your site. When building online +# documentation you should set this. +# Read more: https://zensical.org/docs/setup/basics/#site_url +#site_url = "https://www.example.com/" + +# The copyright notice appears in the page footer and can contain an HTML +# fragment. +# +# Read more: https://zensical.org/docs/setup/basics/#copyright +copyright = """ +Copyright © 2026 The authors +""" + +# Zensical supports both implicit navigation and explicitly defined navigation. +# If you decide not to define a navigation here then Zensical will simply +# derive the navigation structure from the directory structure of your +# "docs_dir". The definition below demonstrates how a navigation structure +# can be defined using TOML syntax. +# +# Read more: https://zensical.org/docs/setup/navigation/ +# nav = [ +# { "Get started" = "index.md" }, +# { "Markdown in 5min" = "markdown.md" }, +# ] + +# With the "extra_css" option you can add your own CSS styling to customize +# your Zensical project according to your needs. You can add any number of +# CSS files. +# +# The path provided should be relative to the "docs_dir". +# +# Read more: https://zensical.org/docs/customization/#additional-css +# +#extra_css = ["stylesheets/extra.css"] + +# With the `extra_javascript` option you can add your own JavaScript to your +# project to customize the behavior according to your needs. +# +# The path provided should be relative to the "docs_dir". +# +# Read more: https://zensical.org/docs/customization/#additional-javascript +#extra_javascript = ["javascripts/extra.js"] + +# ---------------------------------------------------------------------------- +# Section for configuring theme options +# ---------------------------------------------------------------------------- +[project.theme] + +# change this to "classic" to use the traditional Material for MkDocs look. +#variant = "classic" + +# Zensical allows you to override specific blocks, partials, or whole +# templates as well as to define your own templates. To do this, uncomment +# the custom_dir setting below and set it to a directory in which you +# keep your template overrides. +# +# Read more: +# - https://zensical.org/docs/customization/#extending-the-theme +# +#custom_dir = "overrides" + +# With the "favicon" option you can set your own image to use as the icon +# browsers will use in the browser title bar or tab bar. The path provided +# must be relative to the "docs_dir". +# +# Read more: +# - https://zensical.org/docs/setup/logo-and-icons/#favicon +# - https://developer.mozilla.org/en-US/docs/Glossary/Favicon +# +#favicon = "images/favicon.png" + +# Zensical supports more than 60 different languages. This means that the +# labels and tooltips that Zensical's templates produce are translated. +# The "language" option allows you to set the language used. This language +# is also indicated in the HTML head element to help with accessibility +# and guide search engines and translation tools. +# +# The default language is "en" (English). It is possible to create +# sites with multiple languages and configure a language selector. See +# the documentation for details. +# +# Read more: +# - https://zensical.org/docs/setup/language/ +# +language = "en" + +# Zensical provides a number of feature toggles that change the behavior +# of the documentation site. +features = [ + # Zensical includes an announcement bar. This feature allows users to + # dismiss it when they have read the announcement. + # https://zensical.org/docs/setup/header/#announcement-bar + "announce.dismiss", + + # If you have a repository configured and turn on this feature, Zensical + # will generate an edit button for the page. This works for common + # repository hosting services. + # https://zensical.org/docs/setup/repository/#content-actions + #"content.action.edit", + + # If you have a repository configured and turn on this feature, Zensical + # will generate a button that allows the user to view the Markdown + # code for the current page. + # https://zensical.org/docs/setup/repository/#content-actions + #"content.action.view", + + # Code annotations allow you to add an icon with a tooltip to your + # code blocks to provide explanations at crucial points. + # https://zensical.org/docs/authoring/code-blocks/#code-annotations + "content.code.annotate", + + # This feature turns on a button in code blocks that allow users to + # copy the content to their clipboard without first selecting it. + # https://zensical.org/docs/authoring/code-blocks/#code-copy-button + "content.code.copy", + + # Code blocks can include a button to allow for the selection of line + # ranges by the user. + # https://zensical.org/docs/authoring/code-blocks/#code-selection-button + "content.code.select", + + # Zensical can render footnotes as inline tooltips, so the user can read + # the footnote without leaving the context of the document. + # https://zensical.org/docs/authoring/footnotes/#footnote-tooltips + "content.footnote.tooltips", + + # If you have many content tabs that have the same titles (e.g., "Python", + # "JavaScript", "Cobol"), this feature causes all of them to switch to + # at the same time when the user chooses their language in one. + # https://zensical.org/docs/authoring/content-tabs/#linked-content-tabs + "content.tabs.link", + + # With this feature enabled users can add tooltips to links that will be + # displayed when the mouse pointer hovers the link. + # https://zensical.org/docs/authoring/tooltips/#improved-tooltips + "content.tooltips", + + # With this feature enabled, Zensical will automatically hide parts + # of the header when the user scrolls past a certain point. + # https://zensical.org/docs/setup/header/#automatic-hiding + # "header.autohide", + + # Turn on this feature to expand all collapsible sections in the + # navigation sidebar by default. + # https://zensical.org/docs/setup/navigation/#navigation-expansion + # "navigation.expand", + + # This feature turns on navigation elements in the footer that allow the + # user to navigate to a next or previous page. + # https://zensical.org/docs/setup/footer/#navigation + "navigation.footer", + + # When section index pages are enabled, documents can be directly attached + # to sections, which is particularly useful for providing overview pages. + # https://zensical.org/docs/setup/navigation/#section-index-pages + "navigation.indexes", + + # When instant navigation is enabled, clicks on all internal links will be + # intercepted and dispatched via XHR without fully reloading the page. + # https://zensical.org/docs/setup/navigation/#instant-navigation + "navigation.instant", + + # With instant prefetching, your site will start to fetch a page once the + # user hovers over a link. This will reduce the perceived loading time + # for the user. + # https://zensical.org/docs/setup/navigation/#instant-prefetching + "navigation.instant.prefetch", + + # In order to provide a better user experience on slow connections when + # using instant navigation, a progress indicator can be enabled. + # https://zensical.org/docs/setup/navigation/#progress-indicator + #"navigation.instant.progress", + + # When navigation paths are activated, a breadcrumb navigation is rendered + # above the title of each page + # https://zensical.org/docs/setup/navigation/#navigation-path + "navigation.path", + + # When pruning is enabled, only the visible navigation items are included + # in the rendered HTML, reducing the size of the built site by 33% or more. + # https://zensical.org/docs/setup/navigation/#navigation-pruning + #"navigation.prune", + + # When sections are enabled, top-level sections are rendered as groups in + # the sidebar for viewports above 1220px, but remain as-is on mobile. + # https://zensical.org/docs/setup/navigation/#navigation-sections + "navigation.sections", + + # When tabs are enabled, top-level sections are rendered in a menu layer + # below the header for viewports above 1220px, but remain as-is on mobile. + # https://zensical.org/docs/setup/navigation/#navigation-tabs + #"navigation.tabs", + + # When sticky tabs are enabled, navigation tabs will lock below the header + # and always remain visible when scrolling down. + # https://zensical.org/docs/setup/navigation/#sticky-navigation-tabs + #"navigation.tabs.sticky", + + # A back-to-top button can be shown when the user, after scrolling down, + # starts to scroll up again. + # https://zensical.org/docs/setup/navigation/#back-to-top-button + "navigation.top", + + # When anchor tracking is enabled, the URL in the address bar is + # automatically updated with the active anchor as highlighted in the table + # of contents. + # https://zensical.org/docs/setup/navigation/#anchor-tracking + "navigation.tracking", + + # When search highlighting is enabled and a user clicks on a search result, + # Zensical will highlight all occurrences after following the link. + # https://zensical.org/docs/setup/search/#search-highlighting + "search.highlight", + + # When anchor following for the table of contents is enabled, the sidebar + # is automatically scrolled so that the active anchor is always visible. + # https://zensical.org/docs/setup/navigation/#anchor-following + # "toc.follow", + + # When navigation integration for the table of contents is enabled, it is + # always rendered as part of the navigation sidebar on the left. + # https://zensical.org/docs/setup/navigation/#navigation-integration + #"toc.integrate", +] + +# ---------------------------------------------------------------------------- +# You can configure your own logo to be shown in the header using the "logo" +# option in the "theme" subsection. The logo must be a relative path to a file +# in your "docs_dir", e.g., to use `docs/assets/logo.png` you would set: +# ---------------------------------------------------------------------------- +#logo = "assets/logo.png" + +# ---------------------------------------------------------------------------- +# If you don't have a dedicated project logo, you can use a built-in icon from +# the icon sets shipped in Zensical. Please note that the setting lives in a +# different subsection, and that the above take precedence over the icon. +# +# Read more: +# - https://zensical.org/docs/setup/logo-and-icons +# - https://github.com/zensical/ui/tree/master/dist/.icons +# ---------------------------------------------------------------------------- +#[project.theme.icon] +#logo = "lucide/smile" + +# ---------------------------------------------------------------------------- +# In the "font" subsection you can configure the fonts used. By default, fonts +# are loaded from Google Fonts, giving you a wide range of choices from a set +# of suitably licensed fonts. There are options for a normal text font and for +# a monospaced font used in code blocks. +# ---------------------------------------------------------------------------- +#[project.theme.font] +#text = "Inter" +#code = "Jetbrains Mono" + +# ---------------------------------------------------------------------------- +# In the "palette" subsection you can configure options for the color scheme. +# You can configure different color schemes, e.g., to turn on dark mode, +# that the user can switch between. Each color scheme can be further +# customized. +# +# Read more: +# - https://zensical.org/docs/setup/colors/ +# ---------------------------------------------------------------------------- +[[project.theme.palette]] +scheme = "default" +toggle.icon = "lucide/sun" +toggle.name = "Switch to dark mode" + +[[project.theme.palette]] +scheme = "slate" +toggle.icon = "lucide/moon" +toggle.name = "Switch to light mode" + +# ---------------------------------------------------------------------------- +# The "extra" section contains miscellaneous settings. +# ---------------------------------------------------------------------------- +#[[project.extra.social]] +#icon = "fontawesome/brands/github" +#link = "https://github.com/user/repo" + +# ---------------------------------------------------------------------------- +# Markdown extensions +# ---------------------------------------------------------------------------- +[project.markdown_extensions.pymdownx.superfences] + +[project.markdown_extensions.pymdownx.tabbed] +alternate_style = true + +[project.markdown_extensions.pymdownx.tabbed.slugify] +object = "pymdownx.slugs.slugify" +kwds = { case = "lower" }