Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
c6cb96e
qwen tts 1/n
knyazer Mar 17, 2026
86245d2
neater config loading
knyazer Mar 17, 2026
401b317
linters
knyazer Mar 18, 2026
8d5bdc5
latent tts + formatting fixes
knyazer Mar 18, 2026
316c4ec
more strict generation config
knyazer Mar 19, 2026
c11e071
update
knyazer Mar 21, 2026
9272bce
Merge branch 'main' into qwen-tts-new
knyazer Mar 21, 2026
be8212d
finalizing qwen-tts and latent tts
knyazer Mar 23, 2026
34d201e
Merge branch 'main' into qwen-tts-new
knyazer Mar 23, 2026
22709a7
fix up little issues
knyazer Mar 23, 2026
e915f6e
finishing up
knyazer Mar 24, 2026
bd5662c
fixing minor bugs
knyazer Mar 24, 2026
03dad34
update trans deps
knyazer Mar 24, 2026
ab49ecb
correct dep
knyazer Mar 24, 2026
e22ddd3
fix fish
knyazer Mar 25, 2026
42aaa66
Merge branch 'main' into qwen-tts-new
knyazer Mar 25, 2026
b364957
transformers version
knyazer Mar 25, 2026
bf02930
Adressing review
knyazer Mar 27, 2026
68e5469
fix
knyazer Mar 27, 2026
f750bc1
compat
knyazer Mar 27, 2026
745cd2b
Address PR review comments: remove slop and clean up TTS code
claude Mar 27, 2026
17de7f4
iterating
knyazer Mar 28, 2026
c8725ff
fix import
knyazer Mar 29, 2026
89e6e85
fix tests :( i wish it could be automatable
knyazer Mar 29, 2026
97b25e1
unslop pass
knyazer Mar 29, 2026
9c47a39
fix up broky imports
knyazer Mar 29, 2026
6535fe9
fix up tests
knyazer Mar 29, 2026
94e55dc
tweaks
knyazer Mar 30, 2026
a2cc5aa
fix nemo
knyazer Mar 30, 2026
d2c62e3
merge
knyazer Apr 1, 2026
3d18387
iterating
knyazer Apr 1, 2026
bb34a12
tweaks
knyazer Apr 2, 2026
28dd128
Merge branch 'main' into qwen-tts-new
knyazer Apr 3, 2026
ecbb8af
fix precommit
knyazer Apr 3, 2026
e66d401
add new models
knyazer Apr 5, 2026
a51e9a9
final clean up
knyazer Apr 7, 2026
00435ac
microfixes
knyazer Apr 7, 2026
48acaac
Merge branch 'main' into qwen-tts-new
knyazer Apr 7, 2026
d90c3a1
few fixes for the tests precision stuff
knyazer Apr 7, 2026
b05bb06
fill in defaults
knyazer Apr 7, 2026
e9ceb22
new style of origins
knyazer Apr 9, 2026
e188cb5
better origin structuring
knyazer Apr 9, 2026
2df469c
wip: origins cli
knyazer Apr 10, 2026
e3e5cc1
new origin system
knyazer Apr 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ inputs:
description: "Install with development dependencies"
required: false
default: 'false'
extras:
description: "Space-separated list of extras to install (e.g. 'qwen3-tts')"
required: false
default: ''
install-dependencies:
description: "Install Python and project dependencies"
required: false
Expand All @@ -27,7 +31,12 @@ runs:
with:
enable-cache: ${{ inputs.install-dependencies == 'true' }}

- run: uv sync --frozen ${{ inputs.dev == 'true' && '--dev' || '--no-dev' }} ${{ inputs.cuda == 'true' && '--extra cuda' || '' }}
- run: |
EXTRAS_ARGS=""
for extra in ${{ inputs.cuda == 'true' && 'cuda' || '' }} ${{ inputs.extras }}; do
EXTRAS_ARGS="$EXTRAS_ARGS --extra $extra"
done
uv sync --frozen ${{ inputs.dev == 'true' && '--dev' || '--no-dev' }} $EXTRAS_ARGS
if: ${{ inputs.install-dependencies == 'true' }}
shell: bash

Expand Down
13 changes: 7 additions & 6 deletions lalamo/audio/tts_message_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,24 @@
from functools import cached_property
from typing import TypedDict

from jaxtyping import Array, Float
from jinja2 import Template
from tokenizers import Tokenizer


@dataclass(frozen=True)
class VoicePrompt:
"""
Current class is reserved for future usage of audio prompts
to condition style of generated audio
"""
waveform: Float[Array, "*"]
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

waveform: Float[Array, " audio_samples"]

sampling_rate: int


@dataclass(frozen=True)
class TTSMessage:
content: str
speaker_id: str
style: str
speaker_id: str | None = None
style: str | None = None
language: str | None = None
voice_prompt: VoicePrompt | None = None


class TTSRequest(TypedDict):
Expand Down
65 changes: 60 additions & 5 deletions lalamo/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import random
import re
import shutil
Expand All @@ -10,6 +11,7 @@
from pathlib import Path
from typing import Annotated

import jax.numpy as jnp
import jax.profiler
import requests
import soundfile as sf
Expand All @@ -33,6 +35,7 @@
from rich.table import Table
from typer import Argument, Context, Exit, Option, Typer

from lalamo.audio.tts_message_processor import VoicePrompt
from lalamo.audio.utils import play_mono_audio
from lalamo.commands import (
CollectTracesCallbacks,
Expand Down Expand Up @@ -62,9 +65,14 @@
from lalamo.model_import.common import FileSpec
from lalamo.model_import.remote_registry import RegistryModel, RegistryModelFile, fetch_available_models
from lalamo.model_registry import ModelRegistry
from lalamo.models import ClassifierModelConfig, LanguageModelConfig
from lalamo.models import (
ClassifierModelConfig,
LanguageModelConfig,
LatentTTSGenerator,
TTSGenerator,
)
from lalamo.models.common import BatchSizesComputedEvent
from lalamo.models.tts_model import TTSGenerator, TTSMessage
from lalamo.models.tts_model import TTSMessage
from lalamo.speculator.ngram import NGramSpeculator
from lalamo.speculator.utils import test_speculator

Expand Down Expand Up @@ -115,6 +123,15 @@ def convert(self, value: str, param: ClickParameter | None, ctx: ClickContext |
return model_spec


def _is_latent_tts_model(model_path: Path) -> bool:
config_path = model_path / "config.json"
if not config_path.exists():
return False
with open(config_path) as f:
config_json = json.load(f)
return config_json.get("model_type") == "latent_tts_model"


def _error(message: str) -> None:
panel = Panel(message, box=box.ROUNDED, title="Error", title_align="left", border_style="red")
err_console.print(panel)
Expand Down Expand Up @@ -345,6 +362,26 @@ def tts(
help="Render synthesized speech into default audio interface.",
),
] = False,
speaker_id: Annotated[
str | None,
Option(
help="Speaker ID for speech synthesis.",
show_default="First available speaker from the model",
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

show_default should be "A pre-selected speaker available for the specified model" or something like that

),
] = None,
style: Annotated[
str | None,
Option(
help="Style instruction for speech synthesis (e.g. voice description or intonation hint).",
show_default="Default style from the model",
),
] = None,
reference: Annotated[
Path | None,
Option(
help="Path to reference audio file for voice cloning (WAV format).",
),
] = None,
) -> None:
if output_file is None:
output_file = Path.cwd() / "generated_speech.wav"
Expand All @@ -355,9 +392,27 @@ def tts(
raise Exit(1)

console.print(f"🤖 Loading model from specified path: {model_path}.")
model = TTSGenerator.load_model(model_path)

assert model is not None
voice_prompt: VoicePrompt | None = None
if reference is not None:
ref_audio, ref_sr = sf.read(str(reference), dtype="float32")
if ref_audio.ndim > 1:
ref_audio = ref_audio.mean(axis=1)
voice_prompt = VoicePrompt(waveform=jnp.array(ref_audio), sampling_rate=ref_sr)
console.print(f"🎤 Loaded reference audio from {reference} ({ref_sr}Hz, {len(ref_audio) / ref_sr:.1f}s)")

model: TTSGenerator | LatentTTSGenerator
if _is_latent_tts_model(model_path):
model = LatentTTSGenerator.load_model(model_path)
else:
model = TTSGenerator.load_model(model_path)

if isinstance(model, TTSGenerator):
if speaker_id is None:
speaker_id = model.default_speaker_id
if style is None:
style = model.default_style

_stop_word = "/stop"
while True:
user_text = console.input(f"[cyan]input text to generate speech({_stop_word} to exit)> [/cyan]")
Expand All @@ -367,7 +422,7 @@ def tts(
if user_text == "":
continue

user_message = TTSMessage(content=user_text, speaker_id="speaker:0", style="interleave")
user_message = TTSMessage(content=user_text, speaker_id=speaker_id, style=style, voice_prompt=voice_prompt)

tts_result = model.generate_speech([user_message])

Expand Down
Loading
Loading