trymirai · knyazer · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/lalamo/__init__.py b/lalamo/__init__.py
@@ -24,7 +24,7 @@
     UserMessage,
 )
 from lalamo.model_import import ModelSpec, import_model
-from lalamo.model_import.model_specs.common import ConfigMap, FileSpec, JSONFieldSpec, ModelType, UseCase, WeightsType
+from lalamo.model_import.model_specs.common import ConfigMap, FileSpec, JSONFieldSpec, ModelType, UseCase
 from lalamo.models import ClassifierModel, LanguageModel
 from lalamo.modules.common import ShardingConfig, pad_and_apply_data_sharding
 from lalamo.quantization import QuantizationMode
@@ -60,7 +60,6 @@
     "TrainCallbacks",
     "UseCase",
     "UserMessage",
-    "WeightsType",
     "collect_traces",
     "convert",
     "estimate_batchsize",

diff --git a/lalamo/audio/tts_message_processor.py b/lalamo/audio/tts_message_processor.py
@@ -3,23 +3,24 @@
 from functools import cached_property
 from typing import TypedDict
 
+from jaxtyping import Array, Float
 from jinja2 import Template
 from tokenizers import Tokenizer
 
 
 @dataclass(frozen=True)
 class VoicePrompt:
-    """
-    Current class is reserved for future usage of audio prompts
-    to condition style of generated audio
-    """
+    waveform: Float[Array, "*"]
+    sampling_rate: int
 
 
 @dataclass(frozen=True)
 class TTSMessage:
     content: str
-    speaker_id: str
-    style: str
+    speaker_id: str | None = None
+    style: str | None = None
+    language: str | None = None
+    voice_prompt: VoicePrompt | None = None
 
 
 class TTSRequest(TypedDict):
@@ -62,6 +63,9 @@ def render_request(self, messages: Iterable[TTSMessage]) -> str:
             prompt_text = prompt_text[1:]
         return prompt_text
 
+    def preprocess(self, text: str, language: str = "en") -> str:  # noqa: ARG002
+        return text
+
     def tokenize_text(self, text: str) -> list[int]:
         return self.tokenizer.encode(text, add_special_tokens=False).ids
 

diff --git a/lalamo/audio/utils.py b/lalamo/audio/utils.py
@@ -19,8 +19,6 @@ def play_mono_audio(audio: np.ndarray, samplerate: int, audio_chunk_size: int =
     audio = np.clip(audio, -1.0, 1.0)
     # very dumb conversion to PCM16
     pcm_audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
-
-    audio_chunk_size = 1024
     num_chunks = int(np.ceil(n_samples / audio_chunk_size))
 
     # actual size of each chunk might not be exactly 'audio_chunk_size' but not critical here

diff --git a/lalamo/commands.py b/lalamo/commands.py
@@ -226,7 +226,7 @@ def progress_callback(event: StatusEvent) -> None:
     weights = flatten_parameters(model.export_weights())
     del model
 
-    with Path(output_dir / "model.safetensors").open("wb") as fd:
+    with (output_dir / "model.safetensors").open("wb") as fd:
         safe_write(fd, weights)
 
     config_json = config_converter.unstructure(metadata, ModelMetadata)

diff --git a/lalamo/main.py b/lalamo/main.py
@@ -1,3 +1,4 @@
+import json
 import random
 import re
 import shutil
@@ -10,6 +11,7 @@
 from pathlib import Path
 from typing import Annotated
 
+import jax.numpy as jnp
 import jax.profiler
 import requests
 import soundfile as sf
@@ -33,6 +35,7 @@
 from rich.table import Table
 from typer import Argument, Context, Exit, Option, Typer
 
+from lalamo.audio.tts_message_processor import VoicePrompt
 from lalamo.audio.utils import play_mono_audio
 from lalamo.commands import (
     CollectTracesCallbacks,
@@ -58,13 +61,18 @@
 )
 from lalamo.data.lalamo_completions import LalamoCompletion
 from lalamo.message_processor import UserMessage
-from lalamo.model_import import ModelSpec
+from lalamo.model_import import ModelSpec, ModelType
 from lalamo.model_import.common import FileSpec
 from lalamo.model_import.remote_registry import RegistryModel, RegistryModelFile, fetch_available_models
 from lalamo.model_registry import ModelRegistry
-from lalamo.models import ClassifierModelConfig, LanguageModelConfig
+from lalamo.models import (
+    ClassifierModelConfig,
+    LanguageModelConfig,
+    LatentTTSGenerator,
+    TTSGenerator,
+)
 from lalamo.models.common import BatchSizesComputedEvent
-from lalamo.models.tts_model import TTSGenerator, TTSMessage
+from lalamo.models.tts_model import TTSMessage
 from lalamo.speculator.ngram import NGramSpeculator
 from lalamo.speculator.utils import test_speculator
 
@@ -345,6 +353,26 @@ def tts(
             help="Render synthesized speech into default audio interface.",
         ),
     ] = False,
+    speaker_id: Annotated[
+        str | None,
+        Option(
+            help="Speaker ID for speech synthesis.",
+            show_default="First available speaker from the model",
+        ),
+    ] = None,
+    style: Annotated[
+        str | None,
+        Option(
+            help="Style instruction for speech synthesis (e.g. voice description or intonation hint).",
+            show_default="Default style from the model",
+        ),
+    ] = None,
+    reference: Annotated[
+        Path | None,
+        Option(
+            help="Path to reference audio file for voice cloning (WAV format).",
+        ),
+    ] = None,
 ) -> None:
     if output_file is None:
         output_file = Path.cwd() / "generated_speech.wav"
@@ -355,9 +383,26 @@ def tts(
         raise Exit(1)
 
     console.print(f"🤖 Loading model from specified path: {model_path}.")
-    model = TTSGenerator.load_model(model_path)
 
-    assert model is not None
+    voice_prompt: VoicePrompt | None = None
+    if reference is not None:
+        ref_audio, ref_sr = sf.read(str(reference), dtype="float32")
+        if ref_audio.ndim > 1:
+            ref_audio = ref_audio.mean(axis=1)
+        voice_prompt = VoicePrompt(waveform=jnp.array(ref_audio), sampling_rate=ref_sr)
+        console.print(f"🎤 Loaded reference audio from {reference} ({ref_sr}Hz, {len(ref_audio) / ref_sr:.1f}s)")
+
+    config_json = json.loads((model_path / "config.json").read_text())
+    model_type = ModelType(config_json["model_type"])
+    model: TTSGenerator | LatentTTSGenerator
+    match model_type:
+        case ModelType.TTS_MODEL:
+            model = TTSGenerator.load_model(model_path)
+        case ModelType.LATENT_TTS_MODEL:
+            model = LatentTTSGenerator.load_model(model_path)
+        case _:
+            raise ValueError(f"Expected a TTS model, got: {model_type}")
+
     _stop_word = "/stop"
     while True:
         user_text = console.input(f"[cyan]input text to generate speech({_stop_word} to exit)> [/cyan]")
@@ -367,7 +412,7 @@ def tts(
         if user_text == "":
             continue
 
-        user_message = TTSMessage(content=user_text, speaker_id="speaker:0", style="interleave")
+        user_message = TTSMessage(content=user_text, speaker_id=speaker_id, style=style, voice_prompt=voice_prompt)
 
         tts_result = model.generate_speech([user_message])
 
@@ -634,7 +679,7 @@ def list_models(
 
     if plain:
         for spec in sorted_specs:
-            console.print(spec.repo)
+            console.print(spec.origin.description)
         return
 
     table = Table(
@@ -654,7 +699,7 @@ def list_models(
             spec.family,
             spec.size,
             str(spec.quantization),
-            spec.repo,
+            spec.origin.description,
         )
     console.print(table)
 

diff --git a/lalamo/model_import/__init__.py b/lalamo/model_import/__init__.py
@@ -1,7 +1,8 @@
-from .common import ModelMetadata, ModelSpec, import_model
+from .common import ModelMetadata, ModelSpec, ModelType, import_model
 
 __all__ = [
     "ModelMetadata",
     "ModelSpec",
+    "ModelType",
     "import_model",
 ]