trymirai · knyazer · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/lalamo/__init__.py b/lalamo/__init__.py
@@ -24,7 +24,7 @@
     UserMessage,
 )
 from lalamo.model_import import ModelSpec, import_model
-from lalamo.model_import.model_specs.common import ConfigMap, FileSpec, JSONFieldSpec, ModelType, UseCase, WeightsType
+from lalamo.model_import.model_specs.common import ConfigMap, FileSpec, JSONFieldSpec, ModelType, UseCase
 from lalamo.models import ClassifierModel, LanguageModel
 from lalamo.modules.common import ShardingConfig, pad_and_apply_data_sharding
 from lalamo.quantization import QuantizationMode
@@ -60,7 +60,6 @@
     "TrainCallbacks",
     "UseCase",
     "UserMessage",
-    "WeightsType",
     "collect_traces",
     "convert",
     "estimate_batchsize",

diff --git a/lalamo/audio/tts_message_processor.py b/lalamo/audio/tts_message_processor.py
@@ -3,23 +3,24 @@
 from functools import cached_property
 from typing import TypedDict
 
+from jaxtyping import Array, Float
 from jinja2 import Template
 from tokenizers import Tokenizer
 
 
 @dataclass(frozen=True)
 class VoicePrompt:
-    """
-    Current class is reserved for future usage of audio prompts
-    to condition style of generated audio
-    """
+    waveform: Float[Array, " audio_samples"]
+    sampling_rate: int
 
 
 @dataclass(frozen=True)
 class TTSMessage:
     content: str
-    speaker_id: str
-    style: str
+    speaker_id: str | None = None
+    style: str | None = None
+    language: str | None = None
+    voice_prompt: VoicePrompt | None = None
 
 
 class TTSRequest(TypedDict):
@@ -62,6 +63,9 @@ def render_request(self, messages: Iterable[TTSMessage]) -> str:
             prompt_text = prompt_text[1:]
         return prompt_text
 
+    def preprocess(self, text: str, language: str) -> str:  # noqa: ARG002
+        return text
+
     def tokenize_text(self, text: str) -> list[int]:
         return self.tokenizer.encode(text, add_special_tokens=False).ids
 

diff --git a/lalamo/audio/utils.py b/lalamo/audio/utils.py
@@ -19,8 +19,6 @@ def play_mono_audio(audio: np.ndarray, samplerate: int, audio_chunk_size: int =
     audio = np.clip(audio, -1.0, 1.0)
     # very dumb conversion to PCM16
     pcm_audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
-
-    audio_chunk_size = 1024
     num_chunks = int(np.ceil(n_samples / audio_chunk_size))
 
     # actual size of each chunk might not be exactly 'audio_chunk_size' but not critical here

diff --git a/lalamo/commands.py b/lalamo/commands.py
@@ -226,7 +226,7 @@ def progress_callback(event: StatusEvent) -> None:
     weights = flatten_parameters(model.export_weights())
     del model
 
-    with Path(output_dir / "model.safetensors").open("wb") as fd:
+    with (output_dir / "model.safetensors").open("wb") as fd:
         safe_write(fd, weights)
 
     config_json = config_converter.unstructure(metadata, ModelMetadata)

diff --git a/lalamo/common.py b/lalamo/common.py
@@ -11,12 +11,15 @@
 
 from lalamo.utils import MapDictValues, MapSequence
 
+type WeightShard = tuple[Mapping[str, Array], Mapping[str, str]]
+
 __all__ = [
     "DEFAULT_PRECISION",
     "ArrayLike",
     "LalamoWarning",
     "ParameterPath",
     "ParameterTree",
+    "WeightShard",
     "dummy_array",
     "flatten_parameters",
     "require_array",

diff --git a/lalamo/main.py b/lalamo/main.py
@@ -1,15 +1,17 @@
+import json
 import random
 import re
 import shutil
 import sys
 from contextlib import ExitStack
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from functools import partial
 from importlib.util import find_spec
 from itertools import islice
 from pathlib import Path
 from typing import Annotated
 
+import jax.numpy as jnp
 import jax.profiler
 import requests
 import soundfile as sf
@@ -33,6 +35,7 @@
 from rich.table import Table
 from typer import Argument, Context, Exit, Option, Typer
 
+from lalamo.audio.tts_message_processor import VoicePrompt
 from lalamo.audio.utils import play_mono_audio
 from lalamo.commands import (
     CollectTracesCallbacks,
@@ -58,13 +61,19 @@
 )
 from lalamo.data.lalamo_completions import iter_completions
 from lalamo.message_processor import UserMessage
-from lalamo.model_import import ModelSpec
+from lalamo.model_import import ModelSpec, ModelType
 from lalamo.model_import.common import FileSpec
+from lalamo.model_import.model_specs.common import structure_origin
 from lalamo.model_import.remote_registry import RegistryModel, RegistryModelFile, fetch_available_models
 from lalamo.model_registry import ModelRegistry
-from lalamo.models import ClassifierModelConfig, LanguageModelConfig
+from lalamo.models import (
+    ClassifierModelConfig,
+    LanguageModelConfig,
+    LatentTTSGenerator,
+    TTSGenerator,
+)
 from lalamo.models.common import BatchSizesComputedEvent
-from lalamo.models.tts_model import TTSGenerator, TTSMessage
+from lalamo.models.tts_model import TTSMessage
 from lalamo.speculator.ngram import NGramSpeculator
 from lalamo.speculator.utils import test_speculator
 
@@ -339,12 +348,39 @@ def tts(
         ),
     ],
     output_file: Annotated[Path | None, Argument(help="Path to output WAV file with synthesized speech")] = None,
+    message: Annotated[
+        str | None,
+        Option(
+            help="Text to synthesize in non-interactive mode. Generates speech and exits.",
+            show_default="None, run interactively",
+        ),
+    ] = None,
     replay: Annotated[
         bool,
         Option(
             help="Render synthesized speech into default audio interface.",
         ),
     ] = False,
+    speaker_id: Annotated[
+        str | None,
+        Option(
+            help="Speaker ID for speech synthesis.",
+            show_default="A pre-selected speaker available for the specified model",
+        ),
+    ] = None,
+    style: Annotated[
+        str | None,
+        Option(
+            help="Style instruction for speech synthesis (e.g. voice description or intonation hint).",
+            show_default="Default style from the model",
+        ),
+    ] = None,
+    reference: Annotated[
+        Path | None,
+        Option(
+            help="Path to reference audio file for voice cloning (WAV format).",
+        ),
+    ] = None,
 ) -> None:
     if output_file is None:
         output_file = Path.cwd() / "generated_speech.wav"
@@ -355,9 +391,35 @@ def tts(
         raise Exit(1)
 
     console.print(f"🤖 Loading model from specified path: {model_path}.")
-    model = TTSGenerator.load_model(model_path)
 
-    assert model is not None
+    voice_prompt: VoicePrompt | None = None
+    if reference is not None:
+        ref_audio, ref_sr = sf.read(str(reference), dtype="float32")
+        if ref_audio.ndim > 1:
+            ref_audio = ref_audio.mean(axis=1)
+        voice_prompt = VoicePrompt(waveform=jnp.array(ref_audio), sampling_rate=ref_sr)
+        console.print(f"🎤 Loaded reference audio from {reference} ({ref_sr}Hz, {len(ref_audio) / ref_sr:.1f}s)")
+
+    config_json = json.loads((model_path / "config.json").read_text())
+    model_type = ModelType(config_json["model_type"])
+    model: TTSGenerator | LatentTTSGenerator
+    match model_type:
+        case ModelType.TTS_MODEL:
+            model = TTSGenerator.load_model(model_path)
+        case ModelType.LATENT_TTS_MODEL:
+            model = LatentTTSGenerator.load_model(model_path)
+        case _:
+            raise ValueError(f"Expected a TTS model, got: {model_type}")
+
+    if message is not None:
+        user_message = TTSMessage(content=message, speaker_id=speaker_id, style=style, voice_prompt=voice_prompt)
+        tts_result = model.generate_speech([user_message])
+        if replay:
+            play_mono_audio(tts_result.audio, tts_result.audio_params.samplerate)
+        sf.write(str(output_file), tts_result.audio, tts_result.audio_params.samplerate)
+        console.print(f"[green] ... saved generated audio to {output_file}[/green]")
+        return
+
     _stop_word = "/stop"
     while True:
         user_text = console.input(f"[cyan]input text to generate speech({_stop_word} to exit)> [/cyan]")
@@ -367,7 +429,7 @@ def tts(
         if user_text == "":
             continue
 
-        user_message = TTSMessage(content=user_text, speaker_id="speaker:0", style="interleave")
+        user_message = TTSMessage(content=user_text, speaker_id=speaker_id, style=style, voice_prompt=voice_prompt)
 
         tts_result = model.generate_speech([user_message])
 
@@ -430,13 +492,28 @@ def convert(
             show_default="Model's native maximum context length.",
         ),
     ] = None,
+    custom_origin: Annotated[
+        str | None,
+        Option(
+            "--custom-origin",
+            help=(
+                "Origin JSON to override the model's default origin."
+                ' Example: \'{"type": "LocalOrigin", "root": "/path/to/weights"}\''
+            ),
+            show_default="Use the model's default origin",
+        ),
+    ] = None,
     overwrite: Annotated[
         bool,
         Option(
             help="Overwrite existing model files.",
         ),
     ] = False,
 ) -> None:
+    if custom_origin is not None:
+        origin = structure_origin(json.loads(custom_origin))
+        model_repo = replace(model_repo, origin=origin)
+
     if output_dir is None:
         output_dir = DEFAULT_OUTPUT_DIR / model_repo.name
 
@@ -634,7 +711,7 @@ def list_models(
 
     if plain:
         for spec in sorted_specs:
-            console.print(spec.repo)
+            console.print(spec.origin.description)
         return
 
     table = Table(
@@ -654,7 +731,7 @@ def list_models(
             spec.family,
             spec.size,
             str(spec.quantization),
-            spec.repo,
+            spec.origin.description,
         )
     console.print(table)
 

diff --git a/lalamo/model_import/__init__.py b/lalamo/model_import/__init__.py
@@ -1,7 +1,8 @@
-from .common import ModelMetadata, ModelSpec, import_model
+from .common import ModelMetadata, ModelSpec, ModelType, import_model
 
 __all__ = [
     "ModelMetadata",
     "ModelSpec",
+    "ModelType",
     "import_model",
 ]