feat: add everyvoice evaluate cli

roedoejet · roedoejet · commit 1482aec5cdfc · 2024-09-04T08:29:35.000-07:00
currently uses torchsquim
diff --git a/everyvoice/cli.py b/everyvoice/cli.py
@@ -2,9 +2,11 @@
 import sys
 from enum import Enum
 from pathlib import Path
-from typing import Any, List
+from typing import Any, List, Optional
 
 import typer
+from rich import print as rich_print
+from rich.panel import Panel
 
 from everyvoice._version import VERSION
 from everyvoice.base_cli.checkpoint import inspect as inspect_checkpoint
@@ -80,10 +82,126 @@ def list_commands(self, ctx):
     ## Synthesize
 
     Once you have a trained model, generate some audio by running: everyvoice synthesize [text-to-spec|spec-to-wav] [OPTIONS]
+
+    ## Evaluate
+
+    You can also try to evaluate your model by running: everyvoice evaluate [synthesized_audio.wav|folder_containing_wavs] [OPTIONS]
+
 """,
 )
 
 
+@app.command(
+    short_help="Evaluate your synthesized audio",
+    name="evaluate",
+    help="""
+    # Evalution help
+
+    This command will evaluate an audio file, or a folder containing multiple audio files. Currently this is done by calculating the metrics from Kumar et. al. 2023.
+    We will report the predicted Wideband Perceptual Estimation of Speech Quality (PESQ), Short-Time Objective Intelligibility (STOI), and Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) by default.
+    We will also report the estimation of subjective Mean Opinion Score (MOS) if a Non-Matching Reference is provided. Please refer to Kumar et. al. for more information.
+
+
+
+    Kumar, Anurag, et al. “TorchAudio-Squim: Reference-less Speech Quality and Intelligibility measures in TorchAudio.” ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023.
+    """,
+)
+def evaluate(
+    audio_file: Optional[Path] = typer.Option(
+        None,
+        "--audio-file",
+        "-f",
+        exists=True,
+        dir_okay=False,
+        file_okay=True,
+        help="The path to a single audio file for evaluation.",
+        autocompletion=complete_path,
+    ),
+    audio_directory: Optional[Path] = typer.Option(
+        None,
+        "--audio-directory",
+        "-d",
+        file_okay=False,
+        dir_okay=True,
+        help="The directory where multiple audio files are located for evaluation",
+        autocompletion=complete_path,
+    ),
+    non_matching_reference: Optional[Path] = typer.Option(
+        None,
+        "--non-matching-reference",
+        "-r",
+        exists=True,
+        dir_okay=False,
+        file_okay=True,
+        help="The path to a Non Mathing Reference audio file, required for MOS prediction.",
+        autocompletion=complete_path,
+    ),
+):
+    from tabulate import tabulate
+    from tqdm import tqdm
+
+    from everyvoice.evaluation import (
+        calculate_objective_metrics_from_single_path,
+        calculate_subjective_metrics_from_single_path,
+        load_squim_objective_model,
+        load_squim_subjective_model,
+    )
+
+    HEADERS = ["STOI", "PESQ", "SI-SDR"]
+
+    objective_model, o_sr = load_squim_objective_model()
+    if non_matching_reference:
+        subjective_model, s_sr = load_squim_subjective_model()
+        HEADERS.append("MOS")
+
+    if audio_file and audio_directory:
+        print(
+            "Sorry, please choose to evaluate either a single file or an entire directory. Got values for both."
+        )
+        sys.exit(1)
+
+    def calculate_row(single_audio):
+        stoi, pesq, si_sdr = calculate_objective_metrics_from_single_path(
+            single_audio, objective_model, o_sr
+        )
+        row = [stoi, pesq, si_sdr]
+        if non_matching_reference:
+            mos = calculate_subjective_metrics_from_single_path(
+                single_audio, non_matching_reference, subjective_model, s_sr
+            )
+            row.append(mos)
+        return row
+
+    if audio_file:
+        row = calculate_row(audio_file)
+        rich_print(
+            Panel(
+                tabulate([row], HEADERS, tablefmt="simple"),
+                title=f"Objective Metrics for {audio_file}:",
+            )
+        )
+        sys.exit(0)
+
+    if audio_directory:
+        # HEADERS = ["Average " + x for x in HEADERS]
+        results = []
+        for wav_file in tqdm(
+            audio_directory.glob("*.wav"),
+            desc=f"Evaluating filies in {audio_directory}",
+        ):
+            row = calculate_row(wav_file)
+            results.append(row)
+        rich_print(
+            Panel(
+                tabulate(results, HEADERS, tablefmt="simple"),
+                title=f"Objective Metrics for files in {audio_directory}:",
+            )
+        )
+        print(f"Printing results to {audio_directory / 'evaluation.json'}")
+        with open(audio_directory / "evaluation.json", "w") as f:
+            json.dump(results, f)
+
+
 class ModelTypes(str, Enum):
     text_to_spec = "text-to-spec"
     spec_to_wav = "spec-to-wav"
diff --git a/everyvoice/evaluation.py b/everyvoice/evaluation.py
@@ -0,0 +1,67 @@
+from os import PathLike
+from typing import Any, BinaryIO, Union
+
+
+def load_squim_objective_model() -> tuple[Any, int]:
+    """Load the objective Squim Model. See https://pytorch.org/audio/main/tutorials/squim_tutorial.html
+
+    Returns:
+        tuple[Any, int]: a tuple containing the model and the required sampling rate
+    """
+    from torchaudio.pipelines import SQUIM_OBJECTIVE
+
+    model = SQUIM_OBJECTIVE.get_model()
+    model_sampling_rate = 16000
+    return (model, model_sampling_rate)
+
+
+def load_squim_subjective_model() -> tuple[Any, int]:
+    """Load the subjective Squim Model. See https://pytorch.org/audio/main/tutorials/squim_tutorial.html
+
+    Returns:
+        tuple[Any, int]: a tuple containing the model and the required sampling rate
+    """
+    from torchaudio.pipelines import SQUIM_SUBJECTIVE
+
+    model = SQUIM_SUBJECTIVE.get_model()
+    model_sampling_rate = 16000
+    return (model, model_sampling_rate)
+
+
+def process_audio(path: Union[BinaryIO, str, PathLike], sampling_rate: int):
+    import torchaudio
+
+    audio, sr = torchaudio.load(str(path))
+    # Must be 16 kHz
+    if sr != sampling_rate:
+        audio = torchaudio.functional.resample(audio, sr, sampling_rate)
+    # Must have channel dimension
+    if len(audio.size()) < 2:
+        audio = audio.unsqueeze(0)
+    # Must be mono audio
+    if audio.size(0) != 1:
+        raise ValueError("Audio for evaluation must be mono (single channel)")
+    return audio
+
+
+def calculate_objective_metrics_from_single_path(
+    audio_path, model, model_sampling_rate
+) -> tuple[float, float, float]:
+    import torch
+
+    audio = process_audio(audio_path, model_sampling_rate)
+    with torch.no_grad():
+        stoi_hyp, pesq_hyp, si_sdr_hyp = model(audio)
+    return float(stoi_hyp), float(pesq_hyp), float(si_sdr_hyp)
+
+
+def calculate_subjective_metrics_from_single_path(
+    audio_path, non_matching_reference_path, model, model_sampling_rate
+) -> float:
+    import torch
+
+    audio = process_audio(audio_path, model_sampling_rate)
+    nmr_audio = process_audio(non_matching_reference_path, model_sampling_rate)
+    with torch.no_grad():
+        mos = model(audio, nmr_audio)
+    return float(mos)
diff --git a/everyvoice/tests/test_evaluation.py b/everyvoice/tests/test_evaluation.py
@@ -0,0 +1,16 @@
+from everyvoice.evaluation import (
+    calculate_objective_metrics_from_single_path,
+    load_squim_objective_model,
+)
+from everyvoice.tests.basic_test_case import BasicTestCase
+
+
+class EvaluationTest(BasicTestCase):
+    def test_squim_evaluation(self):
+        model, sr = load_squim_objective_model()
+        stoi, pesq, si_sdr = calculate_objective_metrics_from_single_path(
+            self.data_dir / "LJ010-0008.wav", model, sr
+        )
+        self.assertLess(stoi, 1)
+        self.assertEqual(round(pesq, 2), 3.88)
+        self.assertEqual(round(si_sdr, 2), 28.64)