|
2 | 2 | import sys |
3 | 3 | from enum import Enum |
4 | 4 | from pathlib import Path |
5 | | -from typing import Any, List |
| 5 | +from typing import Any, List, Optional |
6 | 6 |
|
7 | 7 | import typer |
| 8 | +from rich import print as rich_print |
| 9 | +from rich.panel import Panel |
8 | 10 |
|
9 | 11 | from everyvoice._version import VERSION |
10 | 12 | from everyvoice.base_cli.checkpoint import inspect as inspect_checkpoint |
@@ -80,10 +82,126 @@ def list_commands(self, ctx): |
80 | 82 | ## Synthesize |
81 | 83 |
|
82 | 84 | Once you have a trained model, generate some audio by running: everyvoice synthesize [text-to-spec|spec-to-wav] [OPTIONS] |
| 85 | +
|
| 86 | + ## Evaluate |
| 87 | +
|
| 88 | + You can also try to evaluate your model by running: everyvoice evaluate [synthesized_audio.wav|folder_containing_wavs] [OPTIONS] |
| 89 | +
|
83 | 90 | """, |
84 | 91 | ) |
85 | 92 |
|
86 | 93 |
|
| 94 | +@app.command( |
| 95 | + short_help="Evaluate your synthesized audio", |
| 96 | + name="evaluate", |
| 97 | + help=""" |
| 98 | + # Evalution help |
| 99 | +
|
| 100 | + This command will evaluate an audio file, or a folder containing multiple audio files. Currently this is done by calculating the metrics from Kumar et. al. 2023. |
| 101 | + We will report the predicted Wideband Perceptual Estimation of Speech Quality (PESQ), Short-Time Objective Intelligibility (STOI), and Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) by default. |
| 102 | + We will also report the estimation of subjective Mean Opinion Score (MOS) if a Non-Matching Reference is provided. Please refer to Kumar et. al. for more information. |
| 103 | +
|
| 104 | +
|
| 105 | +
|
| 106 | + Kumar, Anurag, et al. “TorchAudio-Squim: Reference-less Speech Quality and Intelligibility measures in TorchAudio.” ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023. |
| 107 | + """, |
| 108 | +) |
| 109 | +def evaluate( |
| 110 | + audio_file: Optional[Path] = typer.Option( |
| 111 | + None, |
| 112 | + "--audio-file", |
| 113 | + "-f", |
| 114 | + exists=True, |
| 115 | + dir_okay=False, |
| 116 | + file_okay=True, |
| 117 | + help="The path to a single audio file for evaluation.", |
| 118 | + autocompletion=complete_path, |
| 119 | + ), |
| 120 | + audio_directory: Optional[Path] = typer.Option( |
| 121 | + None, |
| 122 | + "--audio-directory", |
| 123 | + "-d", |
| 124 | + file_okay=False, |
| 125 | + dir_okay=True, |
| 126 | + help="The directory where multiple audio files are located for evaluation", |
| 127 | + autocompletion=complete_path, |
| 128 | + ), |
| 129 | + non_matching_reference: Optional[Path] = typer.Option( |
| 130 | + None, |
| 131 | + "--non-matching-reference", |
| 132 | + "-r", |
| 133 | + exists=True, |
| 134 | + dir_okay=False, |
| 135 | + file_okay=True, |
| 136 | + help="The path to a Non Mathing Reference audio file, required for MOS prediction.", |
| 137 | + autocompletion=complete_path, |
| 138 | + ), |
| 139 | +): |
| 140 | + from tabulate import tabulate |
| 141 | + from tqdm import tqdm |
| 142 | + |
| 143 | + from everyvoice.evaluation import ( |
| 144 | + calculate_objective_metrics_from_single_path, |
| 145 | + calculate_subjective_metrics_from_single_path, |
| 146 | + load_squim_objective_model, |
| 147 | + load_squim_subjective_model, |
| 148 | + ) |
| 149 | + |
| 150 | + HEADERS = ["STOI", "PESQ", "SI-SDR"] |
| 151 | + |
| 152 | + objective_model, o_sr = load_squim_objective_model() |
| 153 | + if non_matching_reference: |
| 154 | + subjective_model, s_sr = load_squim_subjective_model() |
| 155 | + HEADERS.append("MOS") |
| 156 | + |
| 157 | + if audio_file and audio_directory: |
| 158 | + print( |
| 159 | + "Sorry, please choose to evaluate either a single file or an entire directory. Got values for both." |
| 160 | + ) |
| 161 | + sys.exit(1) |
| 162 | + |
| 163 | + def calculate_row(single_audio): |
| 164 | + stoi, pesq, si_sdr = calculate_objective_metrics_from_single_path( |
| 165 | + single_audio, objective_model, o_sr |
| 166 | + ) |
| 167 | + row = [stoi, pesq, si_sdr] |
| 168 | + if non_matching_reference: |
| 169 | + mos = calculate_subjective_metrics_from_single_path( |
| 170 | + single_audio, non_matching_reference, subjective_model, s_sr |
| 171 | + ) |
| 172 | + row.append(mos) |
| 173 | + return row |
| 174 | + |
| 175 | + if audio_file: |
| 176 | + row = calculate_row(audio_file) |
| 177 | + rich_print( |
| 178 | + Panel( |
| 179 | + tabulate([row], HEADERS, tablefmt="simple"), |
| 180 | + title=f"Objective Metrics for {audio_file}:", |
| 181 | + ) |
| 182 | + ) |
| 183 | + sys.exit(0) |
| 184 | + |
| 185 | + if audio_directory: |
| 186 | + # HEADERS = ["Average " + x for x in HEADERS] |
| 187 | + results = [] |
| 188 | + for wav_file in tqdm( |
| 189 | + audio_directory.glob("*.wav"), |
| 190 | + desc=f"Evaluating filies in {audio_directory}", |
| 191 | + ): |
| 192 | + row = calculate_row(wav_file) |
| 193 | + results.append(row) |
| 194 | + rich_print( |
| 195 | + Panel( |
| 196 | + tabulate(results, HEADERS, tablefmt="simple"), |
| 197 | + title=f"Objective Metrics for files in {audio_directory}:", |
| 198 | + ) |
| 199 | + ) |
| 200 | + print(f"Printing results to {audio_directory / 'evaluation.json'}") |
| 201 | + with open(audio_directory / "evaluation.json", "w") as f: |
| 202 | + json.dump(results, f) |
| 203 | + |
| 204 | + |
87 | 205 | class ModelTypes(str, Enum): |
88 | 206 | text_to_spec = "text-to-spec" |
89 | 207 | spec_to_wav = "spec-to-wav" |
|
0 commit comments