You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Sortformer Diarizer model checkpoint (.ckpt) and NeMo file (.nemo) contain Fast Conformer Encoder model (NEST Encoder) and the pre-trained NEST model is loaded along with the Transformer Encoder layers.
weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
186
+
reduction: mean
187
+
188
+
lr: 0.0001
189
+
optim:
190
+
name: adamw
191
+
lr: ${model.lr}
192
+
# optimizer arguments
193
+
betas: [0.9, 0.98]
194
+
weight_decay: 1e-3
195
+
196
+
sched:
197
+
name: InverseSquareRootAnnealing
198
+
warmup_steps: 500
199
+
warmup_ratio: null
200
+
min_lr: 1e-06
201
+
202
+
trainer:
203
+
devices: 1# number of gpus (devices)
204
+
accelerator: gpu
205
+
max_epochs: 800
206
+
max_steps: -1# computed at runtime if not set
207
+
num_nodes: 1
208
+
strategy: ddp_find_unused_parameters_true # Could be "ddp"
209
+
accumulate_grad_batches: 1
210
+
deterministic: True
211
+
enable_checkpointing: False
212
+
logger: False
213
+
log_every_n_steps: 1# Interval of logging.
214
+
val_check_interval: 1.0# Set to 0.25 to check 4 times per epoch, or an int for number of iterations
215
+
216
+
exp_manager:
217
+
use_datetime_version: False
218
+
exp_dir: null
219
+
name: ${name}
220
+
resume_if_exists: True
221
+
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# Postprocessing parameters for timestamp outputs from speaker diarization models.
2
+
# This speaker diarization postprocessing scheme is inspired by the postprocessing procedure in the following paper:
3
+
# Medennikov, Ivan, et al. "Target-Speaker Voice Activity Detection: a Novel Approach for Multi-Speaker Diarization in a Dinner Party Scenario." (2020).
4
+
# These parameters were optimized on CallHome Dataset from the NIST SRE 2000 Disc8, especially from the part1 (callhome1) specified in: Kaldi, “Kaldi x-vector recipe v2,” https://github.com/kaldi-asr/kaldi/blob/master/egs/callhome_diarization/v2/run.sh
5
+
parameters:
6
+
onset: 0.641# Onset threshold for detecting the beginning and end of a speech
7
+
offset: 0.561# Offset threshold for detecting the end of a speech
8
+
pad_onset: 0.229# Adding durations before each speech segment
9
+
pad_offset: 0.079# Adding durations after each speech segment
10
+
min_duration_on: 0.511# Threshold for small non-speech deletion
11
+
min_duration_off: 0.296# Threshold for short speech segment deletion
# Postprocessing parameters for timestamp outputs from speaker diarization models.
2
+
# This speaker diarization postprocessing scheme is inspired by the postprocessing procedure in the following paper:
3
+
# Medennikov, Ivan, et al. "Target-Speaker Voice Activity Detection: a Novel Approach for Multi-Speaker Diarization in a Dinner Party Scenario." (2020).
4
+
# These parameters were optimized on the development split of DIHARD3 dataset (See https://arxiv.org/pdf/2012.01477).
5
+
parameters:
6
+
onset: 0.56# Onset threshold for detecting the beginning and end of a speech
7
+
offset: 1.0# Offset threshold for detecting the end of a speech
8
+
pad_onset: 0.063# Adding durations before each speech segment
9
+
pad_offset: 0.002# Adding durations after each speech segment
10
+
min_duration_on: 0.007# Threshold for small non-speech deletion
11
+
min_duration_off: 0.151# Threshold for short speech segment deletion
0 commit comments