-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathforce_alignment.yaml
More file actions
143 lines (124 loc) · 3.01 KB
/
force_alignment.yaml
File metadata and controls
143 lines (124 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
model_name: 1218_fa
float32_matmul_precision: high
random_seed: 123456
extra_phonemes: [ ] # Not yet enabled.
silent_phonemes:
- SP
- <SP>
- ''
- pau
- cl
- CL
- AP
- EP
- GS
language_prefix: true # When enabled, automatically add language prefixes for dictionary keys and isolate multiple dictionaries.
# Word /tab phoneme_1 /space phoneme_2
dictionaries:
zh: dictionaries/opencpop-extension.txt
ja: dictionaries/japanese_dict_full.txt
en: dictionaries/ds_cmudict-07b.txt
# yue: dictionaries/jyutping_dict.txt
datasets_config_paths: # datasets
- configs/datasets_config.yaml
binary_folder: data/binary_fa # Preprocessing Results Folder
valid_set_size: 20 # When valid_set_Size is greater than 0, randomly select by quantity; otherwise, use the validation set in the dataset
max_length: 45 # Maximum duration of a single audio stream
# It is not recommended to enable the number of preprocessing threads and the size of the temporary queue if the data volume is too small (within 5 hours).
# Every 6-8g of memory and 6g of video memory can open one worker.
multiprocess_works: 0
multiprocess_max_size: 200 # size * num_works. Too large to ensure sufficient memory
multiprocess_start_size: 100 # Minimum startup data volume
# Number of training data loading processes
dataloader_workers: 2
dataloader_prefetch_factor: 2
batch_max_length: 200
binning_length: 1000
drop_last: False
num_valid_plots: 20 # Maximum drawing quantity for val and evaluate items
draw_evaluate: True
augmentation_args: # Code defect, prohibited from use
enabled: false
random_pitch_shifting:
range: [ -5., 5. ]
num: 0
blank_padding:
range: [ 0, 5 ] # seconds
num: 2
fa_arg:
hidden_dims: 192
down_sampling_factor: 2
down_sampling_times: 3
channels_scaleup_factor: 1.5
dropout: 0.1
curves_attention_dropout: 0.1
optimizer_config:
lr: 0.0003
gamma: 0.9999
total_steps: 10000
muon_args:
weight_decay: 0.1
adamw_args:
weight_decay: 0.0
loss_config:
losses:
weights: [ 8.0, 0.1, 1.0, 6.0, 5.0 ]
enable_RampUpScheduler: [ False,False,False,True,False ]
function:
num_bins: 10
alpha: 0.999
label_smoothing: 0.08
# trainer
accelerator: auto
devices: auto # num_devices
precision: bf16-mixed # bf16-mixed , 32-true
gradient_clip_val: 0.5
gradient_clip_algorithm: norm
val_check_interval: 1000
save_top_k: 5
save_every_steps: 1000
# Normally not changed
hubert_config:
encoder: cn_hubert
model_path: dependencies/chinese-hubert-base
sample_rate: 16000
hop_size: 320
channel: 768
mel_spec_config:
n_mels: 128
sample_rate: 44100
window_size: 882
hop_size: 441
n_fft: 1764
f_min: 40
f_max: 16000
clamp: 0.00001
# Not recommended to open
merged_phoneme: false
# merge phonemes
merged_phoneme_groups:
- - zh/f
- en/f
- - zh/m
- en/m
- ja/m
- - zh/w
- en/w
- ja/w
- - zh/s
- en/s
- ja/s
- - zh/n
- en/n
- ja/n
- - zh/c
- ja/ts
- - zh/y
- en/y
- ja/y
- - zh/l
- en/l
- - zh/x
- ja/sh
- - ja/z
- en/z