HubertFA/configs/force_alignment.yaml at main · wolfgitpr/HubertFA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
model_name: 1218_fa

float32_matmul_precision: high
random_seed: 123456

extra_phonemes: [ ] # Not yet enabled.
silent_phonemes:
  - SP
  - <SP>
  - ''
  - pau
  - cl
  - CL
  - AP
  - EP
  - GS

language_prefix: true # When enabled, automatically add language prefixes for dictionary keys and isolate multiple dictionaries.
# Word /tab phoneme_1 /space phoneme_2
dictionaries:
  zh: dictionaries/opencpop-extension.txt
  ja: dictionaries/japanese_dict_full.txt
  en: dictionaries/ds_cmudict-07b.txt
#  yue: dictionaries/jyutping_dict.txt

datasets_config_paths: # datasets
  - configs/datasets_config.yaml

binary_folder: data/binary_fa # Preprocessing Results Folder
valid_set_size: 20 # When valid_set_Size is greater than 0, randomly select by quantity; otherwise, use the validation set in the dataset
max_length: 45 # Maximum duration of a single audio stream

# It is not recommended to enable the number of preprocessing threads and the size of the temporary queue if the data volume is too small (within 5 hours).
# Every 6-8g of memory and 6g of video memory can open one worker.
multiprocess_works: 0
multiprocess_max_size: 200  # size * num_works. Too large to ensure sufficient memory
multiprocess_start_size: 100  # Minimum startup data volume

# Number of training data loading processes
dataloader_workers: 2
dataloader_prefetch_factor: 2
batch_max_length: 200
binning_length: 1000
drop_last: False

num_valid_plots: 20 # Maximum drawing quantity for val and evaluate items
draw_evaluate: True

augmentation_args: # Code defect, prohibited from use
  enabled: false
  random_pitch_shifting:
    range: [ -5., 5. ]
    num: 0
  blank_padding:
    range: [ 0, 5 ] # seconds
    num: 2

fa_arg:
  hidden_dims: 192
  down_sampling_factor: 2
  down_sampling_times: 3
  channels_scaleup_factor: 1.5
  dropout: 0.1

  curves_attention_dropout: 0.1

optimizer_config:
  lr: 0.0003
  gamma: 0.9999
  total_steps: 10000
  muon_args:
    weight_decay: 0.1
  adamw_args:
    weight_decay: 0.0

loss_config:
  losses:
    weights: [ 8.0, 0.1, 1.0, 6.0, 5.0 ]
    enable_RampUpScheduler: [ False,False,False,True,False ]
  function:
    num_bins: 10
    alpha: 0.999
    label_smoothing: 0.08

# trainer
accelerator: auto
devices: auto # num_devices
precision: bf16-mixed # bf16-mixed , 32-true
gradient_clip_val: 0.5
gradient_clip_algorithm: norm
val_check_interval: 1000

save_top_k: 5
save_every_steps: 1000

# Normally not changed
hubert_config:
  encoder: cn_hubert
  model_path: dependencies/chinese-hubert-base
  sample_rate: 16000
  hop_size: 320
  channel: 768

mel_spec_config:
  n_mels: 128
  sample_rate: 44100
  window_size: 882
  hop_size: 441
  n_fft: 1764
  f_min: 40
  f_max: 16000
  clamp: 0.00001

# Not recommended to open
merged_phoneme: false
# merge phonemes
merged_phoneme_groups:
  - - zh/f
    - en/f
  - - zh/m
    - en/m
    - ja/m
  - - zh/w
    - en/w
    - ja/w
  - - zh/s
    - en/s
    - ja/s
  - - zh/n
    - en/n
    - ja/n
  - - zh/c
    - ja/ts
  - - zh/y
    - en/y
    - ja/y
  - - zh/l
    - en/l
  - - zh/x
    - ja/sh
  - - ja/z
    - en/z