forked from wolfgitpr/HubertFA
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_data_config.py
More file actions
70 lines (56 loc) · 2.33 KB
/
make_data_config.py
File metadata and controls
70 lines (56 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pathlib
import click
import pandas as pd
import yaml
@click.command()
@click.option("--data_folder",
type=click.Path(exists=True, file_okay=False),
required=True,
help="Root directory containing dataset folders")
@click.option("--language",
default="zh",
show_default=True,
help="Language identifier for the dataset")
@click.option("--abs", "absolute",
is_flag=True,
help="Use absolute paths (default: relative)")
@click.option("--output",
default=None,
help="Output config filename (auto-generated if not specified)")
def main(data_folder, language, absolute, output):
"""Generate DiffSinger dataset config from directory structure"""
# Handle default output filename
out_data_config = output or f"datasets_config_{language}.yaml"
csv_paths = []
for csv in pathlib.Path(data_folder).rglob("transcriptions.csv"):
parent_dir = csv.parent
if not (csv.exists() and (parent_dir / "wavs").exists()):
print(f"wavs folder does not exist: {parent_dir / 'wavs'}")
continue
try:
df = pd.read_csv(csv, encoding="utf-8")
if len(df) < 2:
raise ValueError(f"{csv} must contain at least 2 rows")
if "ph_seq" not in df.columns:
raise ValueError(f"{csv} is missing 'ph_seq' column")
# Determine label type
label_type = "full" if "ph_dur" in df.columns else "weak"
# Build path representation
raw_path = str(parent_dir.absolute()) if absolute else str(parent_dir)
csv_paths.append({
"raw_data_dir": raw_path,
"label_type": label_type,
"language": language,
"test_prefixes": [str(df["name"][0])]
})
except Exception as e:
click.echo(f"Error processing {csv}: {str(e)}", err=True)
continue
try:
with open(out_data_config, "w", encoding="utf-8") as f:
yaml.dump({"datasets": csv_paths}, f, allow_unicode=True)
click.echo(f"Config successfully generated: {out_data_config}")
except IOError as e:
click.echo(f"Failed to write config: {str(e)}", err=True)
if __name__ == "__main__":
main()