This repository has been archived by the owner on Feb 17, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
126 lines (108 loc) · 3.73 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import random
import subprocess
# 设置相关路径和参数
character_id = 40
character_name = "taki"
speaker_id = 4
input_dir = f"voice/{character_id}/"
output_dir = "dataset/"
train_file = f"{character_id}_train.txt"
val_file = f"{character_id}_val.txt"
sample_rate = 22050
bit_depth = 16
channels = 1
min_duration = 1
max_duration = 10
train_val_ratio = 20 # train样本数量与val样本数量的比例
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 获取所有mp3文件路径
mp3_files = [file for file in os.listdir(input_dir) if file.endswith(".mp3")]
transcript_dict = {}
# 将所有mp3文件处理为wav文件
for idx, file in enumerate(mp3_files):
mp3_path = os.path.join(input_dir, file)
wav_filename = f"{character_name}_{idx}.wav"
wav_path = os.path.join(output_dir, wav_filename)
transcript_dict[file[:-4]] = f"{character_name}_{idx}"
if not os.path.exists(wav_path):
# 使用ffmpeg将mp3文件转换为wav文件
subprocess.call(
[
"ffmpeg",
"-i",
mp3_path,
"-acodec",
"pcm_s16le",
"-ac",
str(channels),
"-ar",
str(sample_rate),
wav_path,
]
)
# 随机选择一部分文件作为训练样本
num_train_samples = int(len(mp3_files) / (train_val_ratio + 1) * train_val_ratio)
train_files = random.sample(mp3_files, num_train_samples)
# 将训练样本处理为数据集
with open(train_file, "w", encoding="UTF-8") as train_txt:
for file in train_files:
transcript = file[:-4]
wav_path = os.path.join(output_dir, transcript_dict[transcript] + ".wav")
# 获取wav文件的时长
duration = subprocess.check_output(
[
"ffprobe",
"-i",
wav_path,
"-show_entries",
"format=duration",
"-v",
"quiet",
"-of",
"csv=p=0",
]
).decode("utf-8")
duration = float(duration)
# 根据时长限制过滤掉不符合要求的音频文件
if duration >= min_duration and duration <= max_duration:
if speaker_id is not None:
train_txt.write(f"{wav_path}|{speaker_id}|{transcript}\n")
else:
train_txt.write(f"{wav_path}|{transcript}\n")
# 选择另一部分文件作为验证样本
val_files = list(set(mp3_files) - set(train_files))
# 将验证样本处理为数据集
with open(val_file, "w", encoding="UTF-8") as val_txt:
for file in val_files:
transcript = file[:-4]
wav_path = os.path.join(output_dir, transcript_dict[transcript] + ".wav")
# 获取wav文件的时长
duration = subprocess.check_output(
[
"ffprobe",
"-i",
wav_path,
"-show_entries",
"format=duration",
"-v",
"quiet",
"-of",
"csv=p=0",
]
).decode("utf-8")
duration = float(duration)
# 根据时长限制过滤掉不符合要求的音频文件
if duration >= min_duration and duration <= max_duration:
if speaker_id is not None:
val_txt.write(f"{wav_path}|{speaker_id}|{transcript}\n")
else:
val_txt.write(f"{wav_path}|{transcript}\n")
# 输出处理完成的消息
print("数据集创建成功!")
print(f"训练样本数: {len(train_files)}")
print(f"验证样本数: {len(val_files)}")
print(f"训练样本文件: {train_file}")
print(f"验证样本文件: {val_file}")