-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathanon_sttts.yaml
72 lines (65 loc) · 2.58 KB
/
anon_sttts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
data_dir: data
results_dir: wav # output example ./data/IEMOCAP_ims_sttts_pc/wav
pipeline: sttts
anon_suffix: !ref _<pipeline>
datasets:
- name: IEMOCAP_dev
data: IEMOCAP_dev
- name: IEMOCAP_test
data: IEMOCAP_test
- name: libri_dev
data: libri_dev
enrolls: [_enrolls]
trials: [_trials_f, _trials_m]
- name: libri_test
data: libri_test
enrolls: [_enrolls]
trials: [_trials_f, _trials_m]
- name: train-clean-360
data: train-clean-360
models_dir: exp/sttts_models
save_intermediate: true
intermediate_dir: !ref exp/anon_pipeline_<pipeline>
# For faster inferance, download precomputed prosody/speaker_embedding/phn_transcript (libri only)
download_precomputed_intermediate_repr: true
modules:
asr:
recognizer: ims
force_compute_recognition: false
model_path: !ref <models_dir>/asr/asr_branchformer_tts-phn_en.zip
ctc_weight: 0.2
utt_start_token: "~"
utt_end_token: "~#"
results_path: !ref <intermediate_dir>/transcription/asr_branchformer_tts-phn_en
speaker_embeddings:
anonymizer: ims
force_compute_extraction: false
force_compute_anonymization: false
vec_type: style-embed
emb_model_path: !ref <models_dir>/tts/Embedding/embedding_function.pt
anon_settings:
method: gan
vectors_file: !ref <models_dir>/anonymization/<modules[speaker_embeddings][vec_type]>_wgan.pt
gan_model_path: !ref <models_dir>/anonymization/gan_<modules[speaker_embeddings][vec_type]>/<modules[speaker_embeddings][vec_type]>_wgan.pt
num_sampled: 5000
sim_threshold: 0.7
extraction_results_path: !ref <intermediate_dir>/original_speaker_embeddings/<modules[speaker_embeddings][vec_type]>
anon_results_path: !ref <intermediate_dir>/anon_speaker_embeddings/<modules[speaker_embeddings][vec_type]>
anon_level_spk: []
anon_level_utt: [IEMOCAP_test, IEMOCAP_dev, libri_dev, libri_test, train-clean-360]
prosody:
extractor_type: ims
force_compute_extraction: false
aligner_model_path: !ref <models_dir>/tts/Aligner/aligner.pt
extraction_results_path: !ref <intermediate_dir>/original_prosody/ims_extractor
anonymizer_type: ims
random_offset_lower: 60
random_offset_higher: 140
anon_results_path: !ref <intermediate_dir>/anon_prosody/random_offsets
tts:
synthesizer: ims
fastspeech_path: !ref <models_dir>/tts/FastSpeech2_Multi/prosody_cloning.pt
hifigan_path: !ref <models_dir>/tts/HiFiGAN_combined/best.pt
embeddings_path: !ref <models_dir>/tts/Embedding/embedding_function.pt
output_sr: 16000
results_path: !ref <intermediate_dir>/anon_speech/ims_sttts_pc