-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
Copy pathaudio_magnet_16khz.yaml
104 lines (89 loc) · 2.42 KB
/
audio_magnet_16khz.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# @package __global__
# This is the training loop solver
# for the base audio-MAGNeT model (text-to-sound)
# on monophonic audio sampled at 16 kHz
# using a similar EnCodec+LM setup to MAGNeT
defaults:
- audiogen/default
- /model: lm/audiogen_lm
- override /dset: audio/default
- _self_
lm_model: transformer_lm_magnet
solver: audio_magnet
autocast: true
autocast_dtype: float16
# EnCodec large trained on mono-channel music audio sampled at 16khz
# with a total stride of 320 leading to 50 frames/s.
# rvq.n_q=4, rvq.bins=2048, no quantization dropout
# (transformer_lm card and n_q must be compatible)
compression_model_checkpoint: //reference/bd44a852/checkpoint.th
channels: 1
sample_rate: 16000
deadlock:
use: true # deadlock detection
dataset:
batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128)
num_workers: 10
segment_duration: 10
min_segment_ratio: 1.0
sample_on_weight: false # Uniform sampling all the way
sample_on_duration: false # Uniform sampling all the way
external_metadata_source: null
# sample mixing augmentation at train time
train:
batch_size: 256 # matching AudioGen paper setup
aug_p: 0.5 # perform audio mixing 50% of the time
mix_p: 0.5 # proportion of batch items mixed together
# important: note that this will reduce the
# actual batch size used at train time
# which will be equal to mix_p * batch_size
mix_snr_low: -5
mix_snr_high: 5
mix_min_overlap: 0.5
optim:
epochs: 100
optimizer: adamw
lr: 5e-4
ema:
use: true
updates: 10
device: cuda
logging:
log_tensorboard: true
schedule:
lr_scheduler: inverse_sqrt
inverse_sqrt:
warmup: 3000
warmup_init_lr: 0.0
codebooks_pattern:
modeling: parallel
parallel:
empty_initial: -1
transformer_lm:
card: 2048
causal: false
subcodes_context: 5
compression_model_framerate: 50 # NOTE: Must match the actual frame rate of the used compression model
segment_duration: 0
span_len: -1
masking:
span_len: 3
generate:
lm:
max_prompt_len: null
max_gen_len: null
remove_prompts: false
use_sampling: true
temp: 3.5
top_k: 0
top_p: 0.8
max_cfg_coef: 20.0
min_cfg_coef: 1.0
decoding_steps: [20, 10, 10, 10]
anneal_temp: true
span_scoring: 'max'
span_arrangement: 'nonoverlap'
prompted_samples: false
samples:
prompted: false
unprompted: true