diff --git a/data/librispeech.py b/data/librispeech.py index 06a3a92..b381291 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -4,57 +4,63 @@ import numpy as np from torch.utils.data import Dataset from collections import defaultdict +from glob import glob +import pandas as pd + +csv_input = pd.read_csv(filepath_or_buffer='/groups/1/gcc50521/furukawa/musicnet_metadata.csv', sep=",") +genre_to_id = { + 'Solo Piano': 0, 'String Quartet': 1, 'Accompanied Violin': 2, 'Piano Quartet': 3, 'Accompanied Cello': 4, + 'String Sextet': 5, 'Piano Trio': 6, 'Piano Quintet': 7, 'Wind Quintet': 8, 'Horn Piano Trio': 9, 'Wind Octet': 10, + 'Clarinet-Cello-Piano Trio': 11, 'Pairs Clarinet-Horn-Bassoon': 12, 'Clarinet Quintet': 13, 'Solo Cello': 14, + 'Accompanied Clarinet': 15, 'Solo Violin': 16, 'Violin and Harpsichord': 17, 'Viola Quintet': 18, 'Solo Flute': 19, + 'Wind and Strings Octet': 20 +} +id_to_genre = {} +for idx, row in csv_input.iterrows(): + genre = row['ensemble'] + song_id = str(row['id']) + id_to_genre[song_id] = genre def default_loader(path): return torchaudio.load(path, normalization=False) -def default_flist_reader(flist): - item_list = [] +def default_flist_reader(root_dir): speaker_dict = defaultdict(list) - index = 0 - with open(flist, "r") as rf: - for line in rf.readlines(): - speaker_id, dir_id, sample_id = line.replace("\n", "").split("-") - item_list.append((speaker_id, dir_id, sample_id)) - speaker_dict[speaker_id].append(index) - index += 1 + item_list = [] + for index, x in enumerate(sorted(glob(os.path.join(root_dir, '*.npy')))): + filename = x.split('/')[-1] + speaker_id = id_to_genre[filename[:4]] + item_list.append(speaker_id) + speaker_dict[speaker_id].append(index) - return item_list, speaker_dict + return speaker_dict, item_list class LibriDataset(Dataset): def __init__( - self, - opt, - root, - flist, - audio_length=20480, - flist_reader=default_flist_reader, - loader=default_loader, + self, + opt, + root, + flist, + audio_length=20480, + flist_reader=default_flist_reader, + loader=default_loader, ): self.root = root self.opt = opt - self.file_list, self.speaker_dict = flist_reader(flist) + self.file_list = sorted(glob(os.path.join(root, '*.npy'))) + self.speaker_dict, self.item_list = flist_reader(root) self.loader = loader self.audio_length = audio_length - self.mean = -1456218.7500 - self.std = 135303504.0 - def __getitem__(self, index): - speaker_id, dir_id, sample_id = self.file_list[index] - filename = "{}-{}-{}".format(speaker_id, dir_id, sample_id) - audio, samplerate = self.loader( - os.path.join(self.root, speaker_id, dir_id, "{}.flac".format(filename)) - ) - - assert ( - samplerate == 16000 - ), "Watch out, samplerate is not consistent throughout the dataset!" + filename = self.file_list[index] + audio = torch.from_numpy(np.load(filename)).unsqueeze(0) + speaker_id = self.item_list[index] # discard last part that is not a full 10ms max_length = audio.size(1) // 160 * 160 @@ -63,10 +69,9 @@ def __getitem__(self, index): np.arange(160, max_length - self.audio_length - 0, 160) ) - audio = audio[:, start_idx : start_idx + self.audio_length] + audio = audio[:, start_idx: start_idx + self.audio_length] # normalize the audio samples - audio = (audio - self.mean) / self.std return audio, filename, speaker_id, start_idx def __len__(self): @@ -87,20 +92,11 @@ def get_full_size_test_item(self, index): get audio samples that cover the full length of the input files used for testing the phone classification performance """ - speaker_id, dir_id, sample_id = self.file_list[index] - filename = "{}-{}-{}".format(speaker_id, dir_id, sample_id) - audio, samplerate = self.loader( - os.path.join(self.root, speaker_id, dir_id, "{}.flac".format(filename)) - ) - - assert ( - samplerate == 16000 - ), "Watch out, samplerate is not consistent throughout the dataset!" + filename = self.file_list[index] + audio = torch.from_numpy(np.load(filename)).unsqueeze(0) ## discard last part that is not a full 10ms max_length = audio.size(1) // 160 * 160 audio = audio[:max_length] - audio = (audio - self.mean) / self.std - return audio, filename diff --git a/data/loaders.py b/data/loaders.py index a24da38..f92df7b 100644 --- a/data/loaders.py +++ b/data/loaders.py @@ -9,10 +9,7 @@ def librispeech_loader(opt, num_workers=16): print("Using Train / Val Split") train_dataset = LibriDataset( opt, - os.path.join( - opt.data_input_dir, - "LibriSpeech/train-clean-100", - ), + opt.data_input_dir, os.path.join( opt.data_input_dir, "LibriSpeech100_labels_split/train_val_train.txt" ), @@ -20,10 +17,7 @@ def librispeech_loader(opt, num_workers=16): test_dataset = LibriDataset( opt, - os.path.join( - opt.data_input_dir, - "LibriSpeech/train-clean-100", - ), + opt.data_input_dir, os.path.join( opt.data_input_dir, "LibriSpeech100_labels_split/train_val_val.txt" ), @@ -33,10 +27,7 @@ def librispeech_loader(opt, num_workers=16): print("Using Train+Val / Test Split") train_dataset = LibriDataset( opt, - os.path.join( - opt.data_input_dir, - "LibriSpeech/train-clean-100", - ), + opt.data_input_dir, os.path.join( opt.data_input_dir, "LibriSpeech100_labels_split/train_split.txt" ), @@ -44,10 +35,7 @@ def librispeech_loader(opt, num_workers=16): test_dataset = LibriDataset( opt, - os.path.join( - opt.data_input_dir, - "LibriSpeech/train-clean-100", - ), + opt.data_input_dir, os.path.join( opt.data_input_dir, "LibriSpeech100_labels_split/test_split.txt" ), diff --git a/main.py b/main.py index 51cc927..d07d9be 100644 --- a/main.py +++ b/main.py @@ -29,7 +29,7 @@ def train(args, model, optimizer, writer): ) total_step = len(train_loader) - print_idx = 100 + print_idx = 10 # at which step to validate training validation_idx = 1000 @@ -113,7 +113,8 @@ def train(args, model, optimizer, writer): save_model(args, model, optimizer, best=True) # save current model state - save_model(args, model, optimizer) + if args.current_epoch % 50 == 0: + save_model(args, model, optimizer) args.current_epoch += 1 @@ -132,7 +133,7 @@ def main(_run, _log): args.time = time.ctime() # Device configuration - args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.current_epoch = args.start_epoch diff --git a/modules/audio/cpc.py b/modules/audio/cpc.py index 611f4d8..8f2a457 100644 --- a/modules/audio/cpc.py +++ b/modules/audio/cpc.py @@ -58,7 +58,9 @@ def get_latent_representations(self, x): def forward(self, x): + # x: (b, 1, 20480) z, c = self.get_latent_representations(x) + # z: (b, 128, 512) c: (b, 128, 256) loss, accuracy = self.loss.get(x, z, c) return loss, accuracy, z, c diff --git a/modules/audio/infonce.py b/modules/audio/infonce.py index e889058..35d98f3 100644 --- a/modules/audio/infonce.py +++ b/modules/audio/infonce.py @@ -3,7 +3,7 @@ Calculates the 'Info Noise-Contrastive-Estimation' as explained by Van den Oord et al. (2018), implementation by Bas Veeling & Sindy Lowe """ - +import numpy as np import torch import torch.nn as nn diff --git a/modules/audio/resnet.py b/modules/audio/resnet.py new file mode 100644 index 0000000..399ede8 --- /dev/null +++ b/modules/audio/resnet.py @@ -0,0 +1,27 @@ +import torch.nn as nn +import torchvision.models as models + + +class ResNetSimCLR(nn.Module): + + def __init__(self, base_model, out_dim): + super(ResNetSimCLR, self).__init__() + self.resnet_dict = {"resnet18": models.resnet18(pretrained=False, num_classes=out_dim), + "resnet50": models.resnet50(pretrained=True)} + + self.backbone = self._get_basemodel(base_model) + self.backbone.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, + bias=False) + num_features = self.backbone.fc.in_features + self.backbone.fc = nn.Linear(num_features, out_dim) + dim_mlp = self.backbone.fc.in_features + + # add mlp projection head + self.backbone.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.backbone.fc) + + def _get_basemodel(self, model_name): + model = self.resnet_dict[model_name] + return model + + def forward(self, x): + return self.backbone(x) \ No newline at end of file diff --git a/modules/audio/speaker_loss.py b/modules/audio/speaker_loss.py index 18becc3..8e8a005 100644 --- a/modules/audio/speaker_loss.py +++ b/modules/audio/speaker_loss.py @@ -2,6 +2,7 @@ import torch from data import loaders +from data.librispeech import genre_to_id, id_to_genre class Speaker_Loss(nn.Module): def __init__(self, args, hidden_dim, calc_accuracy): @@ -38,7 +39,7 @@ def calc_supervised_speaker_loss(self, c, filename): targets = torch.zeros(len(filename)).long() for idx, _ in enumerate(filename): - targets[idx] = self.speaker_id_dict[filename[idx].split("-")[0]] + targets[idx] = torch.tensor(genre_to_id[id_to_genre[filename[idx].split("/")[-1][:4]]]) targets = targets.to(self.args.device).squeeze() # forward pass diff --git a/train_classifier.sh b/train_classifier.sh index 9182787..85675ae 100755 --- a/train_classifier.sh +++ b/train_classifier.sh @@ -2,8 +2,9 @@ python -m testing.logistic_regression_speaker \ with \ - model_path=./logs/cpc_audio_baseline \ - model_num=299 \ + data_input_dir=/groups/1/gcc50521/furukawa/musicnet_npy_10sec \ + model_path=/groups/1/gcc50521/furukawa/cpc_logs/26 \ + model_num=450 \ fp16=False # python -m testing.logistic_regression_phones \ diff --git a/validation/validate_speakers.py b/validation/validate_speakers.py index 7d601a9..a7801cd 100644 --- a/validation/validate_speakers.py +++ b/validation/validate_speakers.py @@ -25,7 +25,7 @@ def tsne(args, features): def validate_speakers(args, dataset, model, optimizer, epoch, step, global_step, writer): - max_speakers = 10 + max_speakers = 20 batch_size = 40 input_size = (args.batch_size, 1, 20480) @@ -40,7 +40,7 @@ def validate_speakers(args, dataset, model, optimizer, epoch, step, global_step, labels = torch.zeros(max_speakers, batch_size).to(args.device) for idx, speaker_idx in enumerate(dataset.speaker_dict): - if idx == 10: + if idx == 20: break model_in = dataset.get_audio_by_speaker(speaker_idx, batch_size=batch_size)