From b7f5b1009cef571c43a37f85ee60b6424f0ce70e Mon Sep 17 00:00:00 2001 From: root Date: Sat, 11 Feb 2023 02:53:34 +0000 Subject: [PATCH 01/44] initial vqa_fine_tune --- src/training/vqa_fine_tune.py | 261 ++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/training/vqa_fine_tune.py diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py new file mode 100644 index 000000000..625d3126c --- /dev/null +++ b/src/training/vqa_fine_tune.py @@ -0,0 +1,261 @@ +import argparse + +import pandas as pd +import torch +from PIL import Image +from torch import nn +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm + +import open_clip +from open_clip.factory import get_tokenizer +from training.scheduler import cosine_lr +from training.train import AverageMeter +import evaluate +from sklearn import preprocessing + +from datasets import load_dataset_builder +from datasets import load_dataset +ds_builder = load_dataset_builder("HuggingFaceM4/VQAv2") + +dataset_train = load_dataset("HuggingFaceM4/VQAv2", split="train", cache_dir = "./sample_data") + +dataset_df = dataset_train.to_pandas() + +labelencoder = preprocessing.LabelEncoder() +labelencoder.fit(dataset_train['multiple_choice_answer']) +num_classes = len(list(labelencoder.classes_)) +new_labels = labelencoder.transform(dataset_train['multiple_choice_answer']) + +class VQATextDataset(Dataset): + def __init__(self, df, split, transforms, tokenizer=None): + self.df = df + self.transforms = transforms + self.tokenize = tokenizer + self.labels = labelencoder.transform(df['multiple_choice_answer']) + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + item = self.df.iloc[idx] + img_path = item["image"]["path"] + image = Image.open(str(img_path)) + text = item["question"] + label = self.labels[idx] + return { + 'image': self.transforms(image), + 'text': self.tokenize([text])[0], + 'label': label + } + +def get_task_dataloaders(df, transforms, args): + tokenizer = get_tokenizer(args.model) + + dataloaders = {} + dataset = VQATextDataset(df, + "train", + transforms, + tokenizer=tokenizer, + ) + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.workers, + pin_memory=True, + drop_last=True, + ) + dataloaders["train"] = dataloader + + return dataloaders + +class CLIPMultimodalClassifier(nn.Module): + def __init__(self, encoder, embed_dim, num_labels): + super().__init__() + + self.encoder = encoder + self.logits_proj = nn.Linear(embed_dim * 2, num_labels) #size of answer space + + def forward(self, image, text): + # CLIP doesn't have a multimodal encoder, so we concatenate the features + text_features = self.encoder.encode_text(text) + image_features = self.encoder.encode_image(image) + multimodal_features = torch.cat([image_features, text_features], dim=-1) + logits = nn.functional.softmax(self.logits_proj(multimodal_features), dim=-1) + return logits + +class EarlyStopping: + + def __init__(self, patience=5, threshold=0.0, metric_name="accuracy"): + self.patience = patience + self.threshold = threshold + self.patience_counter = 0 + self.best_score = None + self.best_metrics = None + self.metric_name = metric_name + + def step(self, metrics): + score = metrics[self.metric_name] + if self.best_score is None: + self.best_score = score + self.best_metrics = metrics + elif score < self.best_score + self.threshold: + self.patience_counter += 1 + else: + self.best_score = score + self.best_metrics = metrics + self.patience_counter = 0 + + if self.patience_counter >= self.patience: + return True + return False + +def compute_metrics(model, dataloader, device, args): + model.eval() + metric = evaluate.load("accuracy") + val_loss = 0 + samples_seen = 0 + for batch in dataloader: + with torch.no_grad(): + image = batch["image"].to(device) + text = batch["text"].to(device) + label = batch["label"].to(device) + samples_seen += text.shape[0] + logits = model(image, text) + logits = logits.view(-1) + label = label.view(-1).float() + predictions = torch.argmax(logits) + batch_val_loss = nn.CrossEntropyLoss(logits, label) + val_loss += batch_val_loss.item() + metric.add_batch( + predictions=predictions.cpu().numpy(), + references=label.cpu().numpy(), + ) + model.train() + metrics = metric.compute() + metrics["loss"] = val_loss / samples_seen + return metrics + +def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device, args): + model.train() + progress_bar = tqdm(total=len(data["train"])) + for i, batch in enumerate(data["train"]): + step = epoch * len(data["train"]) + i + scheduler(step) + + image = batch["image"].to(device) + text = batch["text"].to(device) + label = batch["label"].to(device) + logits = model(image, text) + logits = logits.view(-1) + label = label.view(-1).float() + loss = nn.CrossEntropyLoss(logits, label) #should be cross entropy + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + progress_bar.set_description(f"Loss: {loss.item():.4f}") + progress_bar.update(1) + + if (i % args.val_frequency) == 0 and i > 0: + metrics = compute_metrics(model, data["validation"], device, args) + end_training = early_stop.step(metrics) + if end_training: + progress_bar.close() + return metrics, end_training + + progress_bar.close() + metrics = compute_metrics(model, data["validation"], device, args) + end_training = early_stop.step(metrics) + return metrics, end_training + +def parse_args(args): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hateful-memes", + type=str, + default=None, + help="Path to Hateful Memes dataset directory.", + ) + parser.add_argument( + "--workers", type=int, default=2, help="Number of dataloader workers per GPU." + ) + parser.add_argument( + "--batch-size", type=int, default=32, help="Batch size per GPU." + ) + parser.add_argument( + "--epochs", type=int, default=10, help="Number of epochs to train for." + ) + parser.add_argument("--lr", type=float, default=3e-5, help="Learning rate.") + parser.add_argument("--beta1", type=float, default=0.9, help="Adam beta 1.") + parser.add_argument("--beta2", type=float, default=0.999, help="Adam beta 2.") + parser.add_argument("--eps", type=float, default=1e-8, help="Adam epsilon.") + parser.add_argument("--wd", type=float, default=0.0, help="Weight decay.") + parser.add_argument( + "--warmup", type=int, default=200, help="Number of steps to warmup for." + ) + parser.add_argument( + "--val-frequency", type=int, default=30, help="How often to run evaluation with val data." + ) + parser.add_argument( + "--early-stop-patience", type=int, default=5, help="Early stopping patience." + ) + parser.add_argument( + "--early-stop-threshold", type=float, default=0.0, help="Early stopping threshold." + ) + parser.add_argument( + "--early-stop-metric-name", type=str, default="accuracy", help="Early stopping metric name." + ) + parser.add_argument( + "--precision", + choices=["amp", "amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"], + default="amp", + help="Floating point precision." + ) + parser.add_argument( + "--model", + type=str, + default="ViT-B-32-quickgelu", + help="Name of the vision backbone to use.", + ) + parser.add_argument( + "--pretrained", + default='laion400m_e32', + type=str, + help="Use a pretrained CLIP model weights with the specified tag or file path.", + ) + parser.add_argument( + "--seed", type=int, default=0, help="Default random seed." + ) + + args = parser.parse_args(args) + return args + +args = parse_args([]) +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( + args.model, + args.pretrained, + precision=args.precision, + device=device, +) +model_cfg = open_clip.factory.get_model_config(args.model) +embed_dim = model_cfg["embed_dim"] + +data = get_task_dataloaders(dataset_df, preprocess_val, args) + +clf_cls = CLIPMultimodalClassifier +clf = clf_cls(model, embed_dim, num_classes).to(device) +optim = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=args.wd) + +total_steps = len(data["train"]) * args.epochs +scheduler = cosine_lr(optim, args.lr, args.warmup, total_steps) +early_stop = EarlyStopping( # greater metric value is better + patience=args.early_stop_patience, + threshold=args.early_stop_threshold, + metric_name=args.early_stop_metric_name, +) + +val_metrics, end_training = train_one_epoch(clf, data, 1, optim, scheduler, early_stop, device, args) From 8cb655fd4d178ad27315117ca43cf5acf2fc08d5 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 14:42:54 -0500 Subject: [PATCH 02/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 625d3126c..d97011adb 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -136,8 +136,25 @@ def compute_metrics(model, dataloader, device, args): metrics["loss"] = val_loss / samples_seen return metrics +def train_single_epoch(model, data, optimizer, args): + model.train() + for i, batch in enumerate(data["train"]): + image = batch["image"].to(device) + text = batch["text"].to(device) + label = batch["label"].to(device) + + logits = model(image, text) + print(label.shape) + print(logits.shape) + loss_fn = nn.CrossEntropyLoss() + loss = loss_fn(logits, label) + + loss.backward() + + def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device, args): model.train() + loss_fn = nn.CrossEntropyLoss() progress_bar = tqdm(total=len(data["train"])) for i, batch in enumerate(data["train"]): step = epoch * len(data["train"]) + i @@ -147,9 +164,8 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device text = batch["text"].to(device) label = batch["label"].to(device) logits = model(image, text) - logits = logits.view(-1) - label = label.view(-1).float() - loss = nn.CrossEntropyLoss(logits, label) #should be cross entropy + + loss = loss_fn(logits, label) #should be cross entropy optimizer.zero_grad() loss.backward() From ed39483fed8b625c4b66cc0af2c4ae902180c3b2 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 14:50:10 -0500 Subject: [PATCH 03/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index d97011adb..f01b3b304 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -74,14 +74,17 @@ def __init__(self, encoder, embed_dim, num_labels): super().__init__() self.encoder = encoder - self.logits_proj = nn.Linear(embed_dim * 2, num_labels) #size of answer space + + self.logits_proj = nn.Linear(embed_dim * 2, 1536) #size of answer space + self.logits_2 = nn.Linear(1536, num_classes) def forward(self, image, text): # CLIP doesn't have a multimodal encoder, so we concatenate the features text_features = self.encoder.encode_text(text) image_features = self.encoder.encode_image(image) multimodal_features = torch.cat([image_features, text_features], dim=-1) - logits = nn.functional.softmax(self.logits_proj(multimodal_features), dim=-1) + layer1 = nn.GeLU(self.logits_proj(multimodal_features), dim=-1) + logits = self.logits_2(layer1) return logits class EarlyStopping: From e58dd4dc499bca675491d1262f65ab8305d0528e Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 15:06:26 -0500 Subject: [PATCH 04/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index f01b3b304..a7a45a270 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -22,10 +22,33 @@ dataset_df = dataset_train.to_pandas() +answer_space = [] +with open('answers_vqa.txt') as f: + for line in f: + answer_space.append(line.strip()) +answer_space = np.array(answer_space) + labelencoder = preprocessing.LabelEncoder() -labelencoder.fit(dataset_train['multiple_choice_answer']) +labelencoder.fit(answer_space) num_classes = len(list(labelencoder.classes_)) -new_labels = labelencoder.transform(dataset_train['multiple_choice_answer']) + +answer_set = set(labelencoder.classes_) +class_id = [] +questions = [] +images = [] +answers = [] +for index, row in dataset_df.iterrows(): + if(row['multiple_choice_answer'] in answer_set): + class_id.append(row['question_id']) + questions.append(row['question']) + images.append(row['image']) + answers.append(row['multiple_choice_answer']) +class_id = np.array(class_id) +questions = np.array(questions) +images = np.array(images) +answers = np.array(answers) + +dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) class VQATextDataset(Dataset): def __init__(self, df, split, transforms, tokenizer=None): @@ -45,7 +68,7 @@ def __getitem__(self, idx): return { 'image': self.transforms(image), 'text': self.tokenize([text])[0], - 'label': label + 'label': torch.tensor(label) } def get_task_dataloaders(df, transforms, args): From b98cc8719abd102805642f1990fdaf4c18b7bfcb Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 22:34:32 -0500 Subject: [PATCH 05/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 100 +++++++++++++++++----------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index a7a45a270..b43a6a8c2 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -16,39 +16,6 @@ from datasets import load_dataset_builder from datasets import load_dataset -ds_builder = load_dataset_builder("HuggingFaceM4/VQAv2") - -dataset_train = load_dataset("HuggingFaceM4/VQAv2", split="train", cache_dir = "./sample_data") - -dataset_df = dataset_train.to_pandas() - -answer_space = [] -with open('answers_vqa.txt') as f: - for line in f: - answer_space.append(line.strip()) -answer_space = np.array(answer_space) - -labelencoder = preprocessing.LabelEncoder() -labelencoder.fit(answer_space) -num_classes = len(list(labelencoder.classes_)) - -answer_set = set(labelencoder.classes_) -class_id = [] -questions = [] -images = [] -answers = [] -for index, row in dataset_df.iterrows(): - if(row['multiple_choice_answer'] in answer_set): - class_id.append(row['question_id']) - questions.append(row['question']) - images.append(row['image']) - answers.append(row['multiple_choice_answer']) -class_id = np.array(class_id) -questions = np.array(questions) -images = np.array(images) -answers = np.array(answers) - -dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) class VQATextDataset(Dataset): def __init__(self, df, split, transforms, tokenizer=None): @@ -141,6 +108,7 @@ def compute_metrics(model, dataloader, device, args): metric = evaluate.load("accuracy") val_loss = 0 samples_seen = 0 + loss_fn = nn.CrossEntropyLoss() for batch in dataloader: with torch.no_grad(): image = batch["image"].to(device) @@ -148,17 +116,17 @@ def compute_metrics(model, dataloader, device, args): label = batch["label"].to(device) samples_seen += text.shape[0] logits = model(image, text) - logits = logits.view(-1) - label = label.view(-1).float() - predictions = torch.argmax(logits) - batch_val_loss = nn.CrossEntropyLoss(logits, label) + + #predictions = torch.argmax(logits) + batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() - metric.add_batch( - predictions=predictions.cpu().numpy(), - references=label.cpu().numpy(), - ) + #metric.add_batch( + #predictions=predictions.cpu().numpy(), + #references=label.cpu().numpy(), + #) model.train() - metrics = metric.compute() + #metrics = metric.compute() + metrics = {} metrics["loss"] = val_loss / samples_seen return metrics @@ -174,7 +142,7 @@ def train_single_epoch(model, data, optimizer, args): print(logits.shape) loss_fn = nn.CrossEntropyLoss() loss = loss_fn(logits, label) - + print(loss) loss.backward() @@ -201,14 +169,16 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device progress_bar.update(1) if (i % args.val_frequency) == 0 and i > 0: - metrics = compute_metrics(model, data["validation"], device, args) - end_training = early_stop.step(metrics) - if end_training: - progress_bar.close() - return metrics, end_training + print(loss) + metrics = compute_metrics(model, data["train"], device, args) + + #end_training = early_stop.step(metrics) + #if end_training: + #progress_bar.close() + #return metrics, end_training progress_bar.close() - metrics = compute_metrics(model, data["validation"], device, args) + metrics = compute_metrics(model, data["train"], device, args) end_training = early_stop.step(metrics) return metrics, end_training @@ -274,6 +244,38 @@ def parse_args(args): args = parser.parse_args(args) return args +dataset_train = load_dataset("HuggingFaceM4/VQAv2", split="train", cache_dir = "./sample_data") + +dataset_df = dataset_train.to_pandas() + +answer_space = [] +with open('answers_vqa.txt') as f: + for line in f: + answer_space.append(line.strip()) +answer_space = np.array(answer_space) + +labelencoder = preprocessing.LabelEncoder() +labelencoder.fit(answer_space) +num_classes = len(list(labelencoder.classes_)) + +answer_set = set(labelencoder.classes_) +class_id = [] +questions = [] +images = [] +answers = [] +for index, row in dataset_df.iterrows(): + if(row['multiple_choice_answer'] in answer_set): + class_id.append(row['question_id']) + questions.append(row['question']) + images.append(row['image']) + answers.append(row['multiple_choice_answer']) +class_id = np.array(class_id) +questions = np.array(questions) +images = np.array(images) +answers = np.array(answers) + +dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) + args = parse_args([]) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") From dc5b58a99e3c1407e8138e77d6d1b10fabe96a31 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:04:12 -0500 Subject: [PATCH 06/44] Add files via upload --- src/training/answers_vqa.txt | 3129 ++++++++++++++++++++++++++++++++++ 1 file changed, 3129 insertions(+) create mode 100644 src/training/answers_vqa.txt diff --git a/src/training/answers_vqa.txt b/src/training/answers_vqa.txt new file mode 100644 index 000000000..4c418b192 --- /dev/null +++ b/src/training/answers_vqa.txt @@ -0,0 +1,3129 @@ + +0 +000 +1 +1 4 +1 foot +1 hour +1 in back +1 in front +1 in middle +1 inch +1 on left +1 on right +1 way +1 world +1 year +1.00 +10 +10 feet +10 inches +10 years +100 +100 feet +100 year party ct +1000 +101 +106 +10:00 +10:05 +10:08 +10:10 +10:15 +10:20 +10:25 +10:30 +10:35 +10:40 +10:45 +10:50 +10:55 +11 +11:00 +11:05 +11:10 +11:15 +11:20 +11:25 +11:30 +11:35 +11:45 +11:50 +11:55 +12 +12 feet +120 +12:00 +12:05 +12:10 +12:15 +12:20 +12:25 +12:28 +12:30 +12:35 +12:40 +12:45 +12:50 +12:55 +13 +14 +15 +15 feet +150 +16 +17 +18 +19 +193 +1950 +1950s +1980 +1990 +1:00 +1:05 +1:10 +1:15 +1:20 +1:25 +1:30 +1:35 +1:40 +1:45 +1:50 +1:55 +1st +2 +2 feet +2 hours +2 men +2 people +2 years +2.00 +20 +20 feet +20 ft +200 +2000 +2007 +2008 +2009 +2010 +2011 +2012 +2013 +2015 +2016 +21 +22 +23 +24 +25 +26 +27 +28 +29 +2:00 +2:05 +2:10 +2:15 +2:20 +2:25 +2:30 +2:35 +2:40 +2:45 +2:50 +2:55 +2nd +3 +3 feet +3 inches +30 +30 mph +300 +31 +32 +33 +34 +35 +350 +36 +37 +38 +39 +3:00 +3:10 +3:15 +3:20 +3:25 +3:30 +3:45 +3:50 +3:55 +3rd +4 +4 feet +4 ft +4 inches +4 way +40 +400 +41 +42 +43 +44 +45 +46 +47 +48 +49 +4:00 +4:05 +4:15 +4:20 +4:30 +4:35 +4:40 +4:45 +4:50 +4:55 +4th of july +5 +5 feet +5 ft +5 star +5 years +50 +50 feet +500 +51 +52 +53 +54 +55 +56 +59 +5:00 +5:05 +5:10 +5:15 +5:18 +5:25 +5:30 +5:40 +5:45 +5:50 +5:55 +6 +6 feet +6 inches +60 +600 +61 +64 +65 +66 +68 +6:00 +6:05 +6:20 +6:25 +6:30 +6:35 +6:40 +6:45 +7 +7 eleven +70 +700 +72 +75 +7:00 +7:05 +7:10 +7:25 +7:35 +7:45 +7:55 +8 +8 feet +80 +870 +88 +8:00 +8:05 +8:35 +8:50 +8:55 +9 +90 +99 +9:05 +9:12 +9:15 +9:20 +9:25 +9:30 +9:35 +9:45 +9:50 +9:55 +aa +above +above door +above sink +above stove +above toilet +abstract +accident +acer +across street +adidas +adult +adults +advertisement +africa +african +african american +after +afternoon +against wall +age +ahead +air +air canada +air conditioner +air force +air france +airplane +airplanes +airport +alaska +alcohol +alive +all +all of them +all way +alligator +almonds +alps +aluminum +am +amazon +ambulance +america +american +american airlines +american flag +amtrak +ana +analog +angel +angels +angry +animal +animals +ankle +anniversary +antelope +antenna +antique +apartment +apartments +apple +apple and banana +apples +apron +arabic +arch +arizona +arm +army +around neck +arriving +arrow +arrows +art +ascending +asia +asian +asics +asleep +asparagus +asphalt +at camera +at table +at&t +athletics +atv +audi +australia +avocado +awake +away +b +babies +baby +baby's breath +back +back left +background +backhand +backpack +backward +backwards +backyard +bacon +bad +badminton +bag +bagel +bagels +baggage claim +bags +baked +baker +bakery +baking +balance +balcony +bald +ball +balloon +balloons +balls +bamboo +banana +banana bread +banana peel +banana split +bananas +band +bandana +bank +bank of america +bar +barbed wire +barber shop +bark +barn +barrel +barrier +bars +base +baseball +baseball bat +baseball cap +baseball field +baseball game +baseball glove +baseball player +baseball uniform +basil +basket +basketball +baskets +bat +bathing +bathing suit +bathroom +bathtub +batman +bats +batter +batting +beach +beads +beagle +beanie +beans +bear +beard +bears +bed +bedroom +beef +beer +beets +before +behind +behind bench +behind bus +behind clouds +behind fence +behind woman +beige +beijing +bell +below +belt +bench +benches +bending +berries +best buy +bib +bible +bicycle +bicycles +bidet +big +big ben +bike +bike rack +biker +bikers +bikes +biking +bikini +billabong +bin +biplane +bird +bird feeder +birds +birthday +birthday cake +birthday party +black +black and blue +black and brown +black and gray +black and orange +black and pink +black and red +black and silver +black and white +black and yellow +black white +blackberry +blanket +blankets +bleachers +blender +blending +blinders +blinds +blonde +blood +blt +blue +blue and black +blue and gray +blue and green +blue and orange +blue and pink +blue and red +blue and white +blue and yellow +blue jay +blue team +blueberries +blueberry +blurry +bmw +bnsf +board +boarding +boardwalk +boat +boating +boats +bob +bone +boogie board +book +books +bookshelf +boot +boots +bored +boredom +boston +both +bottle +bottles +bottom +bottom left +bottom right +boundaries +bow +bow tie +bowl +bowling +bowls +bowtie +box +boxer +boxes +boxing +boy +boys +brace +bracelet +braid +branch +branches +brand +brass +braves +brazil +bread +breakfast +brewers +brick +bricks +bride +bridge +bridle +briefcase +bright +britain +british +british airways +broadway +broccoli +broccoli and carrots +broke +broken +bronze +broom +brown +brown and black +brown and white +brush +brushing +brushing hair +brushing her teeth +brushing his teeth +brushing teeth +bucket +bud light +budweiser +buffalo +building +buildings +bull +bulldog +bun +bundt +bunk +bunny +bunt +buoy +buoys +burger +burgers +burrito +burton +bus +bus driver +bus station +bus stop +buses +bush +bushes +business +busy +butt +butter +butterfly +button +button up +buttons +by window +c +cabbage +cabinet +cabinets +cactus +cadillac +cafe +cage +cake +cakes +calendar +calico +california +calm +camel +camera +cameraman +cameras +camo +camouflage +camper +camping +can +can't see +can't see it +can't tell +canada +candle +candles +candy +cane +cannot tell +canoe +canon +canopy +cantaloupe +cap +captivity +car +caramel +cardboard +cardinal +cardinals +cargo +carnation +carnations +carpet +carriage +carrot +carrot cake +carrots +cars +cart +cartoon +case +casserole +cast iron +castle +casual +cat +cat and dog +cat food +catch +catch ball +catch frisbee +catcher +catching +catching frisbee +catholic +cats +caucasian +cauliflower +caution +cd +cds +ceiling +celery +cell +cell phone +cell phones +cement +center +ceramic +cereal +cessna +chain +chain link +chains +chair +chairs +chalk +champagne +chandelier +charging +chase +checkerboard +checkered +checkers +cheddar +cheese +cheesecake +chef +cherries +cherry +chest +chevrolet +chevron +chevy +chicago +chicken +chihuahua +child +children +chili +chimney +china +china airlines +chinese +chips +chiquita +chocolate +choppy +chopsticks +christian +christmas +christmas tree +chrome +church +cigarette +cigarettes +cilantro +cinnamon +circle +circles +circus +cirrus +citizen +city +city bus +clams +classic +classroom +clay +clean +cleaner +cleaning +clear +cleats +climbing +clip +clock +clock tower +clocks +close +close up +closed +closet +cloth +clothes +clothing +cloud +clouds +cloudy +club +cluttered +clydesdale +cnn +coach +coal +coaster +coat +coats +cobblestone +coca cola +cocker spaniel +coconut +coffee +coffee cup +coffee maker +coffee pot +coffee table +coins +coke +cold +coleslaw +colgate +collage +collar +collie +color +colorado +colored +comcast +comfort +comforter +coming +commercial +commuter +compaq +competition +computer +computers +concentration +concert +concrete +condiments +conductor +cone +cones +conference +conference room +confused +congratulations +construction +container +continental +control +controller +controllers +converse +cook +cooked +cookie +cookies +cooking +cool +cooler +copper +copyright +cord +corgi +corn +corner +corona +cosmo +costume +cotton +couch +counter +country +countryside +couple +court +cover +cow +cowboy +cows +crafts +crane +cranes +crates +cream +crest +crib +crocs +croissant +cross +cross country +crossing +crosstown +crosswalk +crow +crown +crows +cruise ship +csx +cubs +cucumber +cucumbers +cuddling +cumulus +cup +cupcake +cupcakes +cups +curb +curious +curly +current +curtain +curtains +curved +cushion +cut +cute +cutting +cutting board +cutting cake +cutting hair +cycling +cylinder +d +dachshund +dad +daffodil +daffodils +dairy +dairy queen +daisies +daisy +dalmatian +dancing +dandelions +dark +dawn +day +day time +daytime +db +dc +dead +dead end +deck +decoration +decorative +deep +deer +defense +deli +delivery +dell +delta +denim +descending +desert +design +desk +desktop +dessert +desserts +detroit +diamond +diamonds +diesel +diet coke +different teams +digital +dim +dining +dining room +dinner +dinosaur +dip +direction +directions +dirt +dirt bike +dirty +dishes +dishwasher +disney +display +distance +do not enter +dock +dodge +dodgers +dog +dog and cat +dog bed +dog food +dog show +dogs +dole +doll +dome +domestic +don't know +don't walk +donkey +donut +donut shop +donuts +door +doorway +dots +double +double decker +doubles +dough +doughnut +doughnuts +down +down street +downhill +downtown +dr pepper +dragon +drain +drawer +drawing +dreadlocks +dress +dresser +drink +drinking +drinking water +drinks +drive +driver +driveway +driving +drums +dry +drying +drywall +ducati +duck +ducks +dugout +dump +dump truck +dunkin donuts +dusk +e +each other +eagle +ear +earbuds +earring +earrings +ears +east +easter +easton +easy +easyjet +eat +eaten +eating +egg +egg salad +eggs +eiffel tower +electric +electricity +electronics +elephant +elephants +elm +elmo +email +emergency +emirates +empty +enclosure +end +engine +england +english +entering +equestrian +europe +evening +evergreen +exhaust +exit +eyes +f +fabric +face +facebook +factory +fair +fake +fall +falling +family +fan +fancy +fans +fanta +far +far right +farm +farmer +farmers +farmers market +fashion +fast +fast food +father +faucet +feathers +fedex +fedora +feeder +feeding +feeding giraffe +feet +fell +female +fence +fern +ferris wheel +ferry +festival +feta +few +field +fighter +fighting +finch +finger +fire +fire extinguisher +fire hydrant +fire truck +firefighter +fireman +fireplace +fires +first +first base +fish +fisheye +fishing +fishing boat +flag +flags +flamingo +flashlight +flat +flat screen +flats +flickr +flip +flip flops +flip phone +floating +flood +floor +floral +florida +flour +flower +flowers +fluffy +fluorescent +fly +fly kite +flying +flying kite +flying kites +foam +focus +fog +foggy +foil +food +food processor +food truck +foot +football +footprints +for balance +for fun +for photo +for sale +ford +foreground +forehand +forest +fork +fork and knife +fork and spoon +forks +formal +formica +forward +fountain +fox +frame +france +free +freezer +freight +freightliner +french +french fries +fresh +fridge +fried +friend +friends +fries +frisbee +frisbees +frog +front +frosted +frosting +fruit +fruit salad +fruits +full +fun +fur +furniture +futon +g +game +game controller +gaming +garage +garbage +garden +garlic +gas +gas station +gate +gatorade +gazebo +ge +geese +genetics +german +german shepherd +germany +ghost +giants +ginger +giraffe +giraffe and zebra +giraffes +girl +girl on right +girls +give way +glass +glasses +glaze +glazed +globe +glove +gloves +gmc +go +goal +goalie +goat +goatee +goats +goggles +going +gold +golden gate +golden retriever +golf +gone +good +google +goose +gothic +graduation +graffiti +grandfather +granite +grape +grapefruit +grapes +grass +grassy +gravel +gravy +gray +gray and black +gray and red +gray and white +grazing +green +green and black +green and blue +green and brown +green and orange +green and red +green and white +green and yellow +green beans +greyhound +grill +grilled +grilled cheese +grind +grinding +grizzly +grocery +grocery store +ground +guitar +guitar hero +gun +gym +h +hair +hair dryer +haircut +half +half full +halloween +hallway +ham +ham and cheese +hamburger +hammer time +hammock +hand +handicap +handle +handlebars +hands +hanger +hanging +happiness +happy +happy birthday +harbor +hard +hardwood +harley +harley davidson +harness +harry potter +hat +hats +hauling +hawaii +hawaiian +hawk +hay +hazy +he isn't +he's not +head +headband +headphones +healthy +heart +hearts +heat +heater +heavy +heels +heineken +heinz +helicopter +hello kitty +helmet +helmets +herd +herding +herself +hexagon +hiding +high +high chair +high heels +highway +hiking +hill +hills +hilly +himself +hispanic +hit +hit ball +hitting +hitting ball +hockey +holding +holding baby +holding it +holding phone +holding umbrella +hollywood +home +home plate +homemade +honda +honey +hood +hoodie +horizontal +horizontally +horns +horse +horse racing +horseback riding +horses +hose +hospital +hot +hot dog +hot dogs +hot sauce +hotel +hotel room +house +houses +hp +hsbc +htc +huge +hugging +human +humans +hummingbird +hundreds +hungry +husky +hydrant +i +i don't know +ibm +ice +ice cream +icing +identification +illinois +in +in air +in back +in background +in basket +in bowl +in box +in cabbage town +in car +in corner +in cup +in field +in front +in grass +in hand +in her hand +in his hand +in middle +in motion +in sink +in sky +in snow +in stands +in street +in suitcase +in vase +in water +index +india +indian +indians +indoor +indoors +information +inside +intersection +iphone +ipod +ireland +iris +iron +island +it isn't +it's not +it's raining +italian +italy +ivory +ivy +j +jacket +jackets +jal +japan +japanese +jar +jeans +jeep +jelly +jesus +jet +jet ski +jetblue +jets +jockey +john +jones +joshua +jp morgan +juice +jump +jumped +jumping +jungle +junk +k +kangaroo +kawasaki +kayak +kayaking +kenmore +ketchup +ketchup and mustard +kettle +keyboard +keys +khaki +kia +kicking +kickstand +kid +kids +king +kissing +kitchen +kitchenaid +kite +kite flying +kite string +kites +kitesurfing +kiting +kitten +kiwi +klm +knee pads +kneeling +knife +knife and fork +knives +kodak +korean air +krispy kreme +l +la +lab +labrador +lace +lacoste +ladder +lady +ladybug +lake +lamb +lamp +lamps +land +landing +landscape +lanes +lanyard +lap +laptop +laptops +large +laughing +laundry +laying +laying down +lays +leaf +leaning +learning +leash +leather +leaves +leaving +left +left 1 +left and right +left side +leg +lego +legos +legs +lemon +lemonade +lemons +leopard +letters +lettuce +lexus +lg +library +license plate +licking +lid +life +life jacket +life vest +lifeguard +lift +light +lighter +lighthouse +lighting +lights +lilac +lilies +lily +lime +limes +lines +linoleum +lion +liquor +listening +listening to music +little +little girl +living +living room +lizard +loading +lobster +log +logitech +logo +logs +london +long +long sleeve +long time +looking +looking at camera +looking at phone +looking out window +los angeles +lot +lotion +lots +love +low +lufthansa +luggage +lunch +lying down +m +mac +macaroni +machine +mack +magazine +magazines +magnet +magnets +mailbox +main +main st +main street +makeup +male +males +mall +man +man in middle +man made +man on left +man on right +man's +mane +mango +mantle +many +map +maple +maple leaf +marble +marina +mariners +mario +marker +market +maroon +married +marshmallows +mask +mat +mattress +mayo +mayonnaise +mcdonald's +me +meat +meatballs +medium +meeting +men +men's +menu +meow +mercedes +mercedes benz +messy +metal +meter +metro +mets +mexican +mexico +miami +michigan +mickey mouse +microphone +microsoft +microwave +middle +middle 1 +military +milk +millions +minnie mouse +mint +mirror +mirrors +mississippi +mitsubishi +mitt +mixer +model +modern +mohawk +mom +monday +money +monitor +monkey +monster +moon +moped +more +morning +mosaic +moss +motel +mother +mother and child +motion +motocross +motor +motorbike +motorcycle +motorcycles +motorola +mound +mountain +mountain dew +mountainous +mountains +mouse +mouse pad +mouth +mouthwash +movement +movie +moving +mozzarella +mt airy +mud +muffin +muffins +mug +multi +multi colored +multicolored +multiple +mural +museum +mushroom +mushrooms +music +mustache +mustard +mutt +n +name +name tag +napkin +napkins +nasa +nathan's +national express +natural +nature +navy +neck +necklace +neither +neon +nest +net +never +new +new orleans +new york +news +newspaper +next to toilet +night +night time +nightstand +nighttime +nike +nikon +nintendo +nissan +no +no 1 +no cat +no clock +no dog +no flag +no grass +no hat +no left turn +no light +no man +no number +no parking +no plate +no shirt +no sign +no smoking +no train +no water +nobody +nokia +noodles +noon +normal +north +north america +north face +nose +not +not at all +not here +not high +not in service +not likely +not long +not possible +not sure +not there +not very +notebook +notes +nothing +now +nowhere +numbers +nursing +nuts +ny +o +oak +oar +oars +obama +ocean +octagon +octopus +off +office +oil +old +older +olives +ollie +olympics +omelet +on +on beach +on bed +on bench +on bike +on boat +on building +on bus +on car +on chair +on couch +on counter +on desk +on dresser +on elephant +on floor +on fridge +on grass +on ground +on his face +on his head +on horse +on laptop +on left +on man +on motorcycle +on napkin +on phone +on pizza +on plane +on plate +on pole +on rack +on right +on road +on rock +on runway +on shelf +on shore +on sidewalk +on sign +on sink +on skateboard +on stove +on street +on suitcase +on table +on toilet +on top +on tower +on track +on tracks +on train +on tray +on tree +on wall +on water +on woman +onion +onion rings +onions +only +opaque +open +opponent +orange +orange and black +orange and blue +orange and white +orange and yellow +orange juice +oranges +orchid +oregon +organic +oriental +orioles +ostrich +ottoman +out +out of focus +outdoor +outdoors +outfield +outside +oval +oven +over +over easy +overalls +overcast +owl +owner +p +pacific +pacifier +packing +paddle +paddle boarding +paddling +paint +painted +painting +paisley +pajamas +palm +palm tree +palm trees +pan +pancake +pancakes +panda +pans +pants +paper +paper towels +papers +parachute +parade +parakeet +parasailing +pare +paris +park +parked +parking +parking garage +parking lot +parking meter +parking meters +parmesan +parmesan cheese +parrot +parrots +parsley +partly cloudy +party +passenger +passengers +pasta +pastries +pastry +pasture +patio +patterned +paved +pavement +paw +pc +peace +peach +peaches +peacock +peanut butter +peanuts +pear +pearl +peas +pedestal +pedestrian +pedestrian crossing +pedestrians +pee +peeing +pelican +pelicans +pen +pencil +penguin +penne +pens +people +pepper +pepperoni +peppers +pepsi +persian +person +petting +petting horse +philadelphia +phillies +phone +phones +photo +photograph +photographer +photography +photoshop +piano +pickle +pickles +pickup +picnic +picnic table +picture +pictures +pie +pier +pig +pigeon +pigeons +pigtails +pillow +pillows +pilot +pine +pineapple +ping pong +pink +pink and black +pink and blue +pink and white +pink and yellow +pipe +pipes +pirate +pirates +pitbull +pitch +pitcher +pitching +pizza +pizza box +pizza cutter +pizza hut +placemat +plaid +plain +plane +planes +plant +planter +plants +plaster +plastic +plastic wrap +plate +plates +platform +play +play tennis +player +players +playing +playing baseball +playing frisbee +playing game +playing soccer +playing tennis +playing video game +playing video games +playing wii +playstation +plow +plunger +pm +pocket +pockets +pointing +polar +polar bear +polar bears +pole +poles +police +police officer +polka dot +polka dots +polo +pomeranian +pond +pony +ponytail +poodle +pool +poop +pooping +poor +porcelain +porch +pork +posing +post +poster +posts +pot +potato +potato salad +potatoes +pots +pottery +powdered +powdered sugar +power +power lines +practice +prince +print +printer +privacy +private +produce +professional +prom +propeller +protection +protest +public +public market center +pug +pull +puma +pumpkin +puppy +purple +purple and white +purse +qantas +qatar +queen +quilt +r +rabbit +race +racing +rack +racket +rackets +racquet +radiator +radio +radish +raft +rail +railing +railroad crossing +rain +rainbow +raining +rainy +ram +ramp +ranch +raspberries +raspberry +raw +rays +reading +real +rear +recently +recessed +recliner +rectangle +rectangles +red +red and black +red and blue +red and gray +red and green +red and silver +red and white +red and yellow +red bull +red light +red sox +red velvet +red white and blue +red white blue +reds +referee +reflection +refrigerator +refrigerators +regular +reins +relaxing +relish +remodeling +remote +remote control +remotes +residential +restaurant +resting +ribbon +rice +ride +riding +riding bike +riding bikes +riding elephant +riding horse +riding horses +riding motorcycle +right +right 1 +right hand +right side +ring +ring finger +ripe +river +road +roast beef +robe +robin +robot +rock +rocks +rocky +rodeo +rolex +roll +roman +roman numerals +roof +room +rooster +rope +rose +roses +rottweiler +rough +round +roundabout +rowing +rubber +rug +rugby +run +running +runway +rural +russia +russian +rust +rv +rye +s +sad +saddle +safari +safe +safety +sail +sailboat +sailboats +sailing +salad +salmon +salon +salt +salt and pepper +samsung +san diego +san francisco +sand +sandals +sandwich +sandwiches +santa +santa hat +sas +sauce +sauerkraut +sausage +savannah +savory +scale +scania +scarf +scenery +schnauzer +school +school bus +scissors +scooter +scrambled +scratching +screen +seafood +seagull +seagulls +seat +seattle +seaweed +second +security +sedan +seeds +selfie +selling +semi +sepia +serious +serve +serving +sesame +sesame seeds +setting +several +sewing +shade +shadow +shadows +shaking hands +shallow +shampoo +shape +shark +shaved +shearing +shed +sheep +sheepdog +sheet +sheets +shelf +shell +shells +shelter +shelves +shepherd +shih tzu +shingles +ship +shirt +shirt and tie +shirts +shoe +shoes +shop +shopping +shopping cart +shore +short +shorter +shorts +shoulder +show +shower +shower curtain +shower head +shrimp +shut +siamese +siblings +side +side of road +sidecar +sidewalk +sideways +sign +signs +silk +silver +silver and black +silver and red +silverware +singapore +singing +single +single engine +singles +sink +sitting +size +skate +skate park +skateboard +skateboarder +skateboarding +skateboards +skatepark +skating +skeleton +ski +ski boots +ski lift +ski pole +ski poles +ski resort +ski slope +skier +skiers +skiing +skirt +skis +skull +skull and crossbones +sky +skyscraper +skyscrapers +slacks +sled +sleep +sleeping +sleeve +sliced +slide +sliding +slippers +slope +slow +slow down +small +smaller +smartphone +smile +smiley face +smiling +smoke +smoking +smooth +smoothie +snake +sneakers +sniffing +snow +snowboard +snowboarder +snowboarding +snowboards +snowflakes +snowing +snowsuit +snowy +soap +soccer +soccer ball +soccer field +socks +soda +sofa +soft +softball +soldier +soldiers +solid +someone +sony +sony ericsson +soon +soup +south +southwest +space +space needle +space shuttle +spaghetti +spanish +sparrow +spatula +speaker +speakers +spectators +speed limit +spices +spider +spiderman +spinach +spiral +spoon +spoons +sports +spots +spotted +spray paint +spring +sprinkles +sprint +sprite +square +squares +squash +squatting +squirrel +st patrick's day +stability +stadium +stagecoach +stained glass +stainless steel +stairs +stand +standing +standing still +stands +star +star alliance +star wars +starbucks +staring +stars +state farm +station +statue +statues +steak +steam +steamed +steel +steeple +steering wheel +steps +stew +stick +sticker +stickers +sticks +still +stir fry +stomach +stone +stones +stool +stop +stop light +stop sign +stopped +stopping +storage +store +stork +storm +stove +straight +straight ahead +strap +straw +strawberries +strawberry +street +street light +street name +street sign +stretching +strike +string +stripe +striped +stripes +stroller +stucco +student +students +stuffed +stuffed animal +stuffed animals +style +styrofoam +sub +subway +sugar +suit +suitcase +suitcases +suits +summer +sun +sun hat +sunbathing +sunflower +sunflowers +sunglasses +sunlight +sunny +sunrise +sunset +supreme +surf +surfboard +surfboards +surfer +surfers +surfing +surprise +surprised +sushi +suspenders +suv +suzuki +swan +swans +sweat +sweatband +sweater +sweatshirt +sweet +sweet potato +swim +swim trunks +swimming +swimsuit +swing +swinging +swinging bat +swirls +swiss +switzerland +sydney +syrup +t +t shirt +t shirt and jeans +tabby +table +tablecloth +tables +tablet +tag +tags +tail +take off +taking off +taking photo +taking picture +taking pictures +taking selfie +talking +talking on phone +tall +taller +tam +tan +tank +tank top +tape +target +tarmac +tarp +tater tots +tattoo +tattoos +taxi +tea +teacher +teal +team +teddy +teddy bear +teddy bears +teeth +telephone +television +tell time +telling time +tennis +tennis ball +tennis court +tennis player +tennis racket +tennis rackets +tennis racquet +tennis shoes +tent +tents +terrier +texas +texting +thai +thailand +thanksgiving +theater +they aren't +thick +thin +thomas +thoroughbred +thousands +throw +throw ball +throw frisbee +throwing +throwing frisbee +thumb +thumbs up +tiara +tie +tie dye +ties +tiger +tigers +tile +tiled +tiles +tim hortons +time +tinkerbell +tire +tired +tires +tissue +tissues +to catch ball +to catch frisbee +to dry +to eat +to get to other side +to hit ball +to left +to right +to see +toast +toasted +toaster +toaster oven +toilet +toilet brush +toilet paper +toiletries +toilets +tokyo +tomato +tomatoes +tongs +tongue +tools +toothbrush +toothbrushes +toothpaste +toothpick +toothpicks +top +top hat +top left +top right +toronto +toshiba +tour +tourist +tow +tow truck +toward +towards +towel +towels +tower +towing +town +toy +toyota +toys +track +tracks +tractor +traffic +traffic light +traffic lights +trailer +train +train car +train station +train tracks +trains +transport +transportation +trash +trash can +travel +traveling +tray +tree +tree branch +trees +triangle +triangles +trick +tripod +triumph +trolley +tropical +tropicana +truck +trucks +trunk +trunks +tub +tube +tugboat +tulip +tulips +tuna +tunnel +turkey +turn +turn right +turning +turtle +tusks +tuxedo +tv +tv stand +twin +twins +tying tie +typing +uk +umbrella +umbrellas +umpire +unclear +under +under armour +under sink +under table +under tree +uniform +uniforms +union station +united +united states +unknown +unsure +up +uphill +upright +ups +upside down +urban +urinal +urinals +us +us air force +us airways +us airways express +us open +usa +used +using computer +using laptop +utensils +v +vacation +vaio +valentine's day +van +vanilla +vans +vase +vases +vegetable +vegetables +vegetarian +veggie +veggies +vehicles +venice +vent +verizon +vertical +very +very big +very deep +very fast +very high +very long +very old +very tall +vest +vests +victoria +victorian +video +video game +vines +virgin +virgin atlantic +visibility +visilab +visor +volkswagen +volleyball +volvo +w +waffle +wagon +waiting +wakeboard +walgreens +walk +walking +wall +wall st +wallet +wallpaper +war +warm +warmth +warning +washing +washington +washington dc +washington monument +watch +watch tv +watching +watching tv +water +water bottle +water ski +water skiing +water skis +watermark +watermelon +wave +waves +waving +wavy +wax +wax paper +weather vane +website +wedding +weeds +welcome +west +western +westin +westjet +wet +wetsuit +wetsuits +whale +wheat +wheel +wheelchair +wheelie +wheels +whipped cream +whirlpool +white +white and black +white and blue +white and brown +white and gray +white and green +white and orange +white and pink +white and red +white and yellow +white house +whole +wicker +wide +wii +wii controller +wii controllers +wii remote +wii remotes +wiimote +wild +wildebeest +willow +wilson +wind +windmill +window +window sill +windows +windowsill +windsor +windsurfing +windy +wine +wine bottle +wine glass +wine glasses +wine tasting +wing +wings +winnie pooh +winter +wire +wireless +wires +wisconsin +woman +woman's +women +women's +wood +wooden +woodpecker +woods +wool +words +work +working +worms +wreath +wrist +wristband +writing +x +xbox +y +yacht +yamaha +yankees +yard +yarn +years +yellow +yellow and black +yellow and blue +yellow and green +yellow and orange +yellow and red +yellow and white +yes +yield +yogurt +young +younger +zebra +zebra and giraffe +zebras +zig zag +zipper +zoo +zucchini From 0d8fc63ee1be004ca42af3feabe49dd428031557 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:10:48 -0500 Subject: [PATCH 07/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index b43a6a8c2..39c3e9bed 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -65,16 +65,15 @@ def __init__(self, encoder, embed_dim, num_labels): self.encoder = encoder - self.logits_proj = nn.Linear(embed_dim * 2, 1536) #size of answer space - - self.logits_2 = nn.Linear(1536, num_classes) + self.fc1 = nn.Linear(embed_dim * 2, 1536) #size of answer space + self.fc2 = nn.Linear(1536, num_classes) def forward(self, image, text): # CLIP doesn't have a multimodal encoder, so we concatenate the features text_features = self.encoder.encode_text(text) image_features = self.encoder.encode_image(image) multimodal_features = torch.cat([image_features, text_features], dim=-1) - layer1 = nn.GeLU(self.logits_proj(multimodal_features), dim=-1) - logits = self.logits_2(layer1) + layer = F.relu(self.fc1(multimodal_features)) + logits = self.fc2(layer) return logits class EarlyStopping: @@ -194,7 +193,7 @@ def parse_args(args): "--workers", type=int, default=2, help="Number of dataloader workers per GPU." ) parser.add_argument( - "--batch-size", type=int, default=32, help="Batch size per GPU." + "--batch-size", type=int, default=64, help="Batch size per GPU." ) parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." From ba74bd29fc707f0d1050886c090d200078cacfdc Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:21:58 -0500 Subject: [PATCH 08/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 39c3e9bed..6742c8ca2 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -198,7 +198,7 @@ def parse_args(args): parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." ) - parser.add_argument("--lr", type=float, default=3e-5, help="Learning rate.") + parser.add_argument("--lr", type=float, default=3e-3, help="Learning rate.") parser.add_argument("--beta1", type=float, default=0.9, help="Adam beta 1.") parser.add_argument("--beta2", type=float, default=0.999, help="Adam beta 2.") parser.add_argument("--eps", type=float, default=1e-8, help="Adam epsilon.") From 5e0e093c9179c85a1e30adbcc2fe8288b35b4d6a Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:24:10 -0500 Subject: [PATCH 09/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 6742c8ca2..6cf010296 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -13,6 +13,7 @@ from training.train import AverageMeter import evaluate from sklearn import preprocessing +import numpy as np from datasets import load_dataset_builder from datasets import load_dataset From 3f91879fbb489d2c2b39b52a1a2cafb0b1d8b69b Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:27:57 -0500 Subject: [PATCH 10/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 6cf010296..3e9e9ac00 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -5,6 +5,7 @@ from PIL import Image from torch import nn from torch.utils.data import Dataset, DataLoader +import torch.nn.functional as F from tqdm import tqdm import open_clip @@ -302,4 +303,5 @@ def parse_args(args): metric_name=args.early_stop_metric_name, ) -val_metrics, end_training = train_one_epoch(clf, data, 1, optim, scheduler, early_stop, device, args) +for epoch in range(10): + val_metrics, end_training = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) From f00bff25f585a55d16b62f16d99873cc1240dd8a Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:33:46 -0500 Subject: [PATCH 11/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 3e9e9ac00..d19b3a709 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -276,6 +276,8 @@ def parse_args(args): answers = np.array(answers) dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) +dataset_df = dataset_df[0:12800] + args = parse_args([]) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") From ca8d44ea4b791d950ca0a0a39df14eddc01b1231 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:40:29 -0500 Subject: [PATCH 12/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index d19b3a709..164082bae 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -200,7 +200,7 @@ def parse_args(args): parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." ) - parser.add_argument("--lr", type=float, default=3e-3, help="Learning rate.") + parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--beta1", type=float, default=0.9, help="Adam beta 1.") parser.add_argument("--beta2", type=float, default=0.999, help="Adam beta 2.") parser.add_argument("--eps", type=float, default=1e-8, help="Adam epsilon.") @@ -276,7 +276,7 @@ def parse_args(args): answers = np.array(answers) dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) -dataset_df = dataset_df[0:12800] +dataset_df = dataset_df[0:128000] args = parse_args([]) From ba822ee09be24363d06bb0739af003bdaa6165af Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 13 Feb 2023 23:44:49 -0500 Subject: [PATCH 13/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 164082bae..b19b3579e 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -129,6 +129,7 @@ def compute_metrics(model, dataloader, device, args): #metrics = metric.compute() metrics = {} metrics["loss"] = val_loss / samples_seen + return metrics def train_single_epoch(model, data, optimizer, args): @@ -180,8 +181,8 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device progress_bar.close() metrics = compute_metrics(model, data["train"], device, args) - end_training = early_stop.step(metrics) - return metrics, end_training + #end_training = early_stop.step(metrics) + return metrics def parse_args(args): parser = argparse.ArgumentParser() @@ -306,4 +307,4 @@ def parse_args(args): ) for epoch in range(10): - val_metrics, end_training = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) + val_metrics = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) From ff8aadafafa2e4270b024608c1e5909d9711b698 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Tue, 14 Feb 2023 00:42:59 -0500 Subject: [PATCH 14/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index b19b3579e..8ca897488 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -68,13 +68,14 @@ def __init__(self, encoder, embed_dim, num_labels): self.encoder = encoder self.fc1 = nn.Linear(embed_dim * 2, 1536) #size of answer space + self.lnorm = nn.LayerNorm(1536) self.fc2 = nn.Linear(1536, num_classes) def forward(self, image, text): # CLIP doesn't have a multimodal encoder, so we concatenate the features text_features = self.encoder.encode_text(text) image_features = self.encoder.encode_image(image) multimodal_features = torch.cat([image_features, text_features], dim=-1) - layer = F.relu(self.fc1(multimodal_features)) + layer = self.lnorm(F.relu(self.fc1(multimodal_features))) logits = self.fc2(layer) return logits @@ -277,7 +278,7 @@ def parse_args(args): answers = np.array(answers) dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) -dataset_df = dataset_df[0:128000] +dataset_df = dataset_df[0:12800] args = parse_args([]) @@ -306,5 +307,5 @@ def parse_args(args): metric_name=args.early_stop_metric_name, ) -for epoch in range(10): +for epoch in range(20): val_metrics = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) From cfe6b72916d6644f4441a9afd2d018f0f44c6dc8 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Wed, 15 Feb 2023 20:56:22 -0500 Subject: [PATCH 15/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 155 +++++++++++++++++----------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 8ca897488..2f6cfe982 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -41,23 +41,57 @@ def __getitem__(self, idx): } def get_task_dataloaders(df, transforms, args): - tokenizer = get_tokenizer(args.model) + answer_space = [] + with open('answers_vqa.txt') as f: + for line in f: + answer_space.append(line.strip()) + answer_space = np.array(answer_space) + + labelencoder = preprocessing.LabelEncoder() + labelencoder.fit(answer_space) + num_classes = len(list(labelencoder.classes_)) + answer_set = set(labelencoder.classes_) + + tokenizer = get_tokenizer(args.model) dataloaders = {} - dataset = VQATextDataset(df, - "train", - transforms, - tokenizer=tokenizer, - ) - dataloader = DataLoader( - dataset, - batch_size=args.batch_size, - shuffle=True, - num_workers=args.workers, - pin_memory=True, - drop_last=True, - ) - dataloaders["train"] = dataloader + + for split in ["train", "validation", "test"] + dataset_train = load_dataset("HuggingFaceM4/VQAv2", split=split, cache_dir = "./sample_data") + dataset_df = dataset_train.to_pandas() + + class_id = [] + questions = [] + images = [] + answers = [] + for index, row in dataset_df.iterrows(): + if(row['multiple_choice_answer'] in answer_set): + class_id.append(row['question_id']) + questions.append(row['question']) + images.append(row['image']) + answers.append(row['multiple_choice_answer']) + class_id = np.array(class_id) + questions = np.array(questions) + images = np.array(images) + answers = np.array(answers) + + dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) + dataset_df = dataset_df[0:12800] + + dataset = VQATextDataset(df, + spliot, + transforms, + tokenizer=tokenizer, + ) + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.workers, + pin_memory=True, + drop_last=True, + ) + dataloaders[split] = dataloader return dataloaders @@ -247,65 +281,32 @@ def parse_args(args): args = parser.parse_args(args) return args -dataset_train = load_dataset("HuggingFaceM4/VQAv2", split="train", cache_dir = "./sample_data") - -dataset_df = dataset_train.to_pandas() - -answer_space = [] -with open('answers_vqa.txt') as f: - for line in f: - answer_space.append(line.strip()) -answer_space = np.array(answer_space) - -labelencoder = preprocessing.LabelEncoder() -labelencoder.fit(answer_space) -num_classes = len(list(labelencoder.classes_)) - -answer_set = set(labelencoder.classes_) -class_id = [] -questions = [] -images = [] -answers = [] -for index, row in dataset_df.iterrows(): - if(row['multiple_choice_answer'] in answer_set): - class_id.append(row['question_id']) - questions.append(row['question']) - images.append(row['image']) - answers.append(row['multiple_choice_answer']) -class_id = np.array(class_id) -questions = np.array(questions) -images = np.array(images) -answers = np.array(answers) - -dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) -dataset_df = dataset_df[0:12800] - - -args = parse_args([]) -device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - -model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( - args.model, - args.pretrained, - precision=args.precision, - device=device, -) -model_cfg = open_clip.factory.get_model_config(args.model) -embed_dim = model_cfg["embed_dim"] - -data = get_task_dataloaders(dataset_df, preprocess_val, args) - -clf_cls = CLIPMultimodalClassifier -clf = clf_cls(model, embed_dim, num_classes).to(device) -optim = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=args.wd) - -total_steps = len(data["train"]) * args.epochs -scheduler = cosine_lr(optim, args.lr, args.warmup, total_steps) -early_stop = EarlyStopping( # greater metric value is better - patience=args.early_stop_patience, - threshold=args.early_stop_threshold, - metric_name=args.early_stop_metric_name, -) - -for epoch in range(20): - val_metrics = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) +if __name__ == "__main__": + args = parse_args([]) + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( + args.model, + args.pretrained, + precision=args.precision, + device=device, + ) + model_cfg = open_clip.factory.get_model_config(args.model) + embed_dim = model_cfg["embed_dim"] + + data = get_task_dataloaders(dataset_df, preprocess_val, args) + + clf_cls = CLIPMultimodalClassifier + clf = clf_cls(model, embed_dim, num_classes).to(device) + optim = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=args.wd) + + total_steps = len(data["train"]) * args.epochs + scheduler = cosine_lr(optim, args.lr, args.warmup, total_steps) + early_stop = EarlyStopping( # greater metric value is better + patience=args.early_stop_patience, + threshold=args.early_stop_threshold, + metric_name=args.early_stop_metric_name, + ) + + for epoch in range(20): + val_metrics = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) From 3d809364cc6dbbe3e98960254e017531825b2d03 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:08:39 -0500 Subject: [PATCH 16/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 2f6cfe982..f17a10634 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -40,7 +40,7 @@ def __getitem__(self, idx): 'label': torch.tensor(label) } -def get_task_dataloaders(df, transforms, args): +def get_task_dataloaders(path, transforms, args): answer_space = [] with open('answers_vqa.txt') as f: for line in f: @@ -57,7 +57,7 @@ def get_task_dataloaders(df, transforms, args): dataloaders = {} for split in ["train", "validation", "test"] - dataset_train = load_dataset("HuggingFaceM4/VQAv2", split=split, cache_dir = "./sample_data") + dataset_train = load_dataset(path, split=split, cache_dir = "./sample_data") dataset_df = dataset_train.to_pandas() class_id = [] @@ -294,7 +294,7 @@ def parse_args(args): model_cfg = open_clip.factory.get_model_config(args.model) embed_dim = model_cfg["embed_dim"] - data = get_task_dataloaders(dataset_df, preprocess_val, args) + data = get_task_dataloaders("HuggingFaceM4/VQAv2", preprocess_val, args) clf_cls = CLIPMultimodalClassifier clf = clf_cls(model, embed_dim, num_classes).to(device) From 57a91d743eed44f2dd7441d7f42f001a56b9cf10 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:18:21 -0500 Subject: [PATCH 17/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index f17a10634..2e7a15c45 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -56,7 +56,7 @@ def get_task_dataloaders(path, transforms, args): tokenizer = get_tokenizer(args.model) dataloaders = {} - for split in ["train", "validation", "test"] + for split in ["train", "validation", "test"]: dataset_train = load_dataset(path, split=split, cache_dir = "./sample_data") dataset_df = dataset_train.to_pandas() From 49420267c7713498f03198cef04715431ee87298 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:20:43 -0500 Subject: [PATCH 18/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 2e7a15c45..20cfc21e9 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -78,7 +78,7 @@ def get_task_dataloaders(path, transforms, args): dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) dataset_df = dataset_df[0:12800] - dataset = VQATextDataset(df, + dataset = VQATextDataset(dataset_df, spliot, transforms, tokenizer=tokenizer, From 2a43247c5899fa8189bc57419b38c77da8e34375 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:21:46 -0500 Subject: [PATCH 19/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 20cfc21e9..430edaebe 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -79,7 +79,7 @@ def get_task_dataloaders(path, transforms, args): dataset_df = dataset_df[0:12800] dataset = VQATextDataset(dataset_df, - spliot, + split, transforms, tokenizer=tokenizer, ) From b17e5bc32e211eed580ee94258a9b32671cbd839 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:23:52 -0500 Subject: [PATCH 20/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 430edaebe..1b2c951df 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -20,7 +20,7 @@ from datasets import load_dataset class VQATextDataset(Dataset): - def __init__(self, df, split, transforms, tokenizer=None): + def __init__(self, df, split, transforms, labelencoder, tokenizer=None): self.df = df self.transforms = transforms self.tokenize = tokenizer @@ -81,6 +81,7 @@ def get_task_dataloaders(path, transforms, args): dataset = VQATextDataset(dataset_df, split, transforms, + labelencoder, tokenizer=tokenizer, ) dataloader = DataLoader( From fe65a7d53732bb169d8a24ef923083c118a79cfe Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:31:25 -0500 Subject: [PATCH 21/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 1b2c951df..8f5b5645e 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -56,7 +56,7 @@ def get_task_dataloaders(path, transforms, args): tokenizer = get_tokenizer(args.model) dataloaders = {} - for split in ["train", "validation", "test"]: + for split in ["train", "validation"]: dataset_train = load_dataset(path, split=split, cache_dir = "./sample_data") dataset_df = dataset_train.to_pandas() From ea4c0a805264ab1acef089e65298b0c79fa9538c Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:34:00 -0500 Subject: [PATCH 22/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 8f5b5645e..ce09adbbb 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -40,19 +40,7 @@ def __getitem__(self, idx): 'label': torch.tensor(label) } -def get_task_dataloaders(path, transforms, args): - answer_space = [] - with open('answers_vqa.txt') as f: - for line in f: - answer_space.append(line.strip()) - answer_space = np.array(answer_space) - - labelencoder = preprocessing.LabelEncoder() - labelencoder.fit(answer_space) - num_classes = len(list(labelencoder.classes_)) - - answer_set = set(labelencoder.classes_) - +def get_task_dataloaders(path, transforms, labelencoder, args): tokenizer = get_tokenizer(args.model) dataloaders = {} @@ -294,8 +282,20 @@ def parse_args(args): ) model_cfg = open_clip.factory.get_model_config(args.model) embed_dim = model_cfg["embed_dim"] + + answer_space = [] + with open('answers_vqa.txt') as f: + for line in f: + answer_space.append(line.strip()) + answer_space = np.array(answer_space) + + labelencoder = preprocessing.LabelEncoder() + labelencoder.fit(answer_space) + num_classes = len(list(labelencoder.classes_)) - data = get_task_dataloaders("HuggingFaceM4/VQAv2", preprocess_val, args) + answer_set = set(labelencoder.classes_) + + data = get_task_dataloaders("HuggingFaceM4/VQAv2", preprocess_val, labelencoder, args) clf_cls = CLIPMultimodalClassifier clf = clf_cls(model, embed_dim, num_classes).to(device) From a042ea6c633d3393467265f827f7aad1c1293f33 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:41:48 -0500 Subject: [PATCH 23/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index ce09adbbb..08844d26d 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -142,17 +142,17 @@ def compute_metrics(model, dataloader, device, args): samples_seen += text.shape[0] logits = model(image, text) - #predictions = torch.argmax(logits) + predictions = torch.argmax(logits) batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() - #metric.add_batch( - #predictions=predictions.cpu().numpy(), - #references=label.cpu().numpy(), - #) + metric.add_batch( + predictions=predictions.cpu().numpy(), + references=label.cpu().numpy(), + ) model.train() - #metrics = metric.compute() + metrics = metric.compute() metrics = {} - metrics["loss"] = val_loss / samples_seen + metrics["accuracy"] = val_loss / samples_seen return metrics @@ -196,17 +196,17 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device if (i % args.val_frequency) == 0 and i > 0: print(loss) - metrics = compute_metrics(model, data["train"], device, args) + metrics = compute_metrics(model, data["validation"], device, args) - #end_training = early_stop.step(metrics) - #if end_training: - #progress_bar.close() - #return metrics, end_training + end_training = early_stop.step(metrics) + if end_training: + progress_bar.close() + return metrics, end_training progress_bar.close() - metrics = compute_metrics(model, data["train"], device, args) - #end_training = early_stop.step(metrics) - return metrics + metrics = compute_metrics(model, data["validation"], device, args) + end_training = early_stop.step(metrics) + return metrics, end_training def parse_args(args): parser = argparse.ArgumentParser() @@ -310,4 +310,4 @@ def parse_args(args): ) for epoch in range(20): - val_metrics = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) + val_metrics, end_training = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) From c862b83cf4313e9f1f241ae2a9793c6018741b89 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:52:09 -0500 Subject: [PATCH 24/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 08844d26d..702b95d51 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -143,6 +143,7 @@ def compute_metrics(model, dataloader, device, args): logits = model(image, text) predictions = torch.argmax(logits) + print(predictions) batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() metric.add_batch( From 12ca1ea57d7cdc113aae12df50b0b673c21d26bc Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:54:30 -0500 Subject: [PATCH 25/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 702b95d51..05f756b41 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -141,7 +141,8 @@ def compute_metrics(model, dataloader, device, args): label = batch["label"].to(device) samples_seen += text.shape[0] logits = model(image, text) - + print(logits) + print(label) predictions = torch.argmax(logits) print(predictions) batch_val_loss = loss_fn(logits, label) From 7f31b1a52f53e1da6f29e7e2b8d589e67beb94e4 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 00:58:00 -0500 Subject: [PATCH 26/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 05f756b41..cb8b40327 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -143,7 +143,7 @@ def compute_metrics(model, dataloader, device, args): logits = model(image, text) print(logits) print(label) - predictions = torch.argmax(logits) + predictions = torch.argmax(logits, dim=-1) print(predictions) batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() From 1889ab448a6b6854deb04d489e52e9ac73e3c128 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 01:02:13 -0500 Subject: [PATCH 27/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index cb8b40327..4ee975ea9 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -64,7 +64,7 @@ def get_task_dataloaders(path, transforms, labelencoder, args): answers = np.array(answers) dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) - dataset_df = dataset_df[0:12800] + #dataset_df = dataset_df[0:12800] dataset = VQATextDataset(dataset_df, split, @@ -141,10 +141,7 @@ def compute_metrics(model, dataloader, device, args): label = batch["label"].to(device) samples_seen += text.shape[0] logits = model(image, text) - print(logits) - print(label) predictions = torch.argmax(logits, dim=-1) - print(predictions) batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() metric.add_batch( From 5305992d33191bb405f3c19350bbd39e3f742dd2 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 01:17:22 -0500 Subject: [PATCH 28/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 4ee975ea9..f665b8752 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -196,7 +196,7 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device if (i % args.val_frequency) == 0 and i > 0: print(loss) metrics = compute_metrics(model, data["validation"], device, args) - + print(metrics["accuracy"]) end_training = early_stop.step(metrics) if end_training: progress_bar.close() From 4c89bb119709db041eb925a69f1ae2321bde81ea Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 01:23:38 -0500 Subject: [PATCH 29/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index f665b8752..9a02d1640 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -193,7 +193,7 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device progress_bar.set_description(f"Loss: {loss.item():.4f}") progress_bar.update(1) - if (i % args.val_frequency) == 0 and i > 0: + if (i % args.val_frequency) == 0 and i > 5: print(loss) metrics = compute_metrics(model, data["validation"], device, args) print(metrics["accuracy"]) @@ -233,7 +233,7 @@ def parse_args(args): "--warmup", type=int, default=200, help="Number of steps to warmup for." ) parser.add_argument( - "--val-frequency", type=int, default=30, help="How often to run evaluation with val data." + "--val-frequency", type=int, default=300, help="How often to run evaluation with val data." ) parser.add_argument( "--early-stop-patience", type=int, default=5, help="Early stopping patience." From b9fcfc2f4664210c999a3031fc08e559cccc3cb0 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 01:43:40 -0500 Subject: [PATCH 30/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 9a02d1640..5e24f766e 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -144,6 +144,7 @@ def compute_metrics(model, dataloader, device, args): predictions = torch.argmax(logits, dim=-1) batch_val_loss = loss_fn(logits, label) val_loss += batch_val_loss.item() + print(val_loss) metric.add_batch( predictions=predictions.cpu().numpy(), references=label.cpu().numpy(), @@ -219,7 +220,7 @@ def parse_args(args): "--workers", type=int, default=2, help="Number of dataloader workers per GPU." ) parser.add_argument( - "--batch-size", type=int, default=64, help="Batch size per GPU." + "--batch-size", type=int, default=1024, help="Batch size per GPU." ) parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." From 55e1501ae854d9e5a4824df2be93ed1f5bd3c078 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 01:45:29 -0500 Subject: [PATCH 31/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 5e24f766e..6459ba709 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -220,7 +220,7 @@ def parse_args(args): "--workers", type=int, default=2, help="Number of dataloader workers per GPU." ) parser.add_argument( - "--batch-size", type=int, default=1024, help="Batch size per GPU." + "--batch-size", type=int, default=256, help="Batch size per GPU." ) parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." From 008bfc59698b94129ea1c108ccace8fe15755e56 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 16 Feb 2023 15:05:46 -0500 Subject: [PATCH 32/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 6459ba709..29abf3c8a 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -65,7 +65,9 @@ def get_task_dataloaders(path, transforms, labelencoder, args): dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) #dataset_df = dataset_df[0:12800] - + b_size = args.batch_size + if(split == "validation"): + b_size = args.batch_size dataset = VQATextDataset(dataset_df, split, transforms, @@ -74,7 +76,7 @@ def get_task_dataloaders(path, transforms, labelencoder, args): ) dataloader = DataLoader( dataset, - batch_size=args.batch_size, + batch_size=b_size, shuffle=True, num_workers=args.workers, pin_memory=True, From cd009bd8c7af2c5132b62b952a7ff4593b76c1dd Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Thu, 23 Feb 2023 00:25:10 -0500 Subject: [PATCH 33/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 29abf3c8a..741398d0e 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -67,13 +67,14 @@ def get_task_dataloaders(path, transforms, labelencoder, args): #dataset_df = dataset_df[0:12800] b_size = args.batch_size if(split == "validation"): - b_size = args.batch_size - dataset = VQATextDataset(dataset_df, - split, - transforms, - labelencoder, - tokenizer=tokenizer, - ) + b_size = args.batch_size * 20 + dataset_df = dataset_df[0:12800] + dataset = VQATextDataset(dataset_df, + split, + transforms, + labelencoder, + tokenizer=tokenizer, + ) dataloader = DataLoader( dataset, batch_size=b_size, @@ -222,7 +223,7 @@ def parse_args(args): "--workers", type=int, default=2, help="Number of dataloader workers per GPU." ) parser.add_argument( - "--batch-size", type=int, default=256, help="Batch size per GPU." + "--batch-size", type=int, default=128, help="Batch size per GPU." ) parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." From 586cf0ac86568b6205e8218c59f15e70be1f972b Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Mon, 27 Feb 2023 13:50:21 -0500 Subject: [PATCH 34/44] bce_loss --- src/training/vqa_fine_tune.py | 92 ++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index 741398d0e..f4a3c0bd4 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -15,16 +15,17 @@ import evaluate from sklearn import preprocessing import numpy as np +import sys from datasets import load_dataset_builder from datasets import load_dataset class VQATextDataset(Dataset): - def __init__(self, df, split, transforms, labelencoder, tokenizer=None): + def __init__(self, df, split, transforms, answer_set, tokenizer=None): self.df = df self.transforms = transforms self.tokenize = tokenizer - self.labels = labelencoder.transform(df['multiple_choice_answer']) + self.num_classes = len(answer_set) def __len__(self): return len(self.df) @@ -33,14 +34,19 @@ def __getitem__(self, idx): img_path = item["image"]["path"] image = Image.open(str(img_path)) text = item["question"] - label = self.labels[idx] + target = np.zeros(self.num_classes) + for index, row in self.df.iterrows(): + target[row['answer_list']] = row['answer_weights'] return { 'image': self.transforms(image), 'text': self.tokenize([text])[0], - 'label': torch.tensor(label) + 'target': torch.tensor(target) } -def get_task_dataloaders(path, transforms, labelencoder, args): +def get_score(count: int) -> float: + return min(1.0, count / 3) + +def get_task_dataloaders(path, transforms, labelencoder, answer_set, args): tokenizer = get_tokenizer(args.model) dataloaders = {} @@ -52,29 +58,43 @@ def get_task_dataloaders(path, transforms, labelencoder, args): questions = [] images = [] answers = [] + weights = [] for index, row in dataset_df.iterrows(): - if(row['multiple_choice_answer'] in answer_set): + answer_count = {} + for answer in row['answers']: + answer_ = answer["answer"] + answer_count[answer_] = answer_count.get(answer_, 0) + 1 + labels = [] + scores = [] + for answer in answer_count: + if answer not in answer_set: + continue + labels.append(labelencoder.transform([answer])[0]) + score = get_score(answer_count[answer]) + scores.append(score) + if(len(labels) == 0): + continue class_id.append(row['question_id']) questions.append(row['question']) images.append(row['image']) - answers.append(row['multiple_choice_answer']) + answers.append(labels) + weights.append(scores) + class_id = np.array(class_id) questions = np.array(questions) images = np.array(images) - answers = np.array(answers) - - dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'multiple_choice_answer': answers}) + dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'answer_list': answers, 'answer_weights': weights}) #dataset_df = dataset_df[0:12800] b_size = args.batch_size if(split == "validation"): b_size = args.batch_size * 20 dataset_df = dataset_df[0:12800] - dataset = VQATextDataset(dataset_df, - split, - transforms, - labelencoder, - tokenizer=tokenizer, - ) + dataset = VQATextDataset(dataset_df, + split, + transforms, + answer_set, + tokenizer=tokenizer, + ) dataloader = DataLoader( dataset, batch_size=b_size, @@ -92,17 +112,19 @@ def __init__(self, encoder, embed_dim, num_labels): super().__init__() self.encoder = encoder + self.layers = nn.Sequential( + nn.Linear(embed_dim * 2, 1536), #size of answer space + nn.ReLU(inplace=True), + nn.LayerNorm(1536), + nn.Linear(1536, num_labels) + ) - self.fc1 = nn.Linear(embed_dim * 2, 1536) #size of answer space - self.lnorm = nn.LayerNorm(1536) - self.fc2 = nn.Linear(1536, num_classes) def forward(self, image, text): # CLIP doesn't have a multimodal encoder, so we concatenate the features text_features = self.encoder.encode_text(text) image_features = self.encoder.encode_image(image) multimodal_features = torch.cat([image_features, text_features], dim=-1) - layer = self.lnorm(F.relu(self.fc1(multimodal_features))) - logits = self.fc2(layer) + logits = self.layers(multimodal_features) return logits class EarlyStopping: @@ -136,16 +158,15 @@ def compute_metrics(model, dataloader, device, args): metric = evaluate.load("accuracy") val_loss = 0 samples_seen = 0 - loss_fn = nn.CrossEntropyLoss() for batch in dataloader: with torch.no_grad(): image = batch["image"].to(device) text = batch["text"].to(device) - label = batch["label"].to(device) + label = batch["target"].to(device) samples_seen += text.shape[0] logits = model(image, text) predictions = torch.argmax(logits, dim=-1) - batch_val_loss = loss_fn(logits, label) + batch_val_loss = nn.functional.binary_cross_entropy_with_logits(logits, label, reduction="mean") val_loss += batch_val_loss.item() print(val_loss) metric.add_batch( @@ -164,20 +185,18 @@ def train_single_epoch(model, data, optimizer, args): for i, batch in enumerate(data["train"]): image = batch["image"].to(device) text = batch["text"].to(device) - label = batch["label"].to(device) + label = batch["target"].to(device) logits = model(image, text) print(label.shape) print(logits.shape) - loss_fn = nn.CrossEntropyLoss() - loss = loss_fn(logits, label) + loss = nn.functional.binary_cross_entropy_with_logits(logits, label, reduction="mean") print(loss) loss.backward() def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device, args): model.train() - loss_fn = nn.CrossEntropyLoss() progress_bar = tqdm(total=len(data["train"])) for i, batch in enumerate(data["train"]): step = epoch * len(data["train"]) + i @@ -185,10 +204,10 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device image = batch["image"].to(device) text = batch["text"].to(device) - label = batch["label"].to(device) + label = batch["target"].to(device) logits = model(image, text) - loss = loss_fn(logits, label) #should be cross entropy + loss = nn.functional.binary_cross_entropy_with_logits(logits, label, reduction = "mean") #should be cross entropy optimizer.zero_grad() loss.backward() @@ -228,7 +247,7 @@ def parse_args(args): parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to train for." ) - parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") + parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate.") parser.add_argument("--beta1", type=float, default=0.9, help="Adam beta 1.") parser.add_argument("--beta2", type=float, default=0.999, help="Adam beta 2.") parser.add_argument("--eps", type=float, default=1e-8, help="Adam epsilon.") @@ -273,8 +292,8 @@ def parse_args(args): args = parser.parse_args(args) return args -if __name__ == "__main__": - args = parse_args([]) +def main(args): + args = parse_args(args) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( @@ -287,7 +306,7 @@ def parse_args(args): embed_dim = model_cfg["embed_dim"] answer_space = [] - with open('answers_vqa.txt') as f: + with open('src/training/answers_vqa.txt') as f: for line in f: answer_space.append(line.strip()) answer_space = np.array(answer_space) @@ -298,7 +317,7 @@ def parse_args(args): answer_set = set(labelencoder.classes_) - data = get_task_dataloaders("HuggingFaceM4/VQAv2", preprocess_val, labelencoder, args) + data = get_task_dataloaders("HuggingFaceM4/VQAv2", preprocess_val, labelencoder, answer_set, args) clf_cls = CLIPMultimodalClassifier clf = clf_cls(model, embed_dim, num_classes).to(device) @@ -314,3 +333,6 @@ def parse_args(args): for epoch in range(20): val_metrics, end_training = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) + +if __name__ == "__main__": + main(sys.argv[1:]) \ No newline at end of file From b8452cd2c26f4d70f51485ac786afc0bd88852fc Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Tue, 28 Feb 2023 16:23:52 -0500 Subject: [PATCH 35/44] Update vqa_fine_tune.py --- src/training/vqa_fine_tune.py | 95 ++++++++++++++++------------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/src/training/vqa_fine_tune.py b/src/training/vqa_fine_tune.py index f4a3c0bd4..e003136ff 100644 --- a/src/training/vqa_fine_tune.py +++ b/src/training/vqa_fine_tune.py @@ -21,11 +21,13 @@ from datasets import load_dataset class VQATextDataset(Dataset): - def __init__(self, df, split, transforms, answer_set, tokenizer=None): + def __init__(self, df, split, transforms, label_encoder, answer_set, tokenizer=None): self.df = df self.transforms = transforms self.tokenize = tokenizer self.num_classes = len(answer_set) + self.label_encoder = label_encoder + self.answer_set = answer_set def __len__(self): return len(self.df) @@ -34,9 +36,23 @@ def __getitem__(self, idx): img_path = item["image"]["path"] image = Image.open(str(img_path)) text = item["question"] + + answer_count = {} + for answer in item['answers']: + answer_ = answer["answer"] + answer_count[answer_] = answer_count.get(answer_, 0) + 1 + labels = [] + scores = [] + for answer in answer_count: + if answer not in self.answer_set: + continue + labels.append(self.label_encoder.transform([answer])[0]) + score = get_score(answer_count[answer]) + scores.append(score) target = np.zeros(self.num_classes) - for index, row in self.df.iterrows(): - target[row['answer_list']] = row['answer_weights'] + for label, score in zip(labels, scores): + target[label] = score + return { 'image': self.transforms(image), 'text': self.tokenize([text])[0], @@ -44,54 +60,25 @@ def __getitem__(self, idx): } def get_score(count: int) -> float: - return min(1.0, count / 3) + return min(1.0, count / 3.0) def get_task_dataloaders(path, transforms, labelencoder, answer_set, args): tokenizer = get_tokenizer(args.model) dataloaders = {} for split in ["train", "validation"]: - dataset_train = load_dataset(path, split=split, cache_dir = "./sample_data") + dataset_train = load_dataset(path, split=split, cache_dir = "./vqa_data") dataset_df = dataset_train.to_pandas() - - class_id = [] - questions = [] - images = [] - answers = [] - weights = [] - for index, row in dataset_df.iterrows(): - answer_count = {} - for answer in row['answers']: - answer_ = answer["answer"] - answer_count[answer_] = answer_count.get(answer_, 0) + 1 - labels = [] - scores = [] - for answer in answer_count: - if answer not in answer_set: - continue - labels.append(labelencoder.transform([answer])[0]) - score = get_score(answer_count[answer]) - scores.append(score) - if(len(labels) == 0): - continue - class_id.append(row['question_id']) - questions.append(row['question']) - images.append(row['image']) - answers.append(labels) - weights.append(scores) - - class_id = np.array(class_id) - questions = np.array(questions) - images = np.array(images) - dataset_df = pd.DataFrame({'question_id': class_id, 'question': questions, 'image': images, 'answer_list': answers, 'answer_weights': weights}) - #dataset_df = dataset_df[0:12800] + dataset_df = dataset_df[dataset_df.apply(lambda item: True if item['multiple_choice_answer'] in answer_set else False, axis=1)] + b_size = args.batch_size if(split == "validation"): - b_size = args.batch_size * 20 + b_size = args.batch_size * 10 dataset_df = dataset_df[0:12800] dataset = VQATextDataset(dataset_df, split, - transforms, + transforms, + labelencoder, answer_set, tokenizer=tokenizer, ) @@ -158,6 +145,8 @@ def compute_metrics(model, dataloader, device, args): metric = evaluate.load("accuracy") val_loss = 0 samples_seen = 0 + total_correct = 0 + for batch in dataloader: with torch.no_grad(): image = batch["image"].to(device) @@ -167,19 +156,21 @@ def compute_metrics(model, dataloader, device, args): logits = model(image, text) predictions = torch.argmax(logits, dim=-1) batch_val_loss = nn.functional.binary_cross_entropy_with_logits(logits, label, reduction="mean") + predictions=predictions.cpu().numpy() + references= label.cpu().numpy() val_loss += batch_val_loss.item() - print(val_loss) - metric.add_batch( - predictions=predictions.cpu().numpy(), - references=label.cpu().numpy(), - ) + for i, pred in enumerate(predictions): + total_correct += references[i][pred] + #print("total correct", total_correct, "seen", samples_seen) + model.train() - metrics = metric.compute() metrics = {} - metrics["accuracy"] = val_loss / samples_seen + metrics["accuracy"] = total_correct/ samples_seen + metrics["loss"] = val_loss / samples_seen return metrics +#Remove in final commit def train_single_epoch(model, data, optimizer, args): model.train() for i, batch in enumerate(data["train"]): @@ -219,11 +210,11 @@ def train_one_epoch(model, data, epoch, optimizer, scheduler, early_stop, device if (i % args.val_frequency) == 0 and i > 5: print(loss) metrics = compute_metrics(model, data["validation"], device, args) - print(metrics["accuracy"]) + print("accuracy", metrics["accuracy"]) end_training = early_stop.step(metrics) - if end_training: - progress_bar.close() - return metrics, end_training + #if end_training: + # progress_bar.close() + # return metrics, end_training progress_bar.close() metrics = compute_metrics(model, data["validation"], device, args) @@ -256,7 +247,7 @@ def parse_args(args): "--warmup", type=int, default=200, help="Number of steps to warmup for." ) parser.add_argument( - "--val-frequency", type=int, default=300, help="How often to run evaluation with val data." + "--val-frequency", type=int, default=100, help="How often to run evaluation with val data." ) parser.add_argument( "--early-stop-patience", type=int, default=5, help="Early stopping patience." @@ -335,4 +326,4 @@ def main(args): val_metrics, end_training = train_one_epoch(clf, data, epoch, optim, scheduler, early_stop, device, args) if __name__ == "__main__": - main(sys.argv[1:]) \ No newline at end of file + main(sys.argv[1:]) From d45ab6f8ec8eb1668b1962b82a8824627f1fa54e Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Tue, 28 Feb 2023 16:24:35 -0500 Subject: [PATCH 36/44] Update vqa_fine_tune.py From 956232f69bedabefc19e75f5eca191d6427b3ef0 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:13:02 -0400 Subject: [PATCH 37/44] Create kilogram.py --- src/open_clip/kilogram.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 src/open_clip/kilogram.py diff --git a/src/open_clip/kilogram.py b/src/open_clip/kilogram.py new file mode 100644 index 000000000..8ba55b58d --- /dev/null +++ b/src/open_clip/kilogram.py @@ -0,0 +1,84 @@ +import argparse + +import numpy as np +import pandas as pd +import torch +from PIL import Image +from torch import nn +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm + +import open_clip +from open_clip.factory import get_tokenizer +from training.scheduler import cosine_lr +from training.train import AverageMeter +import evaluate +import random +import json + + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +with open("kilogram/development/texts/controlled/whole+black.json") as f: + file_contents = json.load(f) +print(file_contents.keys()) + +def sample(texts, images, r, k = 10): + sampled_images = [images[r]] + sampled_text = [texts[r]] + + while(len(sampled_images) < k): + r = random.randint(0, len(images)-1) + if(images[r] in sampled_images or texts[r] in sampled_text): + continue + sampled_images.append(images[r]) + sampled_text.append(texts[r]) + return (sampled_text, sampled_images) + +class CLIPMultimodalClassifier(nn.Module): + def __init__(self, encoder): + super().__init__() + + self.encoder = encoder + def forward(self, image, text): + # CLIP doesn't have a multimodal encoder, so we concatenate the features + text_features = self.encoder.encode_text(text) + image_features = self.encoder.encode_image(image) + + return torch.dot(text_features, image_features) +model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( + "ViT-B-32-quickgelu", + 'laion400m_e32', + precision="amp", + device=device, +) +clf_cls = CLIPMultimodalClassifier +clf = clf_cls(model).to(device) +transforms = preprocess_val +tokenizer = get_tokenizer("ViT-B-32-quickgelu") + +total_correct = 0 +total_games = 0 +for i in range(len(file_contents['texts'])): + sample_first = sample(file_contents['texts'], file_contents['images'], i) + images = [] + texts = [] + + for i in range(len(sample_first[0])): + image = Image.open("kilogram/development/images/black/" + sample_first[1][i] + ".png") + + images.append(transforms(image)) + texts.append(sample_first[0][i]) + image_input = torch.tensor(np.stack(images)).to(device) + text_tokens = tokenizer(texts).to(device) + with torch.no_grad(): + image_features = clf.encoder.encode_image(image_input).float() + text_features = clf.encoder.encode_text(text_tokens).float() + similarity = np.dot(text_features.cpu().numpy(), image_features.cpu().numpy().T) + + k = 10 + total_games += k + for i in range(k): + if(np.argmax(similarity[i]) == i): + total_correct += 1 + + print(total_correct/total_games) From 08913c971e2418669a86d5c223dbef925f53e969 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Wed, 12 Apr 2023 13:55:16 -0400 Subject: [PATCH 38/44] Create kilogram.py --- src/training/kilogram.py | 111 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/training/kilogram.py diff --git a/src/training/kilogram.py b/src/training/kilogram.py new file mode 100644 index 000000000..0e3b15b07 --- /dev/null +++ b/src/training/kilogram.py @@ -0,0 +1,111 @@ +import argparse + +import numpy as np +import pandas as pd +import torch +from PIL import Image +from torch import nn +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm + +import open_clip +from open_clip.factory import get_tokenizer +from training.scheduler import cosine_lr +from training.train import AverageMeter +import evaluate +import random +import os +import json + +from torch.utils.data import Dataset, DataLoader, Sampler + +#Download data: +#!git lfs install +#!git clone https://huggingface.co/datasets/lil-lab/kilogram + +class ValidationDataSet(Dataset): + def __init__(self, image_path, data, tokenizer, image_processor): + ''' + Requires: + [image_path]: path to images folder + [data]: contains targets, texts, image filenames + ''' + self.image_path = image_path + self.images_n = data['images'] + self.texts = data['texts'] + self.targets = data['targets'] + self.tokenizer = tokenizer + self.image_processor = image_processor + def __len__(self): + '''__len__ returns the number of samples in the dataset. + :returns: number of (image, annotation) pairs in dataset + :rtype: int + ''' + return len(self.texts) + + def __getitem__(self, idx): + ''' + __getitem__ returns the tensor, output pair for a given index + :param idx: index within dataset to return + :type idx: int + :returns: image tensor, text tensor + :rtype: tensor, tensor + + ''' + image_file = self.images_n[idx] + image_path = os.path.join(self.image_path, image_file) + '.png' if not image_file.endswith('.png') else os.path.join(self.image_path, image_file) + texts = self.texts[idx] + text_tokens = torch.squeeze(self.tokenizer(texts)) + images = self.image_processor(Image.open(image_path)) + target = self.targets[idx] + return text_tokens, images + +class CLIPMultimodalClassifier(nn.Module): + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + def forward(self, image, text): + # CLIP doesn't have a multimodal encoder, so we concatenate the features + text_features = self.encoder.encode_text(text) + image_features = self.encoder.encode_image(image) + + return torch.dot(text_features, image_features) + +def main(args): + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + with open("kilogram/development/texts/controlled/whole+black.json") as f: + file_contents = json.load(f) + model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( + "ViT-B-32-quickgelu", + 'openai', + precision="amp", + device=device, + ) + clf_cls = CLIPMultimodalClassifier + clf = clf_cls(model).to(device) + + transforms = preprocess_val + tokenizer = get_tokenizer("ViT-B-32-quickgelu") + + dsval = ValidationDataSet("kilogram/development/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + + total_games = 0 + total_correct = 0 + for text_tokens, image_input in dlval: + text_tokens = torch.tensor(text_tokens).to(device) + image_input = torch.tensor(image_input).to(device) + clf.eval() + with torch.no_grad(): + image_features = clf.encoder.encode_image(image_input).float() + text_features = clf.encoder.encode_text(text_tokens).float() + similarity = np.dot(text_features.cpu().numpy(), image_features.cpu().numpy().T) + total_games += 2 + if(np.argmax(similarity, axis = 0)[0] == 0): + total_correct += 1 + if(np.argmax(similarity, axis = 1)[0] == 0): + total_correct += 1 + print(total_correct/total_games) + +if __name__ == "__main__": + main(sys.argv[1:]) From 34d628cb0a6627122349d852e139ed2734ee9a54 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Wed, 12 Apr 2023 14:34:12 -0400 Subject: [PATCH 39/44] Update kilogram.py --- src/training/kilogram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/kilogram.py b/src/training/kilogram.py index 0e3b15b07..921e1ea80 100644 --- a/src/training/kilogram.py +++ b/src/training/kilogram.py @@ -108,4 +108,4 @@ def main(args): print(total_correct/total_games) if __name__ == "__main__": - main(sys.argv[1:]) + main(sys.argv[1:]) From 767d5016df2ad659c15a463260077a475c15c18a Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Wed, 12 Apr 2023 14:41:21 -0400 Subject: [PATCH 40/44] Kilogram Whole+Black --- src/training/kilogram.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/training/kilogram.py b/src/training/kilogram.py index 921e1ea80..6634fa3ed 100644 --- a/src/training/kilogram.py +++ b/src/training/kilogram.py @@ -20,8 +20,8 @@ from torch.utils.data import Dataset, DataLoader, Sampler #Download data: -#!git lfs install -#!git clone https://huggingface.co/datasets/lil-lab/kilogram +#git lfs install +#git clone https://huggingface.co/datasets/lil-lab/kilogram class ValidationDataSet(Dataset): def __init__(self, image_path, data, tokenizer, image_processor): From 8e83ad4ea1a7f1c34029f51eebe5f6179cf1493a Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Wed, 12 Apr 2023 14:43:59 -0400 Subject: [PATCH 41/44] moved. --- src/open_clip/kilogram.py | 84 --------------------------------------- 1 file changed, 84 deletions(-) delete mode 100644 src/open_clip/kilogram.py diff --git a/src/open_clip/kilogram.py b/src/open_clip/kilogram.py deleted file mode 100644 index 8ba55b58d..000000000 --- a/src/open_clip/kilogram.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse - -import numpy as np -import pandas as pd -import torch -from PIL import Image -from torch import nn -from torch.utils.data import Dataset, DataLoader -from tqdm import tqdm - -import open_clip -from open_clip.factory import get_tokenizer -from training.scheduler import cosine_lr -from training.train import AverageMeter -import evaluate -import random -import json - - -device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -with open("kilogram/development/texts/controlled/whole+black.json") as f: - file_contents = json.load(f) -print(file_contents.keys()) - -def sample(texts, images, r, k = 10): - sampled_images = [images[r]] - sampled_text = [texts[r]] - - while(len(sampled_images) < k): - r = random.randint(0, len(images)-1) - if(images[r] in sampled_images or texts[r] in sampled_text): - continue - sampled_images.append(images[r]) - sampled_text.append(texts[r]) - return (sampled_text, sampled_images) - -class CLIPMultimodalClassifier(nn.Module): - def __init__(self, encoder): - super().__init__() - - self.encoder = encoder - def forward(self, image, text): - # CLIP doesn't have a multimodal encoder, so we concatenate the features - text_features = self.encoder.encode_text(text) - image_features = self.encoder.encode_image(image) - - return torch.dot(text_features, image_features) -model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( - "ViT-B-32-quickgelu", - 'laion400m_e32', - precision="amp", - device=device, -) -clf_cls = CLIPMultimodalClassifier -clf = clf_cls(model).to(device) -transforms = preprocess_val -tokenizer = get_tokenizer("ViT-B-32-quickgelu") - -total_correct = 0 -total_games = 0 -for i in range(len(file_contents['texts'])): - sample_first = sample(file_contents['texts'], file_contents['images'], i) - images = [] - texts = [] - - for i in range(len(sample_first[0])): - image = Image.open("kilogram/development/images/black/" + sample_first[1][i] + ".png") - - images.append(transforms(image)) - texts.append(sample_first[0][i]) - image_input = torch.tensor(np.stack(images)).to(device) - text_tokens = tokenizer(texts).to(device) - with torch.no_grad(): - image_features = clf.encoder.encode_image(image_input).float() - text_features = clf.encoder.encode_text(text_tokens).float() - similarity = np.dot(text_features.cpu().numpy(), image_features.cpu().numpy().T) - - k = 10 - total_games += k - for i in range(k): - if(np.argmax(similarity[i]) == i): - total_correct += 1 - - print(total_correct/total_games) From 1032d6620d9775f56ed17408b8eb9ac017ca0f31 Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:37:02 -0400 Subject: [PATCH 42/44] Update kilogram.py --- src/training/kilogram.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/training/kilogram.py b/src/training/kilogram.py index 6634fa3ed..0c6798f1d 100644 --- a/src/training/kilogram.py +++ b/src/training/kilogram.py @@ -70,22 +70,24 @@ def forward(self, image, text): image_features = self.encoder.encode_image(image) return torch.dot(text_features, image_features) - -def main(args): - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - with open("kilogram/development/texts/controlled/whole+black.json") as f: - file_contents = json.load(f) +def get_model(device): model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( - "ViT-B-32-quickgelu", + "ViT-B-32", 'openai', precision="amp", device=device, ) clf_cls = CLIPMultimodalClassifier clf = clf_cls(model).to(device) - transforms = preprocess_val - tokenizer = get_tokenizer("ViT-B-32-quickgelu") + tokenizer = get_tokenizer("ViT-B-32") + return clf, transforms, tokenizer +def main(args): + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + with open("kilogram/development/texts/controlled/whole+black.json") as f: + file_contents = json.load(f) + + clf, transforms, tokenizer = get_model(device) dsval = ValidationDataSet("kilogram/development/images/black/", file_contents, tokenizer, transforms) dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) From 15625725cb5d6e50e1a30b8de1b62304eb8643db Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:57:05 -0400 Subject: [PATCH 43/44] Update kilogram.py --- src/training/kilogram.py | 94 ++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/src/training/kilogram.py b/src/training/kilogram.py index 0c6798f1d..46f9b51e2 100644 --- a/src/training/kilogram.py +++ b/src/training/kilogram.py @@ -70,6 +70,7 @@ def forward(self, image, text): image_features = self.encoder.encode_image(image) return torch.dot(text_features, image_features) + def get_model(device): model, preprocess_train, preprocess_val = open_clip.factory.create_model_and_transforms( "ViT-B-32", @@ -82,32 +83,91 @@ def get_model(device): transforms = preprocess_val tokenizer = get_tokenizer("ViT-B-32") return clf, transforms, tokenizer + +def compute_norm(features): + return features / features.norm(dim=-1, keepdim=True).float() + +def evaluate(dlval, clf, device): + clf.eval() + num_correct = 0 + with torch.no_grad(): + for i, data in enumerate(tqdm(dlval)): + text_tokens, image_input = data[0], data[1] + text_tokens = torch.tensor(text_tokens).to(device) + image_input = torch.tensor(image_input).to(device) + + I_e = clf.encoder.encode_image(image_input).float() + T_e = clf.encoder.encode_text(text_tokens).float() + + image_features = compute_norm(I_e) + text_features = compute_norm(T_e) + + similarity = (image_features @ text_features.t()).cpu().numpy() + + if(np.argmax(similarity, axis = -1)[0] == 0): + num_correct += 1 + n_contexts = len(dlval) + acc_i = num_correct / n_contexts + return acc_i + def main(args): device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + clf, transforms, tokenizer = get_model(device) + + #Development Whole + Black with open("kilogram/development/texts/controlled/whole+black.json") as f: file_contents = json.load(f) + dsval = ValidationDataSet("kilogram/development/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + + accuracy = evaluate(dlval, clf, device) + print("Development Whole + Black:", accuracy) - clf, transforms, tokenizer = get_model(device) + #Heldout Whole + Black + with open("kilogram/heldout/texts/controlled/whole+black.json") as f: + file_contents = json.load(f) + dsval = ValidationDataSet("kilogram/heldout/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + + accuracy = evaluate(dlval, clf, device) + print("Heldout Whole + Black:", accuracy) + + #Development Parts + Black + with open("kilogram/development/texts/controlled/part+black.json") as f: + file_contents = json.load(f) + dsval = ValidationDataSet("kilogram/development/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + + accuracy = evaluate(dlval, clf, device) + print("Development Parts + Black:", accuracy) + + #Heldout Parts + Black + with open("kilogram/heldout/texts/controlled/part+black.json") as f: + file_contents = json.load(f) + dsval = ValidationDataSet("kilogram/heldout/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + accuracy = evaluate(dlval, clf, device) + print("Heldout Parts + Black:", accuracy) + + #Development Whole + Color + with open("kilogram/development/texts/controlled/whole+color.json") as f: + file_contents = json.load(f) dsval = ValidationDataSet("kilogram/development/images/black/", file_contents, tokenizer, transforms) dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) - total_games = 0 - total_correct = 0 - for text_tokens, image_input in dlval: - text_tokens = torch.tensor(text_tokens).to(device) - image_input = torch.tensor(image_input).to(device) - clf.eval() - with torch.no_grad(): - image_features = clf.encoder.encode_image(image_input).float() - text_features = clf.encoder.encode_text(text_tokens).float() - similarity = np.dot(text_features.cpu().numpy(), image_features.cpu().numpy().T) - total_games += 2 - if(np.argmax(similarity, axis = 0)[0] == 0): - total_correct += 1 - if(np.argmax(similarity, axis = 1)[0] == 0): - total_correct += 1 - print(total_correct/total_games) + accuracy = evaluate(dlval, clf, device) + print("Development Whole + Color:", accuracy) + + #Heldout Whole + Color + with open("kilogram/heldout/texts/controlled/whole+color.json") as f: + file_contents = json.load(f) + dsval = ValidationDataSet("kilogram/heldout/images/black/", file_contents, tokenizer, transforms) + dlval = DataLoader(dsval, batch_size=10, shuffle=False, drop_last=True) + + accuracy = evaluate(dlval, clf, device) + print("Heldout Whole + Color:", accuracy) if __name__ == "__main__": main(sys.argv[1:]) From 149090667b9e38dabbb0c18868c83829afbfb74b Mon Sep 17 00:00:00 2001 From: Andrew Wang <69259721+andrewwangva@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:11:28 -0400 Subject: [PATCH 44/44] Update kilogram.py --- src/training/kilogram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/kilogram.py b/src/training/kilogram.py index 46f9b51e2..85973e66d 100644 --- a/src/training/kilogram.py +++ b/src/training/kilogram.py @@ -105,7 +105,7 @@ def evaluate(dlval, clf, device): similarity = (image_features @ text_features.t()).cpu().numpy() if(np.argmax(similarity, axis = -1)[0] == 0): - num_correct += 1 + num_correct += 1 n_contexts = len(dlval) acc_i = num_correct / n_contexts return acc_i