adversarial_examples.py

import json
import argparse
import wandb
from nltk.corpus import movie_reviews, subjectivity, stopwords
from baseline import BaselineExperiment
from sklearn.metrics import accuracy_score, f1_score

from experiment import Experiment
from baseline import BaselineExperiment
from models import *
from settings import *

nameToModel = {
    "BiGRU": BiGRU,
    "BiGRUAttention": BiGRU,
    "TextCNN": TextCNN
}

# Subjective sentence generated by ChatGPT usign only tokens from the objective only lexicon
subj_sentences = ["I was shocked to discover that the financial webcams we had been using were actually part of a scheme known as 'frodes', and I couldn't believe that Daddy's client would scoff at the idea of being caught up in such a bale of trouble.",
                    "I felt betrayed and stunned, but I knew I had to move on and find a new situation-based opportunity, even if it meant leaving behind the familiar Composers' Castle and the territorial Marjorie and Margaret"
                    ]
obj_sentences = ["The widely reserved, self-determination and simplicity of the 12-step program have proven to be an effective life-affirming method for those seeking to overcome addiction and achieve reconciliation with themselves and others.",
                    "The artist-agent's creative approach to marketing and promotion has helped to boost the success and stylishness of numerous music and entertainment projects."
                    ]

sentences = obj_sentences + subj_sentences

def baseline(task):
    exp_subjectivity = BaselineExperiment(task=task)
    classifier, vectorizer = exp_subjectivity.run()

    vectors = vectorizer.transform(sentences)
    preds = classifier.predict(vectors)
    print(preds)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "model", choices=["Baseline", "BiGRU", "BiGRUAttention", "TextCNN"], help="Specify model type. Eg. 'BiGRU'")
    parser.add_argument("task", choices=["subjectivity"], help="Specify which task to perform.")
    parser.add_argument("--fold_index", type=int, choices=[
                        0, 1, 2, 3, 4],  help="Specifify the fold index to load correct train/test split.")
    parser.add_argument("-pe", "--pretrained_embeddings",
                        action="store_true", help="Specify if use pretrained embeddings.")
    args = parser.parse_args()

    sjv_classifier = None
    sjv_vectorizer = None

    if args.model == "Baseline":
        baseline(args.task)
        exit(0)

    # load model
    api = wandb.Api()
    pe_string = "_pe" if args.pretrained_embeddings else ""
    name = f"{args.task}_{args.model}{pe_string}_fold_{args.fold_index:02d}"

    artifact_name = f'{WANDB_ENTITY}/{WANDB_PROJECT}/{name}:latest'
    print(artifact_name)

    checkpoint = f"{name}.pth"
    print(checkpoint)

    artifact = api.artifact(artifact_name)
    artifact.download(root=WEIGHTS_SAVE_PATH)
    print(artifact.metadata)
    model_config = artifact.metadata

    if model_config.get("vocab_size"):
        model = nameToModel[args.model](
            model_config["vocab_size"], model_config)
    else:
        raise Exception("Config does not specify vocab_size.")

    checkpoint = torch.load(
        f"{WEIGHTS_SAVE_PATH}/{checkpoint}", map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])

    # create the same language on which the model was trained
    exp = Experiment(args.task, sjv_classifier, sjv_vectorizer)
    exp.model_config = model_config
    exp.prepare_data()
    exp.create_folds()
    exp.create_dataloaders(args.fold_index)

    # tokenize sentences
    tokenized = [nltk.WordPunctTokenizer().tokenize(sent) for sent in sentences]
    tokenized = [[t.lower() for t in sent] for sent in tokenized]
    print(tokenized)

    # convert to ids and pad
    ids = [[exp.lang.word2id.get(t, exp.lang.word2id['<unk>']) for t in sent] for sent in tokenized]
    ids = [torch.tensor(sent) for sent in ids]
    y_gt = [0, 0, 1, 1]
    print(y_gt)

    # predict
    y_pred = []
    model.eval()
    with torch.no_grad():
        for sent in ids:
            sent = sent.unsqueeze(0).to(DEVICE)
            text_len = torch.tensor(len(sent)).unsqueeze(0).to(DEVICE)
            out = model({"document": sent, "text_len": text_len})
            if args.model == "BiGRUAttention":
                out = out[0]
            prediction = torch.sigmoid(out).round().int()
            y_pred.append(prediction.item())
    
    print(y_pred)