-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbumblebee.py
More file actions
105 lines (86 loc) · 3.32 KB
/
bumblebee.py
File metadata and controls
105 lines (86 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import torch
from transformer_lens import HookedTransformer
import random
import numpy as np
import json
import time
import gc
import argparse
from datetime import datetime
from difflib import SequenceMatcher
# CLI setup
parser = argparse.ArgumentParser(description="Bumblebee - Transformer fuzzing toolkit")
parser.add_argument("--model", type=str, default="gpt2-small", help="Name of the model to load")
parser.add_argument("--fragment", action="store_true", help="Use Frontier Reasoning Attack on Generative Models via Entity Normalization Tampering (FRAGMENT)")
args = parser.parse_args()
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model
model = HookedTransformer.from_pretrained(args.model, device=device)
# Prompts
if args.fragment:
clean_prompt = "The doctor treated the Ρhοenιx with a"
corrupted_prompt = "The butcher treated the Ρhοenιx with a"
else:
clean_prompt = "The doctor treated the patient with a"
corrupted_prompt = "The butcher treated the patient with a"
tokens_clean = model.to_tokens(clean_prompt, prepend_bos=True)
tokens_corrupt = model.to_tokens(corrupted_prompt, prepend_bos=True)
# Output setup
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"buzzlens_results_{args.model.replace('/', '_')}_{timestamp}.json"
results = []
# Similarity function
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
# Fuzzing functions
def gaussian_fuzz(activation, hook):
noise_strength = 0.05
noise = noise_strength * torch.randn_like(activation)
return activation + noise
def bitflip_fuzz(activation, hook):
flip_prob = 0.01
mask = torch.rand_like(activation) < flip_prob
return torch.where(mask, -activation, activation)
def dropout_fuzz(activation, hook):
dropout_prob = 0.1
return torch.nn.functional.dropout(activation, p=dropout_prob)
fuzz_methods = {
"gaussian": gaussian_fuzz,
"bitflip": bitflip_fuzz,
"dropout": dropout_fuzz
}
hook_targets = ["hook_resid_pre", "hook_mlp_out", "hook_attn_out"]
# Run fuzzing
for layer in range(model.cfg.n_layers):
for method_name, fuzz_fn in fuzz_methods.items():
for hook_target in hook_targets:
hook_name = f"blocks.{layer}.{hook_target}"
model.reset_hooks()
model.add_hook(hook_name, fuzz_fn)
# Run inference
try:
logits_clean = model(tokens_clean)
pred_clean = model.to_string(logits_clean.argmax(dim=-1)[0])
logits_corrupt = model(tokens_corrupt)
pred_corrupt = model.to_string(logits_corrupt.argmax(dim=-1)[0])
sim_score = similarity(pred_clean, pred_corrupt)
except RuntimeError as e:
pred_clean = pred_corrupt = f"OOM: {str(e)}"
sim_score = 0.0
results.append({
"layer": layer,
"hook": hook_target,
"method": method_name,
"clean_prediction": pred_clean,
"corrupted_prediction": pred_corrupt,
"similarity_score": sim_score
})
model.reset_hooks()
torch.cuda.empty_cache()
gc.collect()
time.sleep(0.2)
# Save
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
print(f"Results saved to {output_path}")