Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions config.default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,27 @@ full_normalization_lora_rank = 3
# of the components, then clamps the magnitudes of all components to that quantile.
winsorization_quantile = 1.0

# Use multidirectional Self-Organizing Map.
multidirectional_som = false

# Number of SOM neurons in the x-axis.
som_x = 4

# Number of SOM neurons in the y-axis.
som_y = 4

# Number of SOM training iterations.
som_iterations = 10000

# SOM learning rate.
som_lr = 0.01

# SOM neighborhood radius.
som_sigma = 0.5

# Number of top neurons to use for multidirectional SOM.
som_k = 4

# Number of abliteration trials to run during optimization.
n_trials = 200

Expand Down
29 changes: 29 additions & 0 deletions src/heretic/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,35 @@ class Settings(BaseSettings):
),
)

multidirectional_som: bool = Field(
default=False,
description="Use multidirectional Self-Organising Maps. Requires 'minisom' package to be installed.",
)

som_x: int = Field(
default=4, description="Number of SOM neurons in the x-axis."
)

som_y: int = Field(
default=4, description="Number of SOM neurons in the y-axis."
)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to be able to control the X/Y dimensions separately? On average, we should expect the flattened topology to approximate a circle over many runs, so it seems one setting for both dimensions should be enough.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (x,y) setting sat on the original project, so I took it. https://github.com/pralab/som-refusal-directions#3-run-self-organizing-map


som_iterations: int = Field(
default=10000, description="Number of SOM training iterations."
)

som_lr: float = Field(
default=0.01, description="SOM learning rate."
)

som_sigma: float = Field(
default=0.5, description="SOM neighborhood radius."
)

som_k: int = Field(
default=4, description="Number of top neurons to use for multidirectional SOM."
)

n_trials: int = Field(
default=200,
description="Number of abliteration trials to run during optimization.",
Expand Down
89 changes: 72 additions & 17 deletions src/heretic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,13 @@ def run():
)
return

if settings.multidirectional_som:
try:
from minisom import MiniSom
except ModuleNotFoundError as _:
print("Self-Organizing Map is selected, but 'minisom' module not installed.\nPlease install it with 'pip install minisom'.")
return

# Adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/commands/env.py
if torch.cuda.is_available():
count = torch.cuda.device_count()
Expand Down Expand Up @@ -418,22 +425,70 @@ def run():
print("* Obtaining residuals for bad prompts...")
bad_residuals = model.get_residuals_batched(bad_prompts)

good_means = good_residuals.mean(dim=0)
bad_means = bad_residuals.mean(dim=0)
good_means = good_residuals.mean(dim=0) # N_layers, hidden_dim

if settings.multidirectional_som:

refusal_directions = F.normalize(bad_means - good_means, p=2, dim=1)
from .som import SOMCalculator
bad_means = []

# bad_residuals shape: (num_bad_prompts, num_layers, hidden_dim)
num_layers = bad_residuals.shape[1]

print(f" - Retrieving multi-directions through self-organizing map...")

for layer_idx in range(num_layers):
print(f" - Processing Layer {layer_idx + 1}/{num_layers}...")
# Extract residuals for the current layer
# Shape: (num_bad_prompts, hidden_dim)
layer_residuals = bad_residuals[:, layer_idx, :].cpu().float().numpy()

# Initialize and fit the SOM for this layer's residuals
som_calc = SOMCalculator(
som_x=settings.som_x,
som_y=settings.som_y,
iterations=settings.som_iterations,
lr=settings.som_lr,
sigma=settings.som_sigma
)
som_calc.fit(layer_residuals)

# Get the weights of the top-k neurons as our "bad means"
top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k)

# SOM de-duplicates neurons if they are too close to each other
# Temporary solution is to repeat them back
t = torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device)
num_found_neurons = t.shape[0]

if num_found_neurons < settings.som_k:
t = t.repeat(settings.som_k // num_found_neurons, 1)
if t.shape[0] < settings.som_k:
t = torch.cat([t, t[0:settings.som_k - t.shape[0]]], dim=0)

# Convert back to tensor and add to our list
# Shape: (k, hidden_dim)
bad_means.append(t)

bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
else:
bad_means = bad_residuals.mean(dim=0).unsqueeze(0) # (1, N_layers, hidden_dim)

refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]

if settings.orthogonalize_direction:
# Implements https://huggingface.co/blog/grimjim/projected-abliteration
# Adjust the refusal directions so that only the component that is
# orthogonal to the good direction is subtracted during abliteration.
good_directions = F.normalize(good_means, p=2, dim=1)
projection_vector = torch.sum(refusal_directions * good_directions, dim=1)
refusal_directions = (
refusal_directions - projection_vector.unsqueeze(1) * good_directions
)
refusal_directions = F.normalize(refusal_directions, p=2, dim=1)

projection_vectors = [torch.sum(ref * good_directions, dim=1) for ref in refusal_directions]
refusal_directions = [(
refusal_direction - projection_vector.unsqueeze(1) * good_directions
) for (refusal_direction, projection_vector) in zip(refusal_directions, projection_vectors)]
refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions]

refusal_directions = torch.stack(refusal_directions, dim=0)
refusal_directions = refusal_directions.permute(1, 0, 2) # layers, directions, hidden_dim
analyzer = Analyzer(settings, model, good_residuals, bad_residuals)

if settings.print_residual_geometry:
Expand Down Expand Up @@ -487,11 +542,11 @@ def objective(trial: Trial) -> tuple[float, float]:
# The parameter ranges are based on experiments with various models
# and much wider ranges. They are not set in stone and might have to be
# adjusted for future models.
max_weight = trial.suggest_float(
f"{component}.max_weight",
max_weights = [trial.suggest_float(
f"{component}.max_weight.{i}",
0.8,
1.5,
)
) for i in range(refusal_directions.shape[1])]
max_weight_position = trial.suggest_float(
f"{component}.max_weight_position",
0.6 * last_layer_index,
Expand All @@ -500,21 +555,21 @@ def objective(trial: Trial) -> tuple[float, float]:
# For sampling purposes, min_weight is expressed as a fraction of max_weight,
# again because multivariate TPE doesn't support variable-range parameters.
# The value is transformed into the actual min_weight value below.
min_weight = trial.suggest_float(
f"{component}.min_weight",
min_weights = [trial.suggest_float(
f"{component}.min_weight.{i}",
0.0,
1.0,
)
) for i in range(refusal_directions.shape[1])]
min_weight_distance = trial.suggest_float(
f"{component}.min_weight_distance",
1.0,
0.6 * last_layer_index,
)

parameters[component] = AbliterationParameters(
max_weight=max_weight,
max_weights=max_weights,
max_weight_position=max_weight_position,
min_weight=(min_weight * max_weight),
min_weights=[(min_weight * max_weight) for (min_weight, max_weight) in zip(min_weights, max_weights)],
min_weight_distance=min_weight_distance,
)

Expand Down
Loading