From 6ebabc79431fc218edd6adcdb9d06d0b4658de58 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 21:12:32 +0300 Subject: [PATCH 01/21] obtain multiple resid directions with SOMs TODO: make main logic work with lists --- src/heretic/config.py | 29 +++++++++++++++ src/heretic/main.py | 55 ++++++++++++++++++++++++---- src/heretic/som.py | 84 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 7 deletions(-) create mode 100644 src/heretic/som.py diff --git a/src/heretic/config.py b/src/heretic/config.py index 8ed3f80c..750a7814 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -215,6 +215,35 @@ class Settings(BaseSettings): ), ) + multidirectional_som: bool = Field( + default=False, + description="Use multidirectional Self-Organising Maps. Requires 'minisom' package to be installed.", + ) + + som_x: int = Field( + default=4, description="Number of SOM neurons in the x-axis." + ) + + som_y: int = Field( + default=4, description="Number of SOM neurons in the y-axis." + ) + + som_iterations: int = Field( + default=10000, description="Number of SOM training iterations." + ) + + som_lr: float = Field( + default=0.01, description="SOM learning rate." + ) + + som_sigma: float = Field( + default=0.5, description="SOM neighborhood radius." + ) + + som_k: int = Field( + default=4, description="Number of top neurons to use for multidirectional SOM." + ) + n_trials: int = Field( default=200, description="Number of abliteration trials to run during optimization.", diff --git a/src/heretic/main.py b/src/heretic/main.py index 016c3920..9a5a71f4 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -171,6 +171,13 @@ def run(): ) return + if settings.multidirectional_som: + try: + from minisom import MiniSom + except ModuleNotFoundError as _: + print("Self-Organizing Map is selected, but 'minisom' module not installed.\nPlease install it with 'pip install minisom'.") + return + # Adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/commands/env.py if torch.cuda.is_available(): count = torch.cuda.device_count() @@ -419,20 +426,54 @@ def run(): bad_residuals = model.get_residuals_batched(bad_prompts) good_means = good_residuals.mean(dim=0) - bad_means = bad_residuals.mean(dim=0) - refusal_directions = F.normalize(bad_means - good_means, p=2, dim=1) + if settings.multidirectional_som: + + from som import SOMCalculator + bad_means = [] + + # bad_residuals shape: (num_bad_prompts, num_layers, hidden_dim) + num_layers = bad_residuals.shape[1] + + print(f" - Retrieving multi-directions through self-organizing map...") + + for layer_idx in range(num_layers): + print(f" - Processing Layer {layer_idx + 1}/{num_layers}...") + # Extract residuals for the current layer + # Shape: (num_bad_prompts, hidden_dim) + layer_residuals = bad_residuals[:, layer_idx, :].numpy() + + # Initialize and fit the SOM for this layer's residuals + som_calc = SOMCalculator( + som_x=settings.som_x, + som_y=settings.som_y, + iterations=settings.som_iterations, + lr=settings.som_lr, + sigma=settings.som_sigma + ) + som_calc.fit(layer_residuals) + + # Get the weights of the top-k neurons as our "bad means" + top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k) + + # Convert back to tensor and add to our list + # Shape: (k, hidden_dim) + bad_means.append(torch.tensor(top_k_weights, dtype=torch.float32)) + else: + bad_means = [bad_residuals.mean(dim=0)] + + refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means] if settings.orthogonalize_direction: # Implements https://huggingface.co/blog/grimjim/projected-abliteration # Adjust the refusal directions so that only the component that is # orthogonal to the good direction is subtracted during abliteration. good_directions = F.normalize(good_means, p=2, dim=1) - projection_vector = torch.sum(refusal_directions * good_directions, dim=1) - refusal_directions = ( - refusal_directions - projection_vector.unsqueeze(1) * good_directions - ) - refusal_directions = F.normalize(refusal_directions, p=2, dim=1) + projection_vectors = [torch.sum(ref * good_directions, dim=1) for ref in refusal_directions] + refusal_directions = [( + refusal_direction - projection_vector.unsqueeze(1) * good_directions + ) for (refusal_direction, projection_vector) in zip(refusal_directions, projection_vectors)] + refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions] analyzer = Analyzer(settings, model, good_residuals, bad_residuals) diff --git a/src/heretic/som.py b/src/heretic/som.py new file mode 100644 index 00000000..6fa012d6 --- /dev/null +++ b/src/heretic/som.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# Copyright (C) 2025-2026 Philipp Emanuel Weidmann + contributors + +import numpy as np +from minisom import MiniSom +from collections import defaultdict + +class SOMCalculator: + """ + A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights. + """ + def __init__(self, som_x, som_y, iterations, lr, sigma): + """ + Initializes the SOM calculator with training parameters. + + Args: + som_x (int): Number of neurons in the x-axis of the SOM grid. + som_y (int): Number of neurons in the y-axis of the SOM grid. + iterations (int): Number of training iterations for the SOM. + lr (float): Learning rate for the SOM. + sigma (float): Radius of the neighborhood function. + """ + self.som_x = som_x + self.som_y = som_y + self.iterations = iterations + self.lr = lr + self.sigma = sigma + self.som = None + + def fit(self, data: np.ndarray): + """ + Trains the SOM on the provided 2D data. + + Args: + data (np.ndarray): A 2D NumPy array of shape (n_samples, n_features). + """ + # Ensure data is 2D + if len(data.shape) != 2: + raise ValueError(f"Data must be a 2D array, but got shape {data.shape}") + + n_samples, n_features = data.shape + + # Initialize and train the SOM using MiniSom + self.som = MiniSom( + x_size=self.som_x, + y_size=self.som_y, + input_len=n_features, + sigma=self.sigma, + learning_rate=self.lr, + random_seed=0, # For reproducibility + topology='hexagonal' + ) + self.som.random_weights_init(data) + self.som.train_random(data, num_iteration=self.iterations) + + def get_top_k_neuron_weights(self, k: int): + """ + Gets the weights of the top-k neurons based on their frequency of being winners. + + Args: + k (int): The number of top neurons to return. + + Returns: + np.ndarray: A 2D array of shape (k, n_features) containing the weights of the top-k neurons. + """ + if self.som is None: + raise RuntimeError("SOM has not been trained yet. Call `fit()` first.") + + winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])]) + counts = defaultdict(int) + for w in winners: + counts[tuple(w)] += 1 + + # Sort neurons by their count (descending) and get the top-k + sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k] + + # Get the coordinates of the top-k neurons + top_k_coords = [coord for coord, _ in sorted_neurons] + + # Fetch the weights for these top-k neurons + # self.som.get_weights() has shape (som_x, som_y, n_features) + top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) + + return top_k_weights From 3ad2917acf6f406d755379a723a9681a58cb5af5 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 21:23:46 +0300 Subject: [PATCH 02/21] use multidirection in optuna optimizer --- src/heretic/main.py | 16 ++++++++-------- src/heretic/model.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 9a5a71f4..75a57244 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -528,11 +528,11 @@ def objective(trial: Trial) -> tuple[float, float]: # The parameter ranges are based on experiments with various models # and much wider ranges. They are not set in stone and might have to be # adjusted for future models. - max_weight = trial.suggest_float( - f"{component}.max_weight", + max_weights = [trial.suggest_float( + f"{component}.max_weight.{i}", 0.8, 1.5, - ) + ) for i in range(len(refusal_directions))] max_weight_position = trial.suggest_float( f"{component}.max_weight_position", 0.6 * last_layer_index, @@ -541,11 +541,11 @@ def objective(trial: Trial) -> tuple[float, float]: # For sampling purposes, min_weight is expressed as a fraction of max_weight, # again because multivariate TPE doesn't support variable-range parameters. # The value is transformed into the actual min_weight value below. - min_weight = trial.suggest_float( - f"{component}.min_weight", + min_weights = [trial.suggest_float( + f"{component}.min_weight.{i}", 0.0, 1.0, - ) + ) for i in range(len(refusal_directions))] min_weight_distance = trial.suggest_float( f"{component}.min_weight_distance", 1.0, @@ -553,9 +553,9 @@ def objective(trial: Trial) -> tuple[float, float]: ) parameters[component] = AbliterationParameters( - max_weight=max_weight, + max_weights=max_weights, max_weight_position=max_weight_position, - min_weight=(min_weight * max_weight), + min_weights=[(min_weight * max_weight) for (min_weight, max_weight) in zip(min_weights, max_weights)], min_weight_distance=min_weight_distance, ) diff --git a/src/heretic/model.py b/src/heretic/model.py index 58300b16..0da4f807 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -46,9 +46,9 @@ def get_model_class( @dataclass class AbliterationParameters: - max_weight: float + max_weights: list[float] max_weight_position: float - min_weight: float + min_weights: list[float] min_weight_distance: float From bf5f2acdfbf8200d6109e53aa79a1bd92840f062 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:12:06 +0300 Subject: [PATCH 03/21] multi-directional ablation --- src/heretic/main.py | 3 + src/heretic/model.py | 192 ++++++++++++++++++++++--------------------- 2 files changed, 102 insertions(+), 93 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 75a57244..b9153e21 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -475,6 +475,9 @@ def run(): ) for (refusal_direction, projection_vector) in zip(refusal_directions, projection_vectors)] refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions] + refusal_directions = torch.stack(refusal_directions, dim=0) + refusal_directions = refusal_directions.permute(1, 0, -1) # layers, directions, hidden_dim + analyzer = Analyzer(settings, model, good_residuals, bad_residuals) if settings.print_residual_geometry: diff --git a/src/heretic/model.py b/src/heretic/model.py index 0da4f807..46c90ea2 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -378,146 +378,152 @@ def get_abliterable_components(self) -> list[str]: def abliterate( self, - refusal_directions: Tensor, + refusal_directions: torch.Tensor, # Shape: (layers, num_directions, hidden_dim) direction_index: float | None, parameters: dict[str, AbliterationParameters], ): - if direction_index is None: - refusal_direction = None - else: - # The index must be shifted by 1 because the first element - # of refusal_directions is the direction for the embeddings. - weight, index = math.modf(direction_index + 1) - refusal_direction = F.normalize( - refusal_directions[int(index)].lerp( - refusal_directions[int(index) + 1], + """ + Abliterates the model using multidirectional refusal vectors. + + Args: + refusal_directions: A 3D tensor where the first dimension corresponds to layers, + the second to the number of SOM-derived directions, and the third + to the hidden dimension. + direction_index: An optional float to interpolate between two specific refusal directions + (across all layers). If None, all directions are used. + parameters: A dictionary mapping component names to AbliterationParameters. + Each AbliterationParameters instance must have max_weights and min_weights + as lists of length equal to the number of directions. + """ + num_layers, num_directions, _ = refusal_directions.shape + + if direction_index is not None: + # If a specific direction is requested, interpolate across the direction dimension. + # For example, if direction_index=0.5, it will blend direction 0 and 1. + weight, index = math.modf(direction_index) + # Clamp index to be within the valid range [0, num_directions - 1] + idx1 = int(index) % num_directions + idx2 = (idx1 + 1) % num_directions + # Interpolate between the two chosen directions. The result has shape (layers, hidden_dim) + interpolated_directions = F.normalize( + refusal_directions[:, idx1].lerp( + refusal_directions[:, idx2], weight, ), p=2, - dim=0, + dim=1, ) + # We will use this single "blended" set of directions for all layers. + # Shape: (layers, hidden_dim) + layer_refusal_directions = interpolated_directions + else: + # If no specific direction is given, we use all directions for each layer. + # Shape is already (layers, num_directions, hidden_dim) + layer_refusal_directions = refusal_directions - # Note that some implementations of abliteration also orthogonalize - # the embedding matrix, but it's unclear if that has any benefits. - for layer_index in range(len(self.get_layers())): + # Now, iterate through each layer to apply the ablation. + for layer_index in range(num_layers): for component, modules in self.get_layer_modules(layer_index).items(): params = parameters[component] - # Type inference fails here for some reason. - distance = cast(float, abs(layer_index - params.max_weight_position)) + # Ensure the number of weights matches the number of directions + if len(params.max_weights) != num_directions or len(params.min_weights) != num_directions: + raise ValueError( + f"Mismatch in number of directions and weights for component '{component}'. " + f"Found {num_directions} directions but {len(params.max_weights)} max_weights." + ) - # Don't orthogonalize layers that are more than - # min_weight_distance away from max_weight_position. + # Get the layer-specific position scaling factor. + # This factor is the same for all directions in this layer. + distance = cast(float, abs(layer_index - params.max_weight_position)) if distance > params.min_weight_distance: + # If this layer is too far from the optimal position, skip it entirely. continue - # Interpolate linearly between max_weight and min_weight - # over min_weight_distance. - weight = params.max_weight + (distance / params.min_weight_distance) * ( - params.min_weight - params.max_weight + # Calculate the base weight scaling factor for this layer. + # This factor will be applied to all directions in this layer. + layer_base_weight = params.max_weights[0] + (distance / params.min_weight_distance) * ( + params.min_weights[0] - params.max_weights[0] ) + # Note: In the original single-direction logic, this `layer_base_weight` was the + # final weight. Now, it's a base factor that can be further modified per direction. - if refusal_direction is None: - # The index must be shifted by 1 because the first element - # of refusal_directions is the direction for the embeddings. - layer_refusal_direction = refusal_directions[layer_index + 1] - else: - layer_refusal_direction = refusal_direction + # Get the refusal directions for the current layer. + # Shape: (num_directions, hidden_dim) + current_layer_directions = layer_refusal_directions[layer_index] for module in modules: - # FIXME: This cast is potentially invalid, because the program logic - # does not guarantee that the module is of type Linear, and in fact - # the retrieved modules might not conform to the interface assumed - # below (though they do in practice). However, this is difficult - # to fix cleanly, because get_layer_modules is called twice on - # different model configurations, and PEFT employs different - # module types depending on the chosen quantization. module = cast(Linear, module) - - # LoRA abliteration: delta W = -lambda * v * (v^T W) - # lora_B = -lambda * v - # lora_A = v^T W - - # Use the FP32 refusal direction directly (no downcast/upcast) - # and move to the correct device. - v = layer_refusal_direction.to(module.weight.device) - - # Get W (dequantize if necessary). - # - # FIXME: This cast is valid only under the assumption that the original - # module wrapped by the LoRA adapter has a weight attribute. - # See the comment above for why this is currently not guaranteed. + # Get the weight matrix W (dequantized if necessary) in FP32. base_weight = cast(Tensor, module.base_layer.weight) quant_state = getattr(base_weight, "quant_state", None) - if quant_state is None: W = base_weight.to(torch.float32) else: - # 4-bit quantization. - # This cast is always valid. Type inference fails here because the - # bnb.functional module is not found by ty for some reason. - W = cast( - Tensor, - bnb.functional.dequantize_4bit( # ty:ignore[possibly-missing-attribute] - base_weight.data, - quant_state, - ).to(torch.float32), - ) + W = cast(Tensor, bnb.functional.dequantize_4bit(base_weight.data, quant_state)).to(torch.float32) - # Flatten weight matrix to (out_features, in_features). - W = W.view(W.shape[0], -1) + W = W.view(W.shape[0], -1) # Flatten to (out_features, in_features) if self.settings.row_normalization != RowNormalization.NONE: - # Keep a reference to the original weight matrix so we can subtract it later. W_org = W - # Get the row norms. W_row_norms = LA.vector_norm(W, dim=1, keepdim=True) - # Normalize the weight matrix along the rows. W = F.normalize(W, p=2, dim=1) - # Calculate lora_A = v^T W - # v is (d_out,), W is (d_out, d_in) - # v @ W -> (d_in,) - lora_A = (v @ W).view(1, -1) + # --- The core change is here --- + # We will calculate a delta for each direction and sum them up. + total_delta_W = torch.zeros_like(W, device=W.device) + + for i in range(num_directions): + # Get the specific refusal direction for this component. + v = current_layer_directions[i].to(module.weight.device) - # Calculate lora_B = -weight * v - # v is (d_out,) - lora_B = (-weight * v).view(-1, 1) + # Get the optimized weight for this specific direction (i). + # It's a combination of the layer's base weight and the direction's specific weight. + direction_specific_weight = layer_base_weight * (params.max_weights[i] / params.max_weights[0] if params.max_weights[0] != 0 else 1.0) + # LoRA abliteration: delta W = -lambda * v * (v^T W) + # lora_B = -lambda * v + # lora_A = v^T W + v = v # Shape: (hidden_dim,) + W_matrix = W # Shape: (hidden_dim, in_features) + + # Calculate lora_A = v^T W -> (in_features,) + lora_A = (v @ W_matrix).view(1, -1) + # Calculate lora_B = -direction_specific_weight * v -> (hidden_dim, 1) + lora_B = (-direction_specific_weight * v).view(-1, 1) + + # The delta for this direction is lora_B @ lora_A + delta_W = lora_B @ lora_A + total_delta_W += delta_W + + # Now, apply the combined delta to W based on the row normalization setting. if self.settings.row_normalization == RowNormalization.PRE: - # Make the LoRA adapter apply to the original weight matrix. - lora_B = W_row_norms * lora_B + total_delta_W = W_row_norms * total_delta_W elif self.settings.row_normalization == RowNormalization.FULL: - # Approximates https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration - W = W + lora_B @ lora_A - # Normalize the adjusted weight matrix along the rows. + W = W + total_delta_W W = F.normalize(W, p=2, dim=1) - # Restore the original row norms of the weight matrix. W = W * W_row_norms - # Subtract the original matrix to turn W into a delta. W = W - W_org - # Use a low-rank SVD to get an approximation of the matrix. + # Use low-rank SVD for the full delta r = self.peft_config.r U, S, Vh = torch.svd_lowrank(W, q=2 * r + 4, niter=6) - # Truncate it to the part we want to store in the LoRA adapter. - # Note: svd_lowrank actually returns V, so transpose it to get Vh. U = U[:, :r] S = S[:r] Vh = Vh[:, :r].T - # Transfer it into the LoRA adapter components. Split the singular values - # evenly between the two components to keep their norms balanced and avoid - # potential issues with numerical stability. sqrt_S = torch.sqrt(S) lora_B = U @ torch.diag(sqrt_S) lora_A = torch.diag(sqrt_S) @ Vh - - # Assign to adapters. The adapter name is "default", because that's - # what PEFT uses when no name is explicitly specified, as above. - # These casts are therefore valid. - weight_A = cast(Tensor, module.lora_A["default"].weight) - weight_B = cast(Tensor, module.lora_B["default"].weight) - weight_A.data = lora_A.to(weight_A.dtype) - weight_B.data = lora_B.to(weight_B.dtype) + # Assign the SVD result to the LoRA weights + weight_A = cast(Tensor, module.lora_A["default"].weight) + weight_B = cast(Tensor, module.lora_B["default"].weight) + weight_A.data = lora_A.to(weight_A.dtype) + weight_B.data = lora_B.to(weight_B.dtype) + else: # RowNormalization.NONE or others + W = W + total_delta_W + + # Assign the new weight matrix back to the base layer. + # Reshape W back to its original 2D shape. + module.base_layer.weight.data = W.view_as(module.base_layer.weight).to(module.base_layer.weight.dtype) def generate( self, From 46c320a7d112ee9f34ab7c662d04b949da6b4b12 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:15:21 +0300 Subject: [PATCH 04/21] add SOM params to config template --- config.default.toml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/config.default.toml b/config.default.toml index abfa0fc7..67fb31a3 100644 --- a/config.default.toml +++ b/config.default.toml @@ -85,6 +85,27 @@ full_normalization_lora_rank = 3 # of the components, then clamps the magnitudes of all components to that quantile. winsorization_quantile = 1.0 +# Use multidirectional Self-Organizing Map +multidirectional_som = false + +# Number of SOM neurons in the x-axis. +som_x = 4 + +# Number of SOM neurons in the y-axis. +som_y = 4 + +# Number of SOM training iterations. +som_iterations = 5000 + +# SOM learning rate. +som_lr = 0.01 + +# SOM neighborhood radius. +som_sigma = 0.5 + +# Number of top neurons to use for multidirectional SOM. +som_k = 4 + # Number of abliteration trials to run during optimization. n_trials = 200 From 85a6eb0376ee1af3ff45151d06bb05a771c4f68d Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:26:59 +0300 Subject: [PATCH 05/21] som is relative import --- src/heretic/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index b9153e21..5212e9d9 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -429,7 +429,7 @@ def run(): if settings.multidirectional_som: - from som import SOMCalculator + from .som import SOMCalculator bad_means = [] # bad_residuals shape: (num_bad_prompts, num_layers, hidden_dim) From e012bb99813ea1d243fc15b9f9046de800872714 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:40:24 +0300 Subject: [PATCH 06/21] tensor to cpu for numpy --- src/heretic/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 5212e9d9..0544ee1b 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -441,7 +441,7 @@ def run(): print(f" - Processing Layer {layer_idx + 1}/{num_layers}...") # Extract residuals for the current layer # Shape: (num_bad_prompts, hidden_dim) - layer_residuals = bad_residuals[:, layer_idx, :].numpy() + layer_residuals = bad_residuals[:, layer_idx, :].cpu().float().numpy() # Initialize and fit the SOM for this layer's residuals som_calc = SOMCalculator( @@ -458,7 +458,7 @@ def run(): # Convert back to tensor and add to our list # Shape: (k, hidden_dim) - bad_means.append(torch.tensor(top_k_weights, dtype=torch.float32)) + bad_means.append(torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device)) else: bad_means = [bad_residuals.mean(dim=0)] From 7f3b941edf6d57af75df89eaf204632109342141 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:40:50 +0300 Subject: [PATCH 07/21] match minisom args to reference code --- src/heretic/som.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/heretic/som.py b/src/heretic/som.py index 6fa012d6..f2e83370 100644 --- a/src/heretic/som.py +++ b/src/heretic/som.py @@ -42,16 +42,17 @@ def fit(self, data: np.ndarray): # Initialize and train the SOM using MiniSom self.som = MiniSom( - x_size=self.som_x, - y_size=self.som_y, - input_len=n_features, + self.som_x, + self.som_y, + n_features, sigma=self.sigma, learning_rate=self.lr, random_seed=0, # For reproducibility + activation_distance='euclidean', topology='hexagonal' ) self.som.random_weights_init(data) - self.som.train_random(data, num_iteration=self.iterations) + self.som.train_random(data, self.iterations) def get_top_k_neuron_weights(self, k: int): """ From ce1c5891a33eee7541411e5e7360dc38ea8a5139 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 22:53:02 +0300 Subject: [PATCH 08/21] permute layers and directions --- src/heretic/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 0544ee1b..17eb1dfa 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -425,7 +425,7 @@ def run(): print("* Obtaining residuals for bad prompts...") bad_residuals = model.get_residuals_batched(bad_prompts) - good_means = good_residuals.mean(dim=0) + good_means = good_residuals.mean(dim=0) # N_layers, hidden_dim if settings.multidirectional_som: @@ -462,6 +462,8 @@ def run(): else: bad_means = [bad_residuals.mean(dim=0)] + bad_means = torch.stack(bad_means).permute(1, 0, 2) # N_directions, N_layers, hidden_dim + refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means] if settings.orthogonalize_direction: @@ -476,8 +478,7 @@ def run(): refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions] refusal_directions = torch.stack(refusal_directions, dim=0) - refusal_directions = refusal_directions.permute(1, 0, -1) # layers, directions, hidden_dim - + refusal_directions = refusal_directions.permute(1, 0, 2) # layers, directions, hidden_dim analyzer = Analyzer(settings, model, good_residuals, bad_residuals) if settings.print_residual_geometry: From 94b858c11297d048b1a324ef4a7e58209b6563ee Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 23:08:36 +0300 Subject: [PATCH 09/21] duplicate de-duplicated neurons --- src/heretic/main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 17eb1dfa..c85e9233 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -456,13 +456,19 @@ def run(): # Get the weights of the top-k neurons as our "bad means" top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k) + # TODO: SOM de-duplicates neurons if they are too close to each other + # The logic can be reworked for this fact later + # For now we will duplicate them manually + t = torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device) + t = t.expand(settings.som_k, t.shape[-1]) + # Convert back to tensor and add to our list # Shape: (k, hidden_dim) - bad_means.append(torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device)) + bad_means.append(t) else: bad_means = [bad_residuals.mean(dim=0)] - bad_means = torch.stack(bad_means).permute(1, 0, 2) # N_directions, N_layers, hidden_dim + bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means] From a2a209c9892e75cb1d42beb316089a92784fc943 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 23:12:36 +0300 Subject: [PATCH 10/21] fixup suggest float count --- src/heretic/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index c85e9233..98672e08 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -542,7 +542,7 @@ def objective(trial: Trial) -> tuple[float, float]: f"{component}.max_weight.{i}", 0.8, 1.5, - ) for i in range(len(refusal_directions))] + ) for i in range(refusal_directions.shape[1])] max_weight_position = trial.suggest_float( f"{component}.max_weight_position", 0.6 * last_layer_index, @@ -555,7 +555,7 @@ def objective(trial: Trial) -> tuple[float, float]: f"{component}.min_weight.{i}", 0.0, 1.0, - ) for i in range(len(refusal_directions))] + ) for i in range(refusal_directions.shape[1])] min_weight_distance = trial.suggest_float( f"{component}.min_weight_distance", 1.0, From dad69dfc00fdd3008217205e011e364e7313034b Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 23:29:55 +0300 Subject: [PATCH 11/21] print parameter lists --- src/heretic/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/heretic/utils.py b/src/heretic/utils.py index a0d5f35f..145dbba8 100644 --- a/src/heretic/utils.py +++ b/src/heretic/utils.py @@ -260,8 +260,11 @@ def get_trial_parameters(trial: Trial) -> dict[str, str]: for component, parameters in trial.user_attrs["parameters"].items(): for name, value in parameters.items(): - params[f"{component}.{name}"] = f"{value:.2f}" - + if isinstance(value, list): + for direction, direction_value in enumerate(value): + params[f"{component}.{name}.{direction}"] = f"{direction}: {direction_value:.2f}" + else: + params[f"{component}.{name}"] = f"{value:.2f}" return params From 07655cdfc86f9f2fd778e51189adf8ef9127702a Mon Sep 17 00:00:00 2001 From: kabachuha Date: Thu, 26 Feb 2026 23:34:14 +0300 Subject: [PATCH 12/21] correspond the layers count to vanilla code --- src/heretic/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/heretic/model.py b/src/heretic/model.py index 46c90ea2..0ad975b9 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -395,7 +395,7 @@ def abliterate( Each AbliterationParameters instance must have max_weights and min_weights as lists of length equal to the number of directions. """ - num_layers, num_directions, _ = refusal_directions.shape + _, num_directions, _ = refusal_directions.shape if direction_index is not None: # If a specific direction is requested, interpolate across the direction dimension. @@ -422,7 +422,7 @@ def abliterate( layer_refusal_directions = refusal_directions # Now, iterate through each layer to apply the ablation. - for layer_index in range(num_layers): + for layer_index in range(len(self.get_layers())): for component, modules in self.get_layer_modules(layer_index).items(): params = parameters[component] From 31378cb7cf8bb5a83d9a91eaf6ca48161cfa24b9 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 00:02:58 +0300 Subject: [PATCH 13/21] layerwise direction interpolation --- src/heretic/model.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/heretic/model.py b/src/heretic/model.py index 0ad975b9..6f2f01fd 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -400,21 +400,21 @@ def abliterate( if direction_index is not None: # If a specific direction is requested, interpolate across the direction dimension. # For example, if direction_index=0.5, it will blend direction 0 and 1. - weight, index = math.modf(direction_index) - # Clamp index to be within the valid range [0, num_directions - 1] - idx1 = int(index) % num_directions - idx2 = (idx1 + 1) % num_directions - # Interpolate between the two chosen directions. The result has shape (layers, hidden_dim) - interpolated_directions = F.normalize( - refusal_directions[:, idx1].lerp( - refusal_directions[:, idx2], + weight, index = math.modf(direction_index + 1) + # Clamp index to be within the valid range [0, num_layers - 1] + idx1 = int(index) + idx2 = (idx1 + 1) + # Interpolate between the two chosen directions. The result has shape (n_directions, hidden_dim) + interpolated_directions = torch.stack([F.normalize( + r_dir[idx1].lerp( + r_dir[idx2], weight, ), p=2, - dim=1, - ) + dim=0, + ) for r_dir in refusal_directions.permute(1, 0, 2)], dim=0) # We will use this single "blended" set of directions for all layers. - # Shape: (layers, hidden_dim) + # Shape: (n_directions, hidden_dim) layer_refusal_directions = interpolated_directions else: # If no specific direction is given, we use all directions for each layer. @@ -450,7 +450,10 @@ def abliterate( # Get the refusal directions for the current layer. # Shape: (num_directions, hidden_dim) - current_layer_directions = layer_refusal_directions[layer_index] + if direction_index is None: + current_layer_directions = layer_refusal_directions[layer_index+1] + else: + current_layer_directions = layer_refusal_directions for module in modules: module = cast(Linear, module) From 3b111e86d543c652ae20b58f9b8ae01fcb699161 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 10:39:34 +0300 Subject: [PATCH 14/21] fixup cases with not full normalization --- src/heretic/model.py | 46 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/heretic/model.py b/src/heretic/model.py index 6f2f01fd..ebeb6b90 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -501,7 +501,24 @@ def abliterate( # Now, apply the combined delta to W based on the row normalization setting. if self.settings.row_normalization == RowNormalization.PRE: - total_delta_W = W_row_norms * total_delta_W + # Use low-rank SVD to decompose the total_delta_W into LoRA matrices. + r = self.peft_config.r + # Perform SVD on the total_delta_W matrix + U, S, Vh = torch.svd_lowrank(total_delta_W, q=min(2 * r + 4, min(total_delta_W.shape)), niter=6) + + # Truncate to rank 'r' + U = U[:, :r] + S = S[:r] + Vh = Vh[:, :r].T # Vh is (r, in_features) + + # Split the singular values between lora_B and lora_A + sqrt_S = torch.sqrt(S) + lora_B = U @ torch.diag(sqrt_S) # Shape: (out_features, r) + lora_A = torch.diag(sqrt_S) @ Vh # Shape: (r, in_features) + + # Apply PRE normalization: scale lora_B by the original row norms. + # lora_B already has shape (out_features, r), W_row_norms is (out_features, 1) + lora_B = W_row_norms * lora_B elif self.settings.row_normalization == RowNormalization.FULL: W = W + total_delta_W W = F.normalize(W, p=2, dim=1) @@ -516,17 +533,26 @@ def abliterate( sqrt_S = torch.sqrt(S) lora_B = U @ torch.diag(sqrt_S) lora_A = torch.diag(sqrt_S) @ Vh - # Assign the SVD result to the LoRA weights - weight_A = cast(Tensor, module.lora_A["default"].weight) - weight_B = cast(Tensor, module.lora_B["default"].weight) - weight_A.data = lora_A.to(weight_A.dtype) - weight_B.data = lora_B.to(weight_B.dtype) else: # RowNormalization.NONE or others - W = W + total_delta_W + # In NONE mode, the delta is simply total_delta_W. + # We decompose this delta directly using SVD. + r = self.peft_config.r + U, S, Vh = torch.svd_lowrank(total_delta_W, q=min(2 * r + 4, min(total_delta_W.shape)), niter=6) + + U = U[:, :r] + S = S[:r] + Vh = Vh[:, :r].T + + sqrt_S = torch.sqrt(S) + lora_B = U @ torch.diag(sqrt_S) + lora_A = torch.diag(sqrt_S) @ Vh - # Assign the new weight matrix back to the base layer. - # Reshape W back to its original 2D shape. - module.base_layer.weight.data = W.view_as(module.base_layer.weight).to(module.base_layer.weight.dtype) + + # Assign the SVD result to the LoRA weights + weight_A = cast(Tensor, module.lora_A["default"].weight) + weight_B = cast(Tensor, module.lora_B["default"].weight) + weight_A.data = lora_A.to(weight_A.dtype) + weight_B.data = lora_B.to(weight_B.dtype) def generate( self, From 001006dcb393b0c1a699339a53dd4866f6ddd708 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 11:04:24 +0300 Subject: [PATCH 15/21] config template consistent with fields --- config.default.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.default.toml b/config.default.toml index 67fb31a3..d21c68b6 100644 --- a/config.default.toml +++ b/config.default.toml @@ -95,7 +95,7 @@ som_x = 4 som_y = 4 # Number of SOM training iterations. -som_iterations = 5000 +som_iterations = 10000 # SOM learning rate. som_lr = 0.01 From c37a0462bcbeb4294bb37dac8bd6c4d279d45a29 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 11:07:21 +0300 Subject: [PATCH 16/21] style guide --- config.default.toml | 2 +- src/heretic/som.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config.default.toml b/config.default.toml index d21c68b6..d0b19662 100644 --- a/config.default.toml +++ b/config.default.toml @@ -85,7 +85,7 @@ full_normalization_lora_rank = 3 # of the components, then clamps the magnitudes of all components to that quantile. winsorization_quantile = 1.0 -# Use multidirectional Self-Organizing Map +# Use multidirectional Self-Organizing Map. multidirectional_som = false # Number of SOM neurons in the x-axis. diff --git a/src/heretic/som.py b/src/heretic/som.py index f2e83370..776c0293 100644 --- a/src/heretic/som.py +++ b/src/heretic/som.py @@ -9,7 +9,7 @@ class SOMCalculator: """ A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights. """ - def __init__(self, som_x, som_y, iterations, lr, sigma): + def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None: """ Initializes the SOM calculator with training parameters. @@ -27,7 +27,7 @@ def __init__(self, som_x, som_y, iterations, lr, sigma): self.sigma = sigma self.som = None - def fit(self, data: np.ndarray): + def fit(self, data: np.ndarray) -> None: """ Trains the SOM on the provided 2D data. @@ -54,7 +54,7 @@ def fit(self, data: np.ndarray): self.som.random_weights_init(data) self.som.train_random(data, self.iterations) - def get_top_k_neuron_weights(self, k: int): + def get_top_k_neuron_weights(self, k: int) -> np.ndarray: """ Gets the weights of the top-k neurons based on their frequency of being winners. From c02869ab37a8762d98fb8e41dd2e5dcf08fd90e3 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 11:16:35 +0300 Subject: [PATCH 17/21] use repeat instead of expand for duplication --- src/heretic/main.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 98672e08..6fa8a90c 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -456,11 +456,15 @@ def run(): # Get the weights of the top-k neurons as our "bad means" top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k) - # TODO: SOM de-duplicates neurons if they are too close to each other - # The logic can be reworked for this fact later - # For now we will duplicate them manually + # SOM de-duplicates neurons if they are too close to each other + # Temporary solution is to repeat them back t = torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device) - t = t.expand(settings.som_k, t.shape[-1]) + num_found_neurons = t.shape[0] + + if num_found_neurons < settings.som_k: + t = t.repeat(settings.som_k // num_found_neurons, 1) + if t.shape[0] < settings.som_k: + t = torch.cat([t, t[0:settings.som_k - t.shape[0]]], dim=0) # Convert back to tensor and add to our list # Shape: (k, hidden_dim) From 9b2d068712f8f424156f05288b6790b6e5407c7d Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 11:20:13 +0300 Subject: [PATCH 18/21] layer index clamping --- src/heretic/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/heretic/model.py b/src/heretic/model.py index ebeb6b90..0b2faadf 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -403,7 +403,7 @@ def abliterate( weight, index = math.modf(direction_index + 1) # Clamp index to be within the valid range [0, num_layers - 1] idx1 = int(index) - idx2 = (idx1 + 1) + idx2 = min(idx1 + 1, refusal_directions.shape[0] - 1) # Interpolate between the two chosen directions. The result has shape (n_directions, hidden_dim) interpolated_directions = torch.stack([F.normalize( r_dir[idx1].lerp( From 0be6b8722efae711f4307086603c733204e43629 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 13:03:04 +0300 Subject: [PATCH 19/21] som neurons retrieval through win map --- config.default.toml | 3 ++ src/heretic/config.py | 5 ++++ src/heretic/main.py | 3 +- src/heretic/som.py | 65 +++++++++++++++++++++++++++++++++---------- 4 files changed, 61 insertions(+), 15 deletions(-) diff --git a/config.default.toml b/config.default.toml index d0b19662..4fc105c4 100644 --- a/config.default.toml +++ b/config.default.toml @@ -106,6 +106,9 @@ som_sigma = 0.5 # Number of top neurons to use for multidirectional SOM. som_k = 4 +# Use win map for SOM neurons retrieval. +som_use_win_map = false + # Number of abliteration trials to run during optimization. n_trials = 200 diff --git a/src/heretic/config.py b/src/heretic/config.py index 750a7814..62ef9460 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -244,6 +244,11 @@ class Settings(BaseSettings): default=4, description="Number of top neurons to use for multidirectional SOM." ) + som_use_win_map: bool = Field( + default=False, + description="Use win map for SOM neurons retrieval.", + ) + n_trials: int = Field( default=200, description="Number of abliteration trials to run during optimization.", diff --git a/src/heretic/main.py b/src/heretic/main.py index 6fa8a90c..93fa5f70 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -449,7 +449,8 @@ def run(): som_y=settings.som_y, iterations=settings.som_iterations, lr=settings.som_lr, - sigma=settings.som_sigma + sigma=settings.som_sigma, + use_win_map=settings.som_use_win_map, ) som_calc.fit(layer_residuals) diff --git a/src/heretic/som.py b/src/heretic/som.py index 776c0293..babdfbb2 100644 --- a/src/heretic/som.py +++ b/src/heretic/som.py @@ -9,7 +9,7 @@ class SOMCalculator: """ A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights. """ - def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None: + def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float, use_win_map: bool = False) -> None: """ Initializes the SOM calculator with training parameters. @@ -26,6 +26,8 @@ def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: fl self.lr = lr self.sigma = sigma self.som = None + self.data = None # Store the data used for training + self.use_win_map = use_win_map def fit(self, data: np.ndarray) -> None: """ @@ -38,7 +40,8 @@ def fit(self, data: np.ndarray) -> None: if len(data.shape) != 2: raise ValueError(f"Data must be a 2D array, but got shape {data.shape}") - n_samples, n_features = data.shape + self.data = data # Store the data + _, n_features = data.shape # Initialize and train the SOM using MiniSom self.som = MiniSom( @@ -67,19 +70,53 @@ def get_top_k_neuron_weights(self, k: int) -> np.ndarray: if self.som is None: raise RuntimeError("SOM has not been trained yet. Call `fit()` first.") - winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])]) - counts = defaultdict(int) - for w in winners: - counts[tuple(w)] += 1 + if self.use_win_map: - # Sort neurons by their count (descending) and get the top-k - sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k] + win_map = self.som.win_map(self.data) - # Get the coordinates of the top-k neurons - top_k_coords = [coord for coord, _ in sorted_neurons] + # The win_map only contains neurons that won at least one data point. + # We need to account for all neurons in the grid (som_x x som_y) and + # give a count of 0 to those that didn't win anything. + all_neurons = [(i, j) for i in range(self.som_x) for j in range(self.som_y)] - # Fetch the weights for these top-k neurons - # self.som.get_weights() has shape (som_x, som_y, n_features) - top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) + counts = defaultdict(int) + for neuron_coords, data_indices in win_map.items(): + counts[neuron_coords] = len(data_indices) - return top_k_weights + # For neurons not in win_map, their count is 0. + # This ensures all neurons are considered in the ranking. + for neuron in all_neurons: + if neuron not in counts: + counts[neuron] = 0 + + # Sort neurons by their count (descending) and get the top-k + sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True) + top_k_neurons_with_counts = sorted_neurons[:k] + + # Get the coordinates of the top-k neurons + top_k_coords = [coord for coord, _ in top_k_neurons_with_counts] + + # Fetch the weights for these top-k neurons from the SOM's weight matrix. + # self.som.get_weights() has shape (som_x, som_y, n_features) + top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) + + return top_k_weights + + else: + + winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])]) + counts = defaultdict(int) + for w in winners: + counts[tuple(w)] += 1 + + # Sort neurons by their count (descending) and get the top-k + sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k] + + # Get the coordinates of the top-k neurons + top_k_coords = [coord for coord, _ in sorted_neurons] + + # Fetch the weights for these top-k neurons + # self.som.get_weights() has shape (som_x, som_y, n_features) + top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) + + return top_k_weights From e8338ca90cfce3167e0d978d408d96945c298bf9 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Fri, 27 Feb 2026 16:44:42 +0300 Subject: [PATCH 20/21] gemini is stupid This reverts commit 0be6b8722efae711f4307086603c733204e43629. Win map results in worse results than without it and it hasn't been in the original. --- config.default.toml | 3 -- src/heretic/config.py | 5 ---- src/heretic/main.py | 3 +- src/heretic/som.py | 65 ++++++++++--------------------------------- 4 files changed, 15 insertions(+), 61 deletions(-) diff --git a/config.default.toml b/config.default.toml index 4fc105c4..d0b19662 100644 --- a/config.default.toml +++ b/config.default.toml @@ -106,9 +106,6 @@ som_sigma = 0.5 # Number of top neurons to use for multidirectional SOM. som_k = 4 -# Use win map for SOM neurons retrieval. -som_use_win_map = false - # Number of abliteration trials to run during optimization. n_trials = 200 diff --git a/src/heretic/config.py b/src/heretic/config.py index 62ef9460..750a7814 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -244,11 +244,6 @@ class Settings(BaseSettings): default=4, description="Number of top neurons to use for multidirectional SOM." ) - som_use_win_map: bool = Field( - default=False, - description="Use win map for SOM neurons retrieval.", - ) - n_trials: int = Field( default=200, description="Number of abliteration trials to run during optimization.", diff --git a/src/heretic/main.py b/src/heretic/main.py index 93fa5f70..6fa8a90c 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -449,8 +449,7 @@ def run(): som_y=settings.som_y, iterations=settings.som_iterations, lr=settings.som_lr, - sigma=settings.som_sigma, - use_win_map=settings.som_use_win_map, + sigma=settings.som_sigma ) som_calc.fit(layer_residuals) diff --git a/src/heretic/som.py b/src/heretic/som.py index babdfbb2..776c0293 100644 --- a/src/heretic/som.py +++ b/src/heretic/som.py @@ -9,7 +9,7 @@ class SOMCalculator: """ A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights. """ - def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float, use_win_map: bool = False) -> None: + def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None: """ Initializes the SOM calculator with training parameters. @@ -26,8 +26,6 @@ def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: fl self.lr = lr self.sigma = sigma self.som = None - self.data = None # Store the data used for training - self.use_win_map = use_win_map def fit(self, data: np.ndarray) -> None: """ @@ -40,8 +38,7 @@ def fit(self, data: np.ndarray) -> None: if len(data.shape) != 2: raise ValueError(f"Data must be a 2D array, but got shape {data.shape}") - self.data = data # Store the data - _, n_features = data.shape + n_samples, n_features = data.shape # Initialize and train the SOM using MiniSom self.som = MiniSom( @@ -70,53 +67,19 @@ def get_top_k_neuron_weights(self, k: int) -> np.ndarray: if self.som is None: raise RuntimeError("SOM has not been trained yet. Call `fit()` first.") - if self.use_win_map: + winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])]) + counts = defaultdict(int) + for w in winners: + counts[tuple(w)] += 1 - win_map = self.som.win_map(self.data) + # Sort neurons by their count (descending) and get the top-k + sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k] - # The win_map only contains neurons that won at least one data point. - # We need to account for all neurons in the grid (som_x x som_y) and - # give a count of 0 to those that didn't win anything. - all_neurons = [(i, j) for i in range(self.som_x) for j in range(self.som_y)] + # Get the coordinates of the top-k neurons + top_k_coords = [coord for coord, _ in sorted_neurons] - counts = defaultdict(int) - for neuron_coords, data_indices in win_map.items(): - counts[neuron_coords] = len(data_indices) + # Fetch the weights for these top-k neurons + # self.som.get_weights() has shape (som_x, som_y, n_features) + top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) - # For neurons not in win_map, their count is 0. - # This ensures all neurons are considered in the ranking. - for neuron in all_neurons: - if neuron not in counts: - counts[neuron] = 0 - - # Sort neurons by their count (descending) and get the top-k - sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True) - top_k_neurons_with_counts = sorted_neurons[:k] - - # Get the coordinates of the top-k neurons - top_k_coords = [coord for coord, _ in top_k_neurons_with_counts] - - # Fetch the weights for these top-k neurons from the SOM's weight matrix. - # self.som.get_weights() has shape (som_x, som_y, n_features) - top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) - - return top_k_weights - - else: - - winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])]) - counts = defaultdict(int) - for w in winners: - counts[tuple(w)] += 1 - - # Sort neurons by their count (descending) and get the top-k - sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k] - - # Get the coordinates of the top-k neurons - top_k_coords = [coord for coord, _ in sorted_neurons] - - # Fetch the weights for these top-k neurons - # self.som.get_weights() has shape (som_x, som_y, n_features) - top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords]) - - return top_k_weights + return top_k_weights From 444edece46741c6eb92cf7a9fe33a2929384cce8 Mon Sep 17 00:00:00 2001 From: kabachuha Date: Sat, 28 Feb 2026 14:57:20 +0300 Subject: [PATCH 21/21] fix non-som path --- src/heretic/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 6fa8a90c..348e1be4 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -469,10 +469,10 @@ def run(): # Convert back to tensor and add to our list # Shape: (k, hidden_dim) bad_means.append(t) + + bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim else: - bad_means = [bad_residuals.mean(dim=0)] - - bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim + bad_means = bad_residuals.mean(dim=0).unsqueeze(0) # (1, N_layers, hidden_dim) refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]