From 6ebabc79431fc218edd6adcdb9d06d0b4658de58 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 21:12:32 +0300
Subject: [PATCH 01/21] obtain multiple resid directions with SOMs

TODO: make main logic work with lists
---
 src/heretic/config.py | 29 +++++++++++++++
 src/heretic/main.py   | 55 ++++++++++++++++++++++++----
 src/heretic/som.py    | 84 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 7 deletions(-)
 create mode 100644 src/heretic/som.py

diff --git a/src/heretic/config.py b/src/heretic/config.py
index 8ed3f80c..750a7814 100644
--- a/src/heretic/config.py
+++ b/src/heretic/config.py
@@ -215,6 +215,35 @@ class Settings(BaseSettings):
         ),
     )
 
+    multidirectional_som: bool = Field(
+        default=False,
+        description="Use multidirectional Self-Organising Maps. Requires 'minisom' package to be installed.",
+    )
+
+    som_x: int = Field(
+        default=4, description="Number of SOM neurons in the x-axis."
+    )
+
+    som_y: int = Field(
+        default=4, description="Number of SOM neurons in the y-axis."
+    )
+
+    som_iterations: int = Field(
+        default=10000, description="Number of SOM training iterations."
+    )
+
+    som_lr: float = Field(
+        default=0.01, description="SOM learning rate."
+    )
+
+    som_sigma: float = Field(
+        default=0.5, description="SOM neighborhood radius."
+    )
+
+    som_k: int = Field(
+        default=4, description="Number of top neurons to use for multidirectional SOM."
+    )
+
     n_trials: int = Field(
         default=200,
         description="Number of abliteration trials to run during optimization.",
diff --git a/src/heretic/main.py b/src/heretic/main.py
index 016c3920..9a5a71f4 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -171,6 +171,13 @@ def run():
         )
         return
 
+    if settings.multidirectional_som:
+        try:
+            from minisom import MiniSom
+        except ModuleNotFoundError as _:
+            print("Self-Organizing Map is selected, but 'minisom' module not installed.\nPlease install it with 'pip install minisom'.")
+            return
+
     # Adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/commands/env.py
     if torch.cuda.is_available():
         count = torch.cuda.device_count()
@@ -419,20 +426,54 @@ def run():
     bad_residuals = model.get_residuals_batched(bad_prompts)
 
     good_means = good_residuals.mean(dim=0)
-    bad_means = bad_residuals.mean(dim=0)
 
-    refusal_directions = F.normalize(bad_means - good_means, p=2, dim=1)
+    if settings.multidirectional_som:
+
+        from som import SOMCalculator
+        bad_means = []
+
+        # bad_residuals shape: (num_bad_prompts, num_layers, hidden_dim)
+        num_layers = bad_residuals.shape[1]
+
+        print(f"  - Retrieving multi-directions through self-organizing map...")
+
+        for layer_idx in range(num_layers):
+            print(f"  - Processing Layer {layer_idx + 1}/{num_layers}...")
+            # Extract residuals for the current layer
+            # Shape: (num_bad_prompts, hidden_dim)
+            layer_residuals = bad_residuals[:, layer_idx, :].numpy()
+
+            # Initialize and fit the SOM for this layer's residuals
+            som_calc = SOMCalculator(
+                som_x=settings.som_x,
+                som_y=settings.som_y,
+                iterations=settings.som_iterations,
+                lr=settings.som_lr,
+                sigma=settings.som_sigma
+            )
+            som_calc.fit(layer_residuals)
+
+            # Get the weights of the top-k neurons as our "bad means"
+            top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k)
+
+            # Convert back to tensor and add to our list
+            # Shape: (k, hidden_dim)
+            bad_means.append(torch.tensor(top_k_weights, dtype=torch.float32))
+    else:
+        bad_means = [bad_residuals.mean(dim=0)]
+
+    refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]
 
     if settings.orthogonalize_direction:
         # Implements https://huggingface.co/blog/grimjim/projected-abliteration
         # Adjust the refusal directions so that only the component that is
         # orthogonal to the good direction is subtracted during abliteration.
         good_directions = F.normalize(good_means, p=2, dim=1)
-        projection_vector = torch.sum(refusal_directions * good_directions, dim=1)
-        refusal_directions = (
-            refusal_directions - projection_vector.unsqueeze(1) * good_directions
-        )
-        refusal_directions = F.normalize(refusal_directions, p=2, dim=1)
+        projection_vectors = [torch.sum(ref * good_directions, dim=1) for ref in refusal_directions]
+        refusal_directions = [(
+            refusal_direction - projection_vector.unsqueeze(1) * good_directions
+        ) for (refusal_direction, projection_vector) in zip(refusal_directions, projection_vectors)]
+        refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions]
 
     analyzer = Analyzer(settings, model, good_residuals, bad_residuals)
 
diff --git a/src/heretic/som.py b/src/heretic/som.py
new file mode 100644
index 00000000..6fa012d6
--- /dev/null
+++ b/src/heretic/som.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2025-2026  Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors
+
+import numpy as np
+from minisom import MiniSom
+from collections import defaultdict
+
+class SOMCalculator:
+    """
+    A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights.
+    """
+    def __init__(self, som_x, som_y, iterations, lr, sigma):
+        """
+        Initializes the SOM calculator with training parameters.
+
+        Args:
+            som_x (int): Number of neurons in the x-axis of the SOM grid.
+            som_y (int): Number of neurons in the y-axis of the SOM grid.
+            iterations (int): Number of training iterations for the SOM.
+            lr (float): Learning rate for the SOM.
+            sigma (float): Radius of the neighborhood function.
+        """
+        self.som_x = som_x
+        self.som_y = som_y
+        self.iterations = iterations
+        self.lr = lr
+        self.sigma = sigma
+        self.som = None
+
+    def fit(self, data: np.ndarray):
+        """
+        Trains the SOM on the provided 2D data.
+
+        Args:
+            data (np.ndarray): A 2D NumPy array of shape (n_samples, n_features).
+        """
+        # Ensure data is 2D
+        if len(data.shape) != 2:
+            raise ValueError(f"Data must be a 2D array, but got shape {data.shape}")
+
+        n_samples, n_features = data.shape
+
+        # Initialize and train the SOM using MiniSom
+        self.som = MiniSom(
+            x_size=self.som_x,
+            y_size=self.som_y,
+            input_len=n_features,
+            sigma=self.sigma,
+            learning_rate=self.lr,
+            random_seed=0,  # For reproducibility
+            topology='hexagonal'
+        )
+        self.som.random_weights_init(data)
+        self.som.train_random(data, num_iteration=self.iterations)
+
+    def get_top_k_neuron_weights(self, k: int):
+        """
+        Gets the weights of the top-k neurons based on their frequency of being winners.
+
+        Args:
+            k (int): The number of top neurons to return.
+
+        Returns:
+            np.ndarray: A 2D array of shape (k, n_features) containing the weights of the top-k neurons.
+        """
+        if self.som is None:
+            raise RuntimeError("SOM has not been trained yet. Call `fit()` first.")
+
+        winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])])
+        counts = defaultdict(int)
+        for w in winners:
+            counts[tuple(w)] += 1
+
+        # Sort neurons by their count (descending) and get the top-k
+        sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k]
+
+        # Get the coordinates of the top-k neurons
+        top_k_coords = [coord for coord, _ in sorted_neurons]
+
+        # Fetch the weights for these top-k neurons
+        # self.som.get_weights() has shape (som_x, som_y, n_features)
+        top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
+
+        return top_k_weights

From 3ad2917acf6f406d755379a723a9681a58cb5af5 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 21:23:46 +0300
Subject: [PATCH 02/21] use multidirection in optuna optimizer

---
 src/heretic/main.py  | 16 ++++++++--------
 src/heretic/model.py |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 9a5a71f4..75a57244 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -528,11 +528,11 @@ def objective(trial: Trial) -> tuple[float, float]:
             # The parameter ranges are based on experiments with various models
             # and much wider ranges. They are not set in stone and might have to be
             # adjusted for future models.
-            max_weight = trial.suggest_float(
-                f"{component}.max_weight",
+            max_weights = [trial.suggest_float(
+                f"{component}.max_weight.{i}",
                 0.8,
                 1.5,
-            )
+            ) for i in range(len(refusal_directions))]
             max_weight_position = trial.suggest_float(
                 f"{component}.max_weight_position",
                 0.6 * last_layer_index,
@@ -541,11 +541,11 @@ def objective(trial: Trial) -> tuple[float, float]:
             # For sampling purposes, min_weight is expressed as a fraction of max_weight,
             # again because multivariate TPE doesn't support variable-range parameters.
             # The value is transformed into the actual min_weight value below.
-            min_weight = trial.suggest_float(
-                f"{component}.min_weight",
+            min_weights = [trial.suggest_float(
+                f"{component}.min_weight.{i}",
                 0.0,
                 1.0,
-            )
+            ) for i in range(len(refusal_directions))]
             min_weight_distance = trial.suggest_float(
                 f"{component}.min_weight_distance",
                 1.0,
@@ -553,9 +553,9 @@ def objective(trial: Trial) -> tuple[float, float]:
             )
 
             parameters[component] = AbliterationParameters(
-                max_weight=max_weight,
+                max_weights=max_weights,
                 max_weight_position=max_weight_position,
-                min_weight=(min_weight * max_weight),
+                min_weights=[(min_weight * max_weight) for (min_weight, max_weight) in zip(min_weights, max_weights)],
                 min_weight_distance=min_weight_distance,
             )
 
diff --git a/src/heretic/model.py b/src/heretic/model.py
index 58300b16..0da4f807 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -46,9 +46,9 @@ def get_model_class(
 
 @dataclass
 class AbliterationParameters:
-    max_weight: float
+    max_weights: list[float]
     max_weight_position: float
-    min_weight: float
+    min_weights: list[float]
     min_weight_distance: float
 
 

From bf5f2acdfbf8200d6109e53aa79a1bd92840f062 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:12:06 +0300
Subject: [PATCH 03/21] multi-directional ablation

---
 src/heretic/main.py  |   3 +
 src/heretic/model.py | 192 ++++++++++++++++++++++---------------------
 2 files changed, 102 insertions(+), 93 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 75a57244..b9153e21 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -475,6 +475,9 @@ def run():
         ) for (refusal_direction, projection_vector) in zip(refusal_directions, projection_vectors)]
         refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions]
 
+    refusal_directions = torch.stack(refusal_directions, dim=0)
+    refusal_directions = refusal_directions.permute(1, 0, -1) # layers, directions, hidden_dim
+
     analyzer = Analyzer(settings, model, good_residuals, bad_residuals)
 
     if settings.print_residual_geometry:
diff --git a/src/heretic/model.py b/src/heretic/model.py
index 0da4f807..46c90ea2 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -378,146 +378,152 @@ def get_abliterable_components(self) -> list[str]:
 
     def abliterate(
         self,
-        refusal_directions: Tensor,
+        refusal_directions: torch.Tensor, # Shape: (layers, num_directions, hidden_dim)
         direction_index: float | None,
         parameters: dict[str, AbliterationParameters],
     ):
-        if direction_index is None:
-            refusal_direction = None
-        else:
-            # The index must be shifted by 1 because the first element
-            # of refusal_directions is the direction for the embeddings.
-            weight, index = math.modf(direction_index + 1)
-            refusal_direction = F.normalize(
-                refusal_directions[int(index)].lerp(
-                    refusal_directions[int(index) + 1],
+        """
+        Abliterates the model using multidirectional refusal vectors.
+
+        Args:
+            refusal_directions: A 3D tensor where the first dimension corresponds to layers,
+                                the second to the number of SOM-derived directions, and the third
+                                to the hidden dimension.
+            direction_index: An optional float to interpolate between two specific refusal directions
+                            (across all layers). If None, all directions are used.
+            parameters: A dictionary mapping component names to AbliterationParameters.
+                        Each AbliterationParameters instance must have max_weights and min_weights
+                        as lists of length equal to the number of directions.
+        """
+        num_layers, num_directions, _ = refusal_directions.shape
+
+        if direction_index is not None:
+            # If a specific direction is requested, interpolate across the direction dimension.
+            # For example, if direction_index=0.5, it will blend direction 0 and 1.
+            weight, index = math.modf(direction_index)
+            # Clamp index to be within the valid range [0, num_directions - 1]
+            idx1 = int(index) % num_directions
+            idx2 = (idx1 + 1) % num_directions
+            # Interpolate between the two chosen directions. The result has shape (layers, hidden_dim)
+            interpolated_directions = F.normalize(
+                refusal_directions[:, idx1].lerp(
+                    refusal_directions[:, idx2],
                     weight,
                 ),
                 p=2,
-                dim=0,
+                dim=1,
             )
+            # We will use this single "blended" set of directions for all layers.
+            # Shape: (layers, hidden_dim)
+            layer_refusal_directions = interpolated_directions
+        else:
+            # If no specific direction is given, we use all directions for each layer.
+            # Shape is already (layers, num_directions, hidden_dim)
+            layer_refusal_directions = refusal_directions
 
-        # Note that some implementations of abliteration also orthogonalize
-        # the embedding matrix, but it's unclear if that has any benefits.
-        for layer_index in range(len(self.get_layers())):
+        # Now, iterate through each layer to apply the ablation.
+        for layer_index in range(num_layers):
             for component, modules in self.get_layer_modules(layer_index).items():
                 params = parameters[component]
 
-                # Type inference fails here for some reason.
-                distance = cast(float, abs(layer_index - params.max_weight_position))
+                # Ensure the number of weights matches the number of directions
+                if len(params.max_weights) != num_directions or len(params.min_weights) != num_directions:
+                    raise ValueError(
+                        f"Mismatch in number of directions and weights for component '{component}'. "
+                        f"Found {num_directions} directions but {len(params.max_weights)} max_weights."
+                    )
 
-                # Don't orthogonalize layers that are more than
-                # min_weight_distance away from max_weight_position.
+                # Get the layer-specific position scaling factor.
+                # This factor is the same for all directions in this layer.
+                distance = cast(float, abs(layer_index - params.max_weight_position))
                 if distance > params.min_weight_distance:
+                    # If this layer is too far from the optimal position, skip it entirely.
                     continue
 
-                # Interpolate linearly between max_weight and min_weight
-                # over min_weight_distance.
-                weight = params.max_weight + (distance / params.min_weight_distance) * (
-                    params.min_weight - params.max_weight
+                # Calculate the base weight scaling factor for this layer.
+                # This factor will be applied to all directions in this layer.
+                layer_base_weight = params.max_weights[0] + (distance / params.min_weight_distance) * (
+                    params.min_weights[0] - params.max_weights[0]
                 )
+                # Note: In the original single-direction logic, this `layer_base_weight` was the
+                # final weight. Now, it's a base factor that can be further modified per direction.
 
-                if refusal_direction is None:
-                    # The index must be shifted by 1 because the first element
-                    # of refusal_directions is the direction for the embeddings.
-                    layer_refusal_direction = refusal_directions[layer_index + 1]
-                else:
-                    layer_refusal_direction = refusal_direction
+                # Get the refusal directions for the current layer.
+                # Shape: (num_directions, hidden_dim)
+                current_layer_directions = layer_refusal_directions[layer_index]
 
                 for module in modules:
-                    # FIXME: This cast is potentially invalid, because the program logic
-                    #        does not guarantee that the module is of type Linear, and in fact
-                    #        the retrieved modules might not conform to the interface assumed
-                    #        below (though they do in practice). However, this is difficult
-                    #        to fix cleanly, because get_layer_modules is called twice on
-                    #        different model configurations, and PEFT employs different
-                    #        module types depending on the chosen quantization.
                     module = cast(Linear, module)
-
-                    # LoRA abliteration: delta W = -lambda * v * (v^T W)
-                    # lora_B = -lambda * v
-                    # lora_A = v^T W
-
-                    # Use the FP32 refusal direction directly (no downcast/upcast)
-                    # and move to the correct device.
-                    v = layer_refusal_direction.to(module.weight.device)
-
-                    # Get W (dequantize if necessary).
-                    #
-                    # FIXME: This cast is valid only under the assumption that the original
-                    #        module wrapped by the LoRA adapter has a weight attribute.
-                    #        See the comment above for why this is currently not guaranteed.
+                    # Get the weight matrix W (dequantized if necessary) in FP32.
                     base_weight = cast(Tensor, module.base_layer.weight)
                     quant_state = getattr(base_weight, "quant_state", None)
-
                     if quant_state is None:
                         W = base_weight.to(torch.float32)
                     else:
-                        # 4-bit quantization.
-                        # This cast is always valid. Type inference fails here because the
-                        # bnb.functional module is not found by ty for some reason.
-                        W = cast(
-                            Tensor,
-                            bnb.functional.dequantize_4bit(  # ty:ignore[possibly-missing-attribute]
-                                base_weight.data,
-                                quant_state,
-                            ).to(torch.float32),
-                        )
+                        W = cast(Tensor, bnb.functional.dequantize_4bit(base_weight.data, quant_state)).to(torch.float32)
 
-                    # Flatten weight matrix to (out_features, in_features).
-                    W = W.view(W.shape[0], -1)
+                    W = W.view(W.shape[0], -1) # Flatten to (out_features, in_features)
 
                     if self.settings.row_normalization != RowNormalization.NONE:
-                        # Keep a reference to the original weight matrix so we can subtract it later.
                         W_org = W
-                        # Get the row norms.
                         W_row_norms = LA.vector_norm(W, dim=1, keepdim=True)
-                        # Normalize the weight matrix along the rows.
                         W = F.normalize(W, p=2, dim=1)
 
-                    # Calculate lora_A = v^T W
-                    # v is (d_out,), W is (d_out, d_in)
-                    # v @ W -> (d_in,)
-                    lora_A = (v @ W).view(1, -1)
+                    # --- The core change is here ---
+                    # We will calculate a delta for each direction and sum them up.
+                    total_delta_W = torch.zeros_like(W, device=W.device)
+
+                    for i in range(num_directions):
+                        # Get the specific refusal direction for this component.
+                        v = current_layer_directions[i].to(module.weight.device)
 
-                    # Calculate lora_B = -weight * v
-                    # v is (d_out,)
-                    lora_B = (-weight * v).view(-1, 1)
+                        # Get the optimized weight for this specific direction (i).
+                        # It's a combination of the layer's base weight and the direction's specific weight.
+                        direction_specific_weight = layer_base_weight * (params.max_weights[i] / params.max_weights[0] if params.max_weights[0] != 0 else 1.0)
 
+                        # LoRA abliteration: delta W = -lambda * v * (v^T W)
+                        # lora_B = -lambda * v
+                        # lora_A = v^T W
+                        v = v # Shape: (hidden_dim,)
+                        W_matrix = W # Shape: (hidden_dim, in_features)
+
+                        # Calculate lora_A = v^T W -> (in_features,)
+                        lora_A = (v @ W_matrix).view(1, -1)
+                        # Calculate lora_B = -direction_specific_weight * v -> (hidden_dim, 1)
+                        lora_B = (-direction_specific_weight * v).view(-1, 1)
+
+                        # The delta for this direction is lora_B @ lora_A
+                        delta_W = lora_B @ lora_A
+                        total_delta_W += delta_W
+
+                    # Now, apply the combined delta to W based on the row normalization setting.
                     if self.settings.row_normalization == RowNormalization.PRE:
-                        # Make the LoRA adapter apply to the original weight matrix.
-                        lora_B = W_row_norms * lora_B
+                        total_delta_W = W_row_norms * total_delta_W
                     elif self.settings.row_normalization == RowNormalization.FULL:
-                        # Approximates https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration
-                        W = W + lora_B @ lora_A
-                        # Normalize the adjusted weight matrix along the rows.
+                        W = W + total_delta_W
                         W = F.normalize(W, p=2, dim=1)
-                        # Restore the original row norms of the weight matrix.
                         W = W * W_row_norms
-                        # Subtract the original matrix to turn W into a delta.
                         W = W - W_org
-                        # Use a low-rank SVD to get an approximation of the matrix.
+                        # Use low-rank SVD for the full delta
                         r = self.peft_config.r
                         U, S, Vh = torch.svd_lowrank(W, q=2 * r + 4, niter=6)
-                        # Truncate it to the part we want to store in the LoRA adapter.
-                        # Note: svd_lowrank actually returns V, so transpose it to get Vh.
                         U = U[:, :r]
                         S = S[:r]
                         Vh = Vh[:, :r].T
-                        # Transfer it into the LoRA adapter components. Split the singular values
-                        # evenly between the two components to keep their norms balanced and avoid
-                        # potential issues with numerical stability.
                         sqrt_S = torch.sqrt(S)
                         lora_B = U @ torch.diag(sqrt_S)
                         lora_A = torch.diag(sqrt_S) @ Vh
-
-                    # Assign to adapters. The adapter name is "default", because that's
-                    # what PEFT uses when no name is explicitly specified, as above.
-                    # These casts are therefore valid.
-                    weight_A = cast(Tensor, module.lora_A["default"].weight)
-                    weight_B = cast(Tensor, module.lora_B["default"].weight)
-                    weight_A.data = lora_A.to(weight_A.dtype)
-                    weight_B.data = lora_B.to(weight_B.dtype)
+                        # Assign the SVD result to the LoRA weights
+                        weight_A = cast(Tensor, module.lora_A["default"].weight)
+                        weight_B = cast(Tensor, module.lora_B["default"].weight)
+                        weight_A.data = lora_A.to(weight_A.dtype)
+                        weight_B.data = lora_B.to(weight_B.dtype)
+                    else: # RowNormalization.NONE or others
+                        W = W + total_delta_W
+
+                        # Assign the new weight matrix back to the base layer.
+                        # Reshape W back to its original 2D shape.
+                        module.base_layer.weight.data = W.view_as(module.base_layer.weight).to(module.base_layer.weight.dtype)
 
     def generate(
         self,

From 46c320a7d112ee9f34ab7c662d04b949da6b4b12 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:15:21 +0300
Subject: [PATCH 04/21] add SOM params to config template

---
 config.default.toml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/config.default.toml b/config.default.toml
index abfa0fc7..67fb31a3 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -85,6 +85,27 @@ full_normalization_lora_rank = 3
 # of the components, then clamps the magnitudes of all components to that quantile.
 winsorization_quantile = 1.0
 
+# Use multidirectional Self-Organizing Map
+multidirectional_som = false
+
+# Number of SOM neurons in the x-axis.
+som_x = 4
+
+# Number of SOM neurons in the y-axis.
+som_y = 4
+
+# Number of SOM training iterations.
+som_iterations = 5000
+
+# SOM learning rate.
+som_lr = 0.01
+
+# SOM neighborhood radius.
+som_sigma = 0.5
+
+# Number of top neurons to use for multidirectional SOM.
+som_k = 4
+
 # Number of abliteration trials to run during optimization.
 n_trials = 200
 

From 85a6eb0376ee1af3ff45151d06bb05a771c4f68d Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:26:59 +0300
Subject: [PATCH 05/21] som is relative import

---
 src/heretic/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index b9153e21..5212e9d9 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -429,7 +429,7 @@ def run():
 
     if settings.multidirectional_som:
 
-        from som import SOMCalculator
+        from .som import SOMCalculator
         bad_means = []
 
         # bad_residuals shape: (num_bad_prompts, num_layers, hidden_dim)

From e012bb99813ea1d243fc15b9f9046de800872714 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:40:24 +0300
Subject: [PATCH 06/21] tensor to cpu for numpy

---
 src/heretic/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 5212e9d9..0544ee1b 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -441,7 +441,7 @@ def run():
             print(f"  - Processing Layer {layer_idx + 1}/{num_layers}...")
             # Extract residuals for the current layer
             # Shape: (num_bad_prompts, hidden_dim)
-            layer_residuals = bad_residuals[:, layer_idx, :].numpy()
+            layer_residuals = bad_residuals[:, layer_idx, :].cpu().float().numpy()
 
             # Initialize and fit the SOM for this layer's residuals
             som_calc = SOMCalculator(
@@ -458,7 +458,7 @@ def run():
 
             # Convert back to tensor and add to our list
             # Shape: (k, hidden_dim)
-            bad_means.append(torch.tensor(top_k_weights, dtype=torch.float32))
+            bad_means.append(torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device))
     else:
         bad_means = [bad_residuals.mean(dim=0)]
 

From 7f3b941edf6d57af75df89eaf204632109342141 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:40:50 +0300
Subject: [PATCH 07/21] match minisom args to reference code

---
 src/heretic/som.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/heretic/som.py b/src/heretic/som.py
index 6fa012d6..f2e83370 100644
--- a/src/heretic/som.py
+++ b/src/heretic/som.py
@@ -42,16 +42,17 @@ def fit(self, data: np.ndarray):
 
         # Initialize and train the SOM using MiniSom
         self.som = MiniSom(
-            x_size=self.som_x,
-            y_size=self.som_y,
-            input_len=n_features,
+            self.som_x,
+            self.som_y,
+            n_features,
             sigma=self.sigma,
             learning_rate=self.lr,
             random_seed=0,  # For reproducibility
+            activation_distance='euclidean',
             topology='hexagonal'
         )
         self.som.random_weights_init(data)
-        self.som.train_random(data, num_iteration=self.iterations)
+        self.som.train_random(data, self.iterations)
 
     def get_top_k_neuron_weights(self, k: int):
         """

From ce1c5891a33eee7541411e5e7360dc38ea8a5139 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 22:53:02 +0300
Subject: [PATCH 08/21] permute layers and directions

---
 src/heretic/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 0544ee1b..17eb1dfa 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -425,7 +425,7 @@ def run():
     print("* Obtaining residuals for bad prompts...")
     bad_residuals = model.get_residuals_batched(bad_prompts)
 
-    good_means = good_residuals.mean(dim=0)
+    good_means = good_residuals.mean(dim=0) # N_layers, hidden_dim
 
     if settings.multidirectional_som:
 
@@ -462,6 +462,8 @@ def run():
     else:
         bad_means = [bad_residuals.mean(dim=0)]
 
+    bad_means = torch.stack(bad_means).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
+
     refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]
 
     if settings.orthogonalize_direction:
@@ -476,8 +478,7 @@ def run():
         refusal_directions = [F.normalize(refusal_direction, p=2, dim=1) for refusal_direction in refusal_directions]
 
     refusal_directions = torch.stack(refusal_directions, dim=0)
-    refusal_directions = refusal_directions.permute(1, 0, -1) # layers, directions, hidden_dim
-
+    refusal_directions = refusal_directions.permute(1, 0, 2) # layers, directions, hidden_dim
     analyzer = Analyzer(settings, model, good_residuals, bad_residuals)
 
     if settings.print_residual_geometry:

From 94b858c11297d048b1a324ef4a7e58209b6563ee Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 23:08:36 +0300
Subject: [PATCH 09/21] duplicate de-duplicated neurons

---
 src/heretic/main.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 17eb1dfa..c85e9233 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -456,13 +456,19 @@ def run():
             # Get the weights of the top-k neurons as our "bad means"
             top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k)
 
+            # TODO: SOM de-duplicates neurons if they are too close to each other
+            # The logic can be reworked for this fact later
+            # For now we will duplicate them manually
+            t = torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device)
+            t = t.expand(settings.som_k, t.shape[-1])
+
             # Convert back to tensor and add to our list
             # Shape: (k, hidden_dim)
-            bad_means.append(torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device))
+            bad_means.append(t)
     else:
         bad_means = [bad_residuals.mean(dim=0)]
 
-    bad_means = torch.stack(bad_means).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
+    bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
 
     refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]
 

From a2a209c9892e75cb1d42beb316089a92784fc943 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 23:12:36 +0300
Subject: [PATCH 10/21] fixup suggest float count

---
 src/heretic/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index c85e9233..98672e08 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -542,7 +542,7 @@ def objective(trial: Trial) -> tuple[float, float]:
                 f"{component}.max_weight.{i}",
                 0.8,
                 1.5,
-            ) for i in range(len(refusal_directions))]
+            ) for i in range(refusal_directions.shape[1])]
             max_weight_position = trial.suggest_float(
                 f"{component}.max_weight_position",
                 0.6 * last_layer_index,
@@ -555,7 +555,7 @@ def objective(trial: Trial) -> tuple[float, float]:
                 f"{component}.min_weight.{i}",
                 0.0,
                 1.0,
-            ) for i in range(len(refusal_directions))]
+            ) for i in range(refusal_directions.shape[1])]
             min_weight_distance = trial.suggest_float(
                 f"{component}.min_weight_distance",
                 1.0,

From dad69dfc00fdd3008217205e011e364e7313034b Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 23:29:55 +0300
Subject: [PATCH 11/21] print parameter lists

---
 src/heretic/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/heretic/utils.py b/src/heretic/utils.py
index a0d5f35f..145dbba8 100644
--- a/src/heretic/utils.py
+++ b/src/heretic/utils.py
@@ -260,8 +260,11 @@ def get_trial_parameters(trial: Trial) -> dict[str, str]:
 
     for component, parameters in trial.user_attrs["parameters"].items():
         for name, value in parameters.items():
-            params[f"{component}.{name}"] = f"{value:.2f}"
-
+            if isinstance(value, list):
+                for direction, direction_value in enumerate(value):
+                    params[f"{component}.{name}.{direction}"] = f"{direction}: {direction_value:.2f}"
+            else:
+                params[f"{component}.{name}"] = f"{value:.2f}"
     return params
 
 

From 07655cdfc86f9f2fd778e51189adf8ef9127702a Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Thu, 26 Feb 2026 23:34:14 +0300
Subject: [PATCH 12/21] correspond the layers count to vanilla code

---
 src/heretic/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/heretic/model.py b/src/heretic/model.py
index 46c90ea2..0ad975b9 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -395,7 +395,7 @@ def abliterate(
                         Each AbliterationParameters instance must have max_weights and min_weights
                         as lists of length equal to the number of directions.
         """
-        num_layers, num_directions, _ = refusal_directions.shape
+        _, num_directions, _ = refusal_directions.shape
 
         if direction_index is not None:
             # If a specific direction is requested, interpolate across the direction dimension.
@@ -422,7 +422,7 @@ def abliterate(
             layer_refusal_directions = refusal_directions
 
         # Now, iterate through each layer to apply the ablation.
-        for layer_index in range(num_layers):
+        for layer_index in range(len(self.get_layers())):
             for component, modules in self.get_layer_modules(layer_index).items():
                 params = parameters[component]
 

From 31378cb7cf8bb5a83d9a91eaf6ca48161cfa24b9 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 00:02:58 +0300
Subject: [PATCH 13/21] layerwise direction interpolation

---
 src/heretic/model.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/heretic/model.py b/src/heretic/model.py
index 0ad975b9..6f2f01fd 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -400,21 +400,21 @@ def abliterate(
         if direction_index is not None:
             # If a specific direction is requested, interpolate across the direction dimension.
             # For example, if direction_index=0.5, it will blend direction 0 and 1.
-            weight, index = math.modf(direction_index)
-            # Clamp index to be within the valid range [0, num_directions - 1]
-            idx1 = int(index) % num_directions
-            idx2 = (idx1 + 1) % num_directions
-            # Interpolate between the two chosen directions. The result has shape (layers, hidden_dim)
-            interpolated_directions = F.normalize(
-                refusal_directions[:, idx1].lerp(
-                    refusal_directions[:, idx2],
+            weight, index = math.modf(direction_index + 1)
+            # Clamp index to be within the valid range [0, num_layers - 1]
+            idx1 = int(index)
+            idx2 = (idx1 + 1)
+            # Interpolate between the two chosen directions. The result has shape (n_directions, hidden_dim)
+            interpolated_directions = torch.stack([F.normalize(
+                r_dir[idx1].lerp(
+                    r_dir[idx2],
                     weight,
                 ),
                 p=2,
-                dim=1,
-            )
+                dim=0,
+            ) for r_dir in refusal_directions.permute(1, 0, 2)], dim=0)
             # We will use this single "blended" set of directions for all layers.
-            # Shape: (layers, hidden_dim)
+            # Shape: (n_directions, hidden_dim)
             layer_refusal_directions = interpolated_directions
         else:
             # If no specific direction is given, we use all directions for each layer.
@@ -450,7 +450,10 @@ def abliterate(
 
                 # Get the refusal directions for the current layer.
                 # Shape: (num_directions, hidden_dim)
-                current_layer_directions = layer_refusal_directions[layer_index]
+                if direction_index is None:
+                    current_layer_directions = layer_refusal_directions[layer_index+1]
+                else:
+                    current_layer_directions = layer_refusal_directions
 
                 for module in modules:
                     module = cast(Linear, module)

From 3b111e86d543c652ae20b58f9b8ae01fcb699161 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 10:39:34 +0300
Subject: [PATCH 14/21] fixup cases with not full normalization

---
 src/heretic/model.py | 46 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/heretic/model.py b/src/heretic/model.py
index 6f2f01fd..ebeb6b90 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -501,7 +501,24 @@ def abliterate(
 
                     # Now, apply the combined delta to W based on the row normalization setting.
                     if self.settings.row_normalization == RowNormalization.PRE:
-                        total_delta_W = W_row_norms * total_delta_W
+                        # Use low-rank SVD to decompose the total_delta_W into LoRA matrices.
+                        r = self.peft_config.r
+                        # Perform SVD on the total_delta_W matrix
+                        U, S, Vh = torch.svd_lowrank(total_delta_W, q=min(2 * r + 4, min(total_delta_W.shape)), niter=6)
+
+                        # Truncate to rank 'r'
+                        U = U[:, :r]
+                        S = S[:r]
+                        Vh = Vh[:, :r].T # Vh is (r, in_features)
+
+                        # Split the singular values between lora_B and lora_A
+                        sqrt_S = torch.sqrt(S)
+                        lora_B = U @ torch.diag(sqrt_S)  # Shape: (out_features, r)
+                        lora_A = torch.diag(sqrt_S) @ Vh  # Shape: (r, in_features)
+
+                        # Apply PRE normalization: scale lora_B by the original row norms.
+                        # lora_B already has shape (out_features, r), W_row_norms is (out_features, 1)
+                        lora_B = W_row_norms * lora_B
                     elif self.settings.row_normalization == RowNormalization.FULL:
                         W = W + total_delta_W
                         W = F.normalize(W, p=2, dim=1)
@@ -516,17 +533,26 @@ def abliterate(
                         sqrt_S = torch.sqrt(S)
                         lora_B = U @ torch.diag(sqrt_S)
                         lora_A = torch.diag(sqrt_S) @ Vh
-                        # Assign the SVD result to the LoRA weights
-                        weight_A = cast(Tensor, module.lora_A["default"].weight)
-                        weight_B = cast(Tensor, module.lora_B["default"].weight)
-                        weight_A.data = lora_A.to(weight_A.dtype)
-                        weight_B.data = lora_B.to(weight_B.dtype)
                     else: # RowNormalization.NONE or others
-                        W = W + total_delta_W
+                        # In NONE mode, the delta is simply total_delta_W.
+                        # We decompose this delta directly using SVD.
+                        r = self.peft_config.r
+                        U, S, Vh = torch.svd_lowrank(total_delta_W, q=min(2 * r + 4, min(total_delta_W.shape)), niter=6)
+
+                        U = U[:, :r]
+                        S = S[:r]
+                        Vh = Vh[:, :r].T
+
+                        sqrt_S = torch.sqrt(S)
+                        lora_B = U @ torch.diag(sqrt_S)
+                        lora_A = torch.diag(sqrt_S) @ Vh
 
-                        # Assign the new weight matrix back to the base layer.
-                        # Reshape W back to its original 2D shape.
-                        module.base_layer.weight.data = W.view_as(module.base_layer.weight).to(module.base_layer.weight.dtype)
+                        
+                    # Assign the SVD result to the LoRA weights
+                    weight_A = cast(Tensor, module.lora_A["default"].weight)
+                    weight_B = cast(Tensor, module.lora_B["default"].weight)
+                    weight_A.data = lora_A.to(weight_A.dtype)
+                    weight_B.data = lora_B.to(weight_B.dtype)
 
     def generate(
         self,

From 001006dcb393b0c1a699339a53dd4866f6ddd708 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 11:04:24 +0300
Subject: [PATCH 15/21] config template consistent with fields

---
 config.default.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.default.toml b/config.default.toml
index 67fb31a3..d21c68b6 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -95,7 +95,7 @@ som_x = 4
 som_y = 4
 
 # Number of SOM training iterations.
-som_iterations = 5000
+som_iterations = 10000
 
 # SOM learning rate.
 som_lr = 0.01

From c37a0462bcbeb4294bb37dac8bd6c4d279d45a29 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 11:07:21 +0300
Subject: [PATCH 16/21] style guide

---
 config.default.toml | 2 +-
 src/heretic/som.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config.default.toml b/config.default.toml
index d21c68b6..d0b19662 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -85,7 +85,7 @@ full_normalization_lora_rank = 3
 # of the components, then clamps the magnitudes of all components to that quantile.
 winsorization_quantile = 1.0
 
-# Use multidirectional Self-Organizing Map
+# Use multidirectional Self-Organizing Map.
 multidirectional_som = false
 
 # Number of SOM neurons in the x-axis.
diff --git a/src/heretic/som.py b/src/heretic/som.py
index f2e83370..776c0293 100644
--- a/src/heretic/som.py
+++ b/src/heretic/som.py
@@ -9,7 +9,7 @@ class SOMCalculator:
     """
     A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights.
     """
-    def __init__(self, som_x, som_y, iterations, lr, sigma):
+    def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None:
         """
         Initializes the SOM calculator with training parameters.
 
@@ -27,7 +27,7 @@ def __init__(self, som_x, som_y, iterations, lr, sigma):
         self.sigma = sigma
         self.som = None
 
-    def fit(self, data: np.ndarray):
+    def fit(self, data: np.ndarray) -> None:
         """
         Trains the SOM on the provided 2D data.
 
@@ -54,7 +54,7 @@ def fit(self, data: np.ndarray):
         self.som.random_weights_init(data)
         self.som.train_random(data, self.iterations)
 
-    def get_top_k_neuron_weights(self, k: int):
+    def get_top_k_neuron_weights(self, k: int) -> np.ndarray:
         """
         Gets the weights of the top-k neurons based on their frequency of being winners.
 

From c02869ab37a8762d98fb8e41dd2e5dcf08fd90e3 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 11:16:35 +0300
Subject: [PATCH 17/21] use repeat instead of expand for duplication

---
 src/heretic/main.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 98672e08..6fa8a90c 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -456,11 +456,15 @@ def run():
             # Get the weights of the top-k neurons as our "bad means"
             top_k_weights = som_calc.get_top_k_neuron_weights(k=settings.som_k)
 
-            # TODO: SOM de-duplicates neurons if they are too close to each other
-            # The logic can be reworked for this fact later
-            # For now we will duplicate them manually
+            # SOM de-duplicates neurons if they are too close to each other
+            # Temporary solution is to repeat them back
             t = torch.tensor(top_k_weights, dtype=good_means.dtype, device=good_means.device)
-            t = t.expand(settings.som_k, t.shape[-1])
+            num_found_neurons = t.shape[0]
+
+            if num_found_neurons < settings.som_k:
+                t = t.repeat(settings.som_k // num_found_neurons, 1)
+                if t.shape[0] < settings.som_k:
+                    t = torch.cat([t, t[0:settings.som_k - t.shape[0]]], dim=0)
 
             # Convert back to tensor and add to our list
             # Shape: (k, hidden_dim)

From 9b2d068712f8f424156f05288b6790b6e5407c7d Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 11:20:13 +0300
Subject: [PATCH 18/21] layer index clamping

---
 src/heretic/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heretic/model.py b/src/heretic/model.py
index ebeb6b90..0b2faadf 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -403,7 +403,7 @@ def abliterate(
             weight, index = math.modf(direction_index + 1)
             # Clamp index to be within the valid range [0, num_layers - 1]
             idx1 = int(index)
-            idx2 = (idx1 + 1)
+            idx2 = min(idx1 + 1, refusal_directions.shape[0] - 1)
             # Interpolate between the two chosen directions. The result has shape (n_directions, hidden_dim)
             interpolated_directions = torch.stack([F.normalize(
                 r_dir[idx1].lerp(

From 0be6b8722efae711f4307086603c733204e43629 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 13:03:04 +0300
Subject: [PATCH 19/21] som neurons retrieval through win map

---
 config.default.toml   |  3 ++
 src/heretic/config.py |  5 ++++
 src/heretic/main.py   |  3 +-
 src/heretic/som.py    | 65 +++++++++++++++++++++++++++++++++----------
 4 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/config.default.toml b/config.default.toml
index d0b19662..4fc105c4 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -106,6 +106,9 @@ som_sigma = 0.5
 # Number of top neurons to use for multidirectional SOM.
 som_k = 4
 
+# Use win map for SOM neurons retrieval.
+som_use_win_map = false
+
 # Number of abliteration trials to run during optimization.
 n_trials = 200
 
diff --git a/src/heretic/config.py b/src/heretic/config.py
index 750a7814..62ef9460 100644
--- a/src/heretic/config.py
+++ b/src/heretic/config.py
@@ -244,6 +244,11 @@ class Settings(BaseSettings):
         default=4, description="Number of top neurons to use for multidirectional SOM."
     )
 
+    som_use_win_map: bool = Field(
+        default=False,
+        description="Use win map for SOM neurons retrieval.",
+    )
+
     n_trials: int = Field(
         default=200,
         description="Number of abliteration trials to run during optimization.",
diff --git a/src/heretic/main.py b/src/heretic/main.py
index 6fa8a90c..93fa5f70 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -449,7 +449,8 @@ def run():
                 som_y=settings.som_y,
                 iterations=settings.som_iterations,
                 lr=settings.som_lr,
-                sigma=settings.som_sigma
+                sigma=settings.som_sigma,
+                use_win_map=settings.som_use_win_map,
             )
             som_calc.fit(layer_residuals)
 
diff --git a/src/heretic/som.py b/src/heretic/som.py
index 776c0293..babdfbb2 100644
--- a/src/heretic/som.py
+++ b/src/heretic/som.py
@@ -9,7 +9,7 @@ class SOMCalculator:
     """
     A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights.
     """
-    def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None:
+    def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float, use_win_map: bool = False) -> None:
         """
         Initializes the SOM calculator with training parameters.
 
@@ -26,6 +26,8 @@ def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: fl
         self.lr = lr
         self.sigma = sigma
         self.som = None
+        self.data = None # Store the data used for training
+        self.use_win_map = use_win_map
 
     def fit(self, data: np.ndarray) -> None:
         """
@@ -38,7 +40,8 @@ def fit(self, data: np.ndarray) -> None:
         if len(data.shape) != 2:
             raise ValueError(f"Data must be a 2D array, but got shape {data.shape}")
 
-        n_samples, n_features = data.shape
+        self.data = data # Store the data
+        _, n_features = data.shape
 
         # Initialize and train the SOM using MiniSom
         self.som = MiniSom(
@@ -67,19 +70,53 @@ def get_top_k_neuron_weights(self, k: int) -> np.ndarray:
         if self.som is None:
             raise RuntimeError("SOM has not been trained yet. Call `fit()` first.")
 
-        winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])])
-        counts = defaultdict(int)
-        for w in winners:
-            counts[tuple(w)] += 1
+        if self.use_win_map:
 
-        # Sort neurons by their count (descending) and get the top-k
-        sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k]
+            win_map = self.som.win_map(self.data)
 
-        # Get the coordinates of the top-k neurons
-        top_k_coords = [coord for coord, _ in sorted_neurons]
+            # The win_map only contains neurons that won at least one data point.
+            # We need to account for all neurons in the grid (som_x x som_y) and
+            # give a count of 0 to those that didn't win anything.
+            all_neurons = [(i, j) for i in range(self.som_x) for j in range(self.som_y)]
 
-        # Fetch the weights for these top-k neurons
-        # self.som.get_weights() has shape (som_x, som_y, n_features)
-        top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
+            counts = defaultdict(int)
+            for neuron_coords, data_indices in win_map.items():
+                counts[neuron_coords] = len(data_indices)
 
-        return top_k_weights
+            # For neurons not in win_map, their count is 0.
+            # This ensures all neurons are considered in the ranking.
+            for neuron in all_neurons:
+                if neuron not in counts:
+                    counts[neuron] = 0
+
+            # Sort neurons by their count (descending) and get the top-k
+            sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)
+            top_k_neurons_with_counts = sorted_neurons[:k]
+
+            # Get the coordinates of the top-k neurons
+            top_k_coords = [coord for coord, _ in top_k_neurons_with_counts]
+
+            # Fetch the weights for these top-k neurons from the SOM's weight matrix.
+            # self.som.get_weights() has shape (som_x, som_y, n_features)
+            top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
+
+            return top_k_weights
+
+        else:
+
+            winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])])
+            counts = defaultdict(int)
+            for w in winners:
+                counts[tuple(w)] += 1
+
+            # Sort neurons by their count (descending) and get the top-k
+            sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k]
+
+            # Get the coordinates of the top-k neurons
+            top_k_coords = [coord for coord, _ in sorted_neurons]
+
+            # Fetch the weights for these top-k neurons
+            # self.som.get_weights() has shape (som_x, som_y, n_features)
+            top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
+
+            return top_k_weights

From e8338ca90cfce3167e0d978d408d96945c298bf9 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Fri, 27 Feb 2026 16:44:42 +0300
Subject: [PATCH 20/21] gemini is stupid

This reverts commit 0be6b8722efae711f4307086603c733204e43629. Win map results in worse results than without it and it hasn't been in the original.
---
 config.default.toml   |  3 --
 src/heretic/config.py |  5 ----
 src/heretic/main.py   |  3 +-
 src/heretic/som.py    | 65 ++++++++++---------------------------------
 4 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/config.default.toml b/config.default.toml
index 4fc105c4..d0b19662 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -106,9 +106,6 @@ som_sigma = 0.5
 # Number of top neurons to use for multidirectional SOM.
 som_k = 4
 
-# Use win map for SOM neurons retrieval.
-som_use_win_map = false
-
 # Number of abliteration trials to run during optimization.
 n_trials = 200
 
diff --git a/src/heretic/config.py b/src/heretic/config.py
index 62ef9460..750a7814 100644
--- a/src/heretic/config.py
+++ b/src/heretic/config.py
@@ -244,11 +244,6 @@ class Settings(BaseSettings):
         default=4, description="Number of top neurons to use for multidirectional SOM."
     )
 
-    som_use_win_map: bool = Field(
-        default=False,
-        description="Use win map for SOM neurons retrieval.",
-    )
-
     n_trials: int = Field(
         default=200,
         description="Number of abliteration trials to run during optimization.",
diff --git a/src/heretic/main.py b/src/heretic/main.py
index 93fa5f70..6fa8a90c 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -449,8 +449,7 @@ def run():
                 som_y=settings.som_y,
                 iterations=settings.som_iterations,
                 lr=settings.som_lr,
-                sigma=settings.som_sigma,
-                use_win_map=settings.som_use_win_map,
+                sigma=settings.som_sigma
             )
             som_calc.fit(layer_residuals)
 
diff --git a/src/heretic/som.py b/src/heretic/som.py
index babdfbb2..776c0293 100644
--- a/src/heretic/som.py
+++ b/src/heretic/som.py
@@ -9,7 +9,7 @@ class SOMCalculator:
     """
     A simplified class to train a Self-Organizing Map (SOM) and extract neuron weights.
     """
-    def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float, use_win_map: bool = False) -> None:
+    def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: float) -> None:
         """
         Initializes the SOM calculator with training parameters.
 
@@ -26,8 +26,6 @@ def __init__(self, som_x: int, som_y: int, iterations: int, lr: float, sigma: fl
         self.lr = lr
         self.sigma = sigma
         self.som = None
-        self.data = None # Store the data used for training
-        self.use_win_map = use_win_map
 
     def fit(self, data: np.ndarray) -> None:
         """
@@ -40,8 +38,7 @@ def fit(self, data: np.ndarray) -> None:
         if len(data.shape) != 2:
             raise ValueError(f"Data must be a 2D array, but got shape {data.shape}")
 
-        self.data = data # Store the data
-        _, n_features = data.shape
+        n_samples, n_features = data.shape
 
         # Initialize and train the SOM using MiniSom
         self.som = MiniSom(
@@ -70,53 +67,19 @@ def get_top_k_neuron_weights(self, k: int) -> np.ndarray:
         if self.som is None:
             raise RuntimeError("SOM has not been trained yet. Call `fit()` first.")
 
-        if self.use_win_map:
+        winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])])
+        counts = defaultdict(int)
+        for w in winners:
+            counts[tuple(w)] += 1
 
-            win_map = self.som.win_map(self.data)
+        # Sort neurons by their count (descending) and get the top-k
+        sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k]
 
-            # The win_map only contains neurons that won at least one data point.
-            # We need to account for all neurons in the grid (som_x x som_y) and
-            # give a count of 0 to those that didn't win anything.
-            all_neurons = [(i, j) for i in range(self.som_x) for j in range(self.som_y)]
+        # Get the coordinates of the top-k neurons
+        top_k_coords = [coord for coord, _ in sorted_neurons]
 
-            counts = defaultdict(int)
-            for neuron_coords, data_indices in win_map.items():
-                counts[neuron_coords] = len(data_indices)
+        # Fetch the weights for these top-k neurons
+        # self.som.get_weights() has shape (som_x, som_y, n_features)
+        top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
 
-            # For neurons not in win_map, their count is 0.
-            # This ensures all neurons are considered in the ranking.
-            for neuron in all_neurons:
-                if neuron not in counts:
-                    counts[neuron] = 0
-
-            # Sort neurons by their count (descending) and get the top-k
-            sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)
-            top_k_neurons_with_counts = sorted_neurons[:k]
-
-            # Get the coordinates of the top-k neurons
-            top_k_coords = [coord for coord, _ in top_k_neurons_with_counts]
-
-            # Fetch the weights for these top-k neurons from the SOM's weight matrix.
-            # self.som.get_weights() has shape (som_x, som_y, n_features)
-            top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
-
-            return top_k_weights
-
-        else:
-
-            winners = np.array([self.som.winner(x) for x in self.som._weights.reshape(-1, self.som._weights.shape[2])])
-            counts = defaultdict(int)
-            for w in winners:
-                counts[tuple(w)] += 1
-
-            # Sort neurons by their count (descending) and get the top-k
-            sorted_neurons = sorted(counts.items(), key=lambda item: item[1], reverse=True)[:k]
-
-            # Get the coordinates of the top-k neurons
-            top_k_coords = [coord for coord, _ in sorted_neurons]
-
-            # Fetch the weights for these top-k neurons
-            # self.som.get_weights() has shape (som_x, som_y, n_features)
-            top_k_weights = np.array([self.som.get_weights()[i, j] for i, j in top_k_coords])
-
-            return top_k_weights
+        return top_k_weights

From 444edece46741c6eb92cf7a9fe33a2929384cce8 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Sat, 28 Feb 2026 14:57:20 +0300
Subject: [PATCH 21/21] fix non-som path

---
 src/heretic/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 6fa8a90c..348e1be4 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -469,10 +469,10 @@ def run():
             # Convert back to tensor and add to our list
             # Shape: (k, hidden_dim)
             bad_means.append(t)
+        
+        bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
     else:
-        bad_means = [bad_residuals.mean(dim=0)]
-
-    bad_means = torch.stack(bad_means, dim=0).permute(1, 0, 2) # N_directions, N_layers, hidden_dim
+        bad_means = bad_residuals.mean(dim=0).unsqueeze(0)  # (1, N_layers, hidden_dim)
 
     refusal_directions = [F.normalize(bad_mean - good_means, p=2, dim=1) for bad_mean in bad_means]