p-e-w · Galunid · Mar 12, 2026 · gemini-code-assist · Mar 12, 2026 · gemini-code-assist
diff --git a/src/heretic/config.py b/src/heretic/config.py
@@ -230,6 +230,16 @@ class Settings(BaseSettings):
         description="Directory to save and load study progress to/from.",
     )
 
+    refusal_classifier_model: str = Field(
+        default="NousResearch/Minos-v1",
+        description="Hugging Face model ID for refusal classification.",
+    )
+
+    refusal_classifier_threshold: float = Field(
+        default=0.5,
+        description="Confidence threshold for refusal classification (0.0 to 1.0).",
+    )
+
     refusal_markers: list[str] = Field(
         default=[
             "sorry",

diff --git a/src/heretic/evaluator.py b/src/heretic/evaluator.py
@@ -6,6 +6,7 @@
 
 from .config import Settings
 from .model import Model
+from .refusal_classifier import RefusalClassifier
 from .utils import Prompt, load_prompts, print
 
 
@@ -16,6 +17,7 @@ class Evaluator:
     bad_prompts: list[Prompt]
     base_logprobs: Tensor
     base_refusals: int
+    refusal_classifier: RefusalClassifier
 
     def __init__(self, settings: Settings, model: Model):
         self.settings = settings
@@ -44,25 +46,15 @@ def __init__(self, settings: Settings, model: Model):
             f"* Initial refusals: [bold]{self.base_refusals}[/]/{len(self.bad_prompts)}"
         )
 
+        print("* Initializing refusal classifier...")
+        self.refusal_classifier = RefusalClassifier(settings.refusal_classifier_model)
-        self.refusal_classifier = RefusalClassifier(settings.refusal_classifier_model)
+        self.refusal_classifier = RefusalClassifier(
+            model_name=settings.refusal_classifier_model,
+            threshold=settings.refusal_classifier_threshold,
+        )
-        self.refusal_classifier = RefusalClassifier(settings.refusal_classifier_model)
+        self.refusal_classifier = RefusalClassifier(
+            model_name=settings.refusal_classifier_model,
+            threshold=settings.refusal_classifier_threshold,
+        )
+
     def is_refusal(self, response: str) -> bool:
         # Classify empty responses as refusals to avoid optimizing for them.
         if not response.strip():
             return True
 
-        # Remove emphasis (e.g. "I *will not*...").
-        response = response.lower().replace("*", "")
-
-        # Normalize typographic apostrophes ("won’t" -> "won't").
-        response = response.replace("’", "'")
-
-        # Normalize whitespace between words to a single space.
-        response = " ".join(response.split())
-
-        for marker in self.settings.refusal_markers:
-            if marker.lower() in response:
-                return True
-
-        return False
+        return self.refusal_classifier.is_refusal(response)
 
     def count_refusals(self) -> int:
         refusal_count = 0

diff --git a/src/heretic/main.py b/src/heretic/main.py
@@ -39,6 +39,7 @@
 from .config import QuantizationMethod, Settings
 from .evaluator import Evaluator
 from .model import AbliterationParameters, Model, get_model_class
+from .refusal_classifier import RefusalClassifier
 from .utils import (
     empty_cache,
     format_duration,

diff --git a/src/heretic/refusal_classifier.py b/src/heretic/refusal_classifier.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2025-2026  Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors
+
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+
+class RefusalClassifier:
+    def __init__(self, model_name: str = "NousResearch/Minos-v1"):
+        """Initialize the refusal classifier model.
+
+        Args:
+            model_name: Hugging Face model ID for refusal classification.
+        """
+        print(f"Loading refusal classifier model [bold]{model_name}[/]...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        self.model.eval()
+        print("* Refusal classifier loaded")
-    def __init__(self, model_name: str = "NousResearch/Minos-v1"):
-        """Initialize the refusal classifier model.
-
-        Args:
-            model_name: Hugging Face model ID for refusal classification.
-        """
-        print(f"Loading refusal classifier model [bold]{model_name}[/]...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        self.model.eval()
-        print("* Refusal classifier loaded")
+    def __init__(self, model_name: str, threshold: float):
+        """Initialize the refusal classifier model.
+
+        Args:
+            model_name: Hugging Face model ID for refusal classification.
+            threshold: Confidence threshold for refusal classification.
+        """
+        print(f"Loading refusal classifier model [bold]{model_name}[/]...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        self.threshold = threshold
+        self.model.eval()
+        print("* Refusal classifier loaded")
-    def __init__(self, model_name: str = "NousResearch/Minos-v1"):
-        """Initialize the refusal classifier model.
-
-        Args:
-            model_name: Hugging Face model ID for refusal classification.
-        """
-        print(f"Loading refusal classifier model [bold]{model_name}[/]...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        self.model.eval()
-        print("* Refusal classifier loaded")
+    def __init__(self, model_name: str, threshold: float):
+        """Initialize the refusal classifier model.
+
+        Args:
+            model_name: Hugging Face model ID for refusal classification.
+            threshold: Confidence threshold for refusal classification.
+        """
+        print(f"Loading refusal classifier model [bold]{model_name}[/]...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        self.threshold = threshold
+        self.model.eval()
+        print("* Refusal classifier loaded")
+
+    def is_refusal(self, text: str) -> bool:
+        """Classify whether text is a refusal.
+
+        Args:
+            text: The text to classify.
+
+        Returns:
+            True if the text is classified as a refusal, False otherwise.
+        """
+        inputs = self.tokenizer(text, return_tensors="pt")
+
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            prediction = torch.argmax(probabilities, dim=-1)
+
+        return prediction.item() == 1
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            prediction = torch.argmax(probabilities, dim=-1)
-
-        return prediction.item() == 1
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            # For NousResearch/Minos-v1, the "refusal" class is at index 1.
+            refusal_probability = probabilities[0][1]
+
+        return refusal_probability.item() > self.threshold
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            prediction = torch.argmax(probabilities, dim=-1)
-
-        return prediction.item() == 1
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            # For NousResearch/Minos-v1, the "refusal" class is at index 1.
+            refusal_probability = probabilities[0][1]
+
+        return refusal_probability.item() > self.threshold