Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions config.default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,24 @@ n_trials = 200
# Number of trials that use random sampling for the purpose of exploration.
n_startup_trials = 60

# Constraints for the optimization search space (layers and weights).
[constraints]
# Fraction of layers (from 0.0 to 1.0) where the direction search starts.
layer_start_fraction = 0.4
# Fraction of layers (from 0.0 to 1.0) where the direction search ends.
layer_end_fraction = 0.9

# Search constraints for Attention components (e.g., o_proj).
[constraints.attention]
max_weight_min = 0.8
max_weight_max = 1.5

# Search constraints for MLP components (e.g., down_proj).
[constraints.mlp]
max_weight_min = 0.8
max_weight_max = 1.5


# Directory to save and load study progress to/from.
study_checkpoint_dir = "checkpoints"

Expand Down
35 changes: 35 additions & 0 deletions src/heretic/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,36 @@ class DatasetSpecification(BaseModel):
)


class ComponentConstraints(BaseModel):
max_weight_min: float = Field(
default=0.8,
description="Minimum value for the max_weight parameter search range.",
)
max_weight_max: float = Field(
default=1.5,
description="Maximum value for the max_weight parameter search range.",
)


class OptimizationConstraints(BaseModel):
layer_start_fraction: float = Field(
default=0.4,
description="Fraction of layers (from 0.0 to 1.0) where the direction search starts.",
)
layer_end_fraction: float = Field(
default=0.9,
description="Fraction of layers (from 0.0 to 1.0) where the direction search ends.",
)
attention: ComponentConstraints = Field(
default_factory=ComponentConstraints,
description="Search constraints for Attention components (e.g., o_proj).",
)
mlp: ComponentConstraints = Field(
default_factory=ComponentConstraints,
description="Search constraints for MLP components (e.g., down_proj).",
)


class Settings(BaseSettings):
model: str = Field(description="Hugging Face model ID, or path to model on disk.")

Expand Down Expand Up @@ -225,6 +255,11 @@ class Settings(BaseSettings):
description="Number of trials that use random sampling for the purpose of exploration.",
)

constraints: OptimizationConstraints = Field(
default_factory=OptimizationConstraints,
description="Constraints for the optimization search space (layers and weights).",
)

study_checkpoint_dir: str = Field(
default="checkpoints",
description="Directory to save and load study progress to/from.",
Expand Down
13 changes: 9 additions & 4 deletions src/heretic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,8 @@ def objective(trial: Trial) -> tuple[float, float]:
# work with conditional or variable-range parameters.
direction_index = trial.suggest_float(
"direction_index",
0.4 * last_layer_index,
0.9 * last_layer_index,
settings.constraints.layer_start_fraction * last_layer_index,
settings.constraints.layer_end_fraction * last_layer_index,
)

if direction_scope == "per layer":
Expand All @@ -487,10 +487,15 @@ def objective(trial: Trial) -> tuple[float, float]:
# The parameter ranges are based on experiments with various models
# and much wider ranges. They are not set in stone and might have to be
# adjusted for future models.

if "down_proj" in component:
constraints = settings.constraints.mlp
else:
constraints = settings.constraints.attention
Comment on lines +491 to +494
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic to differentiate between MLP and attention components is brittle as it only checks for the presence of "down_proj" to identify an MLP component. For many models, such as the Llama family, MLP layers also include gate_proj and up_proj. With the current logic, these components would incorrectly receive attention-specific constraints, which could lead to suboptimal optimization or unexpected behavior. A more robust approach would be to check against a more comprehensive set of known MLP component names.

Suggested change
if "down_proj" in component:
constraints = settings.constraints.mlp
else:
constraints = settings.constraints.attention
if any(c in component for c in ("gate_proj", "up_proj", "down_proj")):
constraints = settings.constraints.mlp
else:
constraints = settings.constraints.attention

Copy link
Copy Markdown
Author

@KaraKaraWitch KaraKaraWitch Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my understanding, we only capture down_proj in the get_layer_modules() function. Unless we specifically want to do gate & up proj, I doubt this is a show stopper.

(Yes, I know I'm replying to a code assistant bot, but it's for everyone to note)

max_weight = trial.suggest_float(
f"{component}.max_weight",
0.8,
1.5,
constraints.max_weight_min,
constraints.max_weight_max,
)
max_weight_position = trial.suggest_float(
f"{component}.max_weight_position",
Expand Down
Loading