Skip to content
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 92 additions & 16 deletions xbooster/lgb_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from ._utils import calculate_information_value, calculate_weight_of_evidence

# Note: These will be needed when implementing the methods:
# from typing import Optional
Expand Down Expand Up @@ -171,18 +172,9 @@ def get_leafs(
# Note: This is an approximation - pred_contrib gives feature contributions
# For now, we'll use predict with num_iteration to get cumulative scores
for i in range(n_trees):
if i == 0:
# First tree contribution is the raw score from just that tree
tree_margin = (
self.model.predict(X, raw_score=True, num_iteration=1) - self.base_score
)
else:
# Subsequent trees: difference between cumulative scores
curr_score = self.model.predict(X, raw_score=True, num_iteration=i + 1)
prev_score = self.model.predict(X, raw_score=True, num_iteration=i)
tree_margin = curr_score - prev_score

df_leafs[f"tree_{i}"] = tree_margin
df_leafs[f"tree_{i}"] = self.model.predict(
X, raw_score=True, start_iteration=i, num_iteration=1
) - self.base_score * (i == 0)

return df_leafs

Expand Down Expand Up @@ -220,6 +212,8 @@ def extract_leaf_weights(self) -> pd.DataFrame:
leaf_nodes = tree_df[tree_df["split_feature"].isna()][
["tree_index", "node_index", "value"]
].copy()
# Make leaf index relative within each tree
leaf_nodes["relative_leaf_index"] = leaf_nodes.groupby("tree_index").cumcount()
Comment on lines 220 to +229
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Using cumcount for relative_leaf_index assumes leaf_nodes are sorted as intended.

Ensure leaf_nodes is sorted correctly within each tree before using cumcount to prevent incorrect relative indices.

Suggested change
leaf_nodes = tree_df[tree_df["split_feature"].isna()][
["tree_index", "node_index", "value"]
].copy()
# Make leaf index relative within each tree
leaf_nodes["relative_leaf_index"] = leaf_nodes.groupby("tree_index").cumcount()
leaf_nodes = tree_df[tree_df["split_feature"].isna()][
["tree_index", "node_index", "value"]
].copy()
# Ensure leaf_nodes are sorted by tree_index and node_index before assigning relative_leaf_index
leaf_nodes = leaf_nodes.sort_values(["tree_index", "node_index"]).reset_index(drop=True)
leaf_nodes["relative_leaf_index"] = leaf_nodes.groupby("tree_index").cumcount()


# Helper function to merge decision nodes with leaf values
def merge_and_format(decisions, leafs, child_column, sign):
Expand All @@ -232,7 +226,7 @@ def merge_and_format(decisions, leafs, child_column, sign):
)
result = merged.rename(
columns={
"node_index_y": "Node", # Leaf node index
"relative_leaf_index": "Node", # Leaf node index
"split_feature": "Feature",
"threshold": "Split",
"value": "XAddEvidence",
Expand Down Expand Up @@ -266,10 +260,92 @@ def construct_scorecard(self) -> pd.DataFrame:
- Use get_leafs() to map observations to leaf nodes
- Calculate event rates per leaf
- Apply WOE/IV calculations from _utils

TODO: Implement this method following XGBoost pattern
"""
raise NotImplementedError("construct_scorecard() method needs to be implemented")
n_trees = self.booster_.num_trees()
labels = self.y
tree_leaf_idx = self.booster_.predict(self.X, pred_leaf=True)
if tree_leaf_idx.shape != (len(labels), n_trees):
raise ValueError(
f"Invalid leaf index shape {tree_leaf_idx.shape}. Expected {(len(labels), n_trees)}"
)

df_binning_table = pd.DataFrame()
for i in range(n_trees):
index_and_label = pd.concat(
[
pd.Series(tree_leaf_idx[:, i], name="leaf_idx"),
pd.Series(labels, name="label"),
],
axis=1,
)
# Create a binning table
binning_table = (
index_and_label.groupby("leaf_idx").agg(["sum", "count"]).reset_index()
).astype(float)
binning_table.columns = ["leaf_idx", "Events", "Count"] # type: ignore
binning_table["tree"] = i
binning_table["NonEvents"] = binning_table["Count"] - binning_table["Events"]
binning_table["EventRate"] = binning_table["Events"] / binning_table["Count"]
binning_table = binning_table[
["tree", "leaf_idx", "Events", "NonEvents", "Count", "EventRate"]
]
Comment on lines +294 to +304
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Directly renaming columns after groupby/agg may be fragile if the aggregation changes.

To avoid issues if aggregation changes, assign column names based on the aggregation output or reference columns by name rather than relying on order.

Suggested change
# Create a binning table
binning_table = (
index_and_label.groupby("leaf_idx").agg(["sum", "count"]).reset_index()
).astype(float)
binning_table.columns = ["leaf_idx", "Events", "Count"] # type: ignore
binning_table["tree"] = i
binning_table["NonEvents"] = binning_table["Count"] - binning_table["Events"]
binning_table["EventRate"] = binning_table["Events"] / binning_table["Count"]
binning_table = binning_table[
["tree", "leaf_idx", "Events", "NonEvents", "Count", "EventRate"]
]
# Create a binning table
binning_table = (
index_and_label.groupby("leaf_idx").agg({"label": ["sum", "count"]}).reset_index()
).astype(float)
# Flatten MultiIndex columns
binning_table.columns = ["leaf_idx", "Events", "Count"]
binning_table["tree"] = i
binning_table["NonEvents"] = binning_table["Count"] - binning_table["Events"]
binning_table["EventRate"] = binning_table["Events"] / binning_table["Count"]
binning_table = binning_table[
["tree", "leaf_idx", "Events", "NonEvents", "Count", "EventRate"]
]

# Aggregate indices, leafs, and counts of events and non-events
df_binning_table = pd.concat([df_binning_table, binning_table], axis=0)
# Extract leaf weights (XAddEvidence)
df_x_add_evidence = self.extract_leaf_weights()
self.lgb_scorecard = df_x_add_evidence.merge(
df_binning_table,
left_on=["Tree", "Node"],
right_on=["tree", "leaf_idx"],
how="left",
).drop(["tree", "leaf_idx"], axis=1)

self.lgb_scorecard = self.lgb_scorecard[
[
"Tree",
"Node",
"Feature",
"Sign",
"Split",
"Count",
"NonEvents",
"Events",
"EventRate",
"XAddEvidence",
]
]

# Sort by Tree and Node
self.lgb_scorecard = self.lgb_scorecard.sort_values(by=["Tree", "Node"]).reset_index(
drop=True
)
# Get WOE and IV scores
self.lgb_scorecard["WOE"] = calculate_weight_of_evidence(self.lgb_scorecard)["WOE"]
self.lgb_scorecard["IV"] = calculate_information_value(self.lgb_scorecard)["IV"]

# Get % of observation counts in a Split
self.lgb_scorecard["CountPct"] = self.lgb_scorecard["Count"] / self.lgb_scorecard.groupby(
"Tree"
)["Count"].transform("sum")

self.lgb_scorecard = self.lgb_scorecard[
[
"Tree",
"Node",
"Feature",
"Sign",
"Split",
"Count",
"CountPct",
"NonEvents",
"Events",
"EventRate",
"WOE",
"IV",
"XAddEvidence",
]
]
return self.lgb_scorecard

def create_points(
self,
Expand Down
Loading