Skip to content

Commit

Permalink
648/add excluded features (#688)
Browse files Browse the repository at this point in the history
* fix: add a condition to check empty dataframe

* add messages to inform excluded zero features in the report

* Address PR comments

* Address PR comment

* Minor fix

---------

Co-authored-by: Zhaoyang Xie <[email protected]>
  • Loading branch information
damien2012eng and Zhaoyang Xie authored Sep 20, 2024
1 parent e561d9a commit 1a4afea
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 12 deletions.
11 changes: 11 additions & 0 deletions rsmtool/notebooks/data_description.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
"## Description of the data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print the excluded features in the report if exists.\n",
"if excluded_features:\n",
" display(Markdown(f\"The following feature(s) were excluded because their standard devision in the training set is equal to 0: **{', '.join(excluded_features)}**.\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
3 changes: 2 additions & 1 deletion rsmtool/notebooks/header.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
"use_thumbnails = environ_config.get('USE_THUMBNAILS')\n",
"predict_expected_scores = environ_config.get('PREDICT_EXPECTED_SCORES')\n",
"rater_error_variance = environ_config.get(\"RATER_ERROR_VARIANCE\")\n",
"excluded_features = environ_config.get(\"EXCLUDED_ZERO_FEATURE\")\n",
"\n",
"# groups for analysis by prompt or subgroup.\n",
"groups_desc = environ_config.get('GROUPS_FOR_DESCRIPTIVES') \n",
Expand Down Expand Up @@ -233,7 +234,7 @@
"metadata": {},
"outputs": [],
"source": [
"Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description))"
"display(Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description)))"
]
},
{
Expand Down
29 changes: 18 additions & 11 deletions rsmtool/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ class FeaturePreprocessor:
def __init__(self, logger: Optional[logging.Logger] = None):
"""Initialize the FeaturePreprocessor object."""
self.logger = logger if logger else logging.getLogger(__name__)
self.excluded_features = []

def check_model_name(self, model_name: str) -> str:
"""
Expand Down Expand Up @@ -830,6 +831,7 @@ def filter_on_column(
f"training set is equal to 0."
)
drop_column = True
self.excluded_features.append(column)

# if `drop_column` is true, then we need to drop the column
if drop_column:
Expand Down Expand Up @@ -1522,17 +1524,19 @@ def filter_data(
# and also replace any non-numeric feature values in already
# excluded data with NaNs for consistency
for feat in feature_names:
df_excluded[feat] = pd.to_numeric(df_excluded[feat], errors="coerce").astype(float)
newdf, newdf_excluded = self.filter_on_column(
df_filtered,
feat,
exclude_zeros=False,
exclude_zero_sd=exclude_zero_sd,
)
del df_filtered
df_filtered = newdf
with np.errstate(divide="ignore"):
df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
# check if `df_filtered` contains data again after filtering out from previous iteration.
if len(df_filtered) != 0:
df_excluded[feat] = pd.to_numeric(df_excluded[feat], errors="coerce").astype(float)
newdf, newdf_excluded = self.filter_on_column(
df_filtered,
feat,
exclude_zeros=False,
exclude_zero_sd=exclude_zero_sd,
)
del df_filtered
df_filtered = newdf
with np.errstate(divide="ignore"):
df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)

# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
Expand Down Expand Up @@ -2059,6 +2063,9 @@ def process_data_rsmtool(

for key, value in internal_options_dict.items():
new_config_obj[key] = value

# include the excluded features in the configuration
new_config_obj["excluded_features"] = self.excluded_features

new_container_datasets = [
DatasetDict({"name": "train_features", "frame": df_train_features}),
Expand Down
1 change: 1 addition & 0 deletions rsmtool/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,7 @@ def create_report(
"JAVASCRIPT_PATH": javascript_path,
"OUTPUT_DIR": csvdir,
"FIGURE_DIR": figdir,
"EXCLUDED_ZERO_FEATURE": config.get("excluded_features", None)
}

# get the report directory which is at the same level
Expand Down

0 comments on commit 1a4afea

Please sign in to comment.