648/add excluded features (#688)

* fix: add a condition to check empty dataframe * add messages to inform excluded zero features in the report * Address PR comments * Address PR comment * Minor fix --------- Co-authored-by: Zhaoyang Xie <[email protected]>
EducationalTestingService · Sep 20, 2024 · 1a4afea · 1a4afea
1 parent e561d9a
commit 1a4afea
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 12 deletions.
diff --git a/rsmtool/notebooks/data_description.ipynb b/rsmtool/notebooks/data_description.ipynb
@@ -7,6 +7,17 @@
     "## Description of the data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print the excluded features in the report if exists.\n",
+    "if excluded_features:\n",
+    "    display(Markdown(f\"The following feature(s) were excluded because their standard devision in the training set is equal to 0: **{', '.join(excluded_features)}**.\"))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/rsmtool/notebooks/header.ipynb b/rsmtool/notebooks/header.ipynb
@@ -185,6 +185,7 @@
     "use_thumbnails = environ_config.get('USE_THUMBNAILS')\n",
     "predict_expected_scores = environ_config.get('PREDICT_EXPECTED_SCORES')\n",
     "rater_error_variance = environ_config.get(\"RATER_ERROR_VARIANCE\")\n",
+    "excluded_features = environ_config.get(\"EXCLUDED_ZERO_FEATURE\")\n",
     "\n",
     "# groups for analysis by prompt or subgroup.\n",
     "groups_desc = environ_config.get('GROUPS_FOR_DESCRIPTIVES') \n",
@@ -233,7 +234,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description))"
+    "display(Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description)))"
    ]
   },
   {

diff --git a/rsmtool/preprocessor.py b/rsmtool/preprocessor.py
@@ -363,6 +363,7 @@ class FeaturePreprocessor:
     def __init__(self, logger: Optional[logging.Logger] = None):
         """Initialize the FeaturePreprocessor object."""
         self.logger = logger if logger else logging.getLogger(__name__)
+        self.excluded_features = []
 
     def check_model_name(self, model_name: str) -> str:
         """
@@ -830,6 +831,7 @@ def filter_on_column(
                     f"training set is equal to 0."
                 )
                 drop_column = True
+                self.excluded_features.append(column)
 
         # if `drop_column` is true, then we need to drop the column
         if drop_column:
@@ -1522,17 +1524,19 @@ def filter_data(
             # and also replace any non-numeric feature values in already
             # excluded data with NaNs for consistency
             for feat in feature_names:
-                df_excluded[feat] = pd.to_numeric(df_excluded[feat], errors="coerce").astype(float)
-                newdf, newdf_excluded = self.filter_on_column(
-                    df_filtered,
-                    feat,
-                    exclude_zeros=False,
-                    exclude_zero_sd=exclude_zero_sd,
-                )
-                del df_filtered
-                df_filtered = newdf
-                with np.errstate(divide="ignore"):
-                    df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
+                # check if `df_filtered` contains data again after filtering out from previous iteration.
+                if len(df_filtered) != 0:
+                    df_excluded[feat] = pd.to_numeric(df_excluded[feat], errors="coerce").astype(float)
+                    newdf, newdf_excluded = self.filter_on_column(
+                        df_filtered,
+                        feat,
+                        exclude_zeros=False,
+                        exclude_zero_sd=exclude_zero_sd,
+                    )
+                    del df_filtered
+                    df_filtered = newdf
+                    with np.errstate(divide="ignore"):
+                        df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
 
             # make sure that the remaining data frame is not empty
             if len(df_filtered) == 0:
@@ -2059,6 +2063,9 @@ def process_data_rsmtool(
 
         for key, value in internal_options_dict.items():
             new_config_obj[key] = value
+
+        # include the excluded features in the configuration
+        new_config_obj["excluded_features"] = self.excluded_features
 
         new_container_datasets = [
             DatasetDict({"name": "train_features", "frame": df_train_features}),

diff --git a/rsmtool/reporter.py b/rsmtool/reporter.py
@@ -680,6 +680,7 @@ def create_report(
             "JAVASCRIPT_PATH": javascript_path,
             "OUTPUT_DIR": csvdir,
             "FIGURE_DIR": figdir,
+            "EXCLUDED_ZERO_FEATURE": config.get("excluded_features", None)
         }
 
         # get the report directory which is at the same level