Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions q2_sample_classifier/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,13 @@ def fit_regressor(table: biom.Table,
n_jobs, optimize_feature_selection, parameter_tuning,
missing_samples=missing_samples, classification=False)

# this is sorted by the first column rather than 'importance' because
# the column name isn't consistent across methods - so this is the least
# invasive way to preserve order with the first column (which does
# contain the importance values)
importance = importance.sort_values(by=importance.columns[0],
ascending=False, kind='mergesort')

return estimator, importance


Expand Down Expand Up @@ -361,6 +368,14 @@ def regress_samples_ncv(
table, metadata, cv, random_state, n_jobs, n_estimators, estimator,
stratify, parameter_tuning, classification=False,
scoring=mean_squared_error, missing_samples=missing_samples)

# this is sorted by the first column rather than 'importance' because
# the column name isn't consistent across methods - so this is the least
# invasive way to preserve order with the first column (which does
# contain the importance values)
importances = importances.sort_values(by=importances.columns[0],
ascending=False, kind='mergesort')

return y_pred, importances


Expand Down
92 changes: 62 additions & 30 deletions q2_sample_classifier/tests/test_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,47 +37,79 @@ def setUp(self):
self.X = np.random.rand(50, 20)
self.y = np.random.randint(0, 2, 50)

self.exp1 = pd.Series([
0.4999999999999999, 0.52, 0.52, 0.5399999999999999,
0.44000000000000006, 0.52, 0.4600000000000001,
0.5599999999999998, 0.52, 0.52, 0.5, 0.5399999999999999, 0.54,
0.5599999999999999, 0.47999999999999987, 0.6199999999999999,
0.5399999999999999, 0.5, 0.4999999999999999, 0.45999999999999996],
index=pd.Index(range(1, 21)), name='Accuracy')
self.exp2 = pd.Series([
0.5000000000000001, 0.52, 0.48, 0.5599999999999998, 0.5,
0.5799999999999998, 0.54, 0.4600000000000001, 0.6,
0.45999999999999996, 0.45999999999999996],
index=pd.Index([1] + [i for i in range(2, 21, 2)]),
name='Accuracy')
self.exp3 = pd.Series({1: 0.4600000000000001, 20: 0.45999999999999996},
name='Accuracy')

def extract_rfe_scores_template(self, steps, expected):
def _fit_selector(self, step):
selector = RFECV(RandomForestClassifier(
random_state=123, n_estimators=2), step=steps, cv=10)
selector = selector.fit(self.X, self.y.ravel())
pdt.assert_series_equal(
_extract_rfe_scores(selector), expected)
random_state=123, n_estimators=2), step=step, cv=10
)

return selector.fit(self.X, self.y.ravel())

def _assert_basic_contract(self, selector):
obs = _extract_rfe_scores(selector)

self.assertIsInstance(obs, pd.Series)
self.assertEqual(obs.name, 'Accuracy')

scores = selector.cv_results_['mean_test_score']
self.assertEqual(len(obs), len(scores))

np.testing.assert_array_equal(
np.sort(obs.to_numpy()),
np.sort(np.asarray(scores)))

index = obs.index.to_numpy()
self.assertTrue(np.all(np.diff(index) > 0))

n_features = len(selector.ranking_)

self.assertEqual(index[0], 1)
self.assertEqual(index[-1], n_features)
self.assertTrue(np.issubdtype(index.dtype, np.integer))

return obs

def test_extract_rfe_scores_step_int_one(self):
self.extract_rfe_scores_template(1, self.exp1)
self._assert_basic_contract(self._fit_selector(1))

def test_extract_rfe_scores_step_float_one(self):
self.extract_rfe_scores_template(0.05, self.exp1)
self._assert_basic_contract(self._fit_selector(0.05))
# for 20 features, 0.05 * 20 = 1, so this should match step=1 index
step_int = self._fit_selector(1)
step_float = self._fit_selector(0.05)

obs_int = _extract_rfe_scores(step_int)
obs_float = _extract_rfe_scores(step_float)

np.testing.assert_array_equal(
obs_int.index.to_numpy(), obs_float.index.to_numpy()
)

def test_extract_rfe_scores_step_int_two(self):
self.extract_rfe_scores_template(2, self.exp2)
self._assert_basic_contract(self._fit_selector(2))

def test_extract_rfe_scores_step_float_two(self):
self.extract_rfe_scores_template(0.1, self.exp2)
self._assert_basic_contract(self._fit_selector(0.1))
# for 20 features, 0.1 * 20 = 2, so this should match step=2 index
step_int = self._fit_selector(2)
step_float = self._fit_selector(0.1)

obs_int = _extract_rfe_scores(step_int)
obs_float = _extract_rfe_scores(step_float)

np.testing.assert_array_equal(
obs_int.index.to_numpy(), obs_float.index.to_numpy()
)

def test_extract_rfe_scores_step_full_range_out_of_range(self):
self._assert_basic_contract(self._fit_selector(20))
self._assert_basic_contract(self._fit_selector(21))

def test_extract_rfe_scores_step_full_range(self):
self.extract_rfe_scores_template(20, self.exp3)
obs_full = _extract_rfe_scores(self._fit_selector(20))
obs_oor = _extract_rfe_scores(self._fit_selector(21))

def test_extract_rfe_scores_step_out_of_range(self):
# should be equal to full_range
self.extract_rfe_scores_template(21, self.exp3)
np.testing.assert_array_equal(
obs_full.index.to_numpy(), obs_oor.index.to_numpy())
pdt.assert_series_equal(obs_full, obs_oor)


# test classifier pipelines succeed on binary data
Expand Down
4 changes: 2 additions & 2 deletions q2_sample_classifier/tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def test_regress_samples_ncv_accuracy(self):
self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
n_estimators=2, n_jobs=1, missing_samples='ignore')
pdt.assert_series_equal(y_pred, self.exp_pred)
pdt.assert_frame_equal(importances, self.exp_imp)
pdt.assert_frame_equal(importances, self.exp_imp, atol=1e-12)

# test that fit_* methods output consistent importance scores
def test_fit_regressor(self):
Expand All @@ -344,7 +344,7 @@ def test_fit_regressor(self):
exp_imp = pd.read_csv(
self.get_data_path('importance_cv.tsv'), sep='\t', header=0,
index_col=0)
pdt.assert_frame_equal(importances, exp_imp)
pdt.assert_frame_equal(importances, exp_imp, atol=1e-12)

# just make sure this method runs. Uses the same internal function as
# fit_regressor, so importance score consistency is covered by the above
Expand Down
29 changes: 20 additions & 9 deletions q2_sample_classifier/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,15 +250,26 @@ def _rfecv_feature_selection(feature_data, targets, estimator,

def _extract_rfe_scores(rfecv):
grid_scores_ = rfecv.cv_results_['mean_test_score']
n_features = len(rfecv.ranking_)
# If using fractional step, step = integer of fraction * n_features
if rfecv.step < 1:
rfecv.step = int(rfecv.step * n_features)
# Need to manually calculate x-axis, grid_scores_ is a 1-d array
x = [n_features - (n * rfecv.step)
for n in range(len(grid_scores_)-1, -1, -1)]
if x[0] < 1:
x[0] = 1

# sklearn >= 1.5 provides the x-axis directly
# https://scikit-learn.org/stable/whats_new/v1.5.html#sklearn-feature-selection
if 'n_features' in rfecv.cv_results_:
x = rfecv.cv_results_['n_features']

else:
n_features = len(rfecv.ranking_)
# If using fractional step, step = integer of fraction * n_features
step = rfecv.step
if step < 1:
# prevent case where step = 0
step = max(1, int(step * n_features))

# Need to manually calculate x-axis, grid_scores_ is a 1-d array
x = [n_features - (n * step)
for n in range(len(grid_scores_)-1, -1, -1)]
if x[0] < 1:
x[0] = 1

return pd.Series(grid_scores_, index=x, name='Accuracy')


Expand Down
Loading