# Install "numpy scipy matplotlib ipython pandas sympy" libraries via pip
import sys
!pip install --user numpy scipy pandas sympy
Requirement already satisfied: numpy in /usr/local/lib/python2.7/dist-packages
Requirement already satisfied: scipy in /usr/local/lib/python2.7/dist-packages
Requirement already satisfied: pandas in /usr/local/lib/python2.7/dist-packages
Requirement already satisfied: sympy in /usr/lib/python2.7/dist-packages
Requirement already satisfied: pytz>=2011k in /usr/lib/python2.7/dist-packages (from pandas)
Requirement already satisfied: python-dateutil in /usr/lib/python2.7/dist-packages (from pandas)
Requirement already satisfied: six in /usr/local/lib/python2.7/dist-packages (from python-dateutil->pandas)
# Install scikit-learn via pip
import sys
!pip install -U scikit-learn
Requirement already up-to-date: scikit-learn in /usr/local/lib/python2.7/dist-packages
# Install themis-ml via pip
# If this does not work, then you can install this library manually: try forllowing steps from your own terminal
# cd <to this directory>
# sudo python themis-ml/setup.py build
# sudo python themis-ml/setup.py install
import sys
!pip install --user themis-ml
Requirement already satisfied: themis-ml in /usr/local/lib/python2.7/dist-packages/themis_ml-0.0.4-py2.7.egg
Requirement already satisfied: scikit-learn>=0.19.1 in /usr/local/lib/python2.7/dist-packages (from themis-ml)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python2.7/dist-packages (from themis-ml)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python2.7/dist-packages (from themis-ml)
Requirement already satisfied: pandas>=0.22.0 in /usr/local/lib/python2.7/dist-packages (from themis-ml)
Requirement already satisfied: pathlib2 in /usr/local/lib/python2.7/dist-packages (from themis-ml)
Requirement already satisfied: pytz>=2011k in /usr/lib/python2.7/dist-packages (from pandas>=0.22.0->themis-ml)
Requirement already satisfied: python-dateutil in /usr/lib/python2.7/dist-packages (from pandas>=0.22.0->themis-ml)
Requirement already satisfied: six in /usr/local/lib/python2.7/dist-packages (from pathlib2->themis-ml)
Requirement already satisfied: scandir; python_version < "3.5" in /usr/local/lib/python2.7/dist-packages (from pathlib2->themis-ml)
#################################################
# Importing all libraries
#################################################
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from themis_ml.preprocessing.relabelling import Relabeller
from themis_ml.meta_estimators import FairnessAwareMetaEstimator
from themis_ml.linear_model.counterfactually_fair_models import LinearACFClassifier
from themis_ml.postprocessing.reject_option_classification import SingleROClassifier
from themis_ml import datasets
from themis_ml.datasets.german_credit_data_map import preprocess_german_credit_data
import util as u
german_credit = datasets.german_credit(True)
german_credit[
["credit_risk", "purpose", "age_in_years", "foreign_worker"]].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
credit_risk | purpose | age_in_years | foreign_worker | |
---|---|---|---|---|
0 | 1 | radio/television | 67 | 1 |
1 | 0 | radio/television | 22 | 1 |
2 | 1 | education | 49 | 1 |
3 | 1 | furniture/equipment | 45 | 1 |
4 | 0 | car_(new) | 53 | 1 |
german_credit_preprocessed = (
preprocess_german_credit_data(german_credit)
# the following binary variable indicates whether someone is female or
# not since the unique values in `personal_status` are:
# 'personal_status_and_sex_female_divorced/separated/married'
# 'personal_status_and_sex_male_divorced/separated'
# 'personal_status_and_sex_male_married/widowed'
# 'personal_status_and_sex_male_single'
.assign(female=lambda df:
df["personal_status_and_sex_female_divorced/separated/married"])
# we're going to hypothesize here that young people, aged below 25,
# might be considered to have bad credit risk moreso than other groups
.assign(age_below_25=lambda df: df["age_in_years"] <= 25)
)
print 'credit_risk'
credit_risk = german_credit_preprocessed.credit_risk
credit_risk.value_counts()
credit_risk
1 700
0 300
Name: credit_risk, dtype: int64
print 'female'
is_female = german_credit_preprocessed.female
is_female.value_counts()
female
0 690
1 310
Name: female, dtype: int64
print 'foreign'
is_foreign = german_credit_preprocessed.foreign_worker
is_foreign.value_counts()
foreign
1 963
0 37
Name: foreign_worker, dtype: int64
print 'age_below_25'
age_below_25 = german_credit_preprocessed.age_below_25
age_below_25.value_counts()
age_below_25
False 810
True 190
Name: age_below_25, dtype: int64
# specify feature set. Note that we're excluding the `is_female`
# and `age_below_25` columns that we created above.
feature_set_1 = [
'duration_in_month',
'credit_amount',
'installment_rate_in_percentage_of_disposable_income',
'present_residence_since',
'age_in_years',
'number_of_existing_credits_at_this_bank',
'number_of_people_being_liable_to_provide_maintenance_for',
'status_of_existing_checking_account',
'savings_account/bonds',
'present_employment_since',
'job',
'telephone',
'foreign_worker',
'credit_history_all_credits_at_this_bank_paid_back_duly',
'credit_history_critical_account/other_credits_existing_not_at_this_bank',
'credit_history_delay_in_paying_off_in_the_past',
'credit_history_existing_credits_paid_back_duly_till_now',
'credit_history_no_credits_taken/all_credits_paid_back_duly',
'purpose_business',
'purpose_car_(new)',
'purpose_car_(used)',
'purpose_domestic_appliances',
'purpose_education',
'purpose_furniture/equipment',
'purpose_others',
'purpose_radio/television',
'purpose_repairs',
'purpose_retraining',
'personal_status_and_sex_female_divorced/separated/married',
'personal_status_and_sex_male_divorced/separated',
'personal_status_and_sex_male_married/widowed',
'personal_status_and_sex_male_single',
'other_debtors/guarantors_co-applicant',
'other_debtors/guarantors_guarantor',
'other_debtors/guarantors_none',
'property_building_society_savings_agreement/life_insurance',
'property_car_or_other',
'property_real_estate',
'property_unknown/no_property',
'other_installment_plans_bank',
'other_installment_plans_none',
'other_installment_plans_stores',
'housing_for free',
'housing_own',
'housing_rent',
]
#################################################
# Case 1: Baseline
#################################################
# training and target data
X = german_credit_preprocessed[feature_set_1].values
y = german_credit_preprocessed["credit_risk"].values
s_female = german_credit_preprocessed["female"].values
s_foreign = german_credit_preprocessed["foreign_worker"].values
s_age_below_25 = german_credit_preprocessed["age_below_25"].values
estimators = u.get_estemators()
experiment_baseline_female = u.cross_validation_experiment(estimators, X, y, s_female, "female")
experiment_baseline_foreign = u.cross_validation_experiment(estimators, X, y, s_foreign, "foreign_worker")
experiment_baseline_age_below_25 = u.cross_validation_experiment(estimators, X, y, s_age_below_25, "age_below_25")
Training model LogisticRegression with s=female
-----------------------------------------------
Training model DecisionTree with s=female
-----------------------------------------
Training model RandomForest with s=female
-----------------------------------------
Training model LogisticRegression with s=foreign_worker
-------------------------------------------------------
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
/usr/local/lib/python2.7/dist-packages/themis_ml-0.0.4-py2.7.egg/themis_ml/metrics.py:40: RuntimeWarning: Mean of empty slice.
mean_diff = y[s == 0].mean() - y[s == 1].mean()
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
Training model DecisionTree with s=foreign_worker
-------------------------------------------------
Training model RandomForest with s=foreign_worker
-------------------------------------------------
Training model LogisticRegression with s=age_below_25
-----------------------------------------------------
Training model DecisionTree with s=age_below_25
-----------------------------------------------
Training model RandomForest with s=age_below_25
-----------------------------------------------
experiment_baseline = u.generate_summary(experiment_baseline_female, experiment_baseline_foreign, experiment_baseline_age_below_25)
print experiment_baseline[1]
auc mean_diff
protected_class estimator fold_type
age_below_25 DecisionTree test 0.705224 0.156542
LogisticRegression test 0.672467 0.323023
RandomForest test 0.787657 0.267809
female DecisionTree test 0.688390 0.068661
LogisticRegression test 0.672467 0.095351
RandomForest test 0.783857 0.105938
foreign_worker DecisionTree test 0.697010 0.157835
LogisticRegression test 0.672467 0.285688
RandomForest test 0.790048 0.206841
u.plot_experiment_results(experiment_baseline[0])
<seaborn.axisgrid.FacetGrid at 0x7f844b3836d0>
#################################################
# Naive Fairness-aware Approach: Remove Protected Class
#################################################
# create feature sets that remove variables with protected class information
feature_set_no_sex = [
f for f in feature_set_1 if
f not in [
'personal_status_and_sex_female_divorced/separated/married',
'personal_status_and_sex_male_divorced/separated',
'personal_status_and_sex_male_married/widowed',
'personal_status_and_sex_male_single']]
feature_set_no_foreign = [f for f in feature_set_1 if f != "foreign_worker"]
feature_set_no_age = [f for f in feature_set_1 if f != "age_in_years"]
# training and target data
X_no_sex = german_credit_preprocessed[feature_set_no_sex].values
X_no_foreign = german_credit_preprocessed[feature_set_no_foreign].values
X_no_age = german_credit_preprocessed[feature_set_no_age].values
experiment_naive_female = u.cross_validation_experiment(estimators, X_no_sex, y, s_female, "female")
experiment_naive_foreign = u.cross_validation_experiment(estimators, X_no_foreign, y, s_foreign, "foreign_worker")
experiment_naive_age_below_25 = u.cross_validation_experiment(estimators, X_no_age, y, s_age_below_25, "age_below_25")
Training model LogisticRegression with s=female
-----------------------------------------------
Training model DecisionTree with s=female
-----------------------------------------
Training model RandomForest with s=female
-----------------------------------------
Training model LogisticRegression with s=foreign_worker
-------------------------------------------------------
Training model DecisionTree with s=foreign_worker
-------------------------------------------------
Training model RandomForest with s=foreign_worker
-------------------------------------------------
Training model LogisticRegression with s=age_below_25
-----------------------------------------------------
Training model DecisionTree with s=age_below_25
-----------------------------------------------
Training model RandomForest with s=age_below_25
-----------------------------------------------
experiment_naive = u.generate_summary(experiment_naive_female, experiment_naive_foreign, experiment_naive_age_below_25)
print experiment_naive[1]
auc mean_diff
protected_class estimator fold_type
age_below_25 DecisionTree test 0.694524 0.132812
LogisticRegression test 0.683248 0.112415
RandomForest test 0.788552 0.181722
female DecisionTree test 0.694643 0.038633
LogisticRegression test 0.671381 0.075551
RandomForest test 0.788248 0.049310
foreign_worker DecisionTree test 0.698362 0.126122
LogisticRegression test 0.672362 0.285686
RandomForest test 0.785648 0.192885
u.plot_experiment_results(experiment_naive[0])
<seaborn.axisgrid.FacetGrid at 0x7f8435ddb790>
#################################################
# Fairness-aware Method: Relabelling
#################################################
# here we use the relabeller class to create new y vectors for each of the
# protected class contexts.
# we also use the FairnessAwareMetaEstimator as a convenience class to
# compose together different fairness-aware methods. This wraps around the
# estimators that we defined in the previous
relabeller = Relabeller()
relabelling_estimators = [
(name, FairnessAwareMetaEstimator(e, relabeller=relabeller))
for name, e in estimators]
experiment_relabel_female = u.cross_validation_experiment(relabelling_estimators, X_no_sex, y, s_female, "female")
experiment_relabel_foreign = u.cross_validation_experiment(relabelling_estimators, X_no_foreign, y, s_foreign, "foreign_worker")
experiment_relabel_age_below_25 = u.cross_validation_experiment(relabelling_estimators, X_no_age, y, s_age_below_25, "age_below_25")
Training model LogisticRegression with s=female
-----------------------------------------------
Training model DecisionTree with s=female
-----------------------------------------
Training model RandomForest with s=female
-----------------------------------------
Training model LogisticRegression with s=foreign_worker
-------------------------------------------------------
Training model DecisionTree with s=foreign_worker
-------------------------------------------------
Training model RandomForest with s=foreign_worker
-------------------------------------------------
Training model LogisticRegression with s=age_below_25
-----------------------------------------------------
Training model DecisionTree with s=age_below_25
-----------------------------------------------
Training model RandomForest with s=age_below_25
-----------------------------------------------
experiment_relabel = u.generate_summary(experiment_relabel_female, experiment_relabel_foreign, experiment_relabel_age_below_25)
print experiment_relabel[1]
auc mean_diff
protected_class estimator fold_type
age_below_25 DecisionTree test 0.690505 0.076403
LogisticRegression test 0.678381 0.077014
RandomForest test 0.777543 0.107310
female DecisionTree test 0.693833 0.041679
LogisticRegression test 0.669095 0.058545
RandomForest test 0.779790 0.051668
foreign_worker DecisionTree test 0.693219 0.126543
LogisticRegression test 0.671562 0.283398
RandomForest test 0.783686 0.173500
u.plot_experiment_results(experiment_relabel[0])
<seaborn.axisgrid.FacetGrid at 0x7f8435717b10>
#################################################
# Fairness-aware Method: Additive Counterfactually Fair Model
#################################################
LINEAR_REG = LinearRegression()
DECISION_TREE_REG = DecisionTreeRegressor(max_depth=10, min_samples_leaf=10)
RANDOM_FOREST_REG = RandomForestRegressor(
n_estimators=50, max_depth=10, min_samples_leaf=10)
# use the estimators defined above to define the linear additive
# counterfactually fair models
linear_acf_estimators = [
(name, LinearACFClassifier(
target_estimator=e,
binary_residual_type="absolute"))
for name, e in estimators]
experiment_acf_female = u.cross_validation_experiment(linear_acf_estimators, X_no_sex, y, s_female, "female")
experiment_acf_foreign = u.cross_validation_experiment(linear_acf_estimators, X_no_foreign, y, s_foreign, "foreign_worker")
experiment_acf_age_below_25 = u.cross_validation_experiment(linear_acf_estimators, X_no_age, y, s_age_below_25, "age_below_25")
Training model LogisticRegression with s=female
-----------------------------------------------
Training model DecisionTree with s=female
-----------------------------------------
Training model RandomForest with s=female
-----------------------------------------
Training model LogisticRegression with s=foreign_worker
-------------------------------------------------------
Training model DecisionTree with s=foreign_worker
-------------------------------------------------
Training model RandomForest with s=foreign_worker
-------------------------------------------------
Training model LogisticRegression with s=age_below_25
-----------------------------------------------------
Training model DecisionTree with s=age_below_25
-----------------------------------------------
Training model RandomForest with s=age_below_25
-----------------------------------------------
experiment_acf = u.generate_summary(experiment_acf_female, experiment_acf_foreign, experiment_acf_age_below_25)
print experiment_acf[1]
auc mean_diff
protected_class estimator fold_type
age_below_25 DecisionTree test 0.695171 0.257077
LogisticRegression test 0.668181 -0.033848
RandomForest test 0.779952 0.404667
female DecisionTree test 0.702181 0.115006
LogisticRegression test 0.670000 0.028742
RandomForest test 0.783267 0.147862
foreign_worker DecisionTree test 0.703086 0.320369
LogisticRegression test 0.667190 0.117529
RandomForest test 0.782257 0.359509
u.plot_experiment_results(experiment_acf[0])
<seaborn.axisgrid.FacetGrid at 0x7f84351dd550>
#################################################
# Reject-option Classification
#################################################
# use the estimators defined above to define the linear additive
# counterfactually fair models
single_roc_clf_estimators = [
(name, SingleROClassifier(estimator=e))
for name, e in estimators]
experiment_single_roc_female = u.cross_validation_experiment(single_roc_clf_estimators, X_no_sex, y, s_female, "female")
experiment_single_roc_foreign = u.cross_validation_experiment(single_roc_clf_estimators, X_no_foreign, y, s_foreign, "foreign_worker")
experiment_single_roc_age_below_25 = u.cross_validation_experiment(single_roc_clf_estimators, X_no_age, y, s_age_below_25, "age_below_25")
Training model LogisticRegression with s=female
-----------------------------------------------
Training model DecisionTree with s=female
-----------------------------------------
Training model RandomForest with s=female
-----------------------------------------
Training model LogisticRegression with s=foreign_worker
-------------------------------------------------------
Training model DecisionTree with s=foreign_worker
-------------------------------------------------
Training model RandomForest with s=foreign_worker
-------------------------------------------------
Training model LogisticRegression with s=age_below_25
-----------------------------------------------------
Training model DecisionTree with s=age_below_25
-----------------------------------------------
Training model RandomForest with s=age_below_25
-----------------------------------------------
experiment_single_roc = u.generate_summary(experiment_single_roc_female, experiment_single_roc_foreign, experiment_single_roc_age_below_25)
print experiment_single_roc[1]
auc mean_diff
protected_class estimator fold_type
age_below_25 DecisionTree test 0.688719 0.103479
LogisticRegression test 0.532400 -0.040909
RandomForest test 0.764029 0.103746
female DecisionTree test 0.699671 0.009273
LogisticRegression test 0.619590 -0.037961
RandomForest test 0.767514 0.024666
foreign_worker DecisionTree test 0.688700 0.104876
LogisticRegression test 0.619629 0.082053
RandomForest test 0.762457 0.127818
u.plot_experiment_results(experiment_single_roc[0])
<seaborn.axisgrid.FacetGrid at 0x7f8435706390>
#################################################
# Comparison
#################################################
compare_experiments = u.comparison(experiment_baseline[0], experiment_naive[0], experiment_relabel[0], experiment_acf[0], experiment_single_roc[0])
u.compare_experiment_results_multiple_model(compare_experiments.query("estimator == 'LogisticRegression'"))
u.compare_experiment_results_multiple_model(compare_experiments.query("estimator == 'DecisionTree'"))
u.compare_experiment_results_multiple_model(compare_experiments.query("estimator == 'RandomForest'"))