forked from x4nth055/pythoncode-tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdimensionality_reduction_using_feature_selection.py
136 lines (120 loc) · 3.94 KB
/
dimensionality_reduction_using_feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
"""Untitled42.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1pE7KFZcxTLcnAuXUXcKSfsfskiMMPVQd
"""
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
# Load libraries
import pandas as pd
import numpy as np
# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Load libraries
# import data
iris = datasets.load_iris()
# Create features and target
features_i = iris.data
target_i = iris.target
# thresholder creation
thresholder = VarianceThreshold(threshold=.4)
# high variance feature matrix creation
f_high_variance = thresholder.fit_transform(features_i)
# View high variance feature matrix
f_high_variance[0:3]
# View variances
thresholder.fit(features_i).variances_
# feature matrix stantardization
scaler = StandardScaler()
f_std = scaler.fit_transform(features_i)
# variance of each feature calculation
selection = VarianceThreshold()
selection.fit(f_std).variances_
# feature matrix creation with:
# for Feature 0: 80% class 0
# for Feature 1: 80% class 1
# for Feature 2: 60% class 0, 40% class 1
features_i = [[0, 2, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0]]
# threshold by variance
thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))
thresholding.fit_transform(features_i)
# Create feature matrix with two highly correlated features
features_m = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Conversion of feature matrix
dataframe = pd.DataFrame(features_m)
# correlation matrix creation
corr_m = dataframe.corr().abs()
# upper triangle selection
upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),
k=1).astype(np.bool))
# For correlation greater than 0.85, Find index of feature columns
droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]
# Drop features
dataframe.drop(dataframe.columns[droping], axis=1).head(3)
# Load data
iris_i = load_iris()
features_v = iris.data
target = iris.target
# categorical data coversion
features_v = features_v.astype(int)
# Selection of two features using highest chi-squared
chi2_s = SelectKBest(chi2, k=2)
f_kbest = chi2_s.fit_transform(features_v, target)
# Show results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Selection of two features using highest F-values
f_selector = SelectKBest(f_classif, k=2)
f_kbest = f_selector.fit_transform(features_v, target)
# Pisplay results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Selection of top 65% of features
f_selector = SelectPercentile(f_classif, percentile=65)
f_kbest = f_selector.fit_transform(features_v, target)
# Display results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Load libraries
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")
# features matrix, target vector, true coefficients
features_f, target_t = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)
# linear regression creation
ols = linear_model.LinearRegression()
# Recursive features elimination
rfecv = RFECV(estimator=ols, step=2, scoring="neg_mean_squared_error")
rfecv.fit(features_f, target_t)
rfecv.transform(features_f)
# Number of best features
rfecv.n_features_
# What the best categories ?
rfecv.support_
# We can even see how the features are ranked
rfecv.ranking_
help(SelectKBest)