-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathecon 484 project raw code.py
116 lines (85 loc) · 3.59 KB
/
econ 484 project raw code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 27 14:59:38 2019
@author: jordan79
"""
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn import tree
from IPython.display import Image
from sklearn.tree import export_graphviz
import copy
#%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn import metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
df = pd.read_stata("Econ_484_data_final.dta")
df = df.drop("DAYSWAIT_CHRON", axis=1)
df = df.dropna(axis=0, how='any')
df['year'] = pd.DatetimeIndex(df["TX_DATE"]).year
df['month'] = pd.DatetimeIndex(df["TX_DATE"]).month
df = df.drop("TX_DATE", axis=1)
dummy_columns = ['gender', 'abo', 'EXH_PERIT_ACCESS', 'EXH_VASC_ACCESS', 'PREV_TX', 'PREV_KI_TX',
'MALIG_TRR', 'txkid', 'ABO_DON', 'DON_TY', 'GENDER_DON', 'DON_TY', 'GENDER_DON',
'HOME_STATE_DON', 'ABO_MAT', 'GRF_STAT_KI', 'DWFG_KI', 'PREV_TX_ANY', 'PX_STAT',
'SHARE_TY', 'AGE_GROUP', 'malig', 'LT_ONE_WEEK_DON', 'RECOV_OUT_US', 'year', 'month',
'PERM_STATE']
funky_columns = ['TX_PROCEDUR_TY_KI']
df_non_dummy = df.drop(dummy_columns, axis=1)
df_dummified = pd.get_dummies(df[dummy_columns])
df = df_non_dummy.merge(df_dummified, left_index=True, right_index=True)
y_variable = 'DAYSWAIT_CHRON_KI'
X_variables = list(df.columns[df.columns != y_variable])
X = df[X_variables].to_numpy()
y = df[y_variable].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y)
forest = RandomForestRegressor(n_estimators=5)
model = forest.fit(X_train, y_train)
model.score(X_test,y_test)
def plot_feature_importances_(model):
n_features = X_test.shape[1]
plt.barh(np.arange(n_features),model.feature_importances_, align='center')
plt.yticks(np.arange(n_features),df.columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
plot_feature_importances_(model)
state_names = ['AK','AL','AR','AS','AZ','CA','CO','CT','DC','DE','FL','GA',
'GU','HI','IA','ID','IL','IN','KS','KY','LA','MA','MD','ME',
'MI','MN','MO','MP','MS','MT','NA','NC','ND','NE','NH','NJ',
'NM','NV','NY','OH','OK','OR','PA','PR','RI','SC','SD','TN',
'TX','UT','VA','VI','VT','WA','WI','WV','WY','ZZ']
wl_times = pd.DataFrame()
wl_times['state']= state_names
List = []
for i in range(58):
A = np.zeros((58,), dtype=int)
A[i]=1
num_10000 = np.concatenate((X[44332,:172].copy(),A), axis = None).reshape(1,-1)
List.append(model.predict(num_10000))
print(model.predict(num_10000))
List = np.asarray(List)
wl_times['10'] = List
stats = wl_times.describe()
svc_model = SVC()
svc_model.fit(X_train,y_train)
predictions_4 = svc_model.predict(X_test)
print(classification_report(y_test,predictions_4))
print("Accuracy on training set: {:.3f}".format(svc_model.score(X_train, y_train)))
num_21 = X[234567,:].copy().reshape(1, -1)
num_21.shape
num_21 = pd.DataFrame(num_21, columns = df[X_variables].columns)
num_21
# Number 1 is from OK
# Number 2 is from NC