This repository has been archived by the owner on Dec 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_dataPrep.py
84 lines (70 loc) · 2.82 KB
/
script_dataPrep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Prepare data script for longitudinal predictions.
import os
import pandas as pd
import numpy as np
# Settings
leaderboard = 0
# Input directory
str_exp = os.path.dirname(os.path.realpath(__file__))
os.chdir(str_exp)
# Input file
tadpoleD1D2File = os.path.join(str_exp, 'Data', 'TADPOLE_D1_D2.csv')
Dtadpole = pd.read_csv(tadpoleD1D2File)
idx_progress = np.logical_and(Dtadpole['DXCHANGE'] >= 4, Dtadpole['DXCHANGE'] <= 6)
SubC = np.unique(Dtadpole.loc[idx_progress, 'RID'])
SubC = pd.Series(SubC)
SubC.to_csv(os.path.join(str_exp, 'IntermediateData', 'SubjectsWithChange.csv'), index=False)
# Recode diagnosis
idx_mci = Dtadpole['DXCHANGE'] == 4
Dtadpole.loc[idx_mci, 'DXCHANGE'] = 2
idx_ad = Dtadpole['DXCHANGE'] == 5
Dtadpole.loc[idx_ad, 'DXCHANGE'] = 3
idx_ad = Dtadpole['DXCHANGE'] == 6
Dtadpole.loc[idx_ad, 'DXCHANGE'] = 3
idx_cn = Dtadpole['DXCHANGE'] == 7
Dtadpole.loc[idx_cn, 'DXCHANGE'] = 1
idx_mci = Dtadpole['DXCHANGE'] == 8
Dtadpole.loc[idx_mci, 'DXCHANGE'] = 2
idx_cn = Dtadpole['DXCHANGE'] == 9
Dtadpole.loc[idx_cn, 'DXCHANGE'] = 1
Dtadpole = Dtadpole.rename(columns={'DXCHANGE': 'Diagnosis'})
h = list(Dtadpole)
Dtadpole['AGE'] += Dtadpole['Month_bl'] / 12.
D2 = Dtadpole['D2'].copy()
Dtadpole = Dtadpole.drop(h[1:8]+[h[9]]+h[14:17]+h[45:47]+h[53:73]+h[74:486]+h[832:838]+h[1172:1174]+h[1657:1667]+h[1895:1902]+h[1905:], 1)
h = list(Dtadpole)
print 'Forcing Numeric Values'
for i in range(5, len(h)):
print [i],
if Dtadpole[h[i]].dtype != 'float64':
Dtadpole[h[i]] = pd.to_numeric(Dtadpole[h[i]], errors='coerce')
urid = np.unique(Dtadpole['RID'].values)
Dtadpole_sorted = pd.DataFrame(columns=h)
print 'Sort the dataframe based on age for each subject'
for i in range(len(urid)):
print [i],
agei = Dtadpole.loc[Dtadpole['RID'] == urid[i], 'AGE']
idx_sortedi = np.argsort(agei)
D1 = Dtadpole.loc[idx_sortedi.index[idx_sortedi]]
ld = [Dtadpole_sorted, D1]
Dtadpole_sorted = pd.concat(ld)
if not os.path.exists(os.path.join(str_exp, 'IntermediateData')):
os.mkdir(os.path.join(str_exp, 'IntermediateData'))
Dtadpole_sorted.to_csv(os.path.join(str_exp, 'IntermediateData', 'LongTADPOLE.csv'), index=False)
if leaderboard:
tadpoleLB1LB2File = os.path.join(str_exp, 'Data', 'TADPOLE_LB1_LB2.csv')
LB_Table = pd.read_csv(tadpoleLB1LB2File)
LB = LB_Table['LB1']+LB_Table['LB2']
idx_lb = LB.values >= 1
Dtadpole = Dtadpole[idx_lb]
# Leaderboard
idx_lb2 = LB_Table['LB2'] == 1
LB2_RID = LB_Table.loc[idx_lb2, 'RID']
SLB2 = pd.Series(np.unique(LB2_RID.values))
SLB2.to_csv(os.path.join(str_exp, 'IntermediateData', 'ToPredict.csv'), index=False)
else:
# Submission
idx_d2 = D2 == 1
Dtadpole_RID = Dtadpole.loc[idx_d2, 'RID']
SD2 = pd.Series(np.unique(Dtadpole_RID.values))
SD2.to_csv(os.path.join(str_exp, 'IntermediateData', 'ToPredict_D2.csv'), index=False)