This repository has been archived by the owner on Dec 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_dataPrep_D3.py
99 lines (82 loc) · 3.17 KB
/
script_dataPrep_D3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Prepare data script for crosssectional predictions (D3).
import os
import pandas as pd
import numpy as np
# Input directory
str_exp = os.path.dirname(os.path.realpath(__file__))
os.chdir(str_exp)
# Input file D3
tadpoleD3File = os.path.join(str_exp, 'Data', 'TADPOLE_D3.csv')
Dtadpole = pd.read_csv(tadpoleD3File)
# Make D3 prediction file for testing
# Recode diagnosis
idx_mci = Dtadpole['DX'] == 'MCI'
Dtadpole.loc[idx_mci, 'DX'] = 2
idx_mci = Dtadpole['DX'] == 'NL to MCI'
Dtadpole.loc[idx_mci, 'DX'] = 2
idx_mci = Dtadpole['DX'] == 'Dementia to MCI'
Dtadpole.loc[idx_mci, 'DX'] = 2
idx_ad = Dtadpole['DX'] == 'Dementia'
Dtadpole.loc[idx_ad, 'DX'] = 3
idx_ad = Dtadpole['DX'] == 'MCI to Dementia'
Dtadpole.loc[idx_ad, 'DX'] = 3
idx_ad = Dtadpole['DX'] == 'NL to Dementia'
Dtadpole.loc[idx_ad, 'DX'] = 3
idx_cn = Dtadpole['DX'] == 'NL'
Dtadpole.loc[idx_cn, 'DX'] = 1
idx_cn = Dtadpole['DX'] == 'MCI to NL'
Dtadpole.loc[idx_cn, 'DX'] = 1
idx_cn = Dtadpole['DX'] == 'Dementia to NL'
Dtadpole.loc[idx_cn, 'DX'] = 1
Dtadpole = Dtadpole.rename(columns={'DX': 'Diagnosis', 'ICV': 'ICV_bl'})
h = list(Dtadpole)
Dtadpole = Dtadpole.drop([h[1]]+h[7:11]+h[20:37], 1)
h = list(Dtadpole)
print 'Forcing Numeric Values'
for i in range(5, len(h)):
print [i],
if Dtadpole[h[i]].dtype != 'float64':
Dtadpole[h[i]] = pd.to_numeric(Dtadpole[h[i]], errors='coerce')
Dtadpole = Dtadpole.sort_values(['RID'])
if not os.path.exists(os.path.join(str_exp, 'IntermediateData')):
os.mkdir(os.path.join(str_exp, 'IntermediateData'))
Dtadpole.to_csv(os.path.join(str_exp, 'IntermediateData', 'LongTADPOLE_D3.csv'), index=False)
# Input file D1
tadpoleD1D2File = os.path.join(str_exp, 'Data', 'TADPOLE_D1_D2.csv')
Dtadpole = pd.read_csv(tadpoleD1D2File)
# Make D1 prediction file for training (only D1 subects that are not in D3)
idx_mci = Dtadpole['DXCHANGE'] == 4
Dtadpole.loc[idx_mci, 'DXCHANGE'] = 2
idx_ad = Dtadpole['DXCHANGE'] == 5
Dtadpole.loc[idx_ad, 'DXCHANGE'] = 3
idx_ad = Dtadpole['DXCHANGE'] == 6
Dtadpole.loc[idx_ad, 'DXCHANGE'] = 3
idx_cn = Dtadpole['DXCHANGE'] == 7
Dtadpole.loc[idx_cn, 'DXCHANGE'] = 1
idx_mci = Dtadpole['DXCHANGE'] == 8
Dtadpole.loc[idx_mci, 'DXCHANGE'] = 2
idx_cn = Dtadpole['DXCHANGE'] == 9
Dtadpole.loc[idx_cn, 'DXCHANGE'] = 1
Dtadpole = Dtadpole.rename(columns={'DXCHANGE': 'Diagnosis'})
h = list(Dtadpole)
Dtadpole['AGE'] += Dtadpole['Month_bl'] / 12.
idx_notd2 = Dtadpole['D2'] == 0
Dtadpole = Dtadpole[idx_notd2]
Dtadpole = Dtadpole.drop(h[1:8]+[h[9]]+h[14:23]+h[25:47]+h[53:73]+h[74:486]+h[832:],1)
h = list(Dtadpole)
print 'Forcing Numeric Values'
for i in range(5,len(h)):
print [i],
if Dtadpole[h[i]].dtype != 'float64':
Dtadpole[h[i]]=pd.to_numeric(Dtadpole[h[i]], errors='coerce')
urid = np.unique(Dtadpole['RID'].values)
Dtadpole_sorted=pd.DataFrame(columns=h)
print 'Sort the dataframe based on age for each subject'
for i in range(len(urid)):
print [i],
agei=Dtadpole.loc[Dtadpole['RID']==urid[i],'AGE']
idx_sortedi=np.argsort(agei)
D1=Dtadpole.loc[idx_sortedi.index[idx_sortedi]]
ld = [Dtadpole_sorted,D1]
Dtadpole_sorted = pd.concat(ld)
Dtadpole_sorted.to_csv(os.path.join(str_exp, 'IntermediateData', 'LongTADPOLE_D1.csv'), index=False)