-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_eng.py
78 lines (51 loc) · 3.02 KB
/
feature_eng.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# creates the feature name with the mz and rt
def feature_name_creation(xcms_file_path):
table = pd.read_csv(xcms_file_path, index_col=[0])
# no need for decimal on m/z (low resolution) and only one decimal for rt
table.mz = table.mz.round(0).astype(int)
table.rt = table.rt.round(1)
# creating the feature name: mz_rt
features = table["mz"].astype(str) + "_" + table["rt"].astype(str)
table.insert(0, 'features', features) # first column
# drop as we don't know how many columns the table will have. Drop the known ones.
# There should only be the 'features' column and the samples
table_clean = table.drop(['isotopes', 'adduct','pcgroup'], axis=1) #'npeaks','NEG_GROUP', 'POS_GROUP',
return table_clean
# rounds the mz and rt columns along with its min and max
def rounder(dataframe):
table = dataframe
table.mz = table.mz.round(0).astype(int)
table.mzmin = table.mzmin.round(0).astype(int)
table.mzmax = table.mzmax.round(0).astype(int)
table.rt = table.rt.div(60).round(1)
table.rtmin = table.rtmin.div(60).round(1)
table.rtmax = table.rtmax.div(60).round(1)
return table
def feature_correspondance (ref_data, target_data):
# column to be populated
target_data['features'] = np.nan
# sorts the values in order to use the feature name of features with higher npeaks.
# sometimes, the preprocessing generated 'duplicates' of almost the same feature. We ignore those later on
target_data = target_data.sort_values('npeaks', ascending=False,ignore_index=True)
ref_data = ref_data.sort_values('npeaks', ascending=False,ignore_index=True)
for i in range(len(target_data)):
for j in range(len(ref_data)):
# check if mz is in right range.
if ((target_data.loc[i,'mz'] <= ref_data.loc[j,'mzmax'])
& (target_data.loc[i,'mz'] >= ref_data.loc[j,'mzmin'])):
# if it is, then proceed. Any rt value from the target data must be in the range of
# rtmax and rtmin of the reference data:
if (
((target_data.loc[i,'rt'] <= ref_data.loc[j,'rtmax'])
& (target_data.loc[i,'rt'] >= ref_data.loc[j,'rtmin'])) or
((target_data.loc[i,'rtmin'] <= ref_data.loc[j,'rtmax'])
& (target_data.loc[i,'rtmin'] >= ref_data.loc[j,'rtmin'])) or
((target_data.loc[i,'rtmax'] <= ref_data.loc[j,'rtmax'])
& (target_data.loc[i,'rtmax'] >= ref_data.loc[j,'rtmin']))
):
# if the condition is met, then the FIRST feature name is extracted and given to the target data.
# The one with higher npeak will be used, due to sorting at the beggining
target_data.loc[i,'features'] = ref_data.loc[j,'features']
# this brakes the second if statement. There could be multiple matches on rt
break
return target_data