-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_import.py
380 lines (312 loc) · 17.4 KB
/
data_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# Python classes for importing pyControl data files and representing pyControl
# sessions and experiments. Dependencies: Python 3.5+, Numpy.
import os
import pickle
import pandas as pd
import numpy as np
from datetime import datetime, date
from collections import namedtuple
Event = namedtuple('Event', ['time','name'])
#----------------------------------------------------------------------------------
# Session class
#----------------------------------------------------------------------------------
class Session():
'''Import data from a pyControl file and represent it as an object with attributes:
- file_name
- experiment_name
- task_name
- subject_ID
If argument int_subject_IDs is True, suject_ID is stored as an integer,
otherwise subject_ID is stored as a string.
- datetime
The date and time that the session started stored as a datetime object.
- datetime_string
The date and time that the session started stored as a string of format 'YYYY-MM-DD HH:MM:SS'
- events
A list of all framework events and state entries in the order they occured.
Each entry is a namedtuple with fields 'time' & 'name', such that you can get the
name and time of event/state entry x with x.name and x.time respectively.
- times
A dictionary with keys that are the names of the framework events and states and
corresponding values which are Numpy arrays of all the times (in milliseconds since the
start of the framework run) at which each event/state entry occured.
- print_lines
A list of all the lines output by print statements during the framework run, each line starts
with the time in milliseconds at which it was printed.
'''
def __init__(self, file_path, int_subject_IDs=True):
# Load lines from file.
with open(file_path, 'r') as f:
print('Importing data file: '+os.path.split(file_path)[1])
all_lines = [line.strip() for line in f.readlines() if line.strip()]
# Extract and store session information.
self.file_name = os.path.split(file_path)[1]
info_lines = [line[2:] for line in all_lines if line[0]=='I']
self.experiment_name = next(line for line in info_lines if 'Experiment name' in line).split(' : ')[1]
self.task_name = next(line for line in info_lines if 'Task name' in line).split(' : ')[1]
subject_ID_string = next(line for line in info_lines if 'Subject ID' in line).split(' : ')[1]
datetime_string = next(line for line in info_lines if 'Start date' in line).split(' : ')[1]
if int_subject_IDs: # Convert subject ID string to integer.
self.subject_ID = int(''.join([i for i in subject_ID_string if i.isdigit()]))
else:
self.subject_ID = subject_ID_string
self.datetime = datetime.strptime(datetime_string, '%Y/%m/%d %H:%M:%S')
self.datetime_string = self.datetime.strftime('%Y-%m-%d %H:%M:%S')
# Extract and store session data.
state_IDs = eval(next(line for line in all_lines if line[0]=='S')[2:])
event_IDs = eval(next(line for line in all_lines if line[0]=='E')[2:])
ID2name = {v: k for k, v in {**state_IDs, **event_IDs}.items()}
data_lines = [line[2:].split(' ') for line in all_lines if line[0]=='D']
self.events = [Event(int(dl[0]), ID2name[int(dl[1])]) for dl in data_lines]
self.times = {event_name: np.array([ev.time for ev in self.events if ev.name == event_name])
for event_name in ID2name.values()}
self.print_lines = [line[2:] for line in all_lines if line[0]=='P']
#----------------------------------------------------------------------------------
# Experiment class
#----------------------------------------------------------------------------------
class Experiment():
def __init__(self, folder_path, int_subject_IDs=True):
'''
Import all sessions from specified folder to create experiment object. Only sessions in the
specified folder (not in subfolders) will be imported.
Arguments:
folder_path: Path of data folder.
int_subject_IDs: If True subject IDs are converted to integers, e.g. m012 is converted to 12.
'''
self.folder_name = os.path.split(folder_path)[1]
self.path = folder_path
# Import sessions.
self.sessions = []
try: # Load sessions from saved sessions.pkl file.
with open(os.path.join(self.path, 'sessions.pkl'),'rb') as sessions_file:
self.sessions = pickle.load(sessions_file)
print('Saved sessions loaded from: sessions.pkl')
except IOError:
pass
old_files = [session.file_name for session in self.sessions]
files = os.listdir(self.path)
new_files = [f for f in files if f[-4:] == '.txt' and f not in old_files]
if len(new_files) > 0:
print('Loading new data files..')
for file_name in new_files:
try:
self.sessions.append(Session(os.path.join(self.path, file_name), int_subject_IDs))
except Exception as error_message:
print('Unable to import file: ' + file_name)
print(error_message)
# Assign session numbers.
self.subject_IDs = list(set([s.subject_ID for s in self.sessions]))
self.n_subjects = len(self.subject_IDs)
self.sessions.sort(key = lambda s:s.datetime_string + str(s.subject_ID))
self.sessions_per_subject = {}
for subject_ID in self.subject_IDs:
subject_sessions = self.get_sessions(subject_ID)
for i, session in enumerate(subject_sessions):
session.number = i+1
self.sessions_per_subject[subject_ID] = subject_sessions[-1].number
def save(self):
'''Save all sessions as .pkl file. Speeds up subsequent instantiation of
experiment as sessions do not need to be reimported from data files.'''
with open(os.path.join(self.path, 'sessions.pkl'),'wb') as sessions_file:
pickle.dump(self.sessions, sessions_file)
def get_sessions(self, subject_IDs='all', when='all'):
'''Return list of sessions which match specified subject ID and time.
Arguments:
subject_ID: Set to 'all' to select sessions from all subjects or provide a list of subject IDs.
when : Determines session number or dates to select, see example usage below:
when = 'all' # All sessions
when = 1 # Sessions numbered 1
when = [3,5,8] # Session numbered 3,5 & 8
when = [...,10] # Sessions numbered <= 10
when = [5,...] # Sessions numbered >= 5
when = [5,...,10] # Sessions numbered 5 <= n <= 10
when = '2017-07-07' # Select sessions from date '2017-07-07'
when = ['2017-07-07','2017-07-08'] # Select specified list of dates
when = [...,'2017-07-07'] # Select session with date <= '2017-07-07'
when = ['2017-07-01',...,'2017-07-07'] # Select session with '2017-07-01' <= date <= '2017-07-07'.
'''
if subject_IDs == 'all':
subject_IDs = self.subject_IDs
if not isinstance(subject_IDs, list):
subject_IDs = [subject_IDs]
if when == 'all': # Select all sessions.
when_func = lambda session: True
else:
if type(when) is not list:
when = [when]
if ... in when: # Select a range..
if len(when) == 3: # Start and end points defined.
assert type(when[0]) == type(when[2]), 'Start and end of time range must be same type.'
if type(when[0]) == int: # .. range of session numbers.
when_func = lambda session: when[0] <= session.number <= when[2]
else: # .. range of dates.
when_func = lambda session: _toDate(when[0]) <= session.datetime.date() <= _toDate(when[2])
elif when.index(...) == 0: # End point only defined.
if type(when[1]) == int: # .. range of session numbers.
when_func = lambda session: session.number <= when[1]
else: # .. range of dates.
when_func = lambda session: session.datetime.date() <= _toDate(when[1])
else: # Start point only defined.
if type(when[0]) == int: # .. range of session numbers.
when_func = lambda session: when[0] <= session.number
else: # .. range of dates.
when_func = lambda session: _toDate(when[0]) <= session.datetime.date()
else: # Select specified..
assert all([type(when[0]) == type(w) for w in when]), "All elements of 'when' must be same type."
if type(when[0]) == int: # .. session numbers.
when_func = lambda session: session.number in when
else: # .. dates.
dates = [_toDate(d) for d in when]
when_func = lambda session: session.datetime.date() in dates
valid_sessions = [s for s in self.sessions if s.subject_ID in subject_IDs and when_func(s)]
return valid_sessions
def _toDate(d): # Convert input to datetime.date object.
if type(d) is str:
try:
return datetime.strptime(d, '%Y-%m-%d').date()
except ValueError:
raise ValueError('Unable to convert string to date, format must be YYYY-MM-DD.')
elif type(d) is datetime:
return d.date()
elif type(d) is date:
return d
else:
raise ValueError('Unable to convert input to date.')
#----------------------------------------------------------------------------------
# Session Dataframe
#----------------------------------------------------------------------------------
def session_dataframe(file_path, paired_events={}, pair_end_suffix=None):
'''Generate a pandas dataframe from a pyControl data file containing the
sessions data. The data frame has columns:
type : Whether the row contains session 'info', a 'state' entry,
'event' or 'print' line.
name : The name of the state, event or session information in the row.
time : The time the row occured in ms since the session start.
duration : The duration in ms of states and paired events (see below).
value : The contents of 'info' and 'print' rows.
Optionally events can be specified as coming in pairs corresponding to the
start and end of an action, e.g. entering and exiting a nosepoke. When a
start-event end-event pair occurs in the data, only the start_event generates
a row in the dataframe, with the end event used to compute the duration.
Parameters
----------
file_path : path to pyControl data file.
paired_events : Optional dict specifying paired events e.g.
{'poke_1_in':poke_1_out', 'poke_1_in':poke_1_out'}.
pair_end_suffix : Optional string specifying a suffix used to indicate the
end event of paired events that share a common stem e.g.
the pair {'poke_1_in':poke_1_out'} would be found
automatically using pair_end_suffix='_out'
Returns
-------
df : session dataframe
'''
# Load data from file.
with open(file_path, 'r') as f:
print('Importing data file: '+os.path.split(file_path)[1])
all_lines = [line.strip() for line in f.readlines() if line.strip()]
# Make dataframe.
state_IDs = eval(next(line for line in all_lines if line[0]=='S')[2:])
event_IDs = eval(next(line for line in all_lines if line[0]=='E')[2:])
ID2name = {v: k for k, v in {**state_IDs, **event_IDs}.items()}
line_dicts = []
for line in all_lines:
if line[0] == 'I': # Info line.
name, value = line[2:].split(' : ')
line_dicts.append({'type' : 'info',
'name' : name,
'value' : value})
elif line[0] == 'D': # Data line.
timestamp, ID = [int(i) for i in line.split(' ')[1:]]
line_dicts.append({'type' : 'state' if ID in state_IDs.values() else 'event',
'name' : ID2name[ID],
'time' : int(timestamp)})
elif line[0] == 'P': # Print line.
line_dicts.append({'type' : 'print',
'time' : int(line[2:].split(' ',1)[0]),
'value' : line[2:].split(' ',1)[1]})
df = pd.DataFrame(line_dicts)
# Add state durations.
df.loc[df['type'] == 'state','duration'] = -df.loc[df['type'] == 'state','time'].diff(-1)
# Find paired events with specified pair end suffix.
if pair_end_suffix:
end_events = [ev for ev in event_IDs.keys() if ev.endswith(pair_end_suffix)]
for end_event in end_events:
stem = end_event[:-len(pair_end_suffix)]
try:
start_event = next(ev for ev in event_IDs.keys() if ev.startswith(stem) and ev != end_event)
except StopIteration:
continue # No matching start event found.
paired_events[start_event] = end_event
# Compute paired event durations and remove end events.
if paired_events:
end2start = {v:k for k,v in paired_events.items()}
start_times = {se:None for se in paired_events.keys()}
start_inds = {se:None for se in paired_events.keys()}
end_inds = []
for i in df.index:
if df.loc[i,'name'] in paired_events.keys(): # Pair start event.
start_times[df.loc[i,'name']] = df.loc[i,'time']
start_inds[ df.loc[i,'name']] = i
elif df.loc[i,'name'] in paired_events.values(): # Pair end event.
start_event = end2start[df.loc[i,'name']]
if start_times[start_event] is not None:
df.loc[start_inds[start_event],'duration'] = df.loc[i,'time'] - start_times[start_event]
start_times[start_event] = None
end_inds.append(i)
df.drop(index=end_inds, inplace=True)
# Reset index and set column order.
df.reset_index(drop=True)
df = df.reindex(columns=['type','name','time','duration','value'])
return df
#----------------------------------------------------------------------------------
# Experiment dataframe
#----------------------------------------------------------------------------------
def experiment_dataframe(folder_path, paired_events={}, pair_end_suffix=None):
'''Generate a pandas dataframe from a pyControl experiment comprising
many session data files in a folder. The experiment dataframe has the
same columns as the session dataframe ('type', 'name', 'time', 'duration',
'value'), with additional columns specifying the subject_ID, start data and
time etc generated from the info lines in the pyControl data file. Each row
of the dataframe corresponds to a single state entry, event or print line
from a single session.
As with the session_dataframe function, events can optionally be specified
as coming in pairs corresponding to the start and end of an action, e.g.
entering and exiting a nosepoke. When a start-event end-event pair occurs
in the data, only the start_event generates a row in the dataframe, with
the end event used to compute the duration.
Parameters
----------
folder_path : path to experiment data folder.
paired_events : Optional dict specifying paired events e.g.
{'poke_1_in':poke_1_out', 'poke_1_in':poke_1_out'}.
pair_end_suffix : Optional string specifying a suffix used to indicate the
end event of paired events that share a common stem e.g.
the pair {'poke_1_in':poke_1_out'} would be found
automatically using pair_end_suffix='_out'
Returns
-------
df : session dataframe
'''
session_filenames = [f for f in os.listdir(folder_path) if f[-4:] == '.txt']
session_dataframes = []
for session_filename in session_filenames:
# Make session dataframe.
session_df = session_dataframe(os.path.join(folder_path,session_filename),
paired_events=paired_events, pair_end_suffix=pair_end_suffix)
# Convert info rows to columns.
info_rows = session_df[session_df['type']=='info']
session_df = session_df[session_df['type']!='info']
for name,value in zip(info_rows['name'], info_rows['value']):
session_df[name] = value
session_dataframes.append(session_df)
experiment_df = pd.concat(session_dataframes, axis=0)
return experiment_df
#----------------------------------------------------------------------------------
# Load analog data
#----------------------------------------------------------------------------------
def load_analog_data(file_path):
'''Load a pyControl analog data file and return the contents as a numpy array
whose first column is timestamps (ms) and second data values.'''
with open(file_path, 'rb') as f:
return np.fromfile(f, dtype='<i').reshape(-1,2)