-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
192 lines (175 loc) · 9.52 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import sys, os, shutil
import pandas as pd
import random
import csv
import itertools
import sox
import numpy as np
import pympi
from rpy2 import robjects as r
from datetime import datetime, date, time
def choose_template(template):
"""
Update: Choose between three template.Basic, native and non-native.
"""
if template == 'basic':
return 'etf_templates/ACLEW-basic-template_all-tiers.etf', 'etf_templates/ACLEW-basic-template_all-tiers.pfsx'
if template == 'native':
return 'etf_templates/ACLEW-LAAC-native.etf','etf_templates/ACLEW-LAAC-native.pfsx'
if template == 'non-native':
return 'etf_templates/ACLEW-LAAC-non-native.etf','etf_templates/ACLEW-LAAC-non-native.pfsx'
def choose_onsets_random(l,n, t, start=10, end=0):
"""
Function whic sets onset-offset couples in a random way
Args:
int l: length of recording in secondes
int n: number of random segments to choose
int t: length of the chosen segments for annotation
int start: the delay value from the beginnig of the sound file
int end: the delay value before the ending of the sound file
Returns:
A list of tuples which contains onset-offset couples in random intervals
"""
print("choosing random onsets")
l=int(l/60) #transform seconds to minutes
minute_tuple_raw_list=[] #tuple of integers (begin_min,end_min) before skip condition applied
minute_range = list(range(start, min(l - int(t[0]), l-end)))
#for some raison t,n and skip
#values recovered from argparse have a tuple form like (t,) so we take int(t[0])
random_minute_range=random.sample(list(minute_range),int(n[0])) #remplaced of shuffle with sample
minute_tuple_list=[(x, x + int(t[0])) for x in random_minute_range]
for start, stop in minute_tuple_list:
for low, high in minute_tuple_raw_list:
if (low < start < high) or (low < stop < high): #delate overlapping time lapses
break
else:
minute_tuple_raw_list.append((start, stop))
random_milisec_range=[(x*60000 , y*60000) for x, y in minute_tuple_raw_list]#retranformation to miliseconds for eaf
return random_milisec_range
def choose_onsets_periodic(l,skip, t, start=34, end=0):
"""Function which sets onset-offset couples with a periodic interstimulus interval (skip)
Args:
int l: length of recording in secondes
int skip: interstimulus interval, 60 as default
int t: length of the chosen segments for annotation
int end: the delay value before the ending of the sound file
Returns:
A list of tuples which contains onset-offset couples in periodic intervals
"""
print("choosing periodic onsets")
l=int(l/60) #transform seconds to minutes
minute_range = [x for x in np.arange(start,min(l - int(t[0]), l-end),int(skip[0])+int(t[0]))] #creates skipped list of numbers
periodic_minute_range=[(i,i+int(t[0])) for i in minute_range]#creates t min apart tuple couples
periodic_milisec_range=[(x*60000, y*60000) for x, y in periodic_minute_range] #retranformation to miliseconds for eaf
return periodic_milisec_range
def compile_files(file,output_dir):
"""
This function compiles an r function to access RLena package then do the
"""
r.r.source("rlena_extract.R") #access to r file
output = r.r["rlena_extraction"](file,output_dir) #access to function
file_start=pd.read_csv(output_dir+'/'+"Time_info.csv",delimiter=',',names=['startClockTime'],skiprows=1)
for filename in os.listdir(output_dir):
if filename != "Time_info.csv":
file1=pd.read_csv(output_dir+'/'+filename,delimiter=',',names=['Group.1','x'],skiprows=1)
break
#calculate the
start_time=datetime.strptime(file_start['startClockTime'].iloc[0],"%Y-%m-%d %H:%M:%S")
actual_start=datetime.strptime(file1['Group.1'].iloc[1],"%Y-%m-%d %H:%M") #take secod value
startsec=(actual_start-start_time) #in minutes
return startsec
def get_time_adjustements(file,its_types,output_dir):
"""
This function takes a csv file which contains time informartion and volubility sore information. It transforms time information to 5 min chunks with their respective score, then sort n chunks with highest volubility score.
Returns a dicionnary as a key its information type ans its timestamps.
Args:
its_types:list of demanded its information type: CTC, AWC,CVC
n: number of chunks
Retuns:
A dictionnary.
"""
dict_time_lapses={}
startdiff=compile_files(file,output_dir)
for i in its_types:
df=pd.read_csv(output_dir+'/'+i+'.csv',delimiter=',',names=['Group.1','x'],skiprows=1)
list_onsets=[] #reinitialize for new key
begin=startdiff.seconds
for index, row in df.iloc[1:].iterrows(): #skip first chunk because not 5min
list_onsets.append(((begin,begin+300),row['x'])) #create 5 min time stamps and their score associated
begin+=300
list_onsets.sort(key=lambda x:x[1],reverse=True) #sort by the score
milisec_its_range=[((x*1000 , y*1000),z) for (x, y),z in list_onsets]#retranformation to miliseconds for eaf
dict_time_lapses[i]=milisec_its_range
return dict_time_lapses #return n chunks demanded
def create_eaf(etf_path, id, output_dir, timestamps_list, eaf_type,contxt_on, contxt_off,template,its_timestamps_dict):
print("ACLEW ID: ", id)
eaf = pympi.Elan.Eaf(etf_path)
ling_type = "transcription"
eaf.add_tier("code_"+eaf_type, ling=ling_type)
eaf.add_tier("context_"+eaf_type, ling=ling_type)
eaf.add_tier("code_num_"+eaf_type, ling=ling_type)
for i, ts in enumerate(timestamps_list):
print("Creating eaf code segment # ", i+1)
print("enumerate makes: ", i, ts)
whole_region_onset = ts[0]
whole_region_offset = ts[1]
#print whole_region_offset, whole_region_onset
context_onset = int(float(whole_region_onset) - float(contxt_on)*60000)
#for float / integer unmatch float()
context_offset = int(float(whole_region_offset) + float(contxt_off)*60000)
if context_onset < 0:
context_onset = 0.0
codeNumVal = eaf_type + str(i+1)
eaf.add_annotation("code_"+eaf_type, whole_region_onset, whole_region_offset)
eaf.add_annotation("code_num_"+eaf_type, whole_region_onset, whole_region_offset, value=codeNumVal)
eaf.add_annotation("context_"+eaf_type, context_onset, context_offset)
if its_timestamps_dict!=None: #if there is its files to add
for k,v in its_timestamps_dict.items(): #its types timestamps dictionnary
eaf.add_tier("code_"+k, ling=ling_type)
eaf.add_tier("context_"+k, ling=ling_type)
eaf.add_tier("code_num_its"+k, ling=ling_type)
eaf.add_tier("notes", ling=ling_type)
eaf.add_tier("remember-me", ling=ling_type)
for i,((on,off),score) in enumerate(v):
print("Creating eaf code segment # ", i+1)
context_beg = int(float(on) - float(contxt_on)*60000)
context_end = int(float(off) + float(contxt_off)*60000)
if context_beg<0:
context_beg==0.0
codeNumVal = k + str(i+1)
eaf.add_annotation("code_"+k, int(on), int(off))
eaf.add_annotation("code_num_its"+k, int(on), int(off), value=codeNumVal)
eaf.add_annotation("context_"+k, context_beg, context_end)
eaf.to_file(os.path.join(output_dir, "{}.eaf".format(id)))
for i in eaf.get_tier_names():
print(i,":",eaf.get_annotation_data_for_tier(i))
return eaf
def create_output_csv(id, timestamps_list, file_name,context_onset,context_offset):
'''Creates a csv output of created eafs
'''
selected = pd.DataFrame(columns = ['id', 'clip_num', 'onset', 'offset','context_onset','context_offset'], dtype=int)
for i, ts in enumerate(timestamps_list):
selected = selected.append({'id': id,
'clip_num': i+1,
'onset': ts[0],
'offset': ts[1],
'context_onset': int(float(ts[0])-float(context_onset)),
'context_offset': int(float(ts[1])+float(context_offset))},
ignore_index=True)
selected[['id', 'clip_num', 'onset', 'offset','context_onset','context_offset']] = selected[['id', 'clip_num', 'onset', 'offset','context_onset','context_offset']]
selected.to_csv(file_name,index=False)
def create_output_csv_its(id, timestamps_list, file_name,context_onset,context_offset):
'''Creates a csv output of created eafs for its file with score
'''
selected = pd.DataFrame(columns = ['id', 'clip_num', 'onset', 'offset','context_onset','context_offset'], dtype=int)
for i, ((ts1,ts2),score) in enumerate(timestamps_list):
selected = selected.append({'id': id,
'clip_num': i+1,
'onset': ts1,
'offset': ts2,
'context_onset': int(float(ts1)-float(context_onset)),
'context_offset': int(float(ts2)+float(context_offset)),
'score':score},
ignore_index=True)
selected[['id', 'clip_num', 'onset', 'offset','context_onset','context_offset','score']] = selected[['id', 'clip_num', 'onset', 'offset','context_onset','context_offset','score']]
selected.to_csv(file_name,index=False)