forked from bmajoros/python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataFrame.py
executable file
·229 lines (196 loc) · 6.78 KB
/
DataFrame.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# 2018 William H. Majoros ([email protected])
#=========================================================================
from __future__ import (absolute_import, division, print_function,
unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
import sys
from DataFrameRow import DataFrameRow
from Rex import Rex
rex=Rex()
#=========================================================================
# Attributes:
# header
# matrix : array of rows, each of which is a DataFrameRow
# rowHash : dictionary mapping row names to row indices
# colHash : dictionary mapping column names to column indices
# Methods:
# df=DataFrame()
# df.save(filename)
# rowNames=df.getRowNames()
# colNames=df.getColumnNames()
# df.addRow(DataFrameRow)
# n=df.nrow()
# n=df.ncol()
# row=df[index]
# rows=df.getRows()
# elem=df[i][j]
# df.toInt()
# df.toFloat()
# df.colToFloat(colIndex)
# header=df.getHeader()
# df.removeQuotes()
# df.hashRowNames()
# df.hashColNames()
# row=df.getRowI(i)
# col=df.getColI(i)
# row=df.getRow(rowName) # call hashRowNames() first!
# col=df.getColumn(columnName) # call hashColNames() first!
# bool=df.rowExists(rowName) # call hashRowNames() first!
# bool=df.columnExists(colName) # call hashColNames() first!
# index=df.getColumnIndex(colName) # call hashColNames() first!
# newDataFrame=df.subsetColumns(colIndices)
# newDataFrame=df.subsetRows(rowIndices)
# idx=df.addColumn(colName,defaultValue) # returns index of new column
# df.print(handle)
# array=df.toDataArray()
# df.appendDF(otherDF) # does NOT do a deep copy!
# Class methods:
# df=DataFrame.readTable(filename,header=False,rowNames=False)
#=========================================================================
class DataFrame:
def __init__(self):
self.header=[]
self.matrix=[]
self.rowHash=None
self.colHash=None
def save(self,filename):
with open(filename,"wt") as OUT:
self.print(OUT)
def appendDF(self,other):
self.matrix.extend(other.matrix)
def addRow(self,row):
self.matrix.append(row)
def getRows(self):
return self.matrix
def removeQuotes(self):
for row in self.matrix:
raw=row.getRaw()
for i in range(len(raw)):
if(rex.find("\"\s*(\S+)\"",raw[i])):
raw[i]=rex[1]
self.unquoteHeader()
def unquoteHeader(self):
raw=self.header
for i in range(len(raw)):
if(rex.find("\"\s*(\S+)\"",raw[i])):
raw[i]=rex[1]
def toDataArray(self):
array=[]
for row in self.matrix:
array.append(row.values)
return array
def print(self,handle):
print("\t".join(self.header),file=handle)
for row in self.matrix: row.print(handle)
def addColumn(self,name,defaultValue):
colIndex=len(self.header)
self.header.append(name)
for row in self.matrix:
row.append(defaultValue)
return colIndex
def subsetColumns(self,colIndices):
newDF=DataFrame()
header=self.header
newHeader=newDF.header
for i in colIndices: newHeader.append(header[i])
for i in range(self.nrow()):
row=self[i]
newRow=DataFrameRow()
newRow.rename(row.getLabel())
for j in colIndices: newRow.values.append(row[j])
newDF.matrix.append(newRow)
return newDF
def subsetRows(self,rowIndices):
newDF=DataFrame()
newDF.header=self.header
for i in rowIndices:
newDF.addRow(self[i].clone())
return newDF
def rowExists(self,rowName):
if(self.rowHash is None): raise Exception("call hashRowNames() first")
return self.rowHash.get(rowName,None) is not None
def getColumnIndex(self,colName):
return self.colHash.get(colName)
def columnExists(self,colName):
if(self.colHash is None): raise Exception("call hashColNames() first")
return self.colHash.get(colName,None) is not None
def getRowNames(self):
names=[]
for row in self.matrix:
names.append(row.label)
return names
def getColumnNames(self):
return self.header
def getRowI(self,rowIndex):
return self.matrix[rowIndex]
def getColI(self,colIndex):
column=DataFrameRow()
for row in self.matrix:
column.values.append(row[colIndex])
return column
def getRow(self,rowName):
if(self.rowHash is None): raise Exception("call hashRowNames() first")
rowIndex=self.rowHash.get(rowName,None)
if(rowIndex is None): raise Exception("row not found: "+rowName)
return self.matrix[rowIndex]
def getColumn(self,colName):
if(self.colHash is None): raise Exception("call hashColNames() first")
colIndex=self.colHash.get(colName,None)
if(colIndex is None): raise Exception("column not found: "+colName)
column=DataFrameRow()
column.label=colName
for row in self.matrix:
column.values.append(row[colIndex])
return column
def hashRowNames(self):
h=self.rowHash={}
numRows=self.nrow()
for i in range(numRows):
row=self.matrix[i]
h[row.label]=i
def hashColNames(self):
h=self.colHash={}
numCols=self.ncol()
for i in range(numCols):
h[self.header[i]]=i
def getHeader(self):
return self.header
def nrow(self):
return len(self.matrix)
def ncol(self):
if(len(self.header)!=0): return len(self.header)
if(len(self.matrix)==0): return 0
return self.matrix[0].length()
def __getitem__(self,i):
return self.matrix[i]
def toInt(self):
for row in self.matrix: row.toInt()
def colToFloat(self,colIndex):
for row in self.matrix: row[colIndex]=float(row[colIndex])
def toFloat(self):
for row in self.matrix: row.toFloat()
@classmethod
def readTable(cls,filename,header=False,rowNames=False):
df=DataFrame()
with open(filename,"rt") as IN:
if(header):
df.header=IN.readline()
df.header=df.header.rstrip().split() #("\t")
for line in IN:
fields=line.rstrip().split() #("\t")
if(len(fields)<1): continue
label=""
if(rowNames):
label=fields[0]
fields=fields[1:]
row=DataFrameRow()
row.label=label
row.values=fields
df.matrix.append(row)
if(len(df.matrix)>0 and df.matrix[0].length()<len(df.header)):
df.header=df.header[1:]
return df