-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbin_investors.py
174 lines (134 loc) · 7.17 KB
/
bin_investors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import pandas as pd
from rich_dataframe import prettify
import pprint
import time
def writeToCSV(df: pd.DataFrame, filename: str):
df.to_csv(f'{filename}.csv', index=False)
def createOrgDict(orgDF: pd.DataFrame) -> dict:
'''
this produces the organizations dictionary
source of data: orgDF
'''
organizations = dict()
for idx, row in orgDF.iterrows():
orgUuid = row['uuid']
genderMaleFraction = row['genderMalePercent']/100.0 # divide by 100 to get a fraction
genderFemaleFraction = row['genderFemalePercent']/100.0
genderNonBinaryFraction = row['genderNonBinaryPercent']/100.0
genderNotProvidedFraction = row['genderNotProvidedPercent']/100.0
genderMale = row['genderMale']
genderFemale = row['genderFemale']
genderNonBinary = row['genderNonBinary']
genderNotProvided = row['genderNotProvided']
organizations[orgUuid] = (genderMale, genderFemale, genderNonBinary, genderNotProvided, genderMaleFraction, genderFemaleFraction, genderNonBinaryFraction, genderNotProvidedFraction)
return organizations
def postProcessInvestorsFunc(investmentsDF: pd.DataFrame, organizations: dict, moneyMode: bool, filename: str):
'''
purpose:
for each investor, we need a list of pairs: (org, moneyInvested)
key = investor uuid
take moneyInvested from each investor
tie it to the money to a founder
'''
# algorithm
# first loop over investor table
# then loop over org table
# post loop to analyze the money that went to each gender to avoid token investment
# produces investors & investors metadata
investors = dict()
investorMetadata = dict()
noMoneyInvested = 0
for idx, row in investmentsDF.iterrows():
investorUuid = row['investorUuid']
investorName = row['investorIdentifier']
moneyInvested = 0
if not (pd.isna(row['moneyInvested']) or row['moneyInvested'] == 'N/A'):
moneyInvested = row['moneyInvested']
if (moneyInvested > 0) or (not moneyMode):
if not investorUuid in investorMetadata:
investorMetadata[investorUuid] = investorName
if investorUuid in investors:
existingInvestments = investors[investorUuid]
existingInvestments.append((row['organizationUuid'], moneyInvested))
investors[investorUuid] = existingInvestments
else:
investors[investorUuid] = [(row['organizationUuid'], moneyInvested)]
else: # count of times when no money was found
noMoneyInvested += 1
# producing investor dataframe
# source of data: organizations, investors, and investors metadata
investorDF = pd.DataFrame()
for investorUuid, values in investors.items():
genderMale = 0
genderFemale = 0
genderNonBinary = 0
genderNotProvided = 0
genderMaleFraction = 0
genderFemaleFraction = 0
genderNonBinaryFraction = 0
genderNotProvidedFraction = 0
for investment in values: # values is a list of tuples of (org, moneyInvested)
'''
investment[0] = org uuid
investment[1] = money invested
'''
if investment[0] in organizations: # because so many rows don't label any money
# count
genderMale += organizations[investment[0]][0]
genderFemale += organizations[investment[0]][1]
genderNonBinary += organizations[investment[0]][2]
genderNotProvided += organizations[investment[0]][3]
# by money invested
genderMaleFraction += investment[1] * organizations[investment[0]][4]
genderFemaleFraction += investment[1] * organizations[investment[0]][5]
genderNonBinaryFraction += investment[1] * organizations[investment[0]][6]
genderNotProvidedFraction += investment[1] * organizations[investment[0]][7]
if genderMale + genderFemale + genderNonBinary + genderNotProvided == 0:
continue # skipping this investor if all cases are 0 bc no founder was found
# add all together for total investment for this one investor
moneyInvestedByInvestor = genderMaleFraction + genderFemaleFraction + genderNonBinaryFraction + genderNotProvidedFraction
if moneyMode:
if moneyInvestedByInvestor > 0: # ZeroDivisionError
genderMaleFraction /= moneyInvestedByInvestor
genderFemaleFraction /= moneyInvestedByInvestor
genderNonBinaryFraction /= moneyInvestedByInvestor
genderNotProvidedFraction /= moneyInvestedByInvestor
row = {'investorUuid': investorUuid,
'investorName': investorMetadata[investorUuid], # where the value is just the name
'moneyInvested': moneyInvestedByInvestor,
'genderMale': genderMale,
'genderFemale': genderFemale,
'genderNonBinary': genderNonBinary,
'genderNotProvided': genderNotProvided,
'maleFractionByMoney': genderMaleFraction,
'femaleFractionByMoney': genderFemaleFraction,
'nonbinaryFractionByMoney': genderNonBinaryFraction,
'notprovidedFractionByMoney': genderNotProvidedFraction}
else: # just represents activity even if no money is counted
totalFounders = genderMale + genderFemale + genderNonBinary + genderNotProvided
if totalFounders > 0: # ZeroDivisionError
genderMaleFraction = genderMale/totalFounders
genderFemaleFraction = genderFemale/totalFounders
genderNonBinaryFraction = genderNonBinary/totalFounders
genderNotProvidedFraction = genderNotProvided/totalFounders
row = {'investorUuid': investorUuid,
'investorName': investorMetadata[investorUuid], # where the value is just the name
'moneyInvested': moneyInvestedByInvestor,
'genderMale': genderMale,
'genderFemale': genderFemale,
'genderNonBinary': genderNonBinary,
'genderNotProvided': genderNotProvided,
'fractionMaleFounders': genderMaleFraction,
'fractionFemaleFounders': genderFemaleFraction,
'fractionNonbinaryFounders': genderNonBinaryFraction,
'fractionNotProvidedFounders': genderNotProvidedFraction}
tempDF = pd.DataFrame([row])
investorDF = pd.concat([investorDF, tempDF], ignore_index=True)
print(f'Number of instances where no money was labelled: {noMoneyInvested=}')
return investorDF
if __name__ == '__main__':
investmentsDF = pd.read_pickle('datasets/investments.csv.pickle')
orgDF = pd.read_csv('datasets/organizations.csv')
organizations = createOrgDict(orgDF)
investorDF = postProcessInvestorsFunc(investmentsDF, organizations, False,'investors_per_founder')
writeToCSV(investorDF, 'binned_output/investors_fraction')