-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbin_states.py
95 lines (64 loc) · 3.28 KB
/
bin_states.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
from rich_dataframe import prettify
import pprint
import time
def writeToCSV(df: pd.DataFrame, filename: str):
df.to_csv(f'{filename}.csv', index=True)
def getPopulationDict(populationDF: pd.DataFrame) -> dict:
populationDict = dict()
for idx, row in populationDF.iterrows():
state = row['state']
if state not in populationDict:
populationDict[state] = row['estimate']
return populationDict
def scalesByRow(df: pd.DataFrame)->None:
for idx, row in df.iterrows():
rowSum = 0
for key, value in row.items():
if pd.isna(value):
row[key] = 0 # replace nan with 0
else:
rowSum += value
if rowSum != 0: # check for ZeroDivisionError
for key, value in row.items():
df.at[idx, key] = value/rowSum
else:
for key, value in row.items():
df.at[idx, key] = 0 # cleans dataframe of any nan's
def processStatesFunc(orgDF: pd.DataFrame, populationDF: pd.DataFrame) ->list:
population = getPopulationDict(populationDF) # returns a dictionary
statesDict = dict()
for state in population.keys():
statesDict[state] = [0, 0, 0, 0, 0] # state has 0 gender to begin with
for idx, row in orgDF.iterrows():
orgState = row['state']
if orgState in statesDict and orgState in population:
statesDict[orgState][0] += row['genderMale']
statesDict[orgState][1] += row['genderFemale']
statesDict[orgState][2] += row['genderNonBinary']
statesDict[orgState][3] += row['genderNotProvided']
statesDict[orgState][4] += row['genderDiversity']
# for key, value in statesDict.items():
# statesDict[key][0] /= int(population[key].replace(',', '')) # per capita
# statesDict[key][1] /= int(population[key].replace(',', ''))
# statesDict[key][2] /= int(population[key].replace(',', ''))
# statesDict[key][3] /= int(population[key].replace(',', ''))
# statesDict[key][4] /= int(population[key].replace(',', ''))
# pprint.pprint(statesDict)
for key, value in statesDict.items():
statesDict[key][0] /= int(population[key].replace(',', '')) / 100000 # per 100,000
statesDict[key][1] /= int(population[key].replace(',', '')) / 100000
statesDict[key][2] /= int(population[key].replace(',', '')) / 100000
statesDict[key][3] /= int(population[key].replace(',', '')) / 100000
statesDict[key][4] /= int(population[key].replace(',', '')) / 100000
statesDF = pd.DataFrame.from_dict(statesDict, orient='columns').transpose()
statesDF.rename({0: 'male', 1: 'female', 2: 'non-binary', 3: 'unspecified', 4: 'diversity'}, axis='columns', inplace=True)
statesScaledDF = statesDF.drop(columns=['female', 'non-binary', 'unspecified'])
scalesByRow(statesScaledDF) # modifies dataframe
return [statesDF, statesScaledDF]
if __name__ == '__main__':
populationDF = pd.read_csv('datasets/cb_state_estimates.csv')
orgDF = pd.read_csv('datasets/organizations.csv')
[statesDF, statesScaledDF] = processStatesFunc(orgDF, populationDF)
writeToCSV(statesDF, 'binned_output/state_count')
writeToCSV(statesScaledDF, 'binned_output/state_fraction')