Skip to content

Commit

Permalink
start with Kruskal-Wallis test
Browse files Browse the repository at this point in the history
  • Loading branch information
Nate-Wessel committed May 27, 2024
1 parent eaa25d1 commit 9687423
Showing 1 changed file with 46 additions and 18 deletions.
64 changes: 46 additions & 18 deletions analysis/test-difference-between-distributions.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,55 @@
# test whether travel times are drawn from the same distribution

import requests
import requests, scipy

backend = 'http://localhost:8072/aggregate-travel-times'
sig_level = 0.05

# get data for the same corridor and month, one year apart
tt1 = requests.get(f'{backend}/30345882/30357505/16/19/2022-04-01/2022-05-01/false/12345').json()
tt2 = requests.get(f'{backend}/30345882/30357505/16/19/2023-04-01/2023-05-01/false/12345').json()
# major complete-street rebuild took place between tt2 and tt3
tt3 = requests.get(f'{backend}/30345882/30357505/16/19/2024-04-01/2024-05-01/false/12345').json()
backend = 'http://localhost:8072'

tt1_obs = [ tt['seconds'] for tt in tt1['results']['observations'] ]
tt2_obs = [ tt['seconds'] for tt in tt2['results']['observations'] ]
tt3_obs = [ tt['seconds'] for tt in tt3['results']['observations'] ]
# define a range of years to query data for
years = [ y for y in range(2018, 2025) ]

import scipy.stats as stats
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ranksums.html
# aka
# get data for the same corridor and month, for each year in the range
def getObs(responseData):
return [ tt['seconds'] for tt in responseData['results']['observations'] ]

data = [
getObs(
requests.get(f'{backend}/aggregate-travel-times/30345882/30357505/16/19/{year}-04-01/{year}-05-01/false/12345').json()
) for year in years
]

# now test whether any of these sets of observations differ significantly
# from the others
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html
(stat, pvalue) = scipy.stats.kruskal(*data)

print(
f'One or more of the distributions is different with a P-value of {pvalue}'
if pvalue < sig_level else
f'We fail to reject the hypothesis that observations are drawn from the same distribution'
)

if pvalue > sig_level:
# if this test doesn't reject the null hypothesis
# there's no need to go further
raise SystemExit

# now let's find out which years saw significant differences from their
# preceding year. We'll use a Mann-Whitney U test
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
statistic1, pvalue1 = stats.ranksums(tt1_obs, tt2_obs,'two-sided')
statistic2, pvalue2 = stats.ranksums(tt2_obs, tt3_obs,'two-sided')
# available in scipy as
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ranksums.html

sig_level = 0.05
for t1_year, data_t1, data_t2 in zip(years, data, data[1:]):
stat, pvalue = scipy.stats.ranksums(data_t1, data_t2, 'two-sided')
print(
f'Travel time distributions differ between {t1_year} and {t1_year+1} with a P-value of {pvalue}'
if pvalue < sig_level else
f'Travel times between {t1_year} and {t1_year+1} are not significantly different'
)

print(f'P = {pvalue1}', 'not different' if pvalue1 > sig_level else 'different')
print(f'P = {pvalue2}', 'not different' if pvalue2 > sig_level else 'different')
# This is an interesting period. COVID happened, and then for this particular
# corridor, after things settled down for a year between 2022-2023, there was
# a major road-diet / complete-street installed which greatly changed motor
# vehicle travel times

0 comments on commit 9687423

Please sign in to comment.