From 56755a4a6c94310850b7a1898f5175281e2f877d Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 22:21:10 +0000 Subject: [PATCH 1/2] partially implement 1h rolling window approach still need to define bins also by their ends --- backend/app/get_travel_time.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 73fd06f..e741ef0 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -73,7 +73,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ SELECT link_dir, dt::text, - tx, + EXTRACT('epoch' FROM tx)::bigint AS tx, mean::real AS speed_kmph FROM here.ta WHERE @@ -229,21 +229,30 @@ def make_bins(links_df, link_speeds_df): """Create the smallest temporal bins possible while ensuring at least 80% of links, by length, have observations.""" # start with empty list of bins, defined by their ends + # TODO: define bin starts as well bin_ends = list() minimum_length = 0.8 * links_df['length'].sum() links_per_5mbin = {} # iterate over 5-min time bins with data, in chronological order for tx in sorted(list(link_speeds_df.tx.unique())): - # add links one bin at a time + # get the data for this 5min bin bin5m = link_speeds_df[link_speeds_df['tx']==tx] - # get the distinct links in this 5-minute bin + # add the distinct links from this 5-minute bin links_per_5mbin[tx] = bin5m.link_dir.unique() + # in case data is very sparse, drop observations more than one hour + # prior to the current 5min bin - this window is moving! + keys_to_drop = [] + for tx_key in links_per_5mbin.keys(): + if tx - tx_key >= 3600: # seconds, i.e. 1 hour + keys_to_drop.append(tx_key) + for tx_key in keys_to_drop: + del links_per_5mbin[tx_key] # get all the links in all the 5m bins so far links = set( link for linklist in links_per_5mbin.values() for link in linklist ) # measure the length of links in that set length_so_far = links_df.loc[list(links),'length'].sum() - # compare against length threshold + # compare against length threshold; if met, end the bin if length_so_far >= minimum_length: bin_ends.append(tx) links_per_5mbin = {} From c5f84110bf99e554d563ca58baaf9e6a5bdc073e Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Thu, 27 Feb 2025 16:34:38 +0000 Subject: [PATCH 2/2] jon by bin end AND start times --- backend/app/get_travel_time.py | 44 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index e741ef0..8321dfa 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -11,6 +11,10 @@ import json from app.getGitHash import getGitHash +# this configures a setting (globally?) and removes a warning message +# https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#copy-on-write +pandas.options.mode.copy_on_write = True + # the way we currently do it def mean_daily_mean(obs): # group the observations by date @@ -143,16 +147,28 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ 'query_params': query_params } } - # rolling join of bins to data - link_speeds_df = pandas.merge_asof( + # Pandas is very bad at joining(?), so we start by CROSS joining the bins + # to the data, though this is very inefficient + link_speeds_df = pandas.merge( link_speeds_df, - pandas.DataFrame({'tx': bins,'bin':bins}), - on='tx', - direction='forward' - ).set_index('link_dir') + pandas.DataFrame({ + 'bin': [ bin['id'] for bin in bins ], + 'bin_first_tx': [ bin['first'] for bin in bins ], + 'bin_last_tx': [ bin['last'] for bin in bins ] + }), + how='cross' + ) + # Now we filter out the records that don't align with their bins + link_speeds_df = link_speeds_df.query( + 'tx >= bin_first_tx & tx <= bin_last_tx' + ) # drop column used only for binning - link_speeds_df.drop('tx',axis='columns',inplace=True) - + link_speeds_df.drop( + ['tx', 'bin_first_tx', 'bin_last_tx'], + axis='columns', + inplace=True + ) + link_speeds_df = link_speeds_df.set_index('link_dir') # join previously queried link lengths link_speeds_df = link_speeds_df.join(links_df) # calculate link travel times from speed and length (in seconds) @@ -230,10 +246,11 @@ def make_bins(links_df, link_speeds_df): of links, by length, have observations.""" # start with empty list of bins, defined by their ends # TODO: define bin starts as well - bin_ends = list() + bins = list() minimum_length = 0.8 * links_df['length'].sum() links_per_5mbin = {} + bin_counter = 1 # iterate over 5-min time bins with data, in chronological order for tx in sorted(list(link_speeds_df.tx.unique())): # get the data for this 5min bin @@ -254,8 +271,13 @@ def make_bins(links_df, link_speeds_df): length_so_far = links_df.loc[list(links),'length'].sum() # compare against length threshold; if met, end the bin if length_so_far >= minimum_length: - bin_ends.append(tx) + bins.append({ + 'id': bin_counter, + 'first': min(links_per_5mbin.keys()), + 'last': tx # equivalently, max(links_per_5mbin.keys()) + }) + bin_counter += 1 links_per_5mbin = {} else: pass - return bin_ends + return bins