From 01f3f4ddc7dad39256ea3892bd91494b5b2e847f Mon Sep 17 00:00:00 2001 From: Nate Wessel Date: Tue, 28 Jan 2025 09:31:15 -0500 Subject: [PATCH 01/10] remove duplicated lines thanks to Gabe for spotting this! --- backend/app/get_travel_time.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 070030b..8b2fdb5 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -104,13 +104,6 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ total_corridor_length = links_df['length'].sum() - links_df = pandas.DataFrame({ - 'link_dir': [l['link_dir'] for l in links], - 'length': [l['length_m'] for l in links] - }).set_index('link_dir') - - total_corridor_length = links_df['length'].sum() - query_params = { "link_dir_list": [link['link_dir'] for link in links], "node_start": start_node, From c056b36b73f9b63baf0f868dfbb7fea6cb05ab0d Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 18:54:11 +0000 Subject: [PATCH 02/10] add median with d3-array --- frontend/package-lock.json | 22 ++++++++++++++++++++++ frontend/package.json | 1 + frontend/src/travelTimeQuery.js | 8 +++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index aacab40..af6247c 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -12,6 +12,7 @@ "@emotion/react": "^11.10.6", "@emotion/styled": "^11.10.6", "@mui/material": "5.x", + "d3-array": "^3.2.4", "express": "^4.18.2", "maplibre-gl": "4.7.x", "p-queue": "^8.0.1", @@ -3731,6 +3732,18 @@ "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==" }, + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "license": "ISC", + "dependencies": { + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/data-view-buffer": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz", @@ -5499,6 +5512,15 @@ "node": ">= 0.4" } }, + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, "node_modules/interpret": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/interpret/-/interpret-2.2.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index 7ec08d3..ea8f708 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -44,6 +44,7 @@ "@emotion/react": "^11.10.6", "@emotion/styled": "^11.10.6", "@mui/material": "5.x", + "d3-array": "^3.2.4", "express": "^4.18.2", "maplibre-gl": "4.7.x", "p-queue": "^8.0.1", diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js index 9048ba2..4e30341 100644 --- a/frontend/src/travelTimeQuery.js +++ b/frontend/src/travelTimeQuery.js @@ -1,4 +1,5 @@ import { domain } from './domain.js' +import { quantile } from 'd3-array' export class TravelTimeQuery { #corridor @@ -80,15 +81,20 @@ export class TravelTimeQuery { record.set('hoursInRange', this.hoursInRange) record.set('mean_travel_time_minutes', this.#results?.travel_time?.minutes) record.set('mean_travel_time_seconds', this.#results?.travel_time?.seconds) - // minimum and maximum travel time observations + // other stats record.set( 'max_travel_time_seconds', Math.max(...this.#results.observations.map(o => o.seconds)) ) + record.set( + 'median_travel_time_seconds', + quantile([...this.#results.observations.map(o => o.seconds)], 0.5) + ) record.set( 'min_travel_time_seconds', Math.min(...this.#results.observations.map(o => o.seconds)) ) + // turning these off in the frontend until they're ready for production //record.set('moe_lower_p95', this.#results?.confidence?.intervals?.['p=0.95']?.lower?.seconds) //record.set('moe_upper_p95', this.#results?.confidence?.intervals?.['p=0.95']?.upper?.seconds) From bfc22befcf2e8419077a0ebccd4aca509e576e44 Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 18:57:10 +0000 Subject: [PATCH 03/10] 85th percentile --- frontend/src/travelTimeQuery.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js index 4e30341..c6aa800 100644 --- a/frontend/src/travelTimeQuery.js +++ b/frontend/src/travelTimeQuery.js @@ -90,6 +90,10 @@ export class TravelTimeQuery { 'median_travel_time_seconds', quantile([...this.#results.observations.map(o => o.seconds)], 0.5) ) + record.set( + 'p85_travel_time_seconds', + quantile([...this.#results.observations.map(o => o.seconds)], 0.85) + ) record.set( 'min_travel_time_seconds', Math.min(...this.#results.observations.map(o => o.seconds)) From bea7c538e7c2053545961bbf55306d46ce0bc00d Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 19:04:55 +0000 Subject: [PATCH 04/10] add corridor length in meters --- frontend/src/travelTimeQuery.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js index c6aa800..778b999 100644 --- a/frontend/src/travelTimeQuery.js +++ b/frontend/src/travelTimeQuery.js @@ -98,7 +98,11 @@ export class TravelTimeQuery { 'min_travel_time_seconds', Math.min(...this.#results.observations.map(o => o.seconds)) ) - + record.set( + 'corridor_length_meters', + this.#corridor.links.reduce((a,l)=>a+l.length_m, 0) + ) + // turning these off in the frontend until they're ready for production //record.set('moe_lower_p95', this.#results?.confidence?.intervals?.['p=0.95']?.lower?.seconds) //record.set('moe_upper_p95', this.#results?.confidence?.intervals?.['p=0.95']?.upper?.seconds) From 6ca007ea268ea29e3e47d18e5aa7dd3c5ab0ca92 Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 19:09:25 +0000 Subject: [PATCH 05/10] count pseudo observations --- frontend/src/travelTimeQuery.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js index 778b999..d47eb10 100644 --- a/frontend/src/travelTimeQuery.js +++ b/frontend/src/travelTimeQuery.js @@ -82,6 +82,10 @@ export class TravelTimeQuery { record.set('mean_travel_time_minutes', this.#results?.travel_time?.minutes) record.set('mean_travel_time_seconds', this.#results?.travel_time?.seconds) // other stats + record.set( + 'num_pseudo_obs', + this.#results.observations.length + ) record.set( 'max_travel_time_seconds', Math.max(...this.#results.observations.map(o => o.seconds)) From a759988f272895fe88e7e2986a7449c4b7508d27 Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 19:36:20 +0000 Subject: [PATCH 06/10] be explicit about order --- backend/app/get_travel_time.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 8b2fdb5..2c6211c 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -125,9 +125,8 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ ) connection.close() - # create custom binning - bins = make_bins(links_df, link_speeds_df) + bins = make_bins(links_df, link_speeds_df, end_time) # handle the case where there are no observations; return early. if link_speeds_df.empty or len(bins) == 0: @@ -226,7 +225,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ } },cacheURI) -def make_bins(links_df, link_speeds_df): +def make_bins(links_df, link_speeds_df, end_time): """Create the smallest temporal bins possible while ensuring at least 80% of links, by length, have observations.""" # start with an empty set of links @@ -234,7 +233,8 @@ def make_bins(links_df, link_speeds_df): bin_ends = list() total_length = links_df['length'].sum() minimum_length = 0.8 * total_length - for tx in link_speeds_df.tx.unique(): + # iterate over time bins with data, in order of occurence + for tx in link_speeds_df.tx.unique().sort_values(): # add links one bin at a time five_min_bin = link_speeds_df[link_speeds_df['tx']==tx] links.update(five_min_bin.link_dir.unique()) From 1dc38323d82f6e9c5c9c55ecc4153d22abc05dbf Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 21:20:11 +0000 Subject: [PATCH 07/10] remove unused arg, do sorting properly --- backend/app/get_travel_time.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 2c6211c..81ad79e 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -126,7 +126,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ connection.close() # create custom binning - bins = make_bins(links_df, link_speeds_df, end_time) + bins = make_bins(links_df, link_speeds_df) # handle the case where there are no observations; return early. if link_speeds_df.empty or len(bins) == 0: @@ -225,7 +225,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ } },cacheURI) -def make_bins(links_df, link_speeds_df, end_time): +def make_bins(links_df, link_speeds_df): """Create the smallest temporal bins possible while ensuring at least 80% of links, by length, have observations.""" # start with an empty set of links @@ -234,7 +234,7 @@ def make_bins(links_df, link_speeds_df, end_time): total_length = links_df['length'].sum() minimum_length = 0.8 * total_length # iterate over time bins with data, in order of occurence - for tx in link_speeds_df.tx.unique().sort_values(): + for tx in sorted(list(link_speeds_df.tx.unique())): # add links one bin at a time five_min_bin = link_speeds_df[link_speeds_df['tx']==tx] links.update(five_min_bin.link_dir.unique()) From 3d9f0516ea730e9d61efdeb55e9d5f71c26ee560 Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 21:36:56 +0000 Subject: [PATCH 08/10] refactor to track links per 5min bin --- backend/app/get_travel_time.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 81ad79e..73fd06f 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -228,22 +228,25 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ def make_bins(links_df, link_speeds_df): """Create the smallest temporal bins possible while ensuring at least 80% of links, by length, have observations.""" - # start with an empty set of links - links = set() + # start with empty list of bins, defined by their ends bin_ends = list() - total_length = links_df['length'].sum() - minimum_length = 0.8 * total_length - # iterate over time bins with data, in order of occurence + minimum_length = 0.8 * links_df['length'].sum() + + links_per_5mbin = {} + # iterate over 5-min time bins with data, in chronological order for tx in sorted(list(link_speeds_df.tx.unique())): # add links one bin at a time - five_min_bin = link_speeds_df[link_speeds_df['tx']==tx] - links.update(five_min_bin.link_dir.unique()) - # measure the length of links in the set + bin5m = link_speeds_df[link_speeds_df['tx']==tx] + # get the distinct links in this 5-minute bin + links_per_5mbin[tx] = bin5m.link_dir.unique() + # get all the links in all the 5m bins so far + links = set( link for linklist in links_per_5mbin.values() for link in linklist ) + # measure the length of links in that set length_so_far = links_df.loc[list(links),'length'].sum() - # define length threshold + # compare against length threshold if length_so_far >= minimum_length: bin_ends.append(tx) - links = set() # reset + links_per_5mbin = {} else: pass return bin_ends From 56755a4a6c94310850b7a1898f5175281e2f877d Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Wed, 26 Feb 2025 22:21:10 +0000 Subject: [PATCH 09/10] partially implement 1h rolling window approach still need to define bins also by their ends --- backend/app/get_travel_time.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index 73fd06f..e741ef0 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -73,7 +73,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ SELECT link_dir, dt::text, - tx, + EXTRACT('epoch' FROM tx)::bigint AS tx, mean::real AS speed_kmph FROM here.ta WHERE @@ -229,21 +229,30 @@ def make_bins(links_df, link_speeds_df): """Create the smallest temporal bins possible while ensuring at least 80% of links, by length, have observations.""" # start with empty list of bins, defined by their ends + # TODO: define bin starts as well bin_ends = list() minimum_length = 0.8 * links_df['length'].sum() links_per_5mbin = {} # iterate over 5-min time bins with data, in chronological order for tx in sorted(list(link_speeds_df.tx.unique())): - # add links one bin at a time + # get the data for this 5min bin bin5m = link_speeds_df[link_speeds_df['tx']==tx] - # get the distinct links in this 5-minute bin + # add the distinct links from this 5-minute bin links_per_5mbin[tx] = bin5m.link_dir.unique() + # in case data is very sparse, drop observations more than one hour + # prior to the current 5min bin - this window is moving! + keys_to_drop = [] + for tx_key in links_per_5mbin.keys(): + if tx - tx_key >= 3600: # seconds, i.e. 1 hour + keys_to_drop.append(tx_key) + for tx_key in keys_to_drop: + del links_per_5mbin[tx_key] # get all the links in all the 5m bins so far links = set( link for linklist in links_per_5mbin.values() for link in linklist ) # measure the length of links in that set length_so_far = links_df.loc[list(links),'length'].sum() - # compare against length threshold + # compare against length threshold; if met, end the bin if length_so_far >= minimum_length: bin_ends.append(tx) links_per_5mbin = {} From c5f84110bf99e554d563ca58baaf9e6a5bdc073e Mon Sep 17 00:00:00 2001 From: Nate-Wessel Date: Thu, 27 Feb 2025 16:34:38 +0000 Subject: [PATCH 10/10] jon by bin end AND start times --- backend/app/get_travel_time.py | 44 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py index e741ef0..8321dfa 100644 --- a/backend/app/get_travel_time.py +++ b/backend/app/get_travel_time.py @@ -11,6 +11,10 @@ import json from app.getGitHash import getGitHash +# this configures a setting (globally?) and removes a warning message +# https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#copy-on-write +pandas.options.mode.copy_on_write = True + # the way we currently do it def mean_daily_mean(obs): # group the observations by date @@ -143,16 +147,28 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_ 'query_params': query_params } } - # rolling join of bins to data - link_speeds_df = pandas.merge_asof( + # Pandas is very bad at joining(?), so we start by CROSS joining the bins + # to the data, though this is very inefficient + link_speeds_df = pandas.merge( link_speeds_df, - pandas.DataFrame({'tx': bins,'bin':bins}), - on='tx', - direction='forward' - ).set_index('link_dir') + pandas.DataFrame({ + 'bin': [ bin['id'] for bin in bins ], + 'bin_first_tx': [ bin['first'] for bin in bins ], + 'bin_last_tx': [ bin['last'] for bin in bins ] + }), + how='cross' + ) + # Now we filter out the records that don't align with their bins + link_speeds_df = link_speeds_df.query( + 'tx >= bin_first_tx & tx <= bin_last_tx' + ) # drop column used only for binning - link_speeds_df.drop('tx',axis='columns',inplace=True) - + link_speeds_df.drop( + ['tx', 'bin_first_tx', 'bin_last_tx'], + axis='columns', + inplace=True + ) + link_speeds_df = link_speeds_df.set_index('link_dir') # join previously queried link lengths link_speeds_df = link_speeds_df.join(links_df) # calculate link travel times from speed and length (in seconds) @@ -230,10 +246,11 @@ def make_bins(links_df, link_speeds_df): of links, by length, have observations.""" # start with empty list of bins, defined by their ends # TODO: define bin starts as well - bin_ends = list() + bins = list() minimum_length = 0.8 * links_df['length'].sum() links_per_5mbin = {} + bin_counter = 1 # iterate over 5-min time bins with data, in chronological order for tx in sorted(list(link_speeds_df.tx.unique())): # get the data for this 5min bin @@ -254,8 +271,13 @@ def make_bins(links_df, link_speeds_df): length_so_far = links_df.loc[list(links),'length'].sum() # compare against length threshold; if met, end the bin if length_so_far >= minimum_length: - bin_ends.append(tx) + bins.append({ + 'id': bin_counter, + 'first': min(links_per_5mbin.keys()), + 'last': tx # equivalently, max(links_per_5mbin.keys()) + }) + bin_counter += 1 links_per_5mbin = {} else: pass - return bin_ends + return bins