From 01f3f4ddc7dad39256ea3892bd91494b5b2e847f Mon Sep 17 00:00:00 2001
From: Nate Wessel <nate.wessel@toronto.ca>
Date: Tue, 28 Jan 2025 09:31:15 -0500
Subject: [PATCH 01/10] remove duplicated lines

thanks to Gabe for spotting this!
---
 backend/app/get_travel_time.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index 070030b..8b2fdb5 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -104,13 +104,6 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
 
     total_corridor_length = links_df['length'].sum()
 
-    links_df = pandas.DataFrame({
-        'link_dir': [l['link_dir'] for l in links],
-        'length': [l['length_m'] for l in links]
-    }).set_index('link_dir')
-
-    total_corridor_length = links_df['length'].sum()
-
     query_params = {
         "link_dir_list": [link['link_dir'] for link in links],
         "node_start": start_node,

From c056b36b73f9b63baf0f868dfbb7fea6cb05ab0d Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 18:54:11 +0000
Subject: [PATCH 02/10] add median with d3-array

---
 frontend/package-lock.json      | 22 ++++++++++++++++++++++
 frontend/package.json           |  1 +
 frontend/src/travelTimeQuery.js |  8 +++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index aacab40..af6247c 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -12,6 +12,7 @@
                 "@emotion/react": "^11.10.6",
                 "@emotion/styled": "^11.10.6",
                 "@mui/material": "5.x",
+                "d3-array": "^3.2.4",
                 "express": "^4.18.2",
                 "maplibre-gl": "4.7.x",
                 "p-queue": "^8.0.1",
@@ -3731,6 +3732,18 @@
             "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
             "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="
         },
+        "node_modules/d3-array": {
+            "version": "3.2.4",
+            "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz",
+            "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==",
+            "license": "ISC",
+            "dependencies": {
+                "internmap": "1 - 2"
+            },
+            "engines": {
+                "node": ">=12"
+            }
+        },
         "node_modules/data-view-buffer": {
             "version": "1.0.1",
             "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz",
@@ -5499,6 +5512,15 @@
                 "node": ">= 0.4"
             }
         },
+        "node_modules/internmap": {
+            "version": "2.0.3",
+            "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz",
+            "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==",
+            "license": "ISC",
+            "engines": {
+                "node": ">=12"
+            }
+        },
         "node_modules/interpret": {
             "version": "2.2.0",
             "resolved": "https://registry.npmjs.org/interpret/-/interpret-2.2.0.tgz",
diff --git a/frontend/package.json b/frontend/package.json
index 7ec08d3..ea8f708 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -44,6 +44,7 @@
         "@emotion/react": "^11.10.6",
         "@emotion/styled": "^11.10.6",
         "@mui/material": "5.x",
+        "d3-array": "^3.2.4",
         "express": "^4.18.2",
         "maplibre-gl": "4.7.x",
         "p-queue": "^8.0.1",
diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js
index 9048ba2..4e30341 100644
--- a/frontend/src/travelTimeQuery.js
+++ b/frontend/src/travelTimeQuery.js
@@ -1,4 +1,5 @@
 import { domain } from './domain.js'
+import { quantile } from 'd3-array'
 
 export class TravelTimeQuery {
     #corridor
@@ -80,15 +81,20 @@ export class TravelTimeQuery {
         record.set('hoursInRange', this.hoursInRange)
         record.set('mean_travel_time_minutes', this.#results?.travel_time?.minutes)
         record.set('mean_travel_time_seconds', this.#results?.travel_time?.seconds)
-        // minimum and maximum travel time observations
+        // other stats
         record.set(
             'max_travel_time_seconds',
             Math.max(...this.#results.observations.map(o => o.seconds))
         )
+        record.set(
+            'median_travel_time_seconds',
+            quantile([...this.#results.observations.map(o => o.seconds)], 0.5)
+        )
         record.set(
             'min_travel_time_seconds',
             Math.min(...this.#results.observations.map(o => o.seconds))
         )
+        
         // turning these off in the frontend until they're ready for production
         //record.set('moe_lower_p95', this.#results?.confidence?.intervals?.['p=0.95']?.lower?.seconds)
         //record.set('moe_upper_p95', this.#results?.confidence?.intervals?.['p=0.95']?.upper?.seconds)

From bfc22befcf2e8419077a0ebccd4aca509e576e44 Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 18:57:10 +0000
Subject: [PATCH 03/10] 85th percentile

---
 frontend/src/travelTimeQuery.js | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js
index 4e30341..c6aa800 100644
--- a/frontend/src/travelTimeQuery.js
+++ b/frontend/src/travelTimeQuery.js
@@ -90,6 +90,10 @@ export class TravelTimeQuery {
             'median_travel_time_seconds',
             quantile([...this.#results.observations.map(o => o.seconds)], 0.5)
         )
+        record.set(
+            'p85_travel_time_seconds',
+            quantile([...this.#results.observations.map(o => o.seconds)], 0.85)
+        )
         record.set(
             'min_travel_time_seconds',
             Math.min(...this.#results.observations.map(o => o.seconds))

From bea7c538e7c2053545961bbf55306d46ce0bc00d Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 19:04:55 +0000
Subject: [PATCH 04/10] add corridor length in meters

---
 frontend/src/travelTimeQuery.js | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js
index c6aa800..778b999 100644
--- a/frontend/src/travelTimeQuery.js
+++ b/frontend/src/travelTimeQuery.js
@@ -98,7 +98,11 @@ export class TravelTimeQuery {
             'min_travel_time_seconds',
             Math.min(...this.#results.observations.map(o => o.seconds))
         )
-        
+        record.set(
+            'corridor_length_meters',
+            this.#corridor.links.reduce((a,l)=>a+l.length_m, 0)
+        )
+
         // turning these off in the frontend until they're ready for production
         //record.set('moe_lower_p95', this.#results?.confidence?.intervals?.['p=0.95']?.lower?.seconds)
         //record.set('moe_upper_p95', this.#results?.confidence?.intervals?.['p=0.95']?.upper?.seconds)

From 6ca007ea268ea29e3e47d18e5aa7dd3c5ab0ca92 Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 19:09:25 +0000
Subject: [PATCH 05/10] count pseudo observations

---
 frontend/src/travelTimeQuery.js | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/frontend/src/travelTimeQuery.js b/frontend/src/travelTimeQuery.js
index 778b999..d47eb10 100644
--- a/frontend/src/travelTimeQuery.js
+++ b/frontend/src/travelTimeQuery.js
@@ -82,6 +82,10 @@ export class TravelTimeQuery {
         record.set('mean_travel_time_minutes', this.#results?.travel_time?.minutes)
         record.set('mean_travel_time_seconds', this.#results?.travel_time?.seconds)
         // other stats
+        record.set(
+            'num_pseudo_obs',
+            this.#results.observations.length
+        )
         record.set(
             'max_travel_time_seconds',
             Math.max(...this.#results.observations.map(o => o.seconds))

From a759988f272895fe88e7e2986a7449c4b7508d27 Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 19:36:20 +0000
Subject: [PATCH 06/10] be explicit about order

---
 backend/app/get_travel_time.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index 8b2fdb5..2c6211c 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -125,9 +125,8 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
             )
     connection.close()
 
-
     # create custom binning
-    bins = make_bins(links_df, link_speeds_df)
+    bins = make_bins(links_df, link_speeds_df, end_time)
 
     # handle the case where there are no observations; return early. 
     if link_speeds_df.empty or len(bins) == 0:
@@ -226,7 +225,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
         }
     },cacheURI)
 
-def make_bins(links_df, link_speeds_df):
+def make_bins(links_df, link_speeds_df, end_time):
     """Create the smallest temporal bins possible while ensuring at least 80%
     of links, by length, have observations."""
     # start with an empty set of links
@@ -234,7 +233,8 @@ def make_bins(links_df, link_speeds_df):
     bin_ends = list()
     total_length = links_df['length'].sum()
     minimum_length = 0.8 * total_length
-    for tx in link_speeds_df.tx.unique():
+    # iterate over time bins with data, in order of occurence
+    for tx in link_speeds_df.tx.unique().sort_values():
         # add links one bin at a time
         five_min_bin = link_speeds_df[link_speeds_df['tx']==tx]
         links.update(five_min_bin.link_dir.unique())

From 1dc38323d82f6e9c5c9c55ecc4153d22abc05dbf Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 21:20:11 +0000
Subject: [PATCH 07/10] remove unused arg, do sorting properly

---
 backend/app/get_travel_time.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index 2c6211c..81ad79e 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -126,7 +126,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
     connection.close()
 
     # create custom binning
-    bins = make_bins(links_df, link_speeds_df, end_time)
+    bins = make_bins(links_df, link_speeds_df)
 
     # handle the case where there are no observations; return early. 
     if link_speeds_df.empty or len(bins) == 0:
@@ -225,7 +225,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
         }
     },cacheURI)
 
-def make_bins(links_df, link_speeds_df, end_time):
+def make_bins(links_df, link_speeds_df):
     """Create the smallest temporal bins possible while ensuring at least 80%
     of links, by length, have observations."""
     # start with an empty set of links
@@ -234,7 +234,7 @@ def make_bins(links_df, link_speeds_df, end_time):
     total_length = links_df['length'].sum()
     minimum_length = 0.8 * total_length
     # iterate over time bins with data, in order of occurence
-    for tx in link_speeds_df.tx.unique().sort_values():
+    for tx in sorted(list(link_speeds_df.tx.unique())):
         # add links one bin at a time
         five_min_bin = link_speeds_df[link_speeds_df['tx']==tx]
         links.update(five_min_bin.link_dir.unique())

From 3d9f0516ea730e9d61efdeb55e9d5f71c26ee560 Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 21:36:56 +0000
Subject: [PATCH 08/10] refactor to track links per 5min bin

---
 backend/app/get_travel_time.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index 81ad79e..73fd06f 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -228,22 +228,25 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
 def make_bins(links_df, link_speeds_df):
     """Create the smallest temporal bins possible while ensuring at least 80%
     of links, by length, have observations."""
-    # start with an empty set of links
-    links = set()
+    # start with empty list of bins, defined by their ends
     bin_ends = list()
-    total_length = links_df['length'].sum()
-    minimum_length = 0.8 * total_length
-    # iterate over time bins with data, in order of occurence
+    minimum_length = 0.8 * links_df['length'].sum()
+
+    links_per_5mbin = {}
+    # iterate over 5-min time bins with data, in chronological order
     for tx in sorted(list(link_speeds_df.tx.unique())):
         # add links one bin at a time
-        five_min_bin = link_speeds_df[link_speeds_df['tx']==tx]
-        links.update(five_min_bin.link_dir.unique())
-        # measure the length of links in the set
+        bin5m = link_speeds_df[link_speeds_df['tx']==tx]
+        # get the distinct links in this 5-minute bin
+        links_per_5mbin[tx] = bin5m.link_dir.unique()
+        # get all the links in all the 5m bins so far
+        links = set( link for linklist in links_per_5mbin.values() for link in linklist )
+        # measure the length of links in that set
         length_so_far = links_df.loc[list(links),'length'].sum()
-        # define length threshold
+        # compare against length threshold
         if length_so_far >= minimum_length:
             bin_ends.append(tx)
-            links = set() # reset
+            links_per_5mbin = {}
         else:
             pass
     return bin_ends

From 56755a4a6c94310850b7a1898f5175281e2f877d Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Wed, 26 Feb 2025 22:21:10 +0000
Subject: [PATCH 09/10] partially implement 1h rolling window approach

still need to define bins also by their ends
---
 backend/app/get_travel_time.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index 73fd06f..e741ef0 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -73,7 +73,7 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
         SELECT
             link_dir,
             dt::text,
-            tx,
+            EXTRACT('epoch' FROM tx)::bigint AS tx,
             mean::real AS speed_kmph
         FROM here.ta
         WHERE
@@ -229,21 +229,30 @@ def make_bins(links_df, link_speeds_df):
     """Create the smallest temporal bins possible while ensuring at least 80%
     of links, by length, have observations."""
     # start with empty list of bins, defined by their ends
+    # TODO: define bin starts as well
     bin_ends = list()
     minimum_length = 0.8 * links_df['length'].sum()
 
     links_per_5mbin = {}
     # iterate over 5-min time bins with data, in chronological order
     for tx in sorted(list(link_speeds_df.tx.unique())):
-        # add links one bin at a time
+        # get the data for this 5min bin
         bin5m = link_speeds_df[link_speeds_df['tx']==tx]
-        # get the distinct links in this 5-minute bin
+        # add the distinct links from this 5-minute bin
         links_per_5mbin[tx] = bin5m.link_dir.unique()
+        # in case data is very sparse, drop observations more than one hour
+        # prior to the current 5min bin - this window is moving!
+        keys_to_drop = []
+        for tx_key in links_per_5mbin.keys():
+            if tx - tx_key >= 3600: # seconds, i.e. 1 hour
+                keys_to_drop.append(tx_key)
+        for tx_key in keys_to_drop:
+            del links_per_5mbin[tx_key]
         # get all the links in all the 5m bins so far
         links = set( link for linklist in links_per_5mbin.values() for link in linklist )
         # measure the length of links in that set
         length_so_far = links_df.loc[list(links),'length'].sum()
-        # compare against length threshold
+        # compare against length threshold; if met, end the bin
         if length_so_far >= minimum_length:
             bin_ends.append(tx)
             links_per_5mbin = {}

From c5f84110bf99e554d563ca58baaf9e6a5bdc073e Mon Sep 17 00:00:00 2001
From: Nate-Wessel <nate.wessel@toronto.ca>
Date: Thu, 27 Feb 2025 16:34:38 +0000
Subject: [PATCH 10/10] jon by bin end AND start times

---
 backend/app/get_travel_time.py | 44 +++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/backend/app/get_travel_time.py b/backend/app/get_travel_time.py
index e741ef0..8321dfa 100644
--- a/backend/app/get_travel_time.py
+++ b/backend/app/get_travel_time.py
@@ -11,6 +11,10 @@
 import json
 from app.getGitHash import getGitHash
 
+# this configures a setting (globally?) and removes a warning message
+# https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#copy-on-write
+pandas.options.mode.copy_on_write = True
+
 # the way we currently do it
 def mean_daily_mean(obs):
     # group the observations by date
@@ -143,16 +147,28 @@ def get_travel_time(start_node, end_node, start_time, end_time, start_date, end_
                 'query_params': query_params
             }
         }
-    # rolling join of bins to data
-    link_speeds_df = pandas.merge_asof(
+    # Pandas is very bad at joining(?), so we start by CROSS joining the bins
+    # to the data, though this is very inefficient
+    link_speeds_df = pandas.merge(
         link_speeds_df,
-        pandas.DataFrame({'tx': bins,'bin':bins}),
-        on='tx',
-        direction='forward'
-    ).set_index('link_dir')
+        pandas.DataFrame({
+            'bin': [ bin['id'] for bin in bins ],
+            'bin_first_tx': [ bin['first'] for bin in bins ],
+            'bin_last_tx': [ bin['last'] for bin in bins ]
+        }),
+        how='cross'
+    )
+    # Now we filter out the records that don't align with their bins
+    link_speeds_df = link_speeds_df.query(
+        'tx >= bin_first_tx & tx <= bin_last_tx'
+    )
     # drop column used only for binning
-    link_speeds_df.drop('tx',axis='columns',inplace=True)
-
+    link_speeds_df.drop(
+        ['tx', 'bin_first_tx', 'bin_last_tx'],
+        axis='columns',
+        inplace=True
+    )
+    link_speeds_df = link_speeds_df.set_index('link_dir')
     # join previously queried link lengths
     link_speeds_df = link_speeds_df.join(links_df)
     # calculate link travel times from speed and length (in seconds)
@@ -230,10 +246,11 @@ def make_bins(links_df, link_speeds_df):
     of links, by length, have observations."""
     # start with empty list of bins, defined by their ends
     # TODO: define bin starts as well
-    bin_ends = list()
+    bins = list()
     minimum_length = 0.8 * links_df['length'].sum()
 
     links_per_5mbin = {}
+    bin_counter = 1
     # iterate over 5-min time bins with data, in chronological order
     for tx in sorted(list(link_speeds_df.tx.unique())):
         # get the data for this 5min bin
@@ -254,8 +271,13 @@ def make_bins(links_df, link_speeds_df):
         length_so_far = links_df.loc[list(links),'length'].sum()
         # compare against length threshold; if met, end the bin
         if length_so_far >= minimum_length:
-            bin_ends.append(tx)
+            bins.append({
+                'id': bin_counter,
+                'first': min(links_per_5mbin.keys()),
+                'last': tx # equivalently, max(links_per_5mbin.keys())
+            })
+            bin_counter += 1
             links_per_5mbin = {}
         else:
             pass
-    return bin_ends
+    return bins