[Add notification for Pytorch Operator Microbenchmark (#7447)

yangw-dev · web-flow · commit 95712908c909 · 2025-11-12T14:08:21.000-08:00
## Overview Add notification for pytorch operator microbenchmark Add median option as option to choose the median value as baseline the data visualization example; https://hud.pytorch.org/benchmark/regression/report/acfae3e8-7680-403b-a234-79e5c194f4c0 ## Bug Fix 1. [Notification] return skip when timeseries from api resp is empty list, not the class object 2. [API] fix the emptyTimeseriesResponse to match the response format with the data one ## Next step Add search filter for regression report
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -11,6 +11,57 @@
 )
 
 
+PYTORCH_OPERATOR_MICROBENCH_CONFIG = BenchmarkConfig(
+    name="Pytorch Operator Microbench Regression",
+    id="pytorch_operator_microbenchmark",
+    source=BenchmarkApiSource(
+        api_query_url="https://hud.pytorch.org/api/benchmark/get_time_series",
+        type="benchmark_time_series_api",
+        api_endpoint_params_template="""
+                {
+                  "name": "pytorch_operator_microbenchmark",
+                  "query_params": {
+                    "mode": "",
+                    "branches": ["main"],
+                    "repo": "pytorch/pytorch",
+                    "device": "",
+                    "benchmarkName": "PyTorch operator microbenchmark",
+                    "startTime": "{{ startTime }}",
+                    "stopTime": "{{ stopTime }}"
+                    },
+                    "response_formats":["time_series"]
+                }
+                """,
+    ),
+    hud_info={
+        "url": "https://hud.pytorch.org/benchmark/v3/dashboard/pytorch_operator_microbenchmark",
+    },
+    # set baseline from past 4-8 days, and compare with the lastest 4 day
+    policy=Policy(
+        frequency=Frequency(value=1, unit="days"),
+        range=RangeConfig(
+            baseline=DayRangeWindow(value=4),
+            comparison=DayRangeWindow(value=4),
+        ),
+        metrics={
+            "latency": RegressionPolicy(
+                name="latency",
+                condition="greater_equal",
+                threshold=0.85,
+                baseline_aggregation="median",
+            ),
+        },
+        notification_config={
+            "type": "github",
+            "repo": "pytorch/test-infra",
+            "issue": "7445",
+        },
+    ),
+    report_config=ReportConfig(
+        report_level="insufficient_data",
+    ),
+)
+
 # Compiler benchmark regression config
 # todo(elainewy): eventually each team should configure
 # their own benchmark regression config, currenlty place
@@ -94,6 +145,7 @@
 BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(
     configs={
         "compiler_regression": COMPILER_BENCHMARK_CONFIG,
+        "pytorch_operator_microbenchmark": PYTORCH_OPERATOR_MICROBENCH_CONFIG,
     }
 )
 
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -1,5 +1,7 @@
 import datetime as dt
 import logging
+import math
+import statistics
 from typing import Any, Counter, Dict, List, Literal, Optional, TypedDict
 
 from common.benchmark_time_series_api_model import (
@@ -78,9 +80,11 @@ def get_regression_status(regression_summary: BenchmarkRegressionSummary) -> str
     status = (
         "regression"
         if regression_summary.get("regression_count", 0) > 0
-        else "suspicious"
-        if regression_summary.get("suspicious_count", 0) > 0
-        else "no_regression"
+        else (
+            "suspicious"
+            if regression_summary.get("suspicious_count", 0) > 0
+            else "no_regression"
+        )
     )
     return status
 
@@ -274,10 +278,13 @@ def _get_baseline(
         calculate the baseline value based on the mode
         mode: mean, p90, max, min, target, p50, p95
         """
-        items = [d for d in data["values"] if field in d]
+        items = [
+            d
+            for d in data["values"]
+            if field in d and d[field] is not None and not math.isnan(float(d[field]))
+        ]
         if not items:
             return None
-
         if mode == "max":
             baseline_obj = max(items, key=lambda d: float(d[field]))
         elif mode == "min":
@@ -286,10 +293,12 @@ def _get_baseline(
             baseline_obj = items[-1]
         elif mode == "earliest":
             baseline_obj = items[0]
+        elif mode == "median":
+            median_val = statistics.median([float(d[field]) for d in items])
+            baseline_obj = min(items, key=lambda d: abs(float(d[field]) - median_val))
         else:
             logger.warning("Unknown mode: %s", mode)
             return None
-
         result: BaselineResult = {
             "group_info": data["group_info"],
             "value": float(baseline_obj[field]),
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -137,13 +137,14 @@ def process(
             )
 
         target, ls, le = self.get_target(config, self.end_time)
-        if not target:
+        if not target.time_series:
             self.log_info(
                 f"no target data found for time range [{ls},{le}] with frequency {report_freq.get_text()}..."
             )
             return
         baseline, bs, be = self.get_baseline(config, self.end_time)
-        if not baseline:
+
+        if not baseline.time_series:
             self.log_info(
                 f"no baseline data found for time range [{bs},{be}] with frequency {report_freq.get_text()}..."
             )
@@ -255,7 +256,7 @@ def _fetch_from_benchmark_ts_api(
             }
         )
         url = source.api_query_url
-
+        self.log_info(f"query peek: {query}")
         self.log_info(f"trying to call {url}")
         t0 = time.perf_counter()
         try:
diff --git a/torchci/lib/benchmark/api_helper/backend/common/utils.ts b/torchci/lib/benchmark/api_helper/backend/common/utils.ts
@@ -217,12 +217,14 @@ export function toTimeSeriesResponse(
 export function emptyTimeSeriesResponse() {
   return {
     total_rows: 0,
-    time_series: [],
-    table: [],
     time_range: {
       start: new Date().toISOString(),
       end: new Date().toISOString(),
     },
+    data: {
+      time_series: [],
+      table: [],
+    },
   };
 }
 
@@ -265,6 +267,7 @@ export function to_time_series_data(
           diffs.push({
             key: `${key}___${sub_key}`,
             data: item.data,
+            length: item.data.length,
           });
         }
         return item.data[0];
@@ -286,11 +289,11 @@ export function to_time_series_data(
     console.log(
       `we detected multiple datapoints for the same group keys ${
         diffs.length
-      }, peak first on \n ${JSON.stringify(
-        diffs[0].key
-      )}, \n Data1: ${JSON.stringify(
+      }, peak first on \n ${JSON.stringify(diffs[0].key)},\n duplicates ${
+        diffs[0].length
+      } \n Data1: ${JSON.stringify(
         diffs[0].data[0]
-      )}, Data:2 ${JSON.stringify(diffs[0].data[1])}`
+      )},\n Data:2 ${JSON.stringify(diffs[0].data[1])}`
     );
   }
   return result;

Original file line number	Diff line number	Diff line change
`@@ -137,13 +137,14 @@ def process(`
`137`	`137`	`)`
`138`	`138`
`139`	`139`	`target, ls, le = self.get_target(config, self.end_time)`
`140`		`- if not target:`
	`140`	`+ if not target.time_series:`
`141`	`141`	`self.log_info(`
`142`	`142`	`f"no target data found for time range [{ls},{le}] with frequency {report_freq.get_text()}..."`
`143`	`143`	`)`
`144`	`144`	`return`
`145`	`145`	`baseline, bs, be = self.get_baseline(config, self.end_time)`
`146`		`- if not baseline:`
	`146`	`+`
	`147`	`+ if not baseline.time_series:`
`147`	`148`	`self.log_info(`
`148`	`149`	`f"no baseline data found for time range [{bs},{be}] with frequency {report_freq.get_text()}..."`
`149`	`150`	`)`
`@@ -255,7 +256,7 @@ def _fetch_from_benchmark_ts_api(`
`255`	`256`	`}`
`256`	`257`	`)`
`257`	`258`	`url = source.api_query_url`
`258`		`-`
	`259`	`+ self.log_info(f"query peek: {query}")`
`259`	`260`	`self.log_info(f"trying to call {url}")`
`260`	`261`	`t0 = time.perf_counter()`
`261`	`262`	`try:`