Add Helion Regression Config (#7473)

yangw-dev · web-flow · commit 39523dd72147 · 2025-11-14T16:24:37.000-08:00
#Overview Add regression report for helion # frequency the summary report is generated daily # what is considered as regression for helion 1. we find baseline point used to judge new data: median of (4th - 8th day speedup data) 2. if more than 2 new dp of helion speedup are continuously 15% lower than the basline value, considered as regression ## Demo Regression Report https://torchci-git-addhelionregressionreport-fbopensource.vercel.app/benchmark/regression/report/ada0e5ba-874b-47ff-b76c-b281ac08d179 <img width="741" height="851" alt="image" src="https://github.com/user-attachments/assets/abcde75b-f34d-49d1-9c23-0544ac38ba37" /> ## Notification Currently we do not trigger workplace chat notification, this is experimental. But we do: 1. have a github issue that will be used to accept regression report as link, can be used for notification when it's ready 2. have helion dashboard to access to the regression report list, and signal regression if find any #7472
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -11,6 +11,58 @@
 )
 
 
+PYTORCH_HELION_CONFIG = BenchmarkConfig(
+    name="Helion Benchmark Regression",
+    id="pytorch_helion",
+    source=BenchmarkApiSource(
+        api_query_url="https://hud.pytorch.org/api/benchmark/get_time_series",
+        type="benchmark_time_series_api",
+        api_endpoint_params_template="""
+                {
+                  "name": "pytorch_helion",
+                  "query_params": {
+                    "mode": "",
+                    "branches": ["main"],
+                    "repo": "pytorch/helion",
+                    "device": "",
+                    "arch":"",
+                    "benchmarkName": "Helion Benchmark",
+                    "startTime": "{{ startTime }}",
+                    "stopTime": "{{ stopTime }}"
+                    },
+                    "response_formats":["time_series"]
+                }
+                """,
+    ),
+    hud_info={
+        "url": "https://hud.pytorch.org/benchmark/v3/dashboard/pytorch_helion",
+    },
+    # set baseline from past 4-8 days, and compare with the lastest 4 day
+    policy=Policy(
+        frequency=Frequency(value=1, unit="days"),
+        range=RangeConfig(
+            baseline=DayRangeWindow(value=4),
+            comparison=DayRangeWindow(value=4),
+        ),
+        metrics={
+            "helion_speedup": RegressionPolicy(
+                name="helion_speedup",
+                condition="greater_equal",
+                threshold=0.85,
+                baseline_aggregation="median",
+            ),
+        },
+        notification_config={
+            "type": "github",
+            "repo": "pytorch/test-infra",
+            "issue": "7472",
+        },
+    ),
+    report_config=ReportConfig(
+        report_level="insufficient_data",
+    ),
+)
+
 PYTORCH_OPERATOR_MICROBENCH_CONFIG = BenchmarkConfig(
     name="Pytorch Operator Microbench Regression",
     id="pytorch_operator_microbenchmark",
@@ -146,6 +198,7 @@
     configs={
         "compiler_regression": COMPILER_BENCHMARK_CONFIG,
         "pytorch_operator_microbenchmark": PYTORCH_OPERATOR_MICROBENCH_CONFIG,
+        "pytorch_helion": PYTORCH_HELION_CONFIG,
     }
 )
 
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -77,16 +77,18 @@ class BenchmarkRegressionReport(TypedDict):
 
 
 def get_regression_status(regression_summary: BenchmarkRegressionSummary) -> str:
-    status = (
-        "regression"
-        if regression_summary.get("regression_count", 0) > 0
-        else (
-            "suspicious"
-            if regression_summary.get("suspicious_count", 0) > 0
-            else "no_regression"
-        )
-    )
-    return status
+    if regression_summary.get("regression_count", 0) > 0:
+        return "regression"
+    if regression_summary.get("suspicious_count", 0) > 0:
+        return "suspicious"
+    if regression_summary.get("insufficient_data_count", 0) > 0:
+        insufficient_data = regression_summary.get("insufficient_data_count", 0)
+        # default to 1 to avoid dividen issue
+        total = regression_summary.get("total_count", 1)
+        percentage = insufficient_data / total
+        if percentage >= 0.9:
+            return "insufficient_data"
+    return "no_regression"
 
 
 class BenchmarkRegressionReportGenerator:
@@ -251,7 +253,19 @@ def _to_data_map(
             for d in sorted(
                 ts_group.data, key=lambda d: isoparse(d["granularity_bucket"])
             ):
+                # skip if field is not in data, or field is None
                 if field not in d:
+                    logger.warning(
+                        "[_to_data_map] field %s not found or value is undefined", field
+                    )
+                    continue
+                if d[field] is None or math.isnan(float(d[field])):
+                    logger.warning(
+                        "[_to_data_map] Skip %s with value %s with group key [%s]",
+                        field,
+                        d[field],
+                        group_keys,
+                    )
                     continue
 
                 p: BenchmarkRegressionPoint = {
diff --git a/torchci/components/benchmark_v3/configs/configurations.tsx b/torchci/components/benchmark_v3/configs/configurations.tsx
@@ -107,7 +107,7 @@ export const BENCHMARK_ID_MAPPING: Record<string, BenchmarkIdMappingItem> = {
  * @returns
  */
 export function getBenchmarkIdFromReportId(reportId: string): string {
-  return REPORT_ID_TO_BENCHMARK_ID_MAPPING[reportId] ?? "";
+  return REPORT_ID_TO_BENCHMARK_ID_MAPPING[reportId] ?? reportId;
 }
 
 export function getBenchmarkIdMappingItem(
diff --git a/torchci/components/benchmark_v3/pages/BenchmarkListPage.tsx b/torchci/components/benchmark_v3/pages/BenchmarkListPage.tsx
@@ -12,7 +12,8 @@ export function getBenchmarkMainRouteById(id: string): string | undefined {
       }
     }
   }
-  return undefined;
+  // by default, form the v3 route to dashboard page
+  return `/benchmark/v3/dashboard/${id}`;
 }
 
 export function benchmarkCategoryCardToNavGroup(

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ export const BENCHMARK_ID_MAPPING: Record<string, BenchmarkIdMappingItem> = {`
`107`	`107`	`* @returns`
`108`	`108`	`*/`
`109`	`109`	`export function getBenchmarkIdFromReportId(reportId: string): string {`
`110`		`- return REPORT_ID_TO_BENCHMARK_ID_MAPPING[reportId] ?? "";`
	`110`	`+ return REPORT_ID_TO_BENCHMARK_ID_MAPPING[reportId] ?? reportId;`
`111`	`111`	`}`
`112`	`112`
`113`	`113`	`export function getBenchmarkIdMappingItem(`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,8 @@ export function getBenchmarkMainRouteById(id: string): string \| undefined {`
`12`	`12`	`}`
`13`	`13`	`}`
`14`	`14`	`}`
`15`		`- return undefined;`
	`15`	`+ // by default, form the v3 route to dashboard page`
	`16`	+ return `/benchmark/v3/dashboard/${id}`;
`16`	`17`	`}`
`17`	`18`
`18`	`19`	`export function benchmarkCategoryCardToNavGroup(`