Skip to content

Commit 843f88e

Browse files
authored
[vLLM] Add job groups filtering (#7384)
This PR adds ability to filter "AMD" and "Torch Nightly" job groups. It also has some minor fixes: 1- Better visibility in light mode 2- Making legends togglable to filter out unwanted data points 3- Added some useful links at the top of the dashboard to codecov and buildkite
1 parent ee53c82 commit 843f88e

File tree

20 files changed

+513
-130
lines changed

20 files changed

+513
-130
lines changed

torchci/clickhouse_queries/vllm/ci_reliability/params.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
"repo": "String",
55
"pipelineName": "String",
66
"startTime": "DateTime64(3)",
7-
"stopTime": "DateTime64(3)"
7+
"stopTime": "DateTime64(3)",
8+
"jobGroups": "Array(String)"
89
},
910
"tests": [
1011
{
1112
"granularity": "day",
1213
"repo": "https://github.com/vllm-project/vllm.git",
1314
"pipelineName": "CI",
1415
"startTime": "2025-09-26T00:00:00.000",
15-
"stopTime": "2025-10-03T00:00:00.000"
16+
"stopTime": "2025-10-03T00:00:00.000",
17+
"jobGroups": ["amd", "torch_nightly", "main"]
1618
}
1719
]
1820
}

torchci/clickhouse_queries/vllm/ci_reliability/query.sql

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
-- Daily breakdown of build states (passed, failed, canceled)
44
-- Accounts for soft failures: builds with only soft failures count as successful
55
-- Only tracks main branch to exclude work-in-progress PR noise
6+
-- Supports filtering by job groups: AMD, Torch Nightly, or Main
67

78
WITH build_jobs AS (
89
SELECT
@@ -29,6 +30,30 @@ WITH build_jobs AS (
2930
AND tupleElement(build, 'started_at') IS NOT NULL
3031
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
3132
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
33+
-- Job group filtering: AMD, Torch Nightly, or Main
34+
AND (
35+
(
36+
has({jobGroups: Array(String)}, 'amd')
37+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
38+
> 0
39+
)
40+
OR (
41+
has({jobGroups: Array(String)}, 'torch_nightly')
42+
AND positionCaseInsensitive(
43+
tupleElement(job, 'name'), 'Torch Nightly'
44+
)
45+
> 0
46+
)
47+
OR (
48+
has({jobGroups: Array(String)}, 'main')
49+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
50+
= 0
51+
AND positionCaseInsensitive(
52+
tupleElement(job, 'name'), 'Torch Nightly'
53+
)
54+
= 0
55+
)
56+
)
3257
),
3358

3459
builds AS (

torchci/clickhouse_queries/vllm/job_reliability/params.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
"pipelineName": "String",
55
"startTime": "DateTime64(3)",
66
"stopTime": "DateTime64(3)",
7-
"minRuns": "UInt32"
7+
"minRuns": "UInt32",
8+
"jobGroups": "Array(String)"
89
},
910
"tests": [
1011
{
1112
"repo": "https://github.com/vllm-project/vllm.git",
1213
"pipelineName": "CI",
1314
"startTime": "2025-09-26T00:00:00.000",
1415
"stopTime": "2025-10-03T00:00:00.000",
15-
"minRuns": 3
16+
"minRuns": 3,
17+
"jobGroups": ["amd", "torch_nightly", "main"]
1618
}
1719
]
1820
}

torchci/clickhouse_queries/vllm/job_reliability/query.sql

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
-- Computes success rate for each individual job in the CI pipeline
33
-- Shows which jobs are most/least reliable
44
-- Only tracks main branch to exclude work-in-progress PR noise
5+
-- Supports filtering by job groups: AMD, Torch Nightly, or Main
56

67
WITH jobs AS (
78
SELECT
@@ -20,6 +21,30 @@ WITH jobs AS (
2021
AND tupleElement(job, 'finished_at') IS NOT NULL
2122
AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
2223
AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
24+
-- Job group filtering: AMD, Torch Nightly, or Main
25+
AND (
26+
(
27+
has({jobGroups: Array(String)}, 'amd')
28+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
29+
> 0
30+
)
31+
OR (
32+
has({jobGroups: Array(String)}, 'torch_nightly')
33+
AND positionCaseInsensitive(
34+
tupleElement(job, 'name'), 'Torch Nightly'
35+
)
36+
> 0
37+
)
38+
OR (
39+
has({jobGroups: Array(String)}, 'main')
40+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
41+
= 0
42+
AND positionCaseInsensitive(
43+
tupleElement(job, 'name'), 'Torch Nightly'
44+
)
45+
= 0
46+
)
47+
)
2348
),
2449

2550
job_stats AS (

torchci/clickhouse_queries/vllm/job_retry_stats/params.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
"pipelineName": "String",
55
"startTime": "DateTime64(3)",
66
"stopTime": "DateTime64(3)",
7-
"minRuns": "UInt32"
7+
"minRuns": "UInt32",
8+
"jobGroups": "Array(String)"
89
},
910
"tests": [
1011
{
1112
"repo": "https://github.com/vllm-project/vllm.git",
1213
"pipelineName": "CI",
1314
"startTime": "2025-09-22T00:00:00.000",
1415
"stopTime": "2025-09-29T00:00:00.000",
15-
"minRuns": 5
16+
"minRuns": 5,
17+
"jobGroups": ["amd", "torch_nightly", "main"]
1618
}
1719
]
1820
}

torchci/clickhouse_queries/vllm/job_retry_stats/query.sql

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
-- vLLM job retry statistics
22
-- Shows which jobs are retried most often
3+
-- Supports filtering by job groups: AMD, Torch Nightly, or Main
34

45
SELECT
56
tupleElement(job, 'name') AS job_name,
@@ -18,6 +19,28 @@ WHERE
1819
AND tupleElement(build, 'started_at') IS NOT null
1920
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3)}
2021
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3)}
22+
-- Job group filtering: AMD, Torch Nightly, or Main
23+
AND (
24+
(
25+
has({jobGroups: Array(String)}, 'amd')
26+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') > 0
27+
)
28+
OR (
29+
has({jobGroups: Array(String)}, 'torch_nightly')
30+
AND positionCaseInsensitive(
31+
tupleElement(job, 'name'), 'Torch Nightly'
32+
)
33+
> 0
34+
)
35+
OR (
36+
has({jobGroups: Array(String)}, 'main')
37+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') = 0
38+
AND positionCaseInsensitive(
39+
tupleElement(job, 'name'), 'Torch Nightly'
40+
)
41+
= 0
42+
)
43+
)
2144
GROUP BY job_name
2245
HAVING total_runs >= {minRuns: UInt32}
2346
ORDER BY retry_rate DESC, retried_count DESC

torchci/clickhouse_queries/vllm/rebuild_rate/params.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
"pipelineName": "String",
55
"startTime": "DateTime64(3)",
66
"stopTime": "DateTime64(3)",
7-
"granularity": "String"
7+
"granularity": "String",
8+
"jobGroups": "Array(String)"
89
},
910
"tests": [
1011
{
1112
"repo": "https://github.com/vllm-project/vllm.git",
1213
"pipelineName": "CI",
1314
"startTime": "2025-09-22T00:00:00.000",
1415
"stopTime": "2025-09-29T00:00:00.000",
15-
"granularity": "day"
16+
"granularity": "day",
17+
"jobGroups": ["amd", "torch_nightly", "main"]
1618
}
1719
]
1820
}

torchci/clickhouse_queries/vllm/rebuild_rate/query.sql

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
-- vLLM job retry rate metrics
22
-- Tracks how often jobs are retried (indicates flaky tests or infrastructure issues)
3+
-- Supports filtering by job groups: AMD, Torch Nightly, or Main
34

45
WITH jobs AS (
56
SELECT
@@ -21,6 +22,30 @@ WITH jobs AS (
2122
AND tupleElement(build, 'started_at') IS NOT NULL
2223
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3)}
2324
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3)}
25+
-- Job group filtering: AMD, Torch Nightly, or Main
26+
AND (
27+
(
28+
has({jobGroups: Array(String)}, 'amd')
29+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
30+
> 0
31+
)
32+
OR (
33+
has({jobGroups: Array(String)}, 'torch_nightly')
34+
AND positionCaseInsensitive(
35+
tupleElement(job, 'name'), 'Torch Nightly'
36+
)
37+
> 0
38+
)
39+
OR (
40+
has({jobGroups: Array(String)}, 'main')
41+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
42+
= 0
43+
AND positionCaseInsensitive(
44+
tupleElement(job, 'name'), 'Torch Nightly'
45+
)
46+
= 0
47+
)
48+
)
2449
),
2550

2651
daily_stats AS (

torchci/clickhouse_queries/vllm/trunk_health/params.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
"repo": "String",
55
"pipelineName": "String",
66
"startTime": "DateTime64(3)",
7-
"stopTime": "DateTime64(3)"
7+
"stopTime": "DateTime64(3)",
8+
"jobGroups": "Array(String)"
89
},
910
"tests": [
1011
{
1112
"granularity": "day",
1213
"repo": "https://github.com/vllm-project/vllm.git",
1314
"pipelineName": "CI",
1415
"startTime": "2025-09-22T00:00:00.000",
15-
"stopTime": "2025-09-29T00:00:00.000"
16+
"stopTime": "2025-09-29T00:00:00.000",
17+
"jobGroups": ["amd", "torch_nightly", "main"]
1618
}
1719
]
1820
}
Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,62 @@
11
-- vLLM trunk health history
22
-- Returns individual main branch builds with timestamps for hourly visualization
3+
-- Supports filtering by job groups: AMD, Torch Nightly, or Main
4+
-- Build success is computed based on filtered jobs only
5+
6+
WITH build_jobs AS (
7+
SELECT
8+
toUInt32(tupleElement(build, 'number')) AS build_number,
9+
tupleElement(build, 'started_at') AS build_started_at,
10+
tupleElement(build, 'state') AS build_state,
11+
tupleElement(job, 'state') AS job_state,
12+
tupleElement(job, 'soft_failed') AS soft_failed
13+
FROM vllm.vllm_buildkite_jobs
14+
WHERE
15+
tupleElement(pipeline, 'repository') = {repo: String }
16+
AND tupleElement(pipeline, 'name') = {pipelineName: String }
17+
AND tupleElement(build, 'branch') = 'main'
18+
AND tupleElement(build, 'started_at') IS NOT NULL
19+
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
20+
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
21+
-- Job group filtering: AMD, Torch Nightly, or Main
22+
AND (
23+
(
24+
has({jobGroups: Array(String)}, 'amd')
25+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
26+
> 0
27+
)
28+
OR (
29+
has({jobGroups: Array(String)}, 'torch_nightly')
30+
AND positionCaseInsensitive(
31+
tupleElement(job, 'name'), 'Torch Nightly'
32+
)
33+
> 0
34+
)
35+
OR (
36+
has({jobGroups: Array(String)}, 'main')
37+
AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD')
38+
= 0
39+
AND positionCaseInsensitive(
40+
tupleElement(job, 'name'), 'Torch Nightly'
41+
)
42+
= 0
43+
)
44+
)
45+
)
346

447
SELECT
5-
tupleElement(build, 'number') AS build_number,
6-
tupleElement(build, 'started_at') AS build_started_at,
7-
tupleElement(build, 'state') AS build_state,
48+
build_number,
49+
any(build_started_at) AS build_started_at,
50+
any(build_state) AS build_state,
51+
countIf(lowerUTF8(job_state) = 'failed' AND soft_failed = FALSE)
52+
AS hard_failure_count,
53+
-- Build is green if it has no hard failures among filtered jobs and is not canceled
854
if(
9-
lowerUTF8(tupleElement(build, 'state')) IN (
10-
'passed', 'finished', 'success'
11-
),
55+
lowerUTF8(build_state) NOT IN ('canceled', 'cancelled')
56+
AND hard_failure_count = 0,
1257
1,
1358
0
1459
) AS is_green
15-
FROM vllm.vllm_buildkite_builds
16-
WHERE
17-
tupleElement(pipeline, 'repository') = {repo: String }
18-
AND tupleElement(pipeline, 'name') = {pipelineName: String }
19-
AND tupleElement(build, 'branch') = 'main'
20-
AND tupleElement(build, 'started_at') IS NOT NULL
21-
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
22-
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
60+
FROM build_jobs
61+
GROUP BY build_number
2362
ORDER BY build_started_at ASC

0 commit comments

Comments
 (0)