|
| 1 | +/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch. |
| 2 | + WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started). |
| 3 | + RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts. |
| 4 | + COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed). |
| 5 | +*/ |
| 6 | + |
| 7 | +WITH |
| 8 | + parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC) |
| 9 | + parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC) |
| 10 | + toDateTime64(now(), 3) AS now64, |
| 11 | + (w_end - INTERVAL 1 DAY) AS zombie_cutoff, |
| 12 | + toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE, |
| 13 | + ['gpu_1_queue','gpu_4_queue'] AS QUEUES |
| 14 | + |
| 15 | +/* 1) All builds created within the window (+ branch/PR context) */ |
| 16 | +, builds_window AS ( |
| 17 | + SELECT |
| 18 | + tupleElement(build,'id') AS build_id, |
| 19 | + |
| 20 | + argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number, |
| 21 | + argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url, |
| 22 | + concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url, |
| 23 | + argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha, |
| 24 | + |
| 25 | + /* robust start/finish (fallback to job min/max if build-level fields are NULL) */ |
| 26 | + coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')), |
| 27 | + min(tupleElement(job,'started_at'))) AS robust_start, |
| 28 | + coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')), |
| 29 | + max(tupleElement(job,'finished_at'))) AS robust_finish, |
| 30 | + |
| 31 | + countDistinct(tupleElement(job,'id')) AS steps_count, |
| 32 | + argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state, |
| 33 | + |
| 34 | + /* repo + PR mapping (repo_slug may come from pipeline or PR repo) */ |
| 35 | + coalesce( |
| 36 | + nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''), |
| 37 | + nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''), |
| 38 | + nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '') |
| 39 | + ) AS repo_slug, |
| 40 | + coalesce( |
| 41 | + toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))), |
| 42 | + toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)')) |
| 43 | + ) AS pr_number, |
| 44 | + |
| 45 | + argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc, |
| 46 | + argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name |
| 47 | + FROM vllm.vllm_buildkite_jobs |
| 48 | + GROUP BY tupleElement(build,'id') |
| 49 | + HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end |
| 50 | +) |
| 51 | + |
| 52 | +/* 2) Agent-run attempts for those builds that can overlap the window */ |
| 53 | +, base_agent AS ( |
| 54 | + SELECT |
| 55 | + tupleElement(build,'id') AS build_id, |
| 56 | + tupleElement(job,'id') AS job_id, |
| 57 | + tupleElement(job,'created_at') AS created_at, |
| 58 | + tupleElement(job,'state') AS state, |
| 59 | + tupleElement(job,'runnable_at') AS runnable_at, |
| 60 | + tupleElement(job,'started_at') AS started_at, |
| 61 | + tupleElement(job,'finished_at') AS finished_at, |
| 62 | + replaceOne(arrayFirst(x -> startsWith(x,'queue='), |
| 63 | + tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key |
| 64 | + FROM vllm.vllm_buildkite_jobs |
| 65 | + INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id |
| 66 | + WHERE tupleElement(job,'type') IN ('script','command') |
| 67 | + AND ( |
| 68 | + tupleElement(job,'runnable_at') < w_end OR |
| 69 | + tupleElement(job,'started_at') < w_end OR |
| 70 | + ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start |
| 71 | + ) |
| 72 | +) |
| 73 | + |
| 74 | +/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */ |
| 75 | +, jobs_by_build AS ( |
| 76 | + SELECT |
| 77 | + build_id, |
| 78 | + job_id, |
| 79 | + argMax(state, created_at) AS latest_state, |
| 80 | + max(created_at) AS last_event_at, |
| 81 | + |
| 82 | + /* RUN attempts: (queue, start, finish) */ |
| 83 | + arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL, |
| 84 | + groupArray((queue_key, started_at, finished_at)) |
| 85 | + )) AS run_triplets, |
| 86 | + |
| 87 | + /* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */ |
| 88 | + arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL, |
| 89 | + groupArray((queue_key, runnable_at, started_at)) |
| 90 | + )) AS wait_triplets |
| 91 | + FROM base_agent |
| 92 | + GROUP BY build_id, job_id |
| 93 | +) |
| 94 | + |
| 95 | +/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */ |
| 96 | +, runs_scored AS ( |
| 97 | + SELECT |
| 98 | + build_id, |
| 99 | + tupleElement(rt, 1) AS queue_key, |
| 100 | + greatest(tupleElement(rt, 2), w_start) AS s_clip, |
| 101 | + least( |
| 102 | + ifNull( |
| 103 | + tupleElement(rt, 3), |
| 104 | + if(latest_state = 'running' AND last_event_at < zombie_cutoff, |
| 105 | + least(last_event_at + INTERVAL 1 MINUTE, w_end), |
| 106 | + w_end) |
| 107 | + ), |
| 108 | + w_end |
| 109 | + ) AS e_clip |
| 110 | + FROM jobs_by_build |
| 111 | + ARRAY JOIN run_triplets AS rt |
| 112 | + WHERE tupleElement(rt, 1) IN QUEUES |
| 113 | +) |
| 114 | +, run_by_build AS ( |
| 115 | + SELECT |
| 116 | + build_id, queue_key, |
| 117 | + sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s |
| 118 | + FROM runs_scored |
| 119 | + GROUP BY build_id, queue_key |
| 120 | +) |
| 121 | + |
| 122 | +/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */ |
| 123 | +, waits_scored AS ( |
| 124 | + SELECT |
| 125 | + build_id, |
| 126 | + tupleElement(wt, 1) AS queue_key, |
| 127 | + greatest(tupleElement(wt, 2), w_start) AS ra_clip, |
| 128 | + least(tupleElement(wt, 3), w_end) AS st_clip |
| 129 | + FROM jobs_by_build |
| 130 | + ARRAY JOIN wait_triplets AS wt |
| 131 | + WHERE tupleElement(wt, 1) IN QUEUES |
| 132 | +) |
| 133 | +, wait_by_build AS ( |
| 134 | + SELECT |
| 135 | + build_id, queue_key, |
| 136 | + sumIf(dateDiff('second', ra_clip, st_clip), st_clip > ra_clip) AS total_wait_s |
| 137 | + FROM waits_scored |
| 138 | + GROUP BY build_id, queue_key |
| 139 | +) |
| 140 | + |
| 141 | +/* 6) Pivot per-build totals to hour columns */ |
| 142 | +, totals_by_build AS ( |
| 143 | + SELECT |
| 144 | + build_id, |
| 145 | + round(ifNull(sumIf(total_wait_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_wait_hours, |
| 146 | + round(ifNull(sumIf(total_run_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_run_hours, |
| 147 | + round(ifNull(sumIf(total_wait_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_wait_hours, |
| 148 | + round(ifNull(sumIf(total_run_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_run_hours |
| 149 | + FROM ( |
| 150 | + SELECT build_id, queue_key, total_run_s, toInt64(0) AS total_wait_s FROM run_by_build |
| 151 | + UNION ALL |
| 152 | + SELECT build_id, queue_key, toInt64(0) AS total_run_s, total_wait_s FROM wait_by_build |
| 153 | + ) |
| 154 | + GROUP BY build_id |
| 155 | +) |
| 156 | + |
| 157 | +/* 7) Final table (UTC) — includes both PR and main builds */ |
| 158 | +SELECT |
| 159 | + /* PR URL (NULL for non-PR builds) */ |
| 160 | + if((pr_number IS NULL) OR (repo_slug IS NULL), |
| 161 | + NULL, |
| 162 | + concat('https://github.com/', repo_slug, '/pull/', toString(pr_number)) |
| 163 | + ) AS pr_url, |
| 164 | + |
| 165 | + build_number, |
| 166 | + build_id, |
| 167 | + build_url, |
| 168 | + steps_table_url, |
| 169 | + commit_sha, |
| 170 | + |
| 171 | + robust_start AS build_started_at, |
| 172 | + robust_finish AS build_finished_at, |
| 173 | + |
| 174 | + /* duration (hours) = finish − start (UTC) */ |
| 175 | + multiIf( |
| 176 | + robust_start IS NULL OR robust_finish IS NULL, |
| 177 | + NULL, |
| 178 | + round(dateDiff('second', robust_start, robust_finish) / 3600.0, 2) |
| 179 | + ) AS duration_hours, |
| 180 | + |
| 181 | + steps_count, |
| 182 | + latest_build_state, |
| 183 | + |
| 184 | + ifNull(t.gpu_1_queue_wait_hours, 0) AS gpu_1_queue_wait_hours, |
| 185 | + ifNull(t.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours, |
| 186 | + ifNull(t.gpu_4_queue_wait_hours, 0) AS gpu_4_queue_wait_hours, |
| 187 | + ifNull(t.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours, |
| 188 | + |
| 189 | + /* Fixed-rate cost */ |
| 190 | + round( |
| 191 | + 1.3232 * ifNull(t.gpu_1_queue_run_hours, 0) + |
| 192 | + 4.602 * ifNull(t.gpu_4_queue_run_hours, 0), |
| 193 | + 2 |
| 194 | + ) AS cost, |
| 195 | + |
| 196 | + /* Mark if the build branch is literally 'main' */ |
| 197 | + toUInt8(branch_name = 'main') AS is_main_branch |
| 198 | + |
| 199 | +FROM builds_window AS b |
| 200 | +LEFT JOIN totals_by_build AS t ON t.build_id = b.build_id |
| 201 | +ORDER BY build_created_at_utc ASC; |
0 commit comments