From 94718f951288f59cbae801223dfc53a7112d500a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 7 Oct 2024 22:17:53 +0200 Subject: [PATCH] Stable `all.requests` (#5) * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * Initial commit * init * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * Dev (#1) * workspace init * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * technologies partitioning * past month date for cwv * 8pm * package-lock.json * ignore full-refresh * readme * updated tags and example assert * dependency assertions * current month commented * assert fix * all tables publish * incremental tables * node script * enable legacy * missing package name * table configs * all.requests and all.parsed_css * dev sampling vars * sampling instead of rank * readme upd * dev hints * dev sampling for tech report * tech report workflow * removed sampling * dates flexibility * fix * formatting * other legacy tables * docs and dependencies * comment * Update definitions/output/pages.js Co-authored-by: Barry Pollard * Update definitions/output/technologies.js Co-authored-by: Barry Pollard * Update package.json Co-authored-by: Barry Pollard * Update workflow_settings.yaml Co-authored-by: Barry Pollard * format * not dependent on all.pages * migrated to function trigger * cloud function * readme update * deployed function * readme updates * readme update * init stable copies * requests ready * adjusted requests pipeline * use release configs in prod * readme update * tags update * dev sampling * prune summary * sorted * false when target exists * dev sampling * newline * trigger cleanup * formatting * forEach iteration * create table with operate * new test tables script * tested * merge * JSON columns * job per client * native object pruning * Update definitions/output/all/reprocess_requests.js Co-authored-by: Barry Pollard --------- Co-authored-by: Barry Pollard --- README.md | 6 +- definitions/extra/test_env.js | 44 ++++--- definitions/output/all/reprocess_requests.js | 120 ++++++++++++++++++ .../output/core_web_vitals/technologies.js | 28 ++-- definitions/sources/declares.js | 8 +- src/dataform.js | 12 -- 6 files changed, 169 insertions(+), 49 deletions(-) create mode 100644 definitions/output/all/reprocess_requests.js diff --git a/README.md b/README.md index 17cb0ca2..b6f77fce 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # HTTP Archive BigQuery pipeline with Dataform -## Tables +This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery. + +## Pipelines + +The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run. ### Crawl tables in `all` dataset diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index 57f56bcd..e1fdc296 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,26 +1,28 @@ -const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month)); +const date = constants.fn_past_month(constants.current_month); -operate("test_env", { - hasOutput: true, - disabled: true // MUST NOT be commented in main branch -}).queries(ctx => ` -CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS -SELECT * -FROM httparchive.all.pages ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +var resources_list = [ + //{datasetId: "all", tableId: "pages"}, + {datasetId: "all", tableId: "requests"}, + //{datasetId: "all", tableId: "parsed_css"}, + //{datasetId: "core_web_vitals", tableId: "technologies"}, +]; -CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS -SELECT * -FROM httparchive.all.requests ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +resources_list.forEach(resource => { + operate(`test_table ${resource.datasetId}_${resource.tableId}`, { + disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev" + }).tags([ + "test_tables" + ]).queries(ctx => ` +CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; -CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS -SELECT * -FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId}; + +CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId} +LIKE httparchive.${resource.datasetId}.${resource.tableId}; -CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS +INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId} SELECT * -FROM httparchive.core_web_vitals.technologies -WHERE date = '${two_months_ago}' -`) +FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE} +WHERE date = '${date}' + `); +}) \ No newline at end of file diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js new file mode 100644 index 00000000..5b3bc1f3 --- /dev/null +++ b/definitions/output/all/reprocess_requests.js @@ -0,0 +1,120 @@ +operate(`all_requests_stable_pre`).tags( + ["all_requests_stable"] +).queries(` +CREATE SCHEMA IF NOT EXISTS all_dev; + +DROP TABLE IF EXISTS \`all_dev.requests_stable\`; + +CREATE TABLE \`all_dev.requests_stable\` +( + date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"), + client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"), + page STRING NOT NULL OPTIONS(description="The URL of the page being tested"), + is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."), + root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"), + rank INT64 OPTIONS(description="Site popularity rank, from CrUX"), + url STRING NOT NULL OPTIONS(description="The URL of the request"), + is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"), + type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"), + index INT64 OPTIONS(description="The sequential 0-based index of the request"), + payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"), + summary JSON OPTIONS(description="JSON-encoded summarization of request data"), + request_headers ARRAY> OPTIONS(description="Request headers"), + response_headers ARRAY> OPTIONS(description="Response headers"), + response_body STRING OPTIONS(description="Text-based response body") +) +PARTITION BY date +CLUSTER BY client, is_root_page, type, rank +OPTIONS( + require_partition_filter=true +); +`); + +const iterations = []; +const clients = constants.clients; + +for ( + let month = constants.current_month; + month >= '2024-09-01'; // 2022-07-01 + month = constants.fn_past_month(month)) { + clients.forEach((client) => { + iterations.push({ + month: month, + client: client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags( + ["all_requests_stable"] + ).dependencies([ + i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}` + ]).queries(ctx => ` +INSERT INTO \`all_dev.requests_stable\` +SELECT + requests.date, + requests.client, + requests.page, + requests.is_root_page, + requests.root_page, + crux.rank, + requests.url, + requests.is_main_document, + requests.type, + requests.index, + JSON_REMOVE( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + '$._headers' + ) AS payload, + JSON_REMOVE( + SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), + '$.firstHtml', + '$.firstReq', + '$.req_accept_encoding', + '$.req_accept_language', + '$.req_accept', + '$.req_if_modified_since', + '$.req_if_none_match', + '$.req_referer', + '$.req_user_agent', + '$.reqOtherHeaders', + '$.requestid', + '$.resp_age', + '$.resp_cache_control', + '$.resp_content_length', + '$.resp_content_type', + '$.resp_date', + '$.resp_etag', + '$.resp_last_modified', + '$.resp_server', + '$.resp_vary', + '$.respOtherHeaders', + '$.startedDateTime', + '$.url', + '$.urlShort' + ) as summary, + requests.request_headers, + requests.response_headers, + requests.response_body +FROM ( + SELECT * + FROM \`all.requests\` ${constants.dev_TABLESAMPLE} + WHERE date = '${iteration.month}' + AND client = '${iteration.client}') AS requests +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')} +) AS crux +ON requests.root_page = crux.page; + `) +}); diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 111a586f..49c578f2 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F ); CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING) -RETURNS STRUCT +RETURNS STRUCT LANGUAGE js AS ''' try { const $ = JSON.parse(categories); @@ -74,7 +74,7 @@ crux AS ( END AS rank, CONCAT(origin, '/') AS root_page_url, IF(device = 'desktop', 'desktop', 'mobile') AS client, - + # CWV IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid, IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid, @@ -82,15 +82,15 @@ crux AS ( IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, - + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024, - + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023, - + # WV IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp, @@ -114,7 +114,7 @@ technologies AS ( ${ctx.resolve("all", "pages")}, UNNEST(technologies) AS technology WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND technology.technology IS NOT NULL AND technology.technology != '' UNION ALL @@ -125,7 +125,7 @@ UNION ALL FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), categories AS ( @@ -137,7 +137,7 @@ categories AS ( UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} GROUP BY app UNION ALL @@ -149,7 +149,7 @@ UNION ALL UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND client = 'mobile' ), @@ -165,7 +165,7 @@ summary_stats AS ( FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), lab_data AS ( @@ -206,7 +206,7 @@ SELECT app, client, COUNT(0) AS origins, - + # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, COUNTIF(good_cls) AS origins_with_good_cls, @@ -227,19 +227,19 @@ SELECT SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024, SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023, - + # Lighthouse data APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility, APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices, APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance, APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa, APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo, - + # Page weight stats APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total, APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js, APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image - + FROM lab_data JOIN diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js index ca29d1e1..87eb47ad 100644 --- a/definitions/sources/declares.js +++ b/definitions/sources/declares.js @@ -16,7 +16,7 @@ for (const table of crux_tables) { }); assert(`${table}_not_empty`).query(ctx => ` -SELECT +SELECT 'No data for the specified date' AS error_message FROM ${ctx.ref("chrome-ux-report", "materialized", table)} WHERE yyyymm = ${past_month} @@ -24,3 +24,9 @@ GROUP BY yyyymm HAVING COUNT(1) = 0 `); } + +declare({ + database: "chrome-ux-report", + schema: "experimental", + name: "global", +}); diff --git a/src/dataform.js b/src/dataform.js index 5b8159b1..19a02f7d 100644 --- a/src/dataform.js +++ b/src/dataform.js @@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) { compilationResult: { releaseConfig: `${repoURI}/releaseConfigs/production` } - }, dev_request = { - parent: repoURI, - compilationResult: { - gitCommitish: 'dev' - }, - codeCompilationConfig: { - schemaSuffix: 'dev', - tablePrefix: 'dev', - vars: { - current_month: '2024-08-01', - }, - } }; console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`);