Stable all.requests (#5)

max-ostapenko · tunetheweb · web-flow · commit 94718f951288 · 2024-10-07T22:17:53.000+02:00
* pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * Initial commit * init * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * Dev (#1) * workspace init * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * technologies partitioning * past month date for cwv * 8pm * package-lock.json * ignore full-refresh * readme * updated tags and example assert * dependency assertions * current month commented * assert fix * all tables publish * incremental tables * node script * enable legacy * missing package name * table configs * all.requests and all.parsed_css * dev sampling vars * sampling instead of rank * readme upd * dev hints * dev sampling for tech report * tech report workflow * removed sampling * dates flexibility * fix * formatting * other legacy tables * docs and dependencies * comment * Update definitions/output/pages.js Co-authored-by: Barry Pollard <barrypollard@google.com> * Update definitions/output/technologies.js Co-authored-by: Barry Pollard <barrypollard@google.com> * Update package.json Co-authored-by: Barry Pollard <barrypollard@google.com> * Update workflow_settings.yaml Co-authored-by: Barry Pollard <barrypollard@google.com> * format * not dependent on all.pages * migrated to function trigger * cloud function * readme update * deployed function * readme updates * readme update * init stable copies * requests ready * adjusted requests pipeline * use release configs in prod * readme update * tags update * dev sampling * prune summary * sorted * false when target exists * dev sampling * newline * trigger cleanup * formatting * forEach iteration * create table with operate * new test tables script * tested * merge * JSON columns * job per client * native object pruning * Update definitions/output/all/reprocess_requests.js Co-authored-by: Barry Pollard <barrypollard@google.com> --------- Co-authored-by: Barry Pollard <barrypollard@google.com>
diff --git a/README.md b/README.md
@@ -1,6 +1,10 @@
 # HTTP Archive BigQuery pipeline with Dataform
 
-## Tables
+This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
+
+## Pipelines
+
+The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run.
 
 ### Crawl tables in `all` dataset
 
diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js
@@ -1,26 +1,28 @@
-const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month));
+const date = constants.fn_past_month(constants.current_month);
 
-operate("test_env", {
-  hasOutput: true,
-  disabled: true // MUST NOT be commented in main branch
-}).queries(ctx => `
-CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS
-SELECT *
-FROM httparchive.all.pages ${constants.dev_TABLESAMPLE}
-WHERE date = '${two_months_ago}';
+var resources_list = [
+  //{datasetId: "all",              tableId: "pages"},
+  {datasetId: "all",              tableId: "requests"},
+  //{datasetId: "all",              tableId: "parsed_css"},
+  //{datasetId: "core_web_vitals",  tableId: "technologies"},
+];
 
-CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS
-SELECT *
-FROM httparchive.all.requests ${constants.dev_TABLESAMPLE}
-WHERE date = '${two_months_ago}';
+resources_list.forEach(resource => {
+  operate(`test_table ${resource.datasetId}_${resource.tableId}`, {
+    disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev"
+  }).tags([
+    "test_tables"
+  ]).queries(ctx => `
+CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
 
-CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS
-SELECT *
-FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE}
-WHERE date = '${two_months_ago}';
+DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
+
+CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId}
+LIKE httparchive.${resource.datasetId}.${resource.tableId};
 
-CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS
+INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId}
 SELECT *
-FROM httparchive.core_web_vitals.technologies
-WHERE date = '${two_months_ago}'
-`)
+FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE}
+WHERE date = '${date}'
+  `);
+})
diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js
@@ -0,0 +1,120 @@
+operate(`all_requests_stable_pre`).tags(
+  ["all_requests_stable"]
+).queries(`
+CREATE SCHEMA IF NOT EXISTS all_dev;
+
+DROP TABLE IF EXISTS \`all_dev.requests_stable\`;
+
+CREATE TABLE \`all_dev.requests_stable\`
+(
+  date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"),
+  client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"),
+  page STRING NOT NULL OPTIONS(description="The URL of the page being tested"),
+  is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."),
+  root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"),
+  rank INT64 OPTIONS(description="Site popularity rank, from CrUX"),
+  url STRING NOT NULL OPTIONS(description="The URL of the request"),
+  is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"),
+  type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"),
+  index INT64 OPTIONS(description="The sequential 0-based index of the request"),
+  payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"),
+  summary JSON OPTIONS(description="JSON-encoded summarization of request data"),
+  request_headers ARRAY<STRUCT<
+    name STRING OPTIONS(description="Request header name"),
+    value STRING OPTIONS(description="Request header value")
+    >> OPTIONS(description="Request headers"),
+  response_headers ARRAY<STRUCT<
+    name STRING OPTIONS(description="Response header name"),
+    value STRING OPTIONS(description="Response header value")
+    >> OPTIONS(description="Response headers"),
+  response_body STRING OPTIONS(description="Text-based response body")
+)
+PARTITION BY date
+CLUSTER BY client, is_root_page, type, rank
+OPTIONS(
+  require_partition_filter=true
+);
+`);
+
+const iterations = [];
+const clients = constants.clients;
+
+for (
+  let month = constants.current_month;
+  month >= '2024-09-01'; // 2022-07-01
+  month = constants.fn_past_month(month)) {
+    clients.forEach((client) => {
+      iterations.push({
+        month: month,
+        client: client
+        })
+    })
+}
+
+iterations.forEach((iteration, i) => {
+  operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags(
+    ["all_requests_stable"]
+  ).dependencies([
+    i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}`
+  ]).queries(ctx => `
+INSERT INTO \`all_dev.requests_stable\`
+SELECT
+  requests.date,
+  requests.client,
+  requests.page,
+  requests.is_root_page,
+  requests.root_page,
+  crux.rank,
+  requests.url,
+  requests.is_main_document,
+  requests.type,
+  requests.index,
+  JSON_REMOVE(
+    SAFE.PARSE_JSON(payload, wide_number_mode => 'round'),
+    '$._headers'
+  ) AS payload,
+  JSON_REMOVE(
+    SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'),  
+    '$.firstHtml',
+    '$.firstReq',
+    '$.req_accept_encoding',
+    '$.req_accept_language',
+    '$.req_accept',
+    '$.req_if_modified_since',
+    '$.req_if_none_match',
+    '$.req_referer',
+    '$.req_user_agent',
+    '$.reqOtherHeaders',
+    '$.requestid',
+    '$.resp_age',
+    '$.resp_cache_control',
+    '$.resp_content_length',
+    '$.resp_content_type',
+    '$.resp_date',
+    '$.resp_etag',
+    '$.resp_last_modified',
+    '$.resp_server',
+    '$.resp_vary',
+    '$.respOtherHeaders',
+    '$.startedDateTime',
+    '$.url',
+    '$.urlShort'
+  ) as summary,
+  requests.request_headers,
+  requests.response_headers,
+  requests.response_body
+FROM (
+  SELECT *
+  FROM \`all.requests\` ${constants.dev_TABLESAMPLE}
+  WHERE date = '${iteration.month}'
+    AND client = '${iteration.client}') AS requests
+LEFT JOIN (
+  SELECT DISTINCT
+    CONCAT(origin, '/') AS page,
+    experimental.popularity.rank AS rank
+  FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")}
+  WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')}
+) AS crux
+ON requests.root_page = crux.page;
+  `)
+});
diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js
@@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
 );
 
 CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
-RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC> 
+RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
 LANGUAGE js AS '''
 try {
   const $ = JSON.parse(categories);
@@ -74,23 +74,23 @@ crux AS (
     END AS rank,
     CONCAT(origin, '/') AS root_page_url,
     IF(device = 'desktop', 'desktop', 'mobile') AS client,
-    
+
     # CWV
     IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
     IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
     IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
     IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
     IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
     IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
-    
+
     (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
     IS_GOOD(small_cls, medium_cls, large_cls) AND
     IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,
-    
+
     (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
     IS_GOOD(small_cls, medium_cls, large_cls) AND
     IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,
-    
+
     # WV
     IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
     IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
@@ -114,7 +114,7 @@ technologies AS (
     ${ctx.resolve("all", "pages")},
     UNNEST(technologies) AS technology
   WHERE
-    date = '${past_month}' AND
+    date = '${past_month}' ${constants.dev_rank5000_filter} AND
     technology.technology IS NOT NULL AND
     technology.technology != ''
 UNION ALL
@@ -125,7 +125,7 @@ UNION ALL
   FROM
     ${ctx.resolve("all", "pages")}
   WHERE
-    date = '${past_month}'
+    date = '${past_month}' ${constants.dev_rank5000_filter}
 ),
 
 categories AS (
@@ -137,7 +137,7 @@ categories AS (
     UNNEST(technologies) AS technology,
     UNNEST(technology.categories) AS category
   WHERE
-    date = '${past_month}'
+    date = '${past_month}' ${constants.dev_rank5000_filter}
   GROUP BY
     app
 UNION ALL
@@ -149,7 +149,7 @@ UNION ALL
     UNNEST(technologies) AS technology,
     UNNEST(technology.categories) AS category
   WHERE
-    date = '${past_month}' AND
+    date = '${past_month}' ${constants.dev_rank5000_filter} AND
     client = 'mobile'
 ),
 
@@ -165,7 +165,7 @@ summary_stats AS (
   FROM
     ${ctx.resolve("all", "pages")}
   WHERE
-    date = '${past_month}'
+    date = '${past_month}' ${constants.dev_rank5000_filter}
 ),
 
 lab_data AS (
@@ -206,7 +206,7 @@ SELECT
   app,
   client,
   COUNT(0) AS origins,
-  
+
   # CrUX data
   COUNTIF(good_fid) AS origins_with_good_fid,
   COUNTIF(good_cls) AS origins_with_good_cls,
@@ -227,19 +227,19 @@ SELECT
   SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
   SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
   SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,
-  
+
   # Lighthouse data
   APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility,
   APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices,
   APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance,
   APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa,
   APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo,
-  
+
   # Page weight stats
   APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total,
   APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js,
   APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image
-  
+
 FROM
   lab_data
 JOIN
diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js
@@ -16,11 +16,17 @@ for (const table of crux_tables) {
     });
 
     assert(`${table}_not_empty`).query(ctx => `
-SELECT 
+SELECT
   'No data for the specified date' AS error_message
 FROM ${ctx.ref("chrome-ux-report", "materialized", table)}
 WHERE yyyymm = ${past_month}
 GROUP BY yyyymm
 HAVING COUNT(1) = 0
     `);
 }
+
+declare({
+    database: "chrome-ux-report",
+    schema: "experimental",
+    name: "global",
+});
diff --git a/src/dataform.js b/src/dataform.js
@@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) {
     compilationResult: {
       releaseConfig: `${repoURI}/releaseConfigs/production`
     }
-  }, dev_request = {
-    parent: repoURI,
-    compilationResult: {
-      gitCommitish: 'dev'
-    },
-    codeCompilationConfig: {
-      schemaSuffix: 'dev',
-      tablePrefix: 'dev',
-      vars: {
-        current_month: '2024-08-01',
-      },
-    }
   };
 
   console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`);