Skip to content

Commit

Permalink
Stable all.requests (#5)
Browse files Browse the repository at this point in the history
* pages and legacy lighthouse

* fix

* first test

* pages insert

* date var

* js rewrite

* dataset

* Initial commit

* init

* core_web_vitals

* clean graph, tested

* publish core_web_vitals.technologies

* Dev (#1)

* workspace init

* pages and legacy lighthouse

* fix

* first test

* pages insert

* date var

* js rewrite

* dataset

* core_web_vitals

* clean graph, tested

* publish core_web_vitals.technologies

* technologies partitioning

* past month date for cwv

* 8pm

* package-lock.json

* ignore full-refresh

* readme

* updated tags and example assert

* dependency assertions

* current month commented

* assert fix

* all tables publish

* incremental tables

* node script

* enable legacy

* missing package name

* table configs

* all.requests and all.parsed_css

* dev sampling vars

* sampling instead of rank

* readme upd

* dev hints

* dev sampling for tech report

* tech report workflow

* removed sampling

* dates flexibility

* fix

* formatting

* other legacy tables

* docs and dependencies

* comment

* Update definitions/output/pages.js

Co-authored-by: Barry Pollard <[email protected]>

* Update definitions/output/technologies.js

Co-authored-by: Barry Pollard <[email protected]>

* Update package.json

Co-authored-by: Barry Pollard <[email protected]>

* Update workflow_settings.yaml

Co-authored-by: Barry Pollard <[email protected]>

* format

* not dependent on all.pages

* migrated to function trigger

* cloud function

* readme update

* deployed function

* readme updates

* readme update

* init stable copies

* requests ready

* adjusted requests pipeline

* use release configs in prod

* readme update

* tags update

* dev sampling

* prune summary

* sorted

* false when target exists

* dev sampling

* newline

* trigger cleanup

* formatting

* forEach iteration

* create table with operate

* new test tables script

* tested

* merge

* JSON columns

* job per client

* native object pruning

* Update definitions/output/all/reprocess_requests.js

Co-authored-by: Barry Pollard <[email protected]>

---------

Co-authored-by: Barry Pollard <[email protected]>
  • Loading branch information
max-ostapenko and tunetheweb authored Oct 7, 2024
1 parent 6640ffe commit 94718f9
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 49 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# HTTP Archive BigQuery pipeline with Dataform

## Tables
This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.

## Pipelines

The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run.

### Crawl tables in `all` dataset

Expand Down
44 changes: 23 additions & 21 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month));
const date = constants.fn_past_month(constants.current_month);

operate("test_env", {
hasOutput: true,
disabled: true // MUST NOT be commented in main branch
}).queries(ctx => `
CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS
SELECT *
FROM httparchive.all.pages ${constants.dev_TABLESAMPLE}
WHERE date = '${two_months_ago}';
var resources_list = [
//{datasetId: "all", tableId: "pages"},
{datasetId: "all", tableId: "requests"},
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
];

CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS
SELECT *
FROM httparchive.all.requests ${constants.dev_TABLESAMPLE}
WHERE date = '${two_months_ago}';
resources_list.forEach(resource => {
operate(`test_table ${resource.datasetId}_${resource.tableId}`, {
disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev"
}).tags([
"test_tables"
]).queries(ctx => `
CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS
SELECT *
FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE}
WHERE date = '${two_months_ago}';
DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId}
LIKE httparchive.${resource.datasetId}.${resource.tableId};
CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS
INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId}
SELECT *
FROM httparchive.core_web_vitals.technologies
WHERE date = '${two_months_ago}'
`)
FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE}
WHERE date = '${date}'
`);
})
120 changes: 120 additions & 0 deletions definitions/output/all/reprocess_requests.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
operate(`all_requests_stable_pre`).tags(
["all_requests_stable"]
).queries(`
CREATE SCHEMA IF NOT EXISTS all_dev;
DROP TABLE IF EXISTS \`all_dev.requests_stable\`;
CREATE TABLE \`all_dev.requests_stable\`
(
date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"),
client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"),
page STRING NOT NULL OPTIONS(description="The URL of the page being tested"),
is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."),
root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"),
rank INT64 OPTIONS(description="Site popularity rank, from CrUX"),
url STRING NOT NULL OPTIONS(description="The URL of the request"),
is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"),
type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"),
index INT64 OPTIONS(description="The sequential 0-based index of the request"),
payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"),
summary JSON OPTIONS(description="JSON-encoded summarization of request data"),
request_headers ARRAY<STRUCT<
name STRING OPTIONS(description="Request header name"),
value STRING OPTIONS(description="Request header value")
>> OPTIONS(description="Request headers"),
response_headers ARRAY<STRUCT<
name STRING OPTIONS(description="Response header name"),
value STRING OPTIONS(description="Response header value")
>> OPTIONS(description="Response headers"),
response_body STRING OPTIONS(description="Text-based response body")
)
PARTITION BY date
CLUSTER BY client, is_root_page, type, rank
OPTIONS(
require_partition_filter=true
);
`);

const iterations = [];
const clients = constants.clients;

for (
let month = constants.current_month;
month >= '2024-09-01'; // 2022-07-01
month = constants.fn_past_month(month)) {
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
})
}

iterations.forEach((iteration, i) => {
operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags(
["all_requests_stable"]
).dependencies([
i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}`
]).queries(ctx => `
INSERT INTO \`all_dev.requests_stable\`
SELECT
requests.date,
requests.client,
requests.page,
requests.is_root_page,
requests.root_page,
crux.rank,
requests.url,
requests.is_main_document,
requests.type,
requests.index,
JSON_REMOVE(
SAFE.PARSE_JSON(payload, wide_number_mode => 'round'),
'$._headers'
) AS payload,
JSON_REMOVE(
SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'),
'$.firstHtml',
'$.firstReq',
'$.req_accept_encoding',
'$.req_accept_language',
'$.req_accept',
'$.req_if_modified_since',
'$.req_if_none_match',
'$.req_referer',
'$.req_user_agent',
'$.reqOtherHeaders',
'$.requestid',
'$.resp_age',
'$.resp_cache_control',
'$.resp_content_length',
'$.resp_content_type',
'$.resp_date',
'$.resp_etag',
'$.resp_last_modified',
'$.resp_server',
'$.resp_vary',
'$.respOtherHeaders',
'$.startedDateTime',
'$.url',
'$.urlShort'
) as summary,
requests.request_headers,
requests.response_headers,
requests.response_body
FROM (
SELECT *
FROM \`all.requests\` ${constants.dev_TABLESAMPLE}
WHERE date = '${iteration.month}'
AND client = '${iteration.client}') AS requests
LEFT JOIN (
SELECT DISTINCT
CONCAT(origin, '/') AS page,
experimental.popularity.rank AS rank
FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")}
WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')}
) AS crux
ON requests.root_page = crux.page;
`)
});
28 changes: 14 additions & 14 deletions definitions/output/core_web_vitals/technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
);
CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING)
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
RETURNS STRUCT<accessibility NUMERIC, best_practices NUMERIC, performance NUMERIC, pwa NUMERIC, seo NUMERIC>
LANGUAGE js AS '''
try {
const $ = JSON.parse(categories);
Expand Down Expand Up @@ -74,23 +74,23 @@ crux AS (
END AS rank,
CONCAT(origin, '/') AS root_page_url,
IF(device = 'desktop', 'desktop', 'mobile') AS client,
# CWV
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,
(IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,
# WV
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
Expand All @@ -114,7 +114,7 @@ technologies AS (
${ctx.resolve("all", "pages")},
UNNEST(technologies) AS technology
WHERE
date = '${past_month}' AND
date = '${past_month}' ${constants.dev_rank5000_filter} AND
technology.technology IS NOT NULL AND
technology.technology != ''
UNION ALL
Expand All @@ -125,7 +125,7 @@ UNION ALL
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}'
date = '${past_month}' ${constants.dev_rank5000_filter}
),
categories AS (
Expand All @@ -137,7 +137,7 @@ categories AS (
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}'
date = '${past_month}' ${constants.dev_rank5000_filter}
GROUP BY
app
UNION ALL
Expand All @@ -149,7 +149,7 @@ UNION ALL
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}' AND
date = '${past_month}' ${constants.dev_rank5000_filter} AND
client = 'mobile'
),
Expand All @@ -165,7 +165,7 @@ summary_stats AS (
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}'
date = '${past_month}' ${constants.dev_rank5000_filter}
),
lab_data AS (
Expand Down Expand Up @@ -206,7 +206,7 @@ SELECT
app,
client,
COUNT(0) AS origins,
# CrUX data
COUNTIF(good_fid) AS origins_with_good_fid,
COUNTIF(good_cls) AS origins_with_good_cls,
Expand All @@ -227,19 +227,19 @@ SELECT
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,
# Lighthouse data
APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility,
APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices,
APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance,
APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa,
APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo,
# Page weight stats
APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total,
APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js,
APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image
FROM
lab_data
JOIN
Expand Down
8 changes: 7 additions & 1 deletion definitions/sources/declares.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@ for (const table of crux_tables) {
});

assert(`${table}_not_empty`).query(ctx => `
SELECT
SELECT
'No data for the specified date' AS error_message
FROM ${ctx.ref("chrome-ux-report", "materialized", table)}
WHERE yyyymm = ${past_month}
GROUP BY yyyymm
HAVING COUNT(1) = 0
`);
}

declare({
database: "chrome-ux-report",
schema: "experimental",
name: "global",
});
12 changes: 0 additions & 12 deletions src/dataform.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) {
compilationResult: {
releaseConfig: `${repoURI}/releaseConfigs/production`
}
}, dev_request = {
parent: repoURI,
compilationResult: {
gitCommitish: 'dev'
},
codeCompilationConfig: {
schemaSuffix: 'dev',
tablePrefix: 'dev',
vars: {
current_month: '2024-08-01',
},
}
};

console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`);
Expand Down

0 comments on commit 94718f9

Please sign in to comment.