From 38d9f01bb44b82c32862b813d89d39945aa4f3c6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 1 Nov 2024 10:13:25 +0100 Subject: [PATCH] Backfills & deprecating legacy tables (#10) * deprecated * backfill draft * cleanup * null placeholders * sql fix * fix month range * literal table names * backfill tested * dates reset * requests_summary * requests backfill for mid month * remove legacy pipelines * checked against new schema * adjusted to a new schema * backfill_pages * legacy removed * remove legacy datasets * metrics sorted * parse features * lint * jscpd off * update js variable names * other cm format * pages completed * summary_pages completed * without other headers * fix * fix * fix * actual reprocessing queries * fix * requests complete * fix casts * wptid from summary * Update definitions/output/all/backfill_requests.js * summary update * only valid other headers * move tables * fix json parsing * fix summary metrics * crawl pipeline updated * update dependents * response_bodies adjustment * lint --- .github/workflows/linter.yaml | 1 + README.md | 13 +- definitions/extra/test_env.js | 3 +- definitions/output/all/pages.js | 2 +- definitions/output/all/parsed_css.js | 21 +- definitions/output/all/reprocess_pages.js | 274 ---------- definitions/output/all/reprocess_requests.js | 141 ------ definitions/output/all/requests.js | 2 +- definitions/output/blink_features/features.js | 33 +- definitions/output/blink_features/usage.js | 2 +- .../output/core_web_vitals/technologies.js | 55 +- definitions/output/crawl/backfill_pages.js | 336 +++++++++++++ definitions/output/crawl/backfill_requests.js | 311 ++++++++++++ .../output/crawl/backfill_summary_pages.js | 204 ++++++++ .../output/crawl/backfill_summary_requests.js | 242 +++++++++ definitions/output/crawl/pages.js | 468 ++++++++++++++++++ definitions/output/crawl/parsed_css.js | 32 ++ definitions/output/crawl/reprocess_pages.js | 223 +++++++++ .../output/crawl/reprocess_requests.js | 99 ++++ definitions/output/crawl/requests.js | 162 ++++++ definitions/output/sample_data/pages_10k.js | 4 +- .../output/sample_data/parsed_css_10k.js | 4 +- .../output/sample_data/requests_10k.js | 10 +- definitions/sources/chrome-ux-report.js | 4 +- package.json | 6 +- src/index.js | 2 +- 26 files changed, 2130 insertions(+), 524 deletions(-) delete mode 100644 definitions/output/all/reprocess_pages.js delete mode 100644 definitions/output/all/reprocess_requests.js create mode 100644 definitions/output/crawl/backfill_pages.js create mode 100644 definitions/output/crawl/backfill_requests.js create mode 100644 definitions/output/crawl/backfill_summary_pages.js create mode 100644 definitions/output/crawl/backfill_summary_requests.js create mode 100644 definitions/output/crawl/pages.js create mode 100644 definitions/output/crawl/parsed_css.js create mode 100644 definitions/output/crawl/reprocess_pages.js create mode 100644 definitions/output/crawl/reprocess_requests.js create mode 100644 definitions/output/crawl/requests.js diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml index b7ea4d5..60c9064 100644 --- a/.github/workflows/linter.yaml +++ b/.github/workflows/linter.yaml @@ -30,5 +30,6 @@ jobs: env: DEFAULT_BRANCH: main GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VALIDATE_JSCPD: false VALIDATE_JAVASCRIPT_PRETTIER: false VALIDATE_MARKDOWN_PRETTIER: false diff --git a/README.md b/README.md index 04cc1f9..abfb0cc 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are ### Crawl results -Tag: `crawl_results_all` +Tag: `crawl_complete` -- httparchive.all.pages -- httparchive.all.parsed_css -- httparchive.all.requests +- httparchive.crawl.pages +- httparchive.crawl.parsed_css +- httparchive.crawl.requests ### Core Web Vitals Technology Report @@ -39,6 +39,9 @@ Consumers: Tag: `crawl_results_legacy` +- httparchive.all.pages +- httparchive.all.parsed_css +- httparchive.all.requests - httparchive.lighthouse.YYYY_MM_DD_client - httparchive.pages.YYYY_MM_DD_client - httparchive.requests.YYYY_MM_DD_client @@ -51,7 +54,7 @@ Tag: `crawl_results_legacy` 1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription - Tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"] + Tags: ["crawl_complete", "blink_features_report", "crawl_results_legacy"] 2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index f804bfe..c9c4b03 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,4 +1,5 @@ const date = constants.currentMonth +operate('test') // List of resources to be copied to the test environment. Comment out the ones you don't need. const resourcesList = [ @@ -15,7 +16,7 @@ const resourcesList = [ resourcesList.forEach(resource => { operate( `test_table ${resource.datasetId}_dev_dev_${resource.tableId}` - ).queries(` + ).dependencies(['test']).queries(` CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId}; diff --git a/definitions/output/all/pages.js b/definitions/output/all/pages.js index 308c958..e5b3490 100644 --- a/definitions/output/all/pages.js +++ b/definitions/output/all/pages.js @@ -7,7 +7,7 @@ publish('pages', { clusterBy: ['client', 'is_root_page', 'rank'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}'; diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js index 78ffcb4..fcf9f48 100644 --- a/definitions/output/all/parsed_css.js +++ b/definitions/output/all/parsed_css.js @@ -7,21 +7,10 @@ publish('parsed_css', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE date = '${constants.currentMonth}'; -`).query(ctx => ` -SELECT * -FROM ${ctx.ref('crawl_staging', 'parsed_css')} -WHERE date = '${constants.currentMonth}' - AND client = 'desktop' - ${constants.devRankFilter} -`).postOps(ctx => ` -INSERT INTO ${ctx.self()} -SELECT * -FROM ${ctx.ref('crawl_staging', 'parsed_css')} -WHERE date = '${constants.currentMonth}' - AND client = 'mobile' - ${constants.devRankFilter}; +DROP SNAPSHOT TABLE IF EXISTS ${ctx.self()}; + +CREATE SNAPSHOT TABLE ${ctx.self()} +CLONE ${ctx.ref('crawl', 'parsed_css')}; `) diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/all/reprocess_pages.js deleted file mode 100644 index 22f65f8..0000000 --- a/definitions/output/all/reprocess_pages.js +++ /dev/null @@ -1,274 +0,0 @@ -operate('all_pages_stable_pre').tags( - ['all_pages_stable'] -).queries(` -CREATE SCHEMA IF NOT EXISTS all_dev; - --- DROP TABLE IF EXISTS \`all_dev.pages_stable\`; - -CREATE TABLE IF NOT EXISTS \`all_dev.pages_stable\` -( - date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), - client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), - page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), - is_root_page BOOL NOT NULL OPTIONS(description='Whether the page is the root of the origin'), - root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested, the origin followed by /'), - rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), - wptid STRING OPTIONS(description='ID of the WebPageTest results'), - payload JSON OPTIONS(description='JSON-encoded WebPageTest results for the page'), - summary JSON OPTIONS(description='JSON-encoded summarization of the page-level data'), - custom_metrics STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - > OPTIONS(description='Custom metrics from WebPageTest'), - lighthouse JSON OPTIONS(description='JSON-encoded Lighthouse report'), - features ARRAY> OPTIONS(description='Blink features detected at runtime (see https://chromestatus.com/features)'), - technologies ARRAY OPTIONS(description='List of categories to which this technology belongs'), - info ARRAY OPTIONS(description='Additional metadata about the detected technology, ie version number') - >> OPTIONS(description='Technologies detected at runtime (see https://www.wappalyzer.com/)'), - metadata JSON OPTIONS(description='Additional metadata about the test') -) -PARTITION BY date -CLUSTER BY client, is_root_page, rank, page -OPTIONS( - require_partition_filter=true -); -`) - -const iterations = [] -const clients = constants.clients - -// From 2022-07-01 till today -for ( - let month = constants.currentMonth; month >= '2024-09-01'; month = constants.fnPastMonth(month)) { - clients.forEach((client) => { - iterations.push({ - month, - client - }) - }) -} - -iterations.forEach((iteration, i) => { - operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([ - 'all_pages_stable' - ]).dependencies([ - i === 0 ? 'all_pages_stable_pre' : `all_pages_stable_update ${iterations[i - 1].month} ${iterations[i - 1].client}` - ]).queries(ctx => ` -DELETE FROM \`all_dev.pages_stable\` -WHERE date = '${iteration.month}' AND - client = '${iteration.client}'; - -INSERT INTO \`all_dev.pages_stable\` -SELECT - date, - client, - page, - is_root_page, - root_page, - rank, - wptid, - JSON_REMOVE( - SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), - '$._metadata', - '$._detected', - '$._detected_apps', - '$._detected_technologies', - '$._detected_raw', - '$._custom', - '$._00_reset', - '$._a11y', - '$._ads', - '$._almanac', - '$._aurora', - '$._avg_dom_depth', - '$._cms', - '$._Colordepth', - '$._cookies', - '$._crawl_links', - '$._css-variables', - '$._css', - '$._doctype', - '$._document_height', - '$._document_width', - '$._Dpi', - '$._ecommerce', - '$._element_count', - '$._event-names', - '$._fugu-apis', - '$._generated-content', - '$._has_shadow_root', - '$._Images', - '$._img-loading-attr', - '$._initiators', - '$._inline_style_bytes', - '$._javascript', - '$._lib-detector-version', - '$._localstorage_size', - '$._markup', - '$._media', - '$._meta_viewport', - '$._num_iframes', - '$._num_scripts_async', - '$._num_scripts_sync', - '$._num_scripts', - '$._observers', - '$._origin-trials', - '$._parsed_css', - '$._performance', - '$._privacy-sandbox', - '$._privacy', - '$._pwa', - '$._quirks_mode', - '$._Resolution', - '$._responsive_images', - '$._robots_meta', - '$._robots_txt', - '$._sass', - '$._security', - '$._sessionstorage_size', - '$._structured-data', - '$._third-parties', - '$._usertiming', - '$._valid-head', - '$._well-known', - '$._wpt_bodies', - '$._blinkFeatureFirstUsed', - '$._CrUX' - ) AS payload, - JSON_SET( - JSON_REMOVE( - SAFE.PARSE_JSON(summary, wide_number_mode => 'round'), - '$._adult_site', - '$.archive', - '$.avg_dom_depth', - '$.crawlid', - '$.createDate', - '$.doctype', - '$.document_height', - '$.document_width', - '$.label', - '$.localstorage_size', - '$.meta_viewport', - '$.metadata', - '$.num_iframes', - '$.num_scripts_async', - '$.num_scripts_sync', - '$.num_scripts', - '$.pageid', - '$.PageSpeed', - '$.rank', - '$.sessionstorage_size', - '$.startedDateTime', - '$.url', - '$.urlhash', - '$.urlShort', - '$.usertiming', - '$.wptid', - '$.wptrun' - ), - '$.crux', - JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), '$._CrUX') - ) AS summary, - STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - >( - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.a11y'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.cms'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.cookies'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.css-variables'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.ecommerce'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.element_count'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.javascript'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.markup'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.media'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.origin-trials'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.performance'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.privacy'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.responsive_images'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.robots_txt'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.security'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.structured-data'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.third-parties'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.well-known'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.wpt_bodies'), - JSON_REMOVE( - SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), - '$.a11y', - '$.cms', - '$.cookies', - '$.css-variables', - '$.ecommerce', - '$.element_count', - '$.javascript', - '$.markup', - '$.media', - '$.origin-trials', - '$.performance', - '$.privacy', - '$.responsive_images', - '$.robots_txt', - '$.security', - '$.structured-data', - '$.third-parties', - '$.well-known', - '$.wpt_bodies' - ) - ) AS custom_metrics, - SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, - features, - technologies, - JSON_REMOVE( - SAFE.PARSE_JSON(metadata, wide_number_mode => 'round'), - '$.page_id', - '$.parent_page_id', - '$.root_page_id' - ) AS metadata -FROM \`all.pages\` -WHERE - date = '${iteration.month}' AND - client = '${iteration.client}' ${constants.devRankFilter}; - `) -}) diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js deleted file mode 100644 index 5c77204..0000000 --- a/definitions/output/all/reprocess_requests.js +++ /dev/null @@ -1,141 +0,0 @@ -operate('all_requests_stable_pre').tags( - ['all_requests_stable'] -).queries(` -CREATE SCHEMA IF NOT EXISTS all_dev; - --- DROP TABLE IF EXISTS \`all_dev.requests_stable\`; - -CREATE TABLE IF NOT EXISTS \`all_dev.requests_stable\` -( - date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), - client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), - page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), - is_root_page BOOL OPTIONS(description='Whether the page is the root of the origin.'), - root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested'), - rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), - url STRING NOT NULL OPTIONS(description='The URL of the request'), - is_main_document BOOL NOT NULL OPTIONS(description='Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects'), - type STRING OPTIONS(description='Simplified description of the type of resource (script, html, css, text, other, etc)'), - index INT64 OPTIONS(description='The sequential 0-based index of the request'), - payload JSON OPTIONS(description='JSON-encoded WebPageTest result data for this request'), - summary JSON OPTIONS(description='JSON-encoded summarization of request data'), - request_headers ARRAY> OPTIONS(description='Request headers'), - response_headers ARRAY> OPTIONS(description='Response headers'), - response_body STRING OPTIONS(description='Text-based response body') -) -PARTITION BY date -CLUSTER BY client, is_root_page, type, rank -OPTIONS( - require_partition_filter=true -); -`) - -const iterations = [] -const types = ['= "script"', '= "image"', 'NOT IN ("script", "image")'] - -// From 2022-07-01 till today -for ( - let month = constants.currentMonth; month >= '2024-09-01'; month = constants.fnPastMonth(month)) { - constants.clients.forEach((client) => { - constants.booleans.forEach((isRootPage) => { - types.forEach((type) => { - iterations.push({ - month, - client, - isRootPage, - type - }) - }) - }) - }) -} - -iterations.forEach((iteration, i) => { - operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.isRootPage} ${i}`).tags( - ['all_requests_stable'] - ).dependencies([ - i === 0 ? 'all_requests_stable_pre' : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage} ${i - 1}` - ]).queries(ctx => ` -DELETE FROM \`all_dev.requests_stable\` -WHERE date = '${iteration.month}' - AND client = '${iteration.client}' - AND is_root_page = ${iteration.isRootPage} - AND type ${iteration.type}; - -CREATE TEMP FUNCTION PRUNE_HEADERS( - jsonObject JSON -) RETURNS JSON -LANGUAGE js AS ''' -try { - for (const [key, value] of Object.entries(jsonObject)) { - if(key.startsWith('req_') || key.startsWith('resp_')) { - delete jsonObject[key]; - } - } - return jsonObject; -} catch (e) { - return null; -} -'''; - -INSERT INTO \`all_dev.requests_stable\` -SELECT - requests.date, - requests.client, - requests.page, - requests.is_root_page, - requests.root_page, - crux.rank, - requests.url, - requests.is_main_document, - requests.type, - requests.index, - JSON_REMOVE( - SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), - '$._headers', - '$.request.headers', - '$.response.headers' - ) AS payload, - PRUNE_HEADERS( - JSON_REMOVE( - SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), - '$.crawlid', - '$.firstHtml', - '$.firstReq', - '$.pageid', - '$.reqOtherHeaders', - '$.requestid', - '$.respOtherHeaders', - '$.startedDateTime', - '$.type', - '$.url', - '$.urlShort' - ) - ) as summary, - requests.request_headers, - requests.response_headers, - requests.response_body -FROM ( - SELECT * - FROM \`all.requests\` ${constants.devTABLESAMPLE} - WHERE date = '${iteration.month}' - AND client = '${iteration.client}' - AND is_root_page = ${iteration.isRootPage} - AND type ${iteration.type} -) AS requests -LEFT JOIN ( - SELECT DISTINCT - CONCAT(origin, '/') AS page, - experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fnPastMonth(iteration.month).substring(0, 7).replace('-', '')} -) AS crux -ON requests.root_page = crux.page; - `) -}) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index f91eada..9e50b0e 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -7,7 +7,7 @@ publish('requests', { clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}'; diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js index 260f2e4..f3b6d4b 100644 --- a/definitions/output/blink_features/features.js +++ b/definitions/output/blink_features/features.js @@ -10,37 +10,6 @@ publish('features', { }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE yyyymmdd = DATE '${constants.currentMonth}'; - -CREATE TEMP FUNCTION features(payload STRING) -RETURNS ARRAY> LANGUAGE js AS -''' -function getFeatureNames(featureMap, featureType) { - try { - return Object.entries(featureMap).map(([key, value]) => { - // After Feb 2020 keys are feature IDs. - if (value.name) { - return {'name': value.name, 'type': featureType, 'id': key}; - } - // Prior to Feb 2020 keys fell back to IDs if the name was unknown. - if (idPattern.test(key)) { - return {'name': '', 'type': featureType, 'id': key.match(idPattern)[1]}; - } - // Prior to Feb 2020 keys were names by default. - return {'name': key, 'type': featureType, 'id': ''}; - }); - } catch (e) { - return []; - } -} - -var $ = JSON.parse(payload); -if (!$._blinkFeatureFirstUsed) return []; - -var idPattern = new RegExp('^Feature_(\\\\d+)$'); -return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') - .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) - .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); -'''; `).query(ctx => ` SELECT date AS yyyymmdd, @@ -58,7 +27,7 @@ FROM ( payload, rank, feature - FROM ${ctx.ref('all', 'pages')}, + FROM ${ctx.ref('crawl', 'pages')}, UNNEST(features) AS feature WHERE date = '${constants.currentMonth}' AND diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js index 131bb14..0eb0878 100644 --- a/definitions/output/blink_features/usage.js +++ b/definitions/output/blink_features/usage.js @@ -42,7 +42,7 @@ JOIN ( date, client, COUNT(DISTINCT page) AS total_urls - FROM ${ctx.ref('all', 'pages')} + FROM ${ctx.ref('crawl', 'pages')} WHERE date = '${constants.currentMonth}' AND is_root_page = TRUE diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 9502afd..7ec0e20 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -22,23 +22,6 @@ CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( good + needs_improvement + poor > 0 ); - -CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING) -RETURNS STRUCT -LANGUAGE js AS ''' -try { - const $ = JSON.parse(categories); - return { - accessibility: $.accessibility?.score, - best_practices: $['best-practices']?.score, - performance: $.performance?.score, - pwa: $.pwa?.score, - seo: $.seo?.score - }; -} catch (e) { - return {}; -} -'''; `).query(ctx => ` WITH geo_summary AS ( SELECT @@ -111,7 +94,7 @@ technologies AS ( client, page AS url FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology WHERE date = '${pastMonth}' @@ -124,7 +107,7 @@ UNION ALL client, page AS url FROM - ${ctx.resolve('all', 'pages')} + ${ctx.ref('crawl', 'pages')} WHERE date = '${pastMonth}' ${constants.devRankFilter} @@ -135,7 +118,7 @@ categories AS ( technology.technology AS app, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE @@ -148,7 +131,7 @@ UNION ALL 'ALL' AS app, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE @@ -162,12 +145,16 @@ summary_stats AS ( client, page AS url, root_page AS root_page_url, - CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS bytesTotal, - CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) AS bytesJS, - CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) AS bytesImg, - GET_LIGHTHOUSE_CATEGORY_SCORES(JSON_QUERY(lighthouse, '$.categories')) AS lighthouse_category + SAFE.INT64(summary.bytesTotal) AS bytesTotal, + SAFE.INT64(summary.bytesJS) AS bytesJS, + SAFE.INT64(summary.bytesImg) AS bytesImg, + SAFE.FLOAT64(lighthouse_category.accessibility.score) AS accessibility, + SAFE.FLOAT64(lighthouse_category['best-practices'].score) AS best_practices, + SAFE.FLOAT64(lighthouse_category.performance.score) AS performance, + SAFE.FLOAT64(lighthouse_category.pwa.score) AS pwa, + SAFE.FLOAT64(lighthouse_category.seo.score) AS seo FROM - ${ctx.resolve('all', 'pages')} + ${ctx.ref('crawl', 'pages')} WHERE date = '${pastMonth}' ${constants.devRankFilter} @@ -179,14 +166,14 @@ lab_data AS ( root_page_url, app, ANY_VALUE(category) AS category, - CAST(AVG(bytesTotal) AS INT64) AS bytesTotal, - CAST(AVG(bytesJS) AS INT64) AS bytesJS, - CAST(AVG(bytesImg) AS INT64) AS bytesImg, - CAST(AVG(lighthouse_category.accessibility) AS NUMERIC) AS accessibility, - CAST(AVG(lighthouse_category.best_practices) AS NUMERIC) AS best_practices, - CAST(AVG(lighthouse_category.performance) AS NUMERIC) AS performance, - CAST(AVG(lighthouse_category.pwa) AS NUMERIC) AS pwa, - CAST(AVG(lighthouse_category.seo) AS NUMERIC) AS seo + AVG(bytesTotal) AS bytesTotal, + AVG(bytesJS) AS bytesJS, + AVG(bytesImg) AS bytesImg, + AVG(accessibility) AS accessibility, + AVG(best_practices) AS best_practices, + AVG(performance) AS performance, + AVG(pwa) AS pwa, + AVG(seo) AS seo FROM summary_stats JOIN diff --git a/definitions/output/crawl/backfill_pages.js b/definitions/output/crawl/backfill_pages.js new file mode 100644 index 0000000..05d75e4 --- /dev/null +++ b/definitions/output/crawl/backfill_pages.js @@ -0,0 +1,336 @@ +const iterations = [] +const clients = constants.clients + +operate('backfill') + +let midMonth +for ( + let date = '2016-01-01'; + date >= '2016-01-01'; + date = constants.fnPastMonth(date) +) { + clients.forEach((client) => { + iterations.push({ + date, + client + }) + }) + + if (date <= '2018-12-01') { + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client + }) + }) + } +} + +iterations.forEach((iteration, i) => { + operate(`backfill_pages ${iteration.date} ${iteration.client}`).tags([ + 'backfill_pages' + ]).dependencies([ + i === 0 ? 'backfill' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` + ]).queries(ctx => ` +DELETE FROM crawl.pages +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; + +CREATE TEMPORARY FUNCTION getOtherCustomMetrics( + payload JSON, + keys ARRAY +) RETURNS JSON +LANGUAGE js AS """ +try { + let otherMetrics = {}; + let value = null; + keys.forEach(function (key) { + try { + value = JSON.parse(payload[key]) + } catch (e) { + value = payload[key] + } + otherMetrics[key.substr(1)] = value + }); + return otherMetrics; +} catch (e) { + return null; +} +"""; + +CREATE TEMP FUNCTION getFeatures(blinkFeatureFirstUsed JSON) +RETURNS ARRAY> +LANGUAGE js AS +''' + function getFeatureNames(featureMap, featureType) { + try { + return Object.entries(featureMap).map(([key, value]) => { + // After Feb 2020 keys are feature IDs. + if (value.name) { + return {'feature': value.name, 'type': featureType, 'id': key}; + } + // Prior to Feb 2020 keys fell back to IDs if the name was unknown. + if (idPattern.test(key)) { + return {'feature': '', 'type': featureType, 'id': key.match(idPattern)[1]}; + } + // Prior to Feb 2020 keys were names by default. + return {'feature': key, 'type': featureType, 'id': ''}; + }); + } catch (e) { + return []; + } + } + + if (!blinkFeatureFirstUsed) return []; + + var idPattern = new RegExp('^Feature_(\\\\d+)$'); + return getFeatureNames(blinkFeatureFirstUsed.Features, 'default') + .concat(getFeatureNames(blinkFeatureFirstUsed.CSSFeatures, 'css')) + .concat(getFeatureNames(blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); +'''; + +INSERT INTO crawl.pages +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + COALESCE( + crux.rank, + CASE + WHEN summary_pages.rank <= 1000 THEN 1000 + WHEN summary_pages.rank <= 5000 THEN 5000 + ELSE NULL + END + ) AS rank, + summary_pages.wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + TO_JSON( STRUCT( + SpeedIndex, + TTFB, + _connections, + bytesAudio, + bytesCSS, + bytesFlash, + bytesFont, + bytesGif, + bytesHtml, + bytesHtmlDoc, + bytesImg, + bytesJpg, + bytesJS, + bytesJson, + bytesOther, + bytesPng, + bytesSvg, + bytesText, + bytesTotal, + bytesVideo, + bytesWebp, + bytesXml, + cdn, + payload._CrUX, + fullyLoaded, + gzipSavings, + gzipTotal, + maxDomainReqs, + maxage0, + maxage1, + maxage30, + maxage365, + maxageMore, + maxageNull, + numCompressed, + numDomElements, + numDomains, + numErrors, + numGlibs, + numHttps, + numRedirects, + onContentLoaded, + onLoad, + renderStart, + reqAudio, + reqCSS, + reqFlash, + reqFont, + reqGif, + reqHtml, + reqImg, + reqJpg, + reqJS, + reqJson, + reqOther, + reqPng, + reqSvg, + reqText, + reqTotal, + reqVideo, + reqWebp, + reqXml, + visualComplete + )) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._a11y"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cms"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cookies"), wide_number_mode => 'round'), + payload["_css-variables"], + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._ecommerce"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._element_count"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._javascript"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._markup"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._media"), wide_number_mode => 'round'), + payload["_origin-trials"], + payload._performance, + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._privacy"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._responsive_images"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._robots_txt"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._security"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._structured-data"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._third-parties"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._well-known"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._wpt_bodies"), wide_number_mode => 'round'), + getOtherCustomMetrics( + payload, + ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"] + ) + ) AS custom_metrics, + NULL AS lighthouse, + getFeatures(payload._blinkFeatureFirstUsed) AS features, + tech.technologies AS technologies, + pages.payload._metadata AS metadata +FROM ( + SELECT + * EXCEPT(payload), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload + FROM \`pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} +) AS pages + +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} +ON pages.url = summary_pages.url + +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} +) AS crux +ON pages.url = crux.page + +LEFT JOIN ( + SELECT + page, + ARRAY_AGG(technology) AS technologies + FROM( + SELECT + url AS page, + STRUCT< + technology STRING, + categories ARRAY, + info ARRAY + >( + app, + ARRAY_AGG(category), + ARRAY_AGG(info) + ) AS technology + FROM technologies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} + GROUP BY page, app + ) + GROUP BY page +) AS tech +ON pages.url = tech.page; + `) +}) diff --git a/definitions/output/crawl/backfill_requests.js b/definitions/output/crawl/backfill_requests.js new file mode 100644 index 0000000..1be37ea --- /dev/null +++ b/definitions/output/crawl/backfill_requests.js @@ -0,0 +1,311 @@ +const iterations = [] +const clients = constants.clients + +let midMonth +for ( + let date = '2016-01-01'; + date >= '2016-01-01'; + date = constants.fnPastMonth(date) +) { + clients.forEach((client) => { + iterations.push({ + date, + client + }) + }) + + if (date <= '2018-12-01') { + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client + }) + }) + } +} + +function getResponseBodiesColumnName (date) { + return date >= '2024-02-01' ? 'response_body' : 'body' +} + +iterations.forEach((iteration, i) => { + operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ + 'backfill_requests' + ]).dependencies([ + i === 0 ? 'backfill' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` + ]).queries(ctx => ` +DELETE FROM crawl.requests +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; + +CREATE TEMP FUNCTION getExtFromURL(url STRING) +RETURNS STRING +LANGUAGE js AS """ +try { + let ret_ext = url; + + // Remove query parameters + const i_q = ret_ext.indexOf("?"); + if (i_q > -1) { + ret_ext = ret_ext.substring(0, i_q); + } + + // Get the last segment of the path after the last "/" + ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); + + // Find the position of the last dot + const i_dot = ret_ext.lastIndexOf("."); + + if (i_dot === -1) { + // No dot means no extension + ret_ext = ""; + } else { + // Extract the extension + ret_ext = ret_ext.substring(i_dot + 1); + + // Weed out overly long extensions + if (ret_ext.length > 5) { + ret_ext = ""; + } + } + + return ret_ext.toLowerCase(); +} catch (e) { + return ""; // Return an empty string in case of any errors +} +"""; + +CREATE TEMP FUNCTION prettyType(mimeTyp STRING, ext STRING) +RETURNS STRING +LANGUAGE js AS """ +try { + mimeTyp = mimeTyp.toLowerCase(); + + // Order by most unique first. + // Do NOT do html because "text/html" is often misused for other types. We catch it below. + const types = ["font", "css", "image", "script", "video", "audio", "xml"]; + for (const typ of types) { + if (mimeTyp.includes(typ)) { + return typ; + } + } + + // Special cases found manually + if (ext === "js") { + return "script"; + } else if (mimeTyp.includes("json") || ext === "json") { + return "json"; + } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { + return "font"; + } else if (["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext)) { + return "image"; + } else if (ext === "css") { + return "css"; + } else if (ext === "xml") { + return "xml"; + } else if ( + ["flash", "webm", "mp4", "flv"].some((typ) => mimeTyp.includes(typ)) || + ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) + ) { + return "video"; + } else if (mimeTyp.includes("wasm") || ext === "wasm") { + return "wasm"; + } else if (mimeTyp.includes("html") || ["html", "htm"].includes(ext)) { + return "html"; // Catch "text/html" mime type + } else if (mimeTyp.includes("text")) { + return "text"; // Put "text" LAST because it's often misused, so ext should take precedence + } else { + return "other"; + } +} catch (e) { + return "other"; // Return "other" if there's any error +} +"""; + +CREATE TEMP FUNCTION getFormat(prettyTyp STRING, mimeTyp STRING, ext STRING) +RETURNS STRING +LANGUAGE js AS """ +try { + if (prettyTyp === "image") { + // Order by most popular first. + const imageTypes = ["jpg", "png", "gif", "webp", "svg", "ico", "avif", "jxl", "heic", "heif"]; + for (const typ of imageTypes) { + if (mimeTyp.includes(typ) || typ === ext) { + return typ; + } + } + if (mimeTyp.includes("jpeg")) { + return "jpg"; + } + } + + if (prettyTyp === "video") { + // Order by most popular first. + const videoTypes = ["flash", "swf", "mp4", "flv", "f4v"]; + for (const typ of videoTypes) { + if (mimeTyp.includes(typ) || typ === ext) { + return typ; + } + } + } + + return ""; +} catch (e) { + return ""; +} +"""; + +CREATE TEMP FUNCTION parseHeaders(headers JSON) +RETURNS ARRAY> +LANGUAGE js AS """ + try { + return headers.map(header => { + return { name: header.name.toLowerCase(), value: header.value }; + }); + } catch (e) { + return []; + } +"""; + +CREATE TEMP FUNCTION getCookieLen(headers JSON, cookieName STRING) +RETURNS INT64 +LANGUAGE js AS """ + try { + const cookies = headers.filter(header => header.name.toLowerCase() === headerName) + if (!cookies) { + return 0 + } else if (Array.isArray(cookies)) { + return cookies.values().reduce((acc, cookie) => acc + cookie.value.length, 0) + } else { + return 0 + } + } catch (e) { + return 0; // Return 0 in case of any errors + } +"""; + +CREATE TEMP FUNCTION getExpAge(startedDateTime STRING, responseHeaders JSON) +RETURNS INT64 +LANGUAGE js AS """ + try { + const cacheControlRegExp = /max-age=(\\\\d+)/ + + // Get the Cache-Control header value + const cacheControl = responseHeaders.find(header => header.name.toLowerCase() === 'cache-control')?.value + + // Handle no-cache scenarios + if (cacheControl && (cacheControl.includes('must-revalidate') || cacheControl.includes('no-cache') || cacheControl.includes('no-store'))) { + return 0 + } else if (cacheControl && cacheControlRegExp.test(cacheControl)) { // Handle max-age directive in Cache-Control header + const maxAgeValue = parseInt(cacheControlRegExp.exec(cacheControl)[1]) + return Math.min(2 ** 63 - 1, maxAgeValue) + } else if ( // Handle Expires header in the response + responseHeaders.find(header => header.name.toLowerCase() === 'expires') + ) { + const respDate = responseHeaders.find(header => header.name.toLowerCase() === 'date')?.value + const startDate = new Date(respDate)?.getTime() || Date.parse(startedDateTime) + + const expDate = responseHeaders.find(header => header.name.toLowerCase() === 'expires')?.value + const endDate = new Date(expDate)?.getTime() || 0 + + return Math.max((endDate - startDate) / 1000, 0) + } + + return 0 + } catch (e) { + return 0 // Return 0 in case of any errors + } +"""; + +INSERT INTO crawl.requests +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + requests.page AS page, + TRUE AS is_root_page, + requests.page AS root_page, + COALESCE( + crux.rank, + CASE + WHEN summary_pages.rank <= 1000 THEN 1000 + WHEN summary_pages.rank <= 5000 THEN 5000 + ELSE NULL + END + ) AS rank, + requests.url AS url, + IF( + STRING(payload._request_type) = "Document" AND + MIN(index) OVER (PARTITION BY requests.page) = index, + TRUE, + FALSE + ) AS is_main_document, + type, + index, + payload, + TO_JSON( STRUCT( + payload.time, + payload._method AS method, + response.url AS redirectUrl, + IFNULL(STRING(payload._protocol), STRING(request.httpVersion)) AS reqHttpVersion, + request.headersSize AS reqHeadersSize, + request.bodySize AS reqBodySize, + getCookieLen(request.headers, 'cookie') AS reqCookieLen, + response.status, + response.httpVersion AS respHttpVersion, + response.headersSize AS respHeadersSize, + response.bodySize AS respBodySize, + response.content.size AS respSize, + getCookieLen(response.headers, 'set-cookie') AS respCookieLen, + getExpAge(STRING(payload.startedDateTime), response.headers) AS expAge, + response.content.mimeType, + payload._cdn_provider, + payload._gzip_save, + ext, + getFormat(type, STRING(response.content.mimeType), ext) AS format + )) AS summary, + parseHeaders(request.headers) AS request_headers, + parseHeaders(response.headers) AS response_headers, + IF(requests.type = 'image', NULL, response_bodies.response_body) AS response_body +FROM ( + FROM \`requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} + |> SET payload = SAFE.PARSE_JSON(payload, wide_number_mode => 'round') + |> EXTEND getExtFromURL(url) AS ext + |> EXTEND prettyType(STRING(payload.response.content.mimeType), ext) AS type + |> EXTEND INT64(payload._index) AS index + |> EXTEND payload.request AS request + |> EXTEND payload.response AS response + |> SET payload = JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) +) AS requests + +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} +) AS crux +ON requests.page = crux.page + +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} +ON requests.page = summary_pages.url + +LEFT JOIN ( + SELECT + page, + url, + ANY_VALUE(${getResponseBodiesColumnName(iteration.date)}) AS response_body + FROM response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} + GROUP BY page, url +) AS response_bodies ${constants.devTABLESAMPLE} +ON requests.page = response_bodies.page + AND requests.url = response_bodies.url; + `) +}) diff --git a/definitions/output/crawl/backfill_summary_pages.js b/definitions/output/crawl/backfill_summary_pages.js new file mode 100644 index 0000000..56c8aa0 --- /dev/null +++ b/definitions/output/crawl/backfill_summary_pages.js @@ -0,0 +1,204 @@ +const iterations = [] +const clients = constants.clients + +let midMonth +for ( + let date = '2015-12-01'; + date >= '2015-12-01'; + date = constants.fnPastMonth(date) +) { + clients.forEach((client) => { + iterations.push({ + date, + client + }) + }) + + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client + }) + }) +} + +function summaryObject (date) { + let list = '' + if (date >= '2010-11-15') { + list += ` + fullyLoaded, + bytesCSS, + bytesFlash, + bytesFont, + bytesGif, + bytesHtml, + bytesHtmlDoc, + bytesImg, + bytesJpg, + bytesJS, + bytesJson, + bytesOther, + bytesPng, + bytesTotal, + cdn, + gzipSavings, + gzipTotal, + maxage0, + maxage1, + maxage30, + maxage365, + maxageMore, + maxageNull, + maxDomainReqs, + numCompressed, + numDomains, + numDomElements, + numErrors, + numGlibs, + numHttps, + numRedirects, + onContentLoaded, + onLoad, + renderStart, + reqCSS, + reqFlash, + reqFont, + reqGif, + reqHtml, + reqImg, + reqJpg, + reqJS, + reqJson, + reqOther, + reqPng, + reqTotal, + SpeedIndex, + TTFB, + visualComplete` + } + if (date >= '2014-05-15') { + list += `, + _connections` + } + if (date >= '2015-05-01') { + list += `, + bytesAudio, + bytesSvg, + bytesText, + bytesVideo, + bytesWebp, + bytesXml, + reqAudio, + reqSvg, + reqText, + reqVideo, + reqWebp, + reqXml` + } + return list +} + +function customMetrics (date) { + let list = '' + if (date >= '2014-06-01' && date !== '2014-05-15') { + list += ` + avg_dom_depth, + doctype, + document_height, + document_width, + localstorage_size, + meta_viewport, + num_iframes, + num_scripts, + sessionstorage_size` + } + if (date >= '2015-11-01') { + list += `, + num_scripts_async, + num_scripts_sync` + } + return list +} + +iterations.forEach((iteration, i) => { + operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ + 'pages_backfill' + ]).dependencies([ + i === 0 ? 'backfill' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` + ]).queries(ctx => ` +DELETE FROM crawl.pages +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; + +INSERT INTO crawl.pages +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + CASE + WHEN rank<=1000 THEN 1000 + WHEN rank<=5000 THEN 5000 + ELSE NULL + END AS rank, + wptid, + NULL AS payload, + TO_JSON( STRUCT( + ${summaryObject(iteration.date)} + )) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + TO_JSON( STRUCT( + ${customMetrics(iteration.date)} + )) + ) AS custom_metrics, + NULL AS lighthouse, + NULL AS features, + NULL AS technologies, + NULL AS metadata +FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.devTABLESAMPLE}; + `) +}) diff --git a/definitions/output/crawl/backfill_summary_requests.js b/definitions/output/crawl/backfill_summary_requests.js new file mode 100644 index 0000000..ca84286 --- /dev/null +++ b/definitions/output/crawl/backfill_summary_requests.js @@ -0,0 +1,242 @@ +const iterations = [] +const clients = constants.clients + +let midMonth +for ( + let date = '2015-12-01'; + date >= '2015-12-01'; + date = constants.fnPastMonth(date) +) { + clients.forEach((client) => { + iterations.push({ + date, + client + }) + }) + + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client + }) + }) +} + +function summaryObject (date) { + let list = '' + if (date >= '2010-11-15') { + list += ` + expAge, + method, + mimeType, + redirectUrl, + reqBodySize, + reqCookieLen, + reqHeadersSize, + respBodySize, + respCookieLen, + respHeadersSize, + respHttpVersion, + respSize, + status, + time` + } + if (date >= '2014-05-15') { + list += `, + _cdn_provider` + } + if (date >= '2014-05-01') { + list += `, + _gzip_save` + } + if (date >= '2015-05-01') { + list += `, + format` + } + return list +} + +iterations.forEach((iteration, i) => { + operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ + 'requests_backfill' + ]).dependencies([ + i === 0 ? 'backfill' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` + ]).queries(ctx => ` +DELETE FROM crawl.requests +WHERE date = '${iteration.date}' AND client = '${iteration.client}'; + +CREATE TEMP FUNCTION get_ext_from_url(url STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + let ret_ext = url; + + // Remove query parameters + const i_q = ret_ext.indexOf("?"); + if (i_q > -1) { + ret_ext = ret_ext.substring(0, i_q); + } + + // Get the last segment of the path after the last "/" + ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); + + // Find the position of the last dot + const i_dot = ret_ext.lastIndexOf("."); + + if (i_dot === -1) { + // No dot means no extension + ret_ext = ""; + } else { + // Extract the extension + ret_ext = ret_ext.substring(i_dot + 1); + + // Weed out overly long extensions + if (ret_ext.length > 5) { + ret_ext = ""; + } + } + + return ret_ext.toLowerCase(); + } catch (e) { + return ""; // Return an empty string in case of any errors + } +"""; + +CREATE TEMP FUNCTION get_type(mime_typ STRING, ext STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + mime_typ = mime_typ.toLowerCase(); + + // Order by most unique types first + const uniqueTypes = ["font", "css", "image", "script", "video", "audio", "xml"]; + for (let typ of uniqueTypes) { + if (mime_typ.includes(typ)) { + return typ; + } + } + + // Special cases + if (mime_typ.includes("json") || ["js", "json"].includes(ext)) { + return "script"; + } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { + return "font"; + } else if ( + ["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext) + ) { + return "image"; + } else if (ext === "css") { + return "css"; + } else if (ext === "xml") { + return "xml"; + } else if ( + ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) || + ["flash", "webm", "mp4", "flv"].some(typ => mime_typ.includes(typ)) + ) { + return "video"; + } else if (mime_typ.includes("wasm") || ext === "wasm") { + return "wasm"; + } else if (mime_typ.includes("html") || ["html", "htm"].includes(ext)) { + return "html"; + } else if (mime_typ.includes("text")) { + // Put "text" last because it is often misused, so extension should take precedence. + return "text"; + } else { + return "other"; + } + } catch (e) { + return "other"; // Return "other" if there's any error + } +"""; + +CREATE TEMP FUNCTION parse_headers(headers STRING) +RETURNS ARRAY> +LANGUAGE js +AS """ + try { + const parsedHeaders = headers.split(', ').map(header => { + const [name, value] = header.split(' = ') + if (name && value) { + return { name: name.trim(), value: value.trim() } + } + }) + return parsedHeaders.filter(Object) + } catch (e) { + return e + } +"""; + +INSERT INTO crawl.requests +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + pages.rank AS rank, + requests.url AS url, + requests.firstHTML AS is_main_document, + get_type(requests.mimeType, requests.ext_from_url) AS type, + IF(requests.firstReq, 1, NULL) AS index, + NULL AS payload, + TO_JSON( STRUCT( + ext_from_url AS ext, + ${summaryObject(iteration.date)} + )) AS summary, + ARRAY_CONCAT( + ARRAY>[ + ('Accept', requests.req_accept), + ("Accept-Charset", requests.req_accept_charset), + ("Accept-Encoding", requests.req_accept_encoding), + ("Accept-Language", requests.req_accept_language), + ("Connection", requests.req_connection), + ("Host", requests.req_host), + ("If-Modified-Since", requests.req_if_modified_since), + ("If-None-Match", requests.req_if_none_match), + ("Referer", requests.req_referer), + ("User-Agent", requests.req_user_agent) + ], + parse_headers(requests.reqOtherHeaders) + ) AS request_headers, + ARRAY_CONCAT( + ARRAY>[ + ("Accept-Ranges", requests.resp_accept_ranges), + ("Age", requests.resp_age), + ("Cache-Control", requests.resp_cache_control), + ("Connection", requests.resp_connection), + ("Content-Encoding", requests.resp_content_encoding), + ("Content-Language", requests.resp_content_language), + ("Content-Length", requests.resp_content_length), + ("Content-Location", requests.resp_content_location), + ("Content-Type", requests.resp_content_type), + ("Date", requests.resp_date), + ("ETag", requests.resp_etag), + ("Expires", requests.resp_expires), + ("Keep-Alive", requests.resp_keep_alive), + ("Last-Modified", requests.resp_last_modified), + ("Location", requests.resp_location), + ("Pragma", requests.resp_pragma), + ("Server", requests.resp_server), + ("Transfer-Encoding", requests.resp_transfer_encoding), + ("Vary", requests.resp_vary), + ("Via", requests.resp_via), + ("X-Powered-By", requests.resp_x_powered_by) + ], + parse_headers(requests.respOtherHeaders) + ) AS response_headers, + NULL AS response_body +FROM ( + SELECT + *, + get_ext_from_url(url) AS ext_from_url + FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} +) AS requests +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.devTABLESAMPLE} +ON requests.pageid = pages.pageid; + `) +}) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js new file mode 100644 index 0000000..85cd6bd --- /dev/null +++ b/definitions/output/crawl/pages.js @@ -0,0 +1,468 @@ +publish('pages', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'page'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +CREATE SCHEMA IF NOT EXISTS crawl; + +CREATE TABLE IF NOT EXISTS ${ctx.self()} +( + date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), + client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), + page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), + is_root_page BOOL NOT NULL OPTIONS(description='Whether the page is the root of the origin'), + root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested, the origin followed by /'), + rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), + wptid STRING OPTIONS(description='ID of the WebPageTest results'), + payload JSON OPTIONS(description='JSON-encoded WebPageTest results for the page'), + summary JSON OPTIONS(description='JSON-encoded summarization of the page-level data'), + custom_metrics STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + > OPTIONS(description='Custom metrics from WebPageTest'), + lighthouse JSON OPTIONS(description='JSON-encoded Lighthouse report'), + features ARRAY> OPTIONS(description='Blink features detected at runtime (see https://chromestatus.com/features)'), + technologies ARRAY OPTIONS(description='List of categories to which this technology belongs'), + info ARRAY OPTIONS(description='Additional metadata about the detected technology, ie version number') + >> OPTIONS(description='Technologies detected at runtime (see https://www.wappalyzer.com/)'), + metadata JSON OPTIONS(description='Additional metadata about the test') +) +PARTITION BY date +CLUSTER BY client, is_root_page, rank, page +OPTIONS( + require_partition_filter=true +); + +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'desktop'; +`).query(ctx => ` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + summary, + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + '$.crux', + payload._CrUX + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, + JSON_REMOVE( + custom_metrics, + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + lighthouse, + features, + technologies, + JSON_REMOVE( + metadata, + '$.page_id', + '$.parent_page_id', + '$.root_page_id' + ) AS metadata +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'pages')} + WHERE date = '${constants.currentMonth}' AND + client = 'desktop' + ${constants.devRankFilter} +) +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + summary, + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + '$.crux', + payload._CrUX + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, + JSON_REMOVE( + custom_metrics, + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + lighthouse, + features, + technologies, + JSON_REMOVE( + metadata, + '$.page_id', + '$.parent_page_id', + '$.root_page_id' + ) AS metadata +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'pages')} + WHERE date = '${constants.currentMonth}' AND + client = 'mobile' + ${constants.devRankFilter} +) +`) diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js new file mode 100644 index 0000000..529bbe0 --- /dev/null +++ b/definitions/output/crawl/parsed_css.js @@ -0,0 +1,32 @@ +publish('parsed_css', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'page'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' + AND client = 'desktop'; +`).query(ctx => ` +SELECT * +FROM ${ctx.ref('crawl_staging', 'parsed_css')} +WHERE date = '${constants.currentMonth}' + AND client = 'desktop' + ${constants.devRankFilter} +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' + AND client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT * +FROM ${ctx.ref('crawl_staging', 'parsed_css')} +WHERE date = '${constants.currentMonth}' + AND client = 'mobile' + ${constants.devRankFilter}; +`) diff --git a/definitions/output/crawl/reprocess_pages.js b/definitions/output/crawl/reprocess_pages.js new file mode 100644 index 0000000..91d8f14 --- /dev/null +++ b/definitions/output/crawl/reprocess_pages.js @@ -0,0 +1,223 @@ +operate('reprocess') + +const iterations = [] +const clients = constants.clients + +for ( + let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { + clients.forEach((client) => { + iterations.push({ + month, + client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`reprocess_pages ${iteration.month} ${iteration.client}`).tags([ + 'reprocess_pages' + ]).dependencies([ + i === 0 ? 'reprocess' : `reprocess_pages ${iterations[i - 1].month} ${iterations[i - 1].client}` + ]).queries(ctx => ` +DELETE FROM crawl.pages +WHERE date = '${iteration.month}' AND + client = '${iteration.client}'; + +INSERT INTO crawl.pages +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + summary, + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + '$.crux', + payload._CrUX + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, + JSON_REMOVE( + custom_metrics, + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + lighthouse, + features, + technologies, + JSON_REMOVE( + metadata, + '$.page_id', + '$.parent_page_id', + '$.root_page_id' + ) AS metadata +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM \`all.pages\` + WHERE date = '${iteration.month}' AND + client = '${iteration.client}' ${constants.devRankFilter} +); + `) +}) diff --git a/definitions/output/crawl/reprocess_requests.js b/definitions/output/crawl/reprocess_requests.js new file mode 100644 index 0000000..6e20494 --- /dev/null +++ b/definitions/output/crawl/reprocess_requests.js @@ -0,0 +1,99 @@ +const iterations = [] + +for ( + let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { + constants.clients.forEach((client) => { + constants.booleans.forEach((isRootPage) => { + iterations.push({ + month, + client, + isRootPage + }) + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`reprocess_requests ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( + ['reprocess_requests'] + ).dependencies([ + i === 0 ? 'reprocess' : `reprocess_requests ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` + ]).queries(ctx => ` +DELETE FROM crawl.requests +WHERE date = '${iteration.month}' + AND client = '${iteration.client}' + AND is_root_page = ${iteration.isRootPage}; + +CREATE TEMP FUNCTION pruneHeaders( + jsonObject JSON +) RETURNS JSON +LANGUAGE js AS ''' +try { + for (const [key, value] of Object.entries(jsonObject)) { + if(key.startsWith('req_') || key.startsWith('resp_')) { + delete jsonObject[key]; + } + } + return jsonObject; +} catch (e) { + return jsonObject; +} +'''; + +INSERT INTO crawl.requests +SELECT + date, + client, + requests.page, + is_root_page, + root_page, + crux.rank, + url, + is_main_document, + type, + index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + pruneHeaders( + JSON_REMOVE( + summary, + '$.crawlid', + '$.firstHtml', + '$.firstReq', + '$.pageid', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.type', + '$.url', + '$.urlShort' + ) + ) as summary, + request_headers, + response_headers, + response_body +FROM ( + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM \`all.requests\` ${constants.devTABLESAMPLE} + WHERE date = '${iteration.month}' + AND client = '${iteration.client}' + AND is_root_page = ${iteration.isRootPage} +) AS requests +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.month).substring(0, 7).replace('-', '')} +) AS crux +ON requests.root_page = crux.page; + `) +}) diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js new file mode 100644 index 0000000..5ca4922 --- /dev/null +++ b/definitions/output/crawl/requests.js @@ -0,0 +1,162 @@ +publish('requests', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'type', 'rank'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +CREATE SCHEMA IF NOT EXISTS crawl; + +CREATE TABLE IF NOT EXISTS ${ctx.self()} +( + date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), + client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), + page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), + is_root_page BOOL OPTIONS(description='Whether the page is the root of the origin.'), + root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested'), + rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), + url STRING NOT NULL OPTIONS(description='The URL of the request'), + is_main_document BOOL NOT NULL OPTIONS(description='Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects'), + type STRING OPTIONS(description='Simplified description of the type of resource (script, html, css, text, other, etc)'), + index INT64 OPTIONS(description='The sequential 0-based index of the request'), + payload JSON OPTIONS(description='JSON-encoded WebPageTest result data for this request'), + summary JSON OPTIONS(description='JSON-encoded summarization of request data'), + request_headers ARRAY> OPTIONS(description='Request headers'), + response_headers ARRAY> OPTIONS(description='Response headers'), + response_body STRING OPTIONS(description='Text-based response body') +) +PARTITION BY date +CLUSTER BY client, is_root_page, type, rank +OPTIONS( + require_partition_filter=true +); + +CREATE TEMP FUNCTION pruneHeaders( + jsonObject JSON +) RETURNS JSON +LANGUAGE js AS ''' +try { + for (const [key, value] of Object.entries(jsonObject)) { + if(key.startsWith('req_') || key.startsWith('resp_')) { + delete jsonObject[key] + } + } + return jsonObject +} catch (e) { + return jsonObject +} +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'desktop'; +`).query(ctx => ` +SELECT + date, + client, + requests.page, + is_root_page, + root_page, + crux.rank, + url, + is_main_document, + type, + index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + pruneHeaders( + JSON_REMOVE( + summary, + '$.crawlid', + '$.firstHtml', + '$.firstReq', + '$.pageid', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.type', + '$.url', + '$.urlShort' + ) + ) as summary, + request_headers, + response_headers, + response_body +FROM ( + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'requests')} + WHERE date = '${constants.currentMonth}' + AND client = 'desktop' + ${constants.devTABLESAMPLE} +) +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT + date, + client, + requests.page, + is_root_page, + root_page, + crux.rank, + url, + is_main_document, + type, + index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + pruneHeaders( + JSON_REMOVE( + summary, + '$.crawlid', + '$.firstHtml', + '$.firstReq', + '$.pageid', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.type', + '$.url', + '$.urlShort' + ) + ) as summary, + request_headers, + response_headers, + response_body +FROM ( + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'requests')} + WHERE date = '${constants.currentMonth}' + AND client = 'mobile' + ${constants.devTABLESAMPLE} +) +`) diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 875baa6..eb6e4c2 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -5,12 +5,12 @@ publish('pages_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'pages')} +FROM ${ctx.ref('crawl', 'pages')} WHERE date = '${constants.currentMonth}' AND rank <= 10000 `) diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js index fd08f07..b22feba 100644 --- a/definitions/output/sample_data/parsed_css_10k.js +++ b/definitions/output/sample_data/parsed_css_10k.js @@ -5,12 +5,12 @@ publish('parsed_css_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'parsed_css')} +FROM ${ctx.ref('crawl', 'parsed_css')} WHERE date = '${constants.currentMonth}' AND rank <= 10000 `) diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index dc09cc1..08e8e14 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -5,16 +5,12 @@ publish('requests_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'requests')} +FROM ${ctx.ref('crawl', 'requests')} WHERE date = '${constants.currentMonth}' AND - -- rank <= 10000 -- TODO: use rank filtering when https://github.com/HTTPArchive/dataform/pull/5 is complete - page IN ( - SELECT page - FROM ${ctx.ref('sample_data', 'pages_10k')} - ) + rank <= 10000 `) diff --git a/definitions/sources/chrome-ux-report.js b/definitions/sources/chrome-ux-report.js index afe7abd..ff98f44 100644 --- a/definitions/sources/chrome-ux-report.js +++ b/definitions/sources/chrome-ux-report.js @@ -14,7 +14,7 @@ FROM ${ctx.ref(database, 'materialized', 'country_summary')} |> WHERE yyyymm = ${pastMonthYYYYMM} |> AGGREGATE COUNT(DISTINCT country_code) AS cnt_countries |> WHERE cnt_countries != 238 -|> SELECT 'Table data doesn't match 238 countries' AS error_message; +|> SELECT "Table data doesn't match 238 countries" AS error_message `) declare({ @@ -28,7 +28,7 @@ FROM ${ctx.ref(database, 'materialized', 'device_summary')} |> WHERE date = ''${pastMonth}'' |> AGGREGATE COUNT(DISTINCT device) AS cnt_devices, COUNT(DISTINCT rank) AS cnt_ranks |> WHERE cnt_devices != 3 OR cnt_ranks != 10 -|> SELECT 'Table data doesn't match 3 unique devices and 10 ranks' AS error_message; +|> SELECT "Table data doesn't match 3 unique devices and 10 ranks" AS error_message `) declare({ diff --git a/package.json b/package.json index fa18d74..58fccdb 100644 --- a/package.json +++ b/package.json @@ -4,11 +4,9 @@ "@dataform/core": "3.0.7" }, "scripts": { - "start": "dataform run", - "compile": "dataform compile", - "test": "dataform test", "format": "npx standard --fix; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint . --fix", - "lint": "npx standard; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint ." + "lint": "npx standard; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint .; dataform compile", + "test": "dataform test" }, "standard": { "globals": [ diff --git a/src/index.js b/src/index.js index 5222aa9..c7115f5 100644 --- a/src/index.js +++ b/src/index.js @@ -38,7 +38,7 @@ FROM crux, report; actionArgs: { repoName: 'crawl-data', tags: [ - 'crawl_results_all', + 'crawl_complete', 'blink_features_report', 'crawl_results_legacy' ]