diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js deleted file mode 100644 index d609fbd..0000000 --- a/definitions/output/blink_features/features.js +++ /dev/null @@ -1,37 +0,0 @@ -publish('features', { - schema: 'blink_features', - type: 'incremental', - protected: true, - bigquery: { - partitionBy: 'yyyymmdd', - clusterBy: ['client', 'rank'] - }, - tags: ['crawl_complete'] -}).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE yyyymmdd = DATE '${constants.currentMonth}'; -`).query(ctx => ` -SELECT - date AS yyyymmdd, - client, - url, - feature.feature AS feature, - feature.type, - feature.id, - rank -FROM ( - SELECT - date, - client, - page AS url, - payload, - rank, - feature - FROM ${ctx.ref('crawl', 'pages')}, - UNNEST(features) AS feature - WHERE - date = '${constants.currentMonth}' AND - is_root_page = TRUE - ${constants.devRankFilter} -) -`) diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js index 2e483ab..472896c 100644 --- a/definitions/output/blink_features/usage.js +++ b/definitions/output/blink_features/usage.js @@ -2,14 +2,36 @@ publish('usage', { schema: 'blink_features', type: 'incremental', protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'rank', 'feature'], + requirePartitionFilter: true + }, tags: ['crawl_complete'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} -WHERE yyyymmdd = REPLACE('${constants.currentMonth}', '-', ''); +WHERE date = '${constants.currentMonth}'; `).query(ctx => ` +WITH pages AS ( SELECT - REPLACE(CAST(date AS STRING), '-', '') AS yyyymmdd, + date, client, + rank, + page, + features +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${constants.currentMonth}' AND + is_root_page = TRUE + ${constants.devRankFilter} +), ranks AS ( + SELECT DISTINCT rank FROM pages +) + +SELECT + date, + client, + rank, id, feature, type, @@ -19,20 +41,22 @@ SELECT sample_urls FROM ( SELECT - yyyymmdd AS date, + date, client, - id, - feature, - type, - COUNT(DISTINCT url) AS num_urls, - ARRAY_AGG(url ORDER BY rank, url LIMIT 100) AS sample_urls - FROM ${ctx.ref('blink_features', 'features')} - WHERE - yyyymmdd = '${constants.currentMonth}' - ${constants.devRankFilter} + ranks.rank, + feature.id, + feature.feature, + feature.type, + COUNT(DISTINCT page) AS num_urls, + ARRAY_AGG(page ORDER BY pages.rank, page LIMIT 100) AS sample_urls + FROM pages + CROSS JOIN UNNEST(features) AS feature + FULL OUTER JOIN ranks + ON pages.rank <= ranks.rank GROUP BY - yyyymmdd, + date, client, + ranks.rank, id, feature, type @@ -41,15 +65,15 @@ JOIN ( SELECT date, client, + ranks.rank, COUNT(DISTINCT page) AS total_urls - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${constants.currentMonth}' AND - is_root_page = TRUE - ${constants.devRankFilter} + FROM pages + FULL OUTER JOIN ranks + ON pages.rank <= ranks.rank GROUP BY date, - client + client, + ranks.rank ) -USING (date, client) +USING (date, client, rank) `)