From 79afa4d6dd4b0056897b2c7fde5b67f91f2639f9 Mon Sep 17 00:00:00 2001 From: Tristen Harr Date: Thu, 16 Jan 2025 00:00:57 -0600 Subject: [PATCH] add aggregates, fix LIKE, add BigDecimal scalar --- CHANGELOG.md | 30 ++ connector-definition/connector-metadata.yaml | 4 +- generate-config.ts | 4 +- package-lock.json | 4 +- package.json | 2 +- src/constants.ts | 517 ++++++++++++++++++- src/handlers/query.ts | 169 +++++- 7 files changed, 688 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8736b50..9d13840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,36 @@ # DuckDB Connector Changelog This changelog documents changes between release tags. +## [0.2.0] - 2025-01-15 +Implement Aggregates + +* Basic Aggregates + * star_count + * single_column + * single_column distinct + +* Numeric Aggregates: + * sum + * avg + * max + * min + * stddev + * stddev_samp + * stddev_pop + * variance + * var_samp + * var_pop + +* String Aggregates: + * group_concat + * group_concat_distinct + * group_concat_include_nulls + +* Previously, LIKE would automatically wrap the string with %% to do a full-text search, now we allow users to use % on their own so LIKE has full functionality. + +* Add BigDecimal scalar type for arbitrary precision decimals so aggregation functions like SUM don't overflow + + ## [0.1.8] - 2025-01-15 * Enable Aggregates diff --git a/connector-definition/connector-metadata.yaml b/connector-definition/connector-metadata.yaml index f9baf95..262d6f4 100644 --- a/connector-definition/connector-metadata.yaml +++ b/connector-definition/connector-metadata.yaml @@ -1,13 +1,13 @@ packagingDefinition: type: PrebuiltDockerImage - dockerImage: ghcr.io/hasura/ndc-duckdb:v0.1.8 + dockerImage: ghcr.io/hasura/ndc-duckdb:v0.2.0 supportedEnvironmentVariables: - name: DUCKDB_URL description: The url for the DuckDB database commands: update: type: Dockerized - dockerImage: ghcr.io/hasura/ndc-duckdb:v0.1.8 + dockerImage: ghcr.io/hasura/ndc-duckdb:v0.2.0 commandArgs: - update dockerComposeWatch: diff --git a/generate-config.ts b/generate-config.ts index 024904e..0c075cc 100644 --- a/generate-config.ts +++ b/generate-config.ts @@ -62,8 +62,8 @@ const determineType = (t: string): string => { case "VARCHAR": return "String"; default: - if (t.startsWith("DECIMAL")){ - return "Float"; + if (t.startsWith("DECIMAL") || t.startsWith("NUMERIC")){ + return "BigDecimal"; } throw new NotSupported("Unsupported type", {}); } diff --git a/package-lock.json b/package-lock.json index 392fe8f..9d36c25 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "duckdb-sdk", - "version": "0.1.8", + "version": "0.2.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "duckdb-sdk", - "version": "0.1.8", + "version": "0.2.0", "dependencies": { "@hasura/ndc-sdk-typescript": "^6.0.0", "duckdb": "^1.0.0", diff --git a/package.json b/package.json index 6c61fa1..d0d1171 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "duckdb-sdk", - "version": "0.1.8", + "version": "0.2.0", "description": "", "main": "index.js", "scripts": { diff --git a/src/constants.ts b/src/constants.ts index 0cd2d60..3d88ccc 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -22,12 +22,66 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { type: "biginteger" }, aggregate_functions: { - // sum: { - // result_type: { - // type: "named", - // name: "Int" - // } - // } + _sum: { + result_type: { + type: "named", + name: "BigInt" + } + }, + _avg: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _max: { + result_type: { + type: "named", + name: "BigInt" + } + }, + _min: { + result_type: { + type: "named", + name: "BigInt" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } }, comparison_operators: { _eq: { @@ -74,7 +128,68 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { representation: { type: "biginteger" }, - aggregate_functions: {}, + aggregate_functions: { + _sum: { + result_type: { + type: "named", + name: "UBigInt" + } + }, + _avg: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _max: { + result_type: { + type: "named", + name: "UBigInt" + } + }, + _min: { + result_type: { + type: "named", + name: "UBigInt" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } + }, comparison_operators: { _eq: { type: "equal" }, _gt: { type: "custom", argument_type: { type: "named", name: "UBigInt" }}, @@ -88,7 +203,68 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { representation: { type: "biginteger" }, - aggregate_functions: {}, + aggregate_functions: { + _sum: { + result_type: { + type: "named", + name: "HugeInt" + } + }, + _avg: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _max: { + result_type: { + type: "named", + name: "HugeInt" + } + }, + _min: { + result_type: { + type: "named", + name: "HugeInt" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } + }, comparison_operators: { _eq: { type: "equal" }, _gt: { type: "custom", argument_type: { type: "named", name: "HugeInt" }}, @@ -102,7 +278,68 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { representation: { type: "biginteger" }, - aggregate_functions: {}, + aggregate_functions: { + _sum: { + result_type: { + type: "named", + name: "UHugeInt" + } + }, + _avg: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _max: { + result_type: { + type: "named", + name: "UHugeInt" + } + }, + _min: { + result_type: { + type: "named", + name: "UHugeInt" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } + }, comparison_operators: { _eq: { type: "equal" }, _gt: { type: "custom", argument_type: { type: "named", name: "UHugeInt" }}, @@ -209,12 +446,66 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { type: "int64" }, aggregate_functions: { - // sum: { - // result_type: { - // type: "named", - // name: "Int" - // } - // } + _sum: { + result_type: { + type: "named", + name: "BigInt" + } + }, + _avg: { + result_type: { + type: "named", + name: "Float" + } + }, + _max: { + result_type: { + type: "named", + name: "Int" + } + }, + _min: { + result_type: { + type: "named", + name: "Int" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } }, comparison_operators: { _eq: { @@ -262,12 +553,66 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { type: "float64" }, aggregate_functions: { - // sum: { - // result_type: { - // type: "named", - // name: "Float" - // } - // } + _sum: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _avg: { + result_type: { + type: "named", + name: "Float" + } + }, + _max: { + result_type: { + type: "named", + name: "Float" + } + }, + _min: { + result_type: { + type: "named", + name: "Float" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } }, comparison_operators: { _eq: { @@ -314,7 +659,26 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { representation: { type: "string" }, - aggregate_functions: {}, + aggregate_functions: { + _group_concat: { + result_type: { + type: "named", + name: "String" + } + }, + _group_concat_distinct: { + result_type: { + type: "named", + name: "String" + } + }, + _group_concat_include_nulls: { + result_type: { + type: "named", + name: "String" + } + }, + }, comparison_operators: { _eq: { type: "equal" @@ -380,7 +744,114 @@ export const SCALAR_TYPES: { [key: string]: ScalarType } = { type: "equal" }, }, - } + }, + BigDecimal: { + representation: { + type: "bigdecimal" + }, + aggregate_functions: { + _sum: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _avg: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _max: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _min: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _stddev_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _variance: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_samp: { + result_type: { + type: "named", + name: "BigDecimal" + } + }, + _var_pop: { + result_type: { + type: "named", + name: "BigDecimal" + } + } + }, + comparison_operators: { + _eq: { + type: "equal", + }, + _gt: { + type: "custom", + argument_type: { + type: "named", + name: "BigDecimal", + }, + }, + _lt: { + type: "custom", + argument_type: { + type: "named", + name: "BigDecimal", + }, + }, + _gte: { + type: "custom", + argument_type: { + type: "named", + name: "BigDecimal", + }, + }, + _lte: { + type: "custom", + argument_type: { + type: "named", + name: "BigDecimal", + }, + }, + _neq: { + type: "custom", + argument_type: { + type: "named", + name: "BigDecimal", + }, + }, + }, + }, }; export const DUCKDB_CONFIG = { 'custom_user_agent': 'hasura' diff --git a/src/handlers/query.ts b/src/handlers/query.ts index 225f0c5..ca79f73 100644 --- a/src/handlers/query.ts +++ b/src/handlers/query.ts @@ -1,5 +1,4 @@ -//TODO: Aggregates https://github.com/hasura/ndc-duckduckapi/commit/29cacaddb811cbf0610a98a61d05a0d29da27554 - +// TODO: Use duckdb-async? import { QueryRequest, QueryResponse, @@ -241,7 +240,6 @@ function build_where( sql = `${lhs} = ${rhs}`; break; case "_like": - args[args.length - 1] = `%${args[args.length - 1]}%`; sql = `${lhs} LIKE ?`; break; case "_glob": @@ -355,11 +353,6 @@ function build_query( let order_by_sql = ``; let collect_rows = []; let where_conditions = ["WHERE 1"]; - if (query.aggregates) { - run_agg = true; - agg_sql = "... todo"; - throw new Forbidden("Aggregates not implemented yet!", {}); - } if (query.fields) { run_sql = true; for (let [field_name, field_value] of Object.entries(query.fields)) { @@ -502,6 +495,125 @@ ${offset_sql} // console.log(format(formatSQLWithArgs(sql, args), { language: "sqlite" })); } + if (query.aggregates) { + run_agg = true; + let agg_columns: string[] = []; + let agg_where_conditions = ["WHERE 1"]; + + if (path.length > 1 && relationship_key !== null) { + let relationship = query_request.collection_relationships[relationship_key]; + let parent_alias = path.slice(0, -1).join("_"); + agg_where_conditions.push( + ...Object.entries(relationship.column_mapping).map(([from, to]) => { + return `${escape_double(parent_alias)}.${escape_double(from)} = ${escape_double(collection_alias)}.${escape_double(to)}`; + }) + ); + } + + if (query.predicate) { + agg_where_conditions.push( + `(${build_where( + query.predicate, + query_request.collection_relationships, + agg_args, + variables, + collection_alias, + config.config?.collection_aliases || {}, + config, + query_request + )})` + ); + } + + for (const [agg_name, agg_value] of Object.entries(query.aggregates)) { + if (agg_value.type === "star_count") { + agg_columns.push(`COUNT(*) as ${escape_double(agg_name)}`); + } else if (agg_value.type === "column_count") { + const column = `subq.${escape_double(agg_value.column)}`; + const column_expr = agg_value.distinct + ? `COUNT(DISTINCT ${column})` + : `COUNT(${column})`; + agg_columns.push(`${column_expr} as ${escape_double(agg_name)}`); + } else if (agg_value.type === "single_column") { + const column = `subq.${escape_double(agg_value.column)}`; + switch (agg_value.function) { + case "_sum": + agg_columns.push(`SUM(${column}) as ${escape_double(agg_name)}`); + break; + case "_avg": + agg_columns.push(`AVG(${column}) as ${escape_double(agg_name)}`); + break; + case "_max": + agg_columns.push(`MAX(${column}) as ${escape_double(agg_name)}`); + break; + case "_min": + agg_columns.push(`MIN(${column}) as ${escape_double(agg_name)}`); + break; + case "_stddev": + case "_stddev_samp": + const stdevSampleFormula = `SQRT( + (COUNT(*) * SUM(POWER(CAST(${column} AS REAL), 2)) - POWER(SUM(CAST(${column} AS REAL)), 2)) + / (COUNT(*) * (COUNT(*) - 1)) + )`; + agg_columns.push(`${stdevSampleFormula} as ${escape_double(agg_name)}`); + break; + case "_stddev_pop": + const stdevPopFormula = `SQRT( + (COUNT(*) * SUM(POWER(CAST(${column} AS REAL), 2)) - POWER(SUM(CAST(${column} AS REAL)), 2)) + / (COUNT(*) * COUNT(*)) + )`; + agg_columns.push(`${stdevPopFormula} as ${escape_double(agg_name)}`); + break; + case "_variance": + case "_var_samp": + const varianceSampleFormula = `( + (COUNT(*) * SUM(POWER(CAST(${column} AS REAL), 2)) - POWER(SUM(CAST(${column} AS REAL)), 2)) + / (COUNT(*) * (COUNT(*) - 1)) + )`; + agg_columns.push(`${varianceSampleFormula} as ${escape_double(agg_name)}`); + break; + case "_var_pop": + const variancePopFormula = `( + (COUNT(*) * SUM(POWER(CAST(${column} AS REAL), 2)) - POWER(SUM(CAST(${column} AS REAL)), 2)) + / (COUNT(*) * COUNT(*)) + )`; + agg_columns.push(`${variancePopFormula} as ${escape_double(agg_name)}`); + break; + case "_group_concat": + agg_columns.push(`GROUP_CONCAT(${column}) as ${escape_double(agg_name)}`); + break; + case "_group_concat_distinct": + agg_columns.push(`GROUP_CONCAT(DISTINCT ${column}) as ${escape_double(agg_name)}`); + break; + case "_group_concat_include_nulls": + agg_columns.push(`GROUP_CONCAT(COALESCE(${column}, 'NULL')) as ${escape_double(agg_name)}`); + break; + default: + throw new Forbidden(`Unsupported aggregate function: ${agg_value.function}`, {}); + } + } + } + + agg_sql = wrap_data(` + SELECT JSON_OBJECT( + ${agg_columns + .map((col) => { + const parts = col.split(" as "); + return `${escape_single(parts[1].replace('"', '').replace('"', ''))}, ${parts[0]}`; + }) + .join(",")} + ) as data + FROM ( + SELECT * + FROM ${from_sql} + ${agg_where_conditions.join(" AND ")} + ${order_by_sql} + ${limit_sql} + ${offset_sql} + ) subq + `); + } + return { runSql: run_sql, runAgg: run_agg, @@ -579,12 +691,44 @@ async function perform_query( state: State, query_plans: SQLQuery[] ): Promise { - const con = state.client.connect(); const response: RowSet[] = []; for (let query_plan of query_plans) { - const res = await do_all(con, query_plan); - const row_set = JSON.parse(res[0]["data"] as string) as RowSet; - response.push(row_set); + try { + const connection = state.client.connect(); + let row_set: RowSet = {}; // Start with empty object + + if (query_plan.runAgg) { + const aggRes = await do_all(connection, { + runSql: true, + runAgg: false, + sql: query_plan.aggSql, + args: query_plan.aggArgs, + aggSql: "", + aggArgs: [], + }); + const parsedAggData = JSON.parse(aggRes[0]["data"]); + row_set.aggregates = parsedAggData; + } + + if (query_plan.runSql) { + const res = await do_all(connection, { + runSql: true, + runAgg: false, + sql: query_plan.sql, + args: query_plan.args, + aggSql: "", + aggArgs: [], + }); + const regular_results = JSON.parse(res[0]["data"]); + row_set.rows = regular_results.rows; + } + + response.push(row_set); + connection.close(); + } catch (err) { + console.error("Error performing query: " + err); + throw err; + } } return response; } @@ -594,6 +738,7 @@ export async function do_query( state: State, query: QueryRequest ): Promise { + // console.log(JSON.stringify(query, null, 4)); let query_plans = await plan_queries(configuration, query); return await perform_query(state, query_plans); }