diff --git a/datafusion/expr/src/aggregate_function.rs b/datafusion/expr/src/aggregate_function.rs index 6a4354d20470..dd27d9504bee 100644 --- a/datafusion/expr/src/aggregate_function.rs +++ b/datafusion/expr/src/aggregate_function.rs @@ -77,30 +77,34 @@ impl FromStr for AggregateFunction { type Err = DataFusionError; fn from_str(name: &str) -> Result { Ok(match name { - "min" => AggregateFunction::Min, - "max" => AggregateFunction::Max, - "count" => AggregateFunction::Count, + // general "avg" => AggregateFunction::Avg, + "count" => AggregateFunction::Count, + "max" => AggregateFunction::Max, "mean" => AggregateFunction::Avg, - "sum" => AggregateFunction::Sum, "median" => AggregateFunction::Median, - "approx_distinct" => AggregateFunction::ApproxDistinct, + "min" => AggregateFunction::Min, + "sum" => AggregateFunction::Sum, "array_agg" => AggregateFunction::ArrayAgg, - "var" => AggregateFunction::Variance, - "var_samp" => AggregateFunction::Variance, - "var_pop" => AggregateFunction::VariancePop, - "stddev" => AggregateFunction::Stddev, - "stddev_samp" => AggregateFunction::Stddev, - "stddev_pop" => AggregateFunction::StddevPop, + // statistical + "corr" => AggregateFunction::Correlation, "covar" => AggregateFunction::Covariance, - "covar_samp" => AggregateFunction::Covariance, "covar_pop" => AggregateFunction::CovariancePop, - "corr" => AggregateFunction::Correlation, + "covar_samp" => AggregateFunction::Covariance, + "stddev" => AggregateFunction::Stddev, + "stddev_pop" => AggregateFunction::StddevPop, + "stddev_samp" => AggregateFunction::Stddev, + "var" => AggregateFunction::Variance, + "var_pop" => AggregateFunction::VariancePop, + "var_samp" => AggregateFunction::Variance, + // approximate + "approx_distinct" => AggregateFunction::ApproxDistinct, + "approx_median" => AggregateFunction::ApproxMedian, "approx_percentile_cont" => AggregateFunction::ApproxPercentileCont, "approx_percentile_cont_with_weight" => { AggregateFunction::ApproxPercentileContWithWeight } - "approx_median" => AggregateFunction::ApproxMedian, + // other "grouping" => AggregateFunction::Grouping, _ => { return Err(DataFusionError::Plan(format!( diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 72033de8344e..95bd811dd453 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -313,12 +313,13 @@ impl FromStr for BuiltinScalarFunction { // math functions "abs" => BuiltinScalarFunction::Abs, "acos" => BuiltinScalarFunction::Acos, - "asin" => BuiltinScalarFunction::Asin, - "atan" => BuiltinScalarFunction::Atan, "acosh" => BuiltinScalarFunction::Acosh, + "asin" => BuiltinScalarFunction::Asin, "asinh" => BuiltinScalarFunction::Asinh, + "atan" => BuiltinScalarFunction::Atan, "atanh" => BuiltinScalarFunction::Atanh, "atan2" => BuiltinScalarFunction::Atan2, + "cbrt" => BuiltinScalarFunction::Cbrt, "ceil" => BuiltinScalarFunction::Ceil, "cos" => BuiltinScalarFunction::Cos, "cosh" => BuiltinScalarFunction::Cosh, @@ -330,21 +331,19 @@ impl FromStr for BuiltinScalarFunction { "log2" => BuiltinScalarFunction::Log2, "pi" => BuiltinScalarFunction::Pi, "power" | "pow" => BuiltinScalarFunction::Power, + "random" => BuiltinScalarFunction::Random, "round" => BuiltinScalarFunction::Round, "signum" => BuiltinScalarFunction::Signum, "sin" => BuiltinScalarFunction::Sin, "sinh" => BuiltinScalarFunction::Sinh, "sqrt" => BuiltinScalarFunction::Sqrt, - "cbrt" => BuiltinScalarFunction::Cbrt, "tan" => BuiltinScalarFunction::Tan, "tanh" => BuiltinScalarFunction::Tanh, "trunc" => BuiltinScalarFunction::Trunc, // conditional functions "coalesce" => BuiltinScalarFunction::Coalesce, - - // array functions - "make_array" => BuiltinScalarFunction::MakeArray, + "nullif" => BuiltinScalarFunction::NullIf, // string functions "ascii" => BuiltinScalarFunction::Ascii, @@ -355,51 +354,61 @@ impl FromStr for BuiltinScalarFunction { "concat" => BuiltinScalarFunction::Concat, "concat_ws" => BuiltinScalarFunction::ConcatWithSeparator, "chr" => BuiltinScalarFunction::Chr, - "current_date" => BuiltinScalarFunction::CurrentDate, - "current_time" => BuiltinScalarFunction::CurrentTime, - "date_part" | "datepart" => BuiltinScalarFunction::DatePart, - "date_trunc" | "datetrunc" => BuiltinScalarFunction::DateTrunc, - "date_bin" => BuiltinScalarFunction::DateBin, "initcap" => BuiltinScalarFunction::InitCap, "left" => BuiltinScalarFunction::Left, "length" => BuiltinScalarFunction::CharacterLength, "lower" => BuiltinScalarFunction::Lower, "lpad" => BuiltinScalarFunction::Lpad, "ltrim" => BuiltinScalarFunction::Ltrim, - "md5" => BuiltinScalarFunction::MD5, - "nullif" => BuiltinScalarFunction::NullIf, "octet_length" => BuiltinScalarFunction::OctetLength, - "random" => BuiltinScalarFunction::Random, - "regexp_replace" => BuiltinScalarFunction::RegexpReplace, "repeat" => BuiltinScalarFunction::Repeat, "replace" => BuiltinScalarFunction::Replace, "reverse" => BuiltinScalarFunction::Reverse, "right" => BuiltinScalarFunction::Right, "rpad" => BuiltinScalarFunction::Rpad, "rtrim" => BuiltinScalarFunction::Rtrim, - "sha224" => BuiltinScalarFunction::SHA224, - "sha256" => BuiltinScalarFunction::SHA256, - "sha384" => BuiltinScalarFunction::SHA384, - "sha512" => BuiltinScalarFunction::SHA512, - "digest" => BuiltinScalarFunction::Digest, "split_part" => BuiltinScalarFunction::SplitPart, "starts_with" => BuiltinScalarFunction::StartsWith, "strpos" => BuiltinScalarFunction::Strpos, "substr" => BuiltinScalarFunction::Substr, "to_hex" => BuiltinScalarFunction::ToHex, - "to_timestamp" => BuiltinScalarFunction::ToTimestamp, - "to_timestamp_millis" => BuiltinScalarFunction::ToTimestampMillis, - "to_timestamp_micros" => BuiltinScalarFunction::ToTimestampMicros, - "to_timestamp_seconds" => BuiltinScalarFunction::ToTimestampSeconds, - "now" => BuiltinScalarFunction::Now, "translate" => BuiltinScalarFunction::Translate, "trim" => BuiltinScalarFunction::Trim, "upper" => BuiltinScalarFunction::Upper, "uuid" => BuiltinScalarFunction::Uuid, + + // regex functions "regexp_match" => BuiltinScalarFunction::RegexpMatch, - "struct" => BuiltinScalarFunction::Struct, + "regexp_replace" => BuiltinScalarFunction::RegexpReplace, + + // time/date functions + "now" => BuiltinScalarFunction::Now, + "current_date" => BuiltinScalarFunction::CurrentDate, + "current_time" => BuiltinScalarFunction::CurrentTime, + "date_bin" => BuiltinScalarFunction::DateBin, + "date_trunc" | "datetrunc" => BuiltinScalarFunction::DateTrunc, + "date_part" | "datepart" => BuiltinScalarFunction::DatePart, + "to_timestamp" => BuiltinScalarFunction::ToTimestamp, + "to_timestamp_millis" => BuiltinScalarFunction::ToTimestampMillis, + "to_timestamp_micros" => BuiltinScalarFunction::ToTimestampMicros, + "to_timestamp_seconds" => BuiltinScalarFunction::ToTimestampSeconds, "from_unixtime" => BuiltinScalarFunction::FromUnixtime, + + // hashing functions + "digest" => BuiltinScalarFunction::Digest, + "md5" => BuiltinScalarFunction::MD5, + "sha224" => BuiltinScalarFunction::SHA224, + "sha256" => BuiltinScalarFunction::SHA256, + "sha384" => BuiltinScalarFunction::SHA384, + "sha512" => BuiltinScalarFunction::SHA512, + + // other functions + "struct" => BuiltinScalarFunction::Struct, "arrow_typeof" => BuiltinScalarFunction::ArrowTypeof, + + // array functions + "make_array" => BuiltinScalarFunction::MakeArray, + _ => { return Err(DataFusionError::Plan(format!( "There is no built-in function named {name}" diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index fff547f25785..497cf78aadeb 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -27,6 +27,7 @@ Aggregate functions operate on a set of values to compute a single result. - [count](#count) - [max](#max) - [mean](#mean) +- [median](#median) - [min](#min) - [sum](#sum) - [array_agg](#array_agg) @@ -82,6 +83,19 @@ max(expression) _Alias of [avg](#avg)._ +### `median` + +Returns the median value in the specified column. + +``` +median(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. + Can be a constant, column, or function, and any combination of arithmetic operators. + ### `min` Returns the minimum value in the specified column. @@ -110,7 +124,16 @@ sum(expression) ### `array_agg` - +Returns an array created from the expression elements. + +``` +array_agg(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. + Can be a constant, column, or function, and any combination of arithmetic operators. ## Statistical diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md index 2753b014dc36..063976dc3d04 100644 --- a/docs/source/user-guide/sql/data_types.md +++ b/docs/source/user-guide/sql/data_types.md @@ -60,20 +60,20 @@ For example, to cast the output of `now()` to a `Timestamp` with second precisio ## Numeric Types -| SQL DataType | Arrow DataType | Notes | -| ------------------------------------ | :---------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `TINYINT` | `Int8` | | -| `SMALLINT` | `Int16` | | -| `INT` or `INTEGER` | `Int32` | | -| `BIGINT` | `Int64` | | -| `TINYINT UNSIGNED` | `UInt8` | | -| `SMALLINT UNSIGNED` | `UInt16` | | -| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32` | | -| `BIGINT UNSIGNED` | `UInt64` | | -| `FLOAT` | `Float32` | | -| `REAL` | `Float32` | | -| `DOUBLE` | `Float64` | | -| `DECIMAL(precision,scale)` | `Decimal128(precision,scale)` | Decimal support is currently experimental ([#3523](https://github.com/apache/arrow-datafusion/issues/3523)) | +| SQL DataType | Arrow DataType | Notes | +| ------------------------------------ | :----------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `TINYINT` | `Int8` | | +| `SMALLINT` | `Int16` | | +| `INT` or `INTEGER` | `Int32` | | +| `BIGINT` | `Int64` | | +| `TINYINT UNSIGNED` | `UInt8` | | +| `SMALLINT UNSIGNED` | `UInt16` | | +| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32` | | +| `BIGINT UNSIGNED` | `UInt64` | | +| `FLOAT` | `Float32` | | +| `REAL` | `Float32` | | +| `DOUBLE` | `Float64` | | +| `DECIMAL(precision, scale)` | `Decimal128(precision, scale)` | Decimal support is currently experimental ([#3523](https://github.com/apache/arrow-datafusion/issues/3523)) | ## Date/Time Types diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md index 8de29b4e50ff..45d7d81a0aec 100644 --- a/docs/source/user-guide/sql/ddl.md +++ b/docs/source/user-guide/sql/ddl.md @@ -19,6 +19,32 @@ # DDL +## CREATE DATABASE + +Create catalog with specified name. + +
+CREATE DATABASE [ IF NOT EXISTS ] catalog
+
+ +```sql +-- create catalog cat +CREATE DATABASE cat; +``` + +## CREATE SCHEMA + +Create schema under specified catalog, or the default DataFusion catalog if not specified. + +
+CREATE SCHEMA [ IF NOT EXISTS ] [ catalog. ] schema_name
+
+ +```sql +-- create schema emu under catalog cat +CREATE SCHEMA cat.emu; +``` + ## CREATE EXTERNAL TABLE Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary @@ -67,7 +93,7 @@ When creating an output from a data source that is already ordered by an express the data using the `WITH ORDER` clause. This applies even if the expression used for sorting is complex, allowing for greater flexibility. -Here's an example of how to use `WITH ORDER` query +Here's an example of how to use `WITH ORDER` clause. ```sql CREATE EXTERNAL TABLE test ( @@ -91,14 +117,14 @@ WITH ORDER (c2 ASC, c5 + c8 DESC NULL FIRST) LOCATION '/path/to/aggregate_test_100.csv'; ``` -where `WITH ORDER` clause specifies the sort order: +Where `WITH ORDER` clause specifies the sort order: ```sql WITH ORDER (sort_expression1 [ASC | DESC] [NULLS { FIRST | LAST }] [, sort_expression2 [ASC | DESC] [NULLS { FIRST | LAST }] ...]) ``` -### Cautions When Using the WITH ORDER Clause +### Cautions when using the WITH ORDER Clause - It's important to understand that using the `WITH ORDER` clause in the `CREATE EXTERNAL TABLE` statement only specifies the order in which the data should be read from the external file. If the data in the file is not already sorted according to the specified order, then the results may not be correct. @@ -153,7 +179,7 @@ DROP TABLE IF EXISTS nonexistent_table; View is a virtual table based on the result of a SQL query. It can be created from an existing table or values list.
-CREATE VIEW view_name AS statement;
+CREATE [ OR REPLACE ] VIEW view_name AS statement;
 
```sql diff --git a/docs/source/user-guide/sql/explain.md b/docs/source/user-guide/sql/explain.md index ae0795f9ab4b..ca4169d5773c 100644 --- a/docs/source/user-guide/sql/explain.md +++ b/docs/source/user-guide/sql/explain.md @@ -28,7 +28,7 @@ EXPLAIN [ANALYZE] [VERBOSE] statement ## EXPLAIN Shows the execution plan of a statement. -If you need more details output, try to use `EXPLAIN VERBOSE`. +If you need more detailed output, use `EXPLAIN VERBOSE`. ```sql EXPLAIN SELECT SUM(x) FROM table GROUP BY b; @@ -52,7 +52,7 @@ EXPLAIN SELECT SUM(x) FROM table GROUP BY b; ## EXPLAIN ANALYZE Shows the execution plan and metrics of a statement. -If you need more information output, try to use `EXPLAIN ANALYZE VERBOSE`. +If you need more information output, use `EXPLAIN ANALYZE VERBOSE`. ```sql EXPLAIN ANALYZE SELECT SUM(x) FROM table GROUP BY b; diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 0414ffdd3157..25002e572d55 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -36,10 +36,12 @@ - [exp](#exp) - [floor](#floor) - [ln](#ln) +- [log](#log) - [log10](#log10) - [log2](#log2) - [pi](#pi) - [power](#power) +- [pow](#pow) - [random](#random) - [round](#round) - [signum](#signum) @@ -247,6 +249,23 @@ ln(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +### `log` + +Returns the base-x logarithm of a number. +Can either provide a specified base, or if omitted then takes the base-10 of a number. + +``` +log(base, numeric_expression) +log(numeric_expression) +``` + +#### Arguments + +- **base**: Base numeric expression to operate on. + Can be a constant, column, or function, and any combination of arithmetic operators. +- **numeric_expression**: Numeric expression to operate on. + Can be a constant, column, or function, and any combination of arithmetic operators. + ### `log10` Returns the base-10 logarithm of a number. @@ -262,7 +281,7 @@ log10(numeric_expression) ### `log2` -Returns the base-2 logarithm or a number. +Returns the base-2 logarithm of a number. ``` log2(numeric_expression) @@ -283,7 +302,7 @@ pi() ### `power` -Returns a base number raised to the power of an exponent. +Returns a base expression raised to the power of an exponent. ``` power(base, exponent) @@ -291,14 +310,22 @@ power(base, exponent) #### Arguments -- **power**: Base numeric expression to operate on. +- **base**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. - **exponent**: Exponent numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +#### Aliases + +- pow + +### `pow` + +_Alias of [power](#power)._ + ### `random` -Returns a random float value between 0 and 1. +Returns a random float value in the range [0, 1). The random seed is unique to each row. ``` @@ -480,6 +507,7 @@ nullif(expression1, expression2) - [translate](#translate) - [trim](#trim) - [upper](#upper) +- [uuid](#uuid) ### `ascii` @@ -673,7 +701,7 @@ lower(str) ### `lpad` -Pads the left side a string with another string to a specified string length. +Pads the left side of a string with another string to a specified string length. ``` lpad(str, n[, padding_str]) @@ -794,7 +822,7 @@ right(str, n) ### `rpad` -right side a string with another string to a specified string length. +Pads the right side of a string with another string to a specified string length. ``` rpad(str, n[, padding_str]) @@ -832,7 +860,7 @@ rtrim(str) ### `split_part` -Splits a string based on a specified delimiter and returns the substring a the +Splits a string based on a specified delimiter and returns the substring in the specified position. ``` @@ -957,6 +985,14 @@ upper(str) [initcap](#initcap), [lower](#lower) +### `uuid` + +Returns UUID v4 string value which is unique per row. + +``` +uuid() +``` + ## Regular Expression Functions Apache DataFusion uses the POSIX regular expression syntax and @@ -1004,9 +1040,13 @@ regexp_replace(str, regexp, replacement, flags) ## Time and Date Functions - [now](#now) +- [current_date](#current_date) +- [current_time](#current_time) - [date_bin](#date_bin) - [date_trunc](#date_trunc) +- [datetrunc](#datetrunc) - [date_part](#date_part) +- [datepart](#datepart) - [extract](#extract) - [to_timestamp](#to_timestamp) - [to_timestamp_millis](#to_timestamp_millis) @@ -1025,6 +1065,28 @@ no matter when in the query plan the function executes. now() ``` +### `current_date` + +Returns the current UTC date. + +The `current_date()` return value is determined at query time and will return the same date, +no matter when in the query plan the function executes. + +``` +current_date() +``` + +### `current_time` + +Returns the current UTC time. + +The `current_time()` return value is determined at query time and will return the same time, +no matter when in the query plan the function executes. + +``` +current_time() +``` + ### `date_bin` Calculates time intervals and returns the start of the interval nearest to the specified timestamp. @@ -1084,6 +1146,14 @@ date_trunc(precision, expression) - **expression**: Time expression to operate on. Can be a constant, column, or function. +#### Aliases + +- datetrunc + +### `datetrunc` + +_Alias of [date_trunc](#date_trunc)._ + ### `date_part` Returns the specified part of the date as an integer. @@ -1113,6 +1183,14 @@ date_part(part, expression) - **expression**: Time expression to operate on. Can be a constant, column, or function. +#### Aliases + +- datepart + +### `datepart` + +_Alias of [date_part](#date_part)._ + ### `extract` Returns a sub-field from a time value as an integer. @@ -1224,12 +1302,37 @@ from_unixtime(expression) ## Hashing Functions +- [digest](#digest) - [md5](#md5) - [sha224](#sha224) - [sha256](#sha256) - [sha384](#sha384) - [sha512](#sha512) +### `digest` + +Computes the binary hash of an expression using the specified algorithm. + +``` +digest(expression, algorithm) +``` + +#### Arguments + +- **expression**: String expression to operate on. + Can be a constant, column, or function, and any combination of string operators. +- **algorithm**: String expression specifying algorithm to use. + Must be one of: + + - md5 + - sha224 + - sha256 + - sha384 + - sha512 + - blake2s + - blake2b + - blake3 + ### `md5` Computes an MD5 128-bit checksum for a string expression. @@ -1297,17 +1400,17 @@ sha512(expression) ## Other Functions -- [array](#array) +- [make_array](#make_array) - [arrow_cast](#arrow_cast) - [arrow_typeof](#arrow_typeof) - [struct](#struct) -### `array` +### `make_array` Returns an Arrow array using the specified input expressions. ``` -array(expression1[, ..., expression_n]) +make_array(expression1[, ..., expression_n]) ``` #### Arguments diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md index 3eea252d7080..68be88d7cff3 100644 --- a/docs/source/user-guide/sql/select.md +++ b/docs/source/user-guide/sql/select.md @@ -83,7 +83,7 @@ SELECT a FROM table WHERE a > 10 ## JOIN clause -DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, and `CROSS JOIN`. +DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, `NATURAL JOIN` and `CROSS JOIN`. The following examples are based on this table: @@ -153,6 +153,20 @@ either side of the join where there is not a match. +----------+----------+----------+----------+ ``` +### NATURAL JOIN + +A natural join defines an inner join based on common column names found between the input tables. When no common +column names are found, it behaves like a cross join. + +```sql +❯ select * from x natural join x y; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +``` + ### CROSS JOIN A cross join produces a cartesian product that matches every row in the left side of the join with every row in the