-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Spark: Support singular form of years, months, days, and hours functions #12117
base: main
Are you sure you want to change the base?
Changes from all commits
87c408b
ccf99c7
d1e6c4b
7f6ed48
9ed424e
2ea1c19
519815a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,17 +31,29 @@ | |
* A Spark function implementation for the Iceberg day transform. | ||
* | ||
* <p>Example usage: {@code SELECT system.days('source_col')}. | ||
* | ||
* <p>Alternate form: {@code SELECT system.day('source_col')}. | ||
*/ | ||
public class DaysFunction extends UnaryUnboundFunction { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ebyhr if I understand you correctly, you also suggest introducing a constructor that takes a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want the function to be constructed with an arbitrary name. We really just want the name to be either "day" or "days" in this case. So we can have a constructor with a boolean. |
||
|
||
private boolean singular; | ||
|
||
public DaysFunction() { | ||
this(false); | ||
} | ||
|
||
DaysFunction(boolean singular) { | ||
this.singular = singular; | ||
} | ||
|
||
@Override | ||
protected BoundFunction doBind(DataType valueType) { | ||
if (valueType instanceof DateType) { | ||
return new DateToDaysFunction(); | ||
return new DateToDaysFunction(singular); | ||
} else if (valueType instanceof TimestampType) { | ||
return new TimestampToDaysFunction(); | ||
return new TimestampToDaysFunction(singular); | ||
} else if (valueType instanceof TimestampNTZType) { | ||
return new TimestampNtzToDaysFunction(); | ||
return new TimestampNtzToDaysFunction(singular); | ||
} else { | ||
throw new UnsupportedOperationException( | ||
"Expected value to be date or timestamp: " + valueType.catalogString()); | ||
|
@@ -57,13 +69,19 @@ public String description() { | |
|
||
@Override | ||
public String name() { | ||
return "days"; | ||
return singular ? "day" : "days"; | ||
} | ||
|
||
private abstract static class BaseToDaysFunction extends BaseScalarFunction<Integer> { | ||
private boolean singular; | ||
|
||
protected BaseToDaysFunction(boolean singular) { | ||
this.singular = singular; | ||
} | ||
|
||
@Override | ||
public String name() { | ||
return "days"; | ||
return singular ? "day" : "days"; | ||
} | ||
|
||
@Override | ||
|
@@ -74,6 +92,14 @@ public DataType resultType() { | |
|
||
// Spark and Iceberg internal representations of dates match so no transformation is required | ||
public static class DateToDaysFunction extends BaseToDaysFunction { | ||
public DateToDaysFunction() { | ||
this(false); | ||
} | ||
|
||
DateToDaysFunction(boolean singular) { | ||
super(singular); | ||
} | ||
|
||
// magic method used in codegen | ||
public static int invoke(int days) { | ||
return days; | ||
|
@@ -86,7 +112,7 @@ public DataType[] inputTypes() { | |
|
||
@Override | ||
public String canonicalName() { | ||
return "iceberg.days(date)"; | ||
return "iceberg." + name() + "(date)"; | ||
} | ||
|
||
@Override | ||
|
@@ -97,6 +123,14 @@ public Integer produceResult(InternalRow input) { | |
} | ||
|
||
public static class TimestampToDaysFunction extends BaseToDaysFunction { | ||
public TimestampToDaysFunction() { | ||
this(false); | ||
} | ||
|
||
TimestampToDaysFunction(boolean singular) { | ||
super(singular); | ||
} | ||
|
||
// magic method used in codegen | ||
public static int invoke(long micros) { | ||
return DateTimeUtil.microsToDays(micros); | ||
|
@@ -109,7 +143,7 @@ public DataType[] inputTypes() { | |
|
||
@Override | ||
public String canonicalName() { | ||
return "iceberg.days(timestamp)"; | ||
return "iceberg." + name() + "(timestamp)"; | ||
} | ||
|
||
@Override | ||
|
@@ -120,6 +154,14 @@ public Integer produceResult(InternalRow input) { | |
} | ||
|
||
public static class TimestampNtzToDaysFunction extends BaseToDaysFunction { | ||
public TimestampNtzToDaysFunction() { | ||
this(false); | ||
} | ||
|
||
TimestampNtzToDaysFunction(boolean singular) { | ||
super(singular); | ||
} | ||
|
||
// magic method used in codegen | ||
public static int invoke(long micros) { | ||
return DateTimeUtil.microsToDays(micros); | ||
|
@@ -132,7 +174,7 @@ public DataType[] inputTypes() { | |
|
||
@Override | ||
public String canonicalName() { | ||
return "iceberg.days(timestamp_ntz)"; | ||
return "iceberg." + name() + "(timestamp_ntz)"; | ||
} | ||
|
||
@Override | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wasn't aware of the hardcoded references to "years", "months", "days" and "hours" in
SparkV2Filters
.The changes here in
SparkV2Filters
are only necessary if we want to make the singular forms the standard.Right now, I have left the plural forms as standard.
@rdblue should we make the singular form of the functions the standard?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey @wypoon thanks for working on this. Initially, we went for the plural form because the singular form is already a function in Spark (e.g. day), do you know if these interfere? We should also make sure that we convert this in the tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Fokko thanks for looking at this, and the explanation for why the plural form was used.
There is no interference between the Iceberg functions and the Spark built-in functions, because the Iceberg functions use the "system" namespace. I added tests for the built-in functions to demonstrate this.
There is one thing that gave me pause, which is the comment in BaseCatalog::isFunctionNamespace:
For this reason, I added some variants to
TestStoragePartitionJoins
to use the singular forms of the transforms. Those tests pass too.