|
5 | 5 |
|
6 | 6 | package org.opensearch.sql.expression.function.udf.binning; |
7 | 7 |
|
| 8 | +import java.time.Instant; |
| 9 | +import java.time.ZoneOffset; |
| 10 | +import java.time.ZonedDateTime; |
| 11 | +import java.time.format.DateTimeFormatter; |
8 | 12 | import java.util.List; |
9 | 13 | import org.apache.calcite.adapter.enumerable.NotNullImplementor; |
10 | 14 | import org.apache.calcite.adapter.enumerable.NullPolicy; |
|
24 | 28 | import org.opensearch.sql.expression.function.UDFOperandMetadata; |
25 | 29 |
|
26 | 30 | /** |
27 | | - * WIDTH_BUCKET(field_value, num_bins, data_range, max_value) - Histogram bucketing function. |
| 31 | + * WIDTH_BUCKET(field_value, num_bins, min_value, max_value) - Histogram bucketing function. |
28 | 32 | * |
29 | 33 | * <p>This function creates equal-width bins for histogram operations. It uses a mathematical O(1) |
30 | 34 | * algorithm to determine optimal bin widths based on powers of 10. |
31 | 35 | * |
32 | 36 | * <p>Parameters: |
33 | 37 | * |
34 | 38 | * <ul> |
35 | | - * <li>field_value - The numeric value to bin |
| 39 | + * <li>field_value - The numeric or timestamp value to bin |
36 | 40 | * <li>num_bins - Number of bins to create |
37 | | - * <li>data_range - Range of the data (MAX - MIN) |
| 41 | + * <li>min_value - Minimum value in the dataset |
38 | 42 | * <li>max_value - Maximum value in the dataset |
39 | 43 | * </ul> |
40 | 44 | * |
41 | | - * <p>Implements the same binning logic as BinCalculatorFunction for 'bins' type. |
| 45 | + * <p>Supports both numeric and timestamp fields. For timestamps, uses auto_date_histogram interval |
| 46 | + * selection. |
42 | 47 | */ |
43 | 48 | public class WidthBucketFunction extends ImplementorUDF { |
44 | 49 |
|
@@ -76,35 +81,76 @@ public Expression implement( |
76 | 81 | RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) { |
77 | 82 | Expression fieldValue = translatedOperands.get(0); |
78 | 83 | Expression numBins = translatedOperands.get(1); |
79 | | - Expression dataRange = translatedOperands.get(2); |
| 84 | + Expression minValue = translatedOperands.get(2); |
80 | 85 | Expression maxValue = translatedOperands.get(3); |
81 | 86 |
|
| 87 | + // Pass the field type information to help detect timestamps |
| 88 | + RelDataType fieldType = call.getOperands().get(0).getType(); |
| 89 | + boolean isTimestampField = dateRelatedType(fieldType); |
| 90 | + Expression isTimestamp = Expressions.constant(isTimestampField); |
| 91 | + |
| 92 | + // For timestamp fields, keep as-is (don't convert to Number) |
| 93 | + // For numeric fields, convert to Number |
| 94 | + Expression fieldValueExpr = |
| 95 | + isTimestampField ? fieldValue : Expressions.convert_(fieldValue, Number.class); |
| 96 | + Expression minValueExpr = |
| 97 | + isTimestampField ? minValue : Expressions.convert_(minValue, Number.class); |
| 98 | + Expression maxValueExpr = |
| 99 | + isTimestampField ? maxValue : Expressions.convert_(maxValue, Number.class); |
| 100 | + |
82 | 101 | return Expressions.call( |
83 | 102 | WidthBucketImplementor.class, |
84 | 103 | "calculateWidthBucket", |
85 | | - Expressions.convert_(fieldValue, Number.class), |
| 104 | + fieldValueExpr, |
86 | 105 | Expressions.convert_(numBins, Number.class), |
87 | | - Expressions.convert_(dataRange, Number.class), |
88 | | - Expressions.convert_(maxValue, Number.class)); |
| 106 | + minValueExpr, |
| 107 | + maxValueExpr, |
| 108 | + isTimestamp); |
89 | 109 | } |
90 | 110 |
|
91 | 111 | /** Width bucket calculation using nice number algorithm. */ |
92 | 112 | public static String calculateWidthBucket( |
93 | | - Number fieldValue, Number numBinsParam, Number dataRange, Number maxValue) { |
94 | | - if (fieldValue == null || numBinsParam == null || dataRange == null || maxValue == null) { |
| 113 | + Object fieldValue, |
| 114 | + Number numBinsParam, |
| 115 | + Object minValue, |
| 116 | + Object maxValue, |
| 117 | + boolean isTimestamp) { |
| 118 | + if (fieldValue == null || numBinsParam == null || minValue == null || maxValue == null) { |
95 | 119 | return null; |
96 | 120 | } |
97 | 121 |
|
98 | | - double value = fieldValue.doubleValue(); |
99 | 122 | int numBins = numBinsParam.intValue(); |
100 | | - |
101 | 123 | if (numBins < BinConstants.MIN_BINS || numBins > BinConstants.MAX_BINS) { |
102 | 124 | return null; |
103 | 125 | } |
104 | 126 |
|
105 | | - double range = dataRange.doubleValue(); |
106 | | - double max = maxValue.doubleValue(); |
| 127 | + // Handle timestamp fields differently |
| 128 | + if (isTimestamp) { |
| 129 | + // Convert all timestamp values to milliseconds |
| 130 | + long fieldMillis = convertTimestampToMillis(fieldValue); |
| 131 | + long minMillis = convertTimestampToMillis(minValue); |
| 132 | + long maxMillis = convertTimestampToMillis(maxValue); |
| 133 | + |
| 134 | + // Calculate range |
| 135 | + long rangeMillis = maxMillis - minMillis; |
| 136 | + if (rangeMillis <= 0) { |
| 137 | + return null; |
| 138 | + } |
| 139 | + |
| 140 | + return calculateTimestampBucket(fieldMillis, numBins, rangeMillis, minMillis); |
| 141 | + } |
| 142 | + |
| 143 | + // Numeric field handling (existing logic) |
| 144 | + Number numericValue = (Number) fieldValue; |
| 145 | + Number numericMin = (Number) minValue; |
| 146 | + Number numericMax = (Number) maxValue; |
| 147 | + |
| 148 | + double value = numericValue.doubleValue(); |
| 149 | + double min = numericMin.doubleValue(); |
| 150 | + double max = numericMax.doubleValue(); |
107 | 151 |
|
| 152 | + // Calculate range |
| 153 | + double range = max - min; |
108 | 154 | if (range <= 0) { |
109 | 155 | return null; |
110 | 156 | } |
@@ -190,5 +236,92 @@ private static int getAppropriateDecimalPlaces(double span) { |
190 | 236 | return 4; |
191 | 237 | } |
192 | 238 | } |
| 239 | + |
| 240 | + /** |
| 241 | + * Convert timestamp value to milliseconds. Handles both numeric (Long) milliseconds and String |
| 242 | + * formatted timestamps. |
| 243 | + */ |
| 244 | + private static long convertTimestampToMillis(Object timestamp) { |
| 245 | + if (timestamp instanceof Number) { |
| 246 | + return ((Number) timestamp).longValue(); |
| 247 | + } else if (timestamp instanceof String) { |
| 248 | + // Parse timestamp string "yyyy-MM-dd HH:mm:ss" to milliseconds |
| 249 | + // Use LocalDateTime to parse without timezone, then convert to UTC |
| 250 | + String timestampStr = (String) timestamp; |
| 251 | + java.time.LocalDateTime localDateTime = |
| 252 | + java.time.LocalDateTime.parse( |
| 253 | + timestampStr, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); |
| 254 | + // Assume the timestamp is in UTC and convert to epoch millis |
| 255 | + return localDateTime.atZone(ZoneOffset.UTC).toInstant().toEpochMilli(); |
| 256 | + } else { |
| 257 | + throw new IllegalArgumentException("Unsupported timestamp type: " + timestamp.getClass()); |
| 258 | + } |
| 259 | + } |
| 260 | + |
| 261 | + /** |
| 262 | + * Calculate timestamp bucket using auto_date_histogram interval selection. Timestamps are in |
| 263 | + * milliseconds since epoch. Bins are aligned to the minimum timestamp, not to calendar |
| 264 | + * boundaries. |
| 265 | + */ |
| 266 | + private static String calculateTimestampBucket( |
| 267 | + long timestampMillis, int numBins, long rangeMillis, long minMillis) { |
| 268 | + // Calculate target width in milliseconds |
| 269 | + long targetWidthMillis = rangeMillis / numBins; |
| 270 | + |
| 271 | + // Select appropriate time interval (same as OpenSearch auto_date_histogram) |
| 272 | + long intervalMillis = selectTimeInterval(targetWidthMillis); |
| 273 | + |
| 274 | + // Floor timestamp to the interval boundary aligned with minMillis |
| 275 | + // This ensures bins start at the data's minimum value, like OpenSearch auto_date_histogram |
| 276 | + long offsetFromMin = timestampMillis - minMillis; |
| 277 | + long intervalsSinceMin = offsetFromMin / intervalMillis; |
| 278 | + long binStartMillis = minMillis + (intervalsSinceMin * intervalMillis); |
| 279 | + |
| 280 | + // Format as ISO 8601 timestamp string |
| 281 | + return formatTimestamp(binStartMillis); |
| 282 | + } |
| 283 | + |
| 284 | + /** |
| 285 | + * Select the appropriate time interval based on target width. Uses the same intervals as |
| 286 | + * OpenSearch auto_date_histogram: 1s, 5s, 10s, 30s, 1m, 5m, 10m, 30m, 1h, 3h, 12h, 1d, 7d, 1M, |
| 287 | + * 1y |
| 288 | + */ |
| 289 | + private static long selectTimeInterval(long targetWidthMillis) { |
| 290 | + // Define nice time intervals in milliseconds |
| 291 | + long[] intervals = { |
| 292 | + 1000L, // 1 second |
| 293 | + 5000L, // 5 seconds |
| 294 | + 10000L, // 10 seconds |
| 295 | + 30000L, // 30 seconds |
| 296 | + 60000L, // 1 minute |
| 297 | + 300000L, // 5 minutes |
| 298 | + 600000L, // 10 minutes |
| 299 | + 1800000L, // 30 minutes |
| 300 | + 3600000L, // 1 hour |
| 301 | + 10800000L, // 3 hours |
| 302 | + 43200000L, // 12 hours |
| 303 | + 86400000L, // 1 day |
| 304 | + 604800000L, // 7 days |
| 305 | + 2592000000L, // 30 days (approximate month) |
| 306 | + 31536000000L // 365 days (approximate year) |
| 307 | + }; |
| 308 | + |
| 309 | + // Find the smallest interval that is >= target width |
| 310 | + for (long interval : intervals) { |
| 311 | + if (interval >= targetWidthMillis) { |
| 312 | + return interval; |
| 313 | + } |
| 314 | + } |
| 315 | + |
| 316 | + // If target is larger than all intervals, use the largest |
| 317 | + return intervals[intervals.length - 1]; |
| 318 | + } |
| 319 | + |
| 320 | + /** Format timestamp in milliseconds as ISO 8601 string. Format: "yyyy-MM-dd HH:mm:ss" */ |
| 321 | + private static String formatTimestamp(long timestampMillis) { |
| 322 | + Instant instant = Instant.ofEpochMilli(timestampMillis); |
| 323 | + ZonedDateTime zdt = instant.atZone(ZoneOffset.UTC); |
| 324 | + return zdt.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); |
| 325 | + } |
193 | 326 | } |
194 | 327 | } |
0 commit comments