Skip to content

Commit 0c1ec27

Browse files
authored
[BugFix] Fix unexpected shift of extraction for rex with nested capture groups in named groups (#4641)
1 parent 448c42a commit 0c1ec27

File tree

7 files changed

+274
-63
lines changed

7 files changed

+274
-63
lines changed

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
282282
"Rex pattern must contain at least one named capture group");
283283
}
284284

285+
// TODO: Once JDK 20+ is supported, consider using Pattern.namedGroups() API for more efficient
286+
// named group handling instead of manual parsing in RegexCommonUtils
287+
285288
List<RexNode> newFields = new ArrayList<>();
286289
List<String> newFieldNames = new ArrayList<>();
287290

288-
for (int i = 0; i < namedGroups.size(); i++) {
291+
for (String groupName : namedGroups) {
289292
RexNode extractCall;
290293
if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) {
291294
extractCall =
@@ -294,7 +297,7 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
294297
BuiltinFunctionName.REX_EXTRACT_MULTI,
295298
fieldRex,
296299
context.rexBuilder.makeLiteral(patternStr),
297-
context.relBuilder.literal(i + 1),
300+
context.rexBuilder.makeLiteral(groupName),
298301
context.relBuilder.literal(node.getMaxMatch().get()));
299302
} else {
300303
extractCall =
@@ -303,10 +306,10 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
303306
BuiltinFunctionName.REX_EXTRACT,
304307
fieldRex,
305308
context.rexBuilder.makeLiteral(patternStr),
306-
context.relBuilder.literal(i + 1));
309+
context.rexBuilder.makeLiteral(groupName));
307310
}
308311
newFields.add(extractCall);
309-
newFieldNames.add(namedGroups.get(i));
312+
newFieldNames.add(groupName);
310313
}
311314

312315
if (node.getOffsetField().isPresent()) {

core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractFunction.java

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
import org.apache.calcite.linq4j.tree.Expression;
1616
import org.apache.calcite.linq4j.tree.Expressions;
1717
import org.apache.calcite.rex.RexCall;
18+
import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
1819
import org.apache.calcite.sql.type.ReturnTypes;
1920
import org.apache.calcite.sql.type.SqlReturnTypeInference;
2021
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
2122
import org.opensearch.sql.expression.function.ImplementorUDF;
2223
import org.opensearch.sql.expression.function.UDFOperandMetadata;
24+
import org.opensearch.sql.expression.parse.RegexCommonUtils;
2325

2426
/** Custom REX_EXTRACT function for extracting regex named capture groups. */
2527
public final class RexExtractFunction extends ImplementorUDF {
@@ -35,7 +37,12 @@ public SqlReturnTypeInference getReturnTypeInference() {
3537

3638
@Override
3739
public UDFOperandMetadata getOperandMetadata() {
38-
return PPLOperandTypes.STRING_STRING_INTEGER;
40+
// Support both (field, pattern, groupIndex) and (field, pattern, groupName)
41+
return UDFOperandMetadata.wrap(
42+
(CompositeOperandTypeChecker)
43+
PPLOperandTypes.STRING_STRING_INTEGER
44+
.getInnerTypeChecker()
45+
.or(PPLOperandTypes.STRING_STRING_STRING.getInnerTypeChecker()));
3946
}
4047

4148
private static class RexExtractImplementor implements NotNullImplementor {
@@ -45,19 +52,80 @@ public Expression implement(
4552
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
4653
Expression field = translatedOperands.get(0);
4754
Expression pattern = translatedOperands.get(1);
48-
Expression groupIndex = translatedOperands.get(2);
55+
Expression groupIndexOrName = translatedOperands.get(2);
4956

50-
return Expressions.call(RexExtractFunction.class, "extractGroup", field, pattern, groupIndex);
57+
return Expressions.call(
58+
RexExtractFunction.class, "extractGroup", field, pattern, groupIndexOrName);
5159
}
5260
}
5361

62+
/**
63+
* Extract a regex group by index (1-based).
64+
*
65+
* @param text The input text to extract from
66+
* @param pattern The regex pattern
67+
* @param groupIndex The 1-based group index to extract
68+
* @return The extracted value or null if not found or invalid
69+
*/
5470
public static String extractGroup(String text, String pattern, int groupIndex) {
71+
if (text == null || pattern == null) {
72+
return null;
73+
}
74+
75+
return executeExtraction(
76+
text,
77+
pattern,
78+
matcher -> {
79+
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
80+
return matcher.group(groupIndex);
81+
}
82+
return null;
83+
});
84+
}
85+
86+
/**
87+
* Extract a named capture group from text using the provided pattern. This method avoids the
88+
* index shifting issue that occurs with nested unnamed groups.
89+
*
90+
* @param text The input text to extract from
91+
* @param pattern The regex pattern with named capture groups
92+
* @param groupName The name of the capture group to extract
93+
* @return The extracted value or null if not found
94+
*/
95+
public static String extractGroup(String text, String pattern, String groupName) {
96+
if (text == null || pattern == null || groupName == null) {
97+
return null;
98+
}
99+
100+
return executeExtraction(
101+
text,
102+
pattern,
103+
matcher -> {
104+
try {
105+
return matcher.group(groupName);
106+
} catch (IllegalArgumentException e) {
107+
// Group name doesn't exist in the pattern
108+
return null;
109+
}
110+
});
111+
}
112+
113+
/**
114+
* Common extraction logic to avoid code duplication.
115+
*
116+
* @param text The input text
117+
* @param pattern The regex pattern
118+
* @param extractor Function to extract the value from the matcher
119+
* @return The extracted value or null
120+
*/
121+
private static String executeExtraction(
122+
String text, String pattern, java.util.function.Function<Matcher, String> extractor) {
55123
try {
56-
Pattern compiledPattern = Pattern.compile(pattern);
124+
Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
57125
Matcher matcher = compiledPattern.matcher(text);
58126

59-
if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) {
60-
return matcher.group(groupIndex);
127+
if (matcher.find()) {
128+
return extractor.apply(matcher);
61129
}
62130
return null;
63131
} catch (PatternSyntaxException e) {

core/src/main/java/org/opensearch/sql/expression/function/udf/RexExtractMultiFunction.java

Lines changed: 95 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,15 @@
1616
import org.apache.calcite.linq4j.tree.Expression;
1717
import org.apache.calcite.linq4j.tree.Expressions;
1818
import org.apache.calcite.rex.RexCall;
19+
import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
20+
import org.apache.calcite.sql.type.OperandTypes;
1921
import org.apache.calcite.sql.type.SqlReturnTypeInference;
22+
import org.apache.calcite.sql.type.SqlTypeFamily;
2023
import org.apache.calcite.sql.type.SqlTypeName;
2124
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
2225
import org.opensearch.sql.expression.function.ImplementorUDF;
2326
import org.opensearch.sql.expression.function.UDFOperandMetadata;
27+
import org.opensearch.sql.expression.parse.RegexCommonUtils;
2428

2529
/** Custom REX_EXTRACT_MULTI function for extracting multiple regex matches. */
2630
public final class RexExtractMultiFunction extends ImplementorUDF {
@@ -40,7 +44,17 @@ public SqlReturnTypeInference getReturnTypeInference() {
4044

4145
@Override
4246
public UDFOperandMetadata getOperandMetadata() {
43-
return PPLOperandTypes.STRING_STRING_INTEGER_INTEGER;
47+
// Support both (field, pattern, groupIndex, maxMatch) and (field, pattern, groupName, maxMatch)
48+
return UDFOperandMetadata.wrap(
49+
(CompositeOperandTypeChecker)
50+
PPLOperandTypes.STRING_STRING_INTEGER_INTEGER
51+
.getInnerTypeChecker()
52+
.or(
53+
OperandTypes.family(
54+
SqlTypeFamily.CHARACTER,
55+
SqlTypeFamily.CHARACTER,
56+
SqlTypeFamily.CHARACTER,
57+
SqlTypeFamily.INTEGER)));
4458
}
4559

4660
private static class RexExtractMultiImplementor implements NotNullImplementor {
@@ -50,35 +64,105 @@ public Expression implement(
5064
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
5165
Expression field = translatedOperands.get(0);
5266
Expression pattern = translatedOperands.get(1);
53-
Expression groupIndex = translatedOperands.get(2);
67+
Expression groupIndexOrName = translatedOperands.get(2);
5468
Expression maxMatch = translatedOperands.get(3);
5569

5670
return Expressions.call(
5771
RexExtractMultiFunction.class,
5872
"extractMultipleGroups",
5973
field,
6074
pattern,
61-
groupIndex,
75+
groupIndexOrName,
6276
maxMatch);
6377
}
6478
}
6579

80+
/**
81+
* Extract multiple regex groups by index (1-based).
82+
*
83+
* @param text The input text to extract from
84+
* @param pattern The regex pattern
85+
* @param groupIndex The 1-based group index to extract
86+
* @param maxMatch Maximum number of matches to return (0 = unlimited)
87+
* @return List of extracted values or null if no matches found
88+
*/
6689
public static List<String> extractMultipleGroups(
6790
String text, String pattern, int groupIndex, int maxMatch) {
68-
// Query planner already validates null inputs via NullPolicy.ARG0
91+
if (text == null || pattern == null) {
92+
return null;
93+
}
94+
95+
return executeMultipleExtractions(
96+
text,
97+
pattern,
98+
maxMatch,
99+
matcher -> {
100+
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
101+
return matcher.group(groupIndex);
102+
}
103+
return null;
104+
});
105+
}
106+
107+
/**
108+
* Extract multiple occurrences of a named capture group from text. This method avoids the index
109+
* shifting issue that occurs with nested unnamed groups.
110+
*
111+
* @param text The input text to extract from
112+
* @param pattern The regex pattern with named capture groups
113+
* @param groupName The name of the capture group to extract
114+
* @param maxMatch Maximum number of matches to return (0 = unlimited)
115+
* @return List of extracted values or null if no matches found
116+
*/
117+
public static List<String> extractMultipleGroups(
118+
String text, String pattern, String groupName, int maxMatch) {
119+
if (text == null || pattern == null || groupName == null) {
120+
return null;
121+
}
122+
123+
return executeMultipleExtractions(
124+
text,
125+
pattern,
126+
maxMatch,
127+
matcher -> {
128+
try {
129+
return matcher.group(groupName);
130+
} catch (IllegalArgumentException e) {
131+
// Group name doesn't exist in the pattern, stop processing
132+
return null;
133+
}
134+
});
135+
}
136+
137+
/**
138+
* Common extraction logic for multiple matches to avoid code duplication.
139+
*
140+
* @param text The input text
141+
* @param pattern The regex pattern
142+
* @param maxMatch Maximum matches (0 = unlimited)
143+
* @param extractor Function to extract the value from the matcher
144+
* @return List of extracted values or null if no matches found
145+
*/
146+
private static List<String> executeMultipleExtractions(
147+
String text,
148+
String pattern,
149+
int maxMatch,
150+
java.util.function.Function<Matcher, String> extractor) {
69151
try {
70-
Pattern compiledPattern = Pattern.compile(pattern);
152+
Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
71153
Matcher matcher = compiledPattern.matcher(text);
72154
List<String> matches = new ArrayList<>();
73155

74156
int matchCount = 0;
75157
while (matcher.find() && (maxMatch == 0 || matchCount < maxMatch)) {
76-
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
77-
String match = matcher.group(groupIndex);
78-
if (match != null) {
79-
matches.add(match);
80-
matchCount++;
81-
}
158+
String match = extractor.apply(matcher);
159+
if (match != null) {
160+
matches.add(match);
161+
matchCount++;
162+
} else {
163+
// If extractor returns null, it might indicate an error (like invalid group name)
164+
// Stop processing to avoid infinite loop
165+
break;
82166
}
83167
}
84168

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteRexCommandIT.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,4 +306,53 @@ public void testRexMaxMatchConfigurableLimit() throws IOException {
306306
new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), null));
307307
}
308308
}
309+
310+
@Test
311+
public void testRexNestedCaptureGroupsBugFix() throws IOException {
312+
JSONObject resultWithNested =
313+
executeQuery(
314+
String.format(
315+
"source=%s | rex field=email"
316+
+ " \\\"(?<user>[^@]+)@(?<domain>(pyrami|gmail|yahoo))\\\\\\\\.(?<tld>(com|org|net))\\\""
317+
+ " | fields user, domain, tld | head 1",
318+
TEST_INDEX_ACCOUNT));
319+
320+
assertEquals(1, resultWithNested.getJSONArray("datarows").length());
321+
assertEquals(
322+
"amberduke",
323+
resultWithNested
324+
.getJSONArray("datarows")
325+
.getJSONArray(0)
326+
.get(0)); // user should be "amberduke"
327+
assertEquals(
328+
"pyrami",
329+
resultWithNested
330+
.getJSONArray("datarows")
331+
.getJSONArray(0)
332+
.get(1)); // domain should be "pyrami", NOT "amberduke"
333+
assertEquals(
334+
"com",
335+
resultWithNested
336+
.getJSONArray("datarows")
337+
.getJSONArray(0)
338+
.get(2)); // tld should be "com", NOT "pyrami"
339+
340+
// More complex nested alternation
341+
JSONObject complexNested =
342+
executeQuery(
343+
String.format(
344+
"source=%s | rex field=firstname"
345+
+ " \\\"(?<initial>(A|B|C|D|E))[a-z]*(?<suffix>(ley|nne|ber|ton|son))\\\" |"
346+
+ " fields initial, suffix | head 1",
347+
TEST_INDEX_ACCOUNT));
348+
349+
if (!complexNested.getJSONArray("datarows").isEmpty()) {
350+
String initial = complexNested.getJSONArray("datarows").getJSONArray(0).getString(0);
351+
String suffix = complexNested.getJSONArray("datarows").getJSONArray(0).getString(1);
352+
353+
assertTrue("Initial should be a single letter A-E", initial.matches("[A-E]"));
354+
assertTrue(
355+
"Suffix should match alternation pattern", suffix.matches("(ley|nne|ber|ton|son)"));
356+
}
357+
}
309358
}

integ-test/src/test/resources/expectedOutput/calcite/explain_rex.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ calcite:
33
LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
44
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
55
LogicalSort(fetch=[5])
6-
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
6+
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
77
CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
88
physical: |
9-
EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=[1], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
9+
EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=['initial'], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
1010
CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->5, LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":5,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=5, pageSize=null, startFrom=0)])

integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_rex.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ calcite:
33
LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
44
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
55
LogicalSort(fetch=[5])
6-
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
6+
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
77
CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
88
physical: |
99
EnumerableLimit(fetch=[10000])
10-
EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=[1], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
10+
EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=['initial'], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
1111
EnumerableLimit(fetch=[5])
1212
CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])

0 commit comments

Comments
 (0)