diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 573a51de2a7..09ad5d4009a 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -2854,9 +2854,39 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { for (ReplacePair pair : node.getReplacePairs()) { RexNode patternNode = rexVisitor.analyze(pair.getPattern(), context); RexNode replacementNode = rexVisitor.analyze(pair.getReplacement(), context); - fieldRef = - context.relBuilder.call( - SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); + + String patternStr = pair.getPattern().getValue().toString(); + String replacementStr = pair.getReplacement().getValue().toString(); + + if (patternStr.contains("*")) { + WildcardUtils.validateWildcardSymmetry(patternStr, replacementStr); + + String regexPattern = WildcardUtils.convertWildcardPatternToRegex(patternStr); + String regexReplacement = + WildcardUtils.convertWildcardReplacementToRegex(replacementStr); + + RexNode regexPatternNode = + context.rexBuilder.makeLiteral( + regexPattern, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true); + RexNode regexReplacementNode = + context.rexBuilder.makeLiteral( + regexReplacement, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true); + + fieldRef = + context.rexBuilder.makeCall( + org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE_3, + fieldRef, + regexPatternNode, + regexReplacementNode); + } else { + fieldRef = + context.relBuilder.call( + SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); + } } projectList.add(fieldRef); diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java index 09552e97109..8558a5292b7 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java @@ -5,6 +5,7 @@ package org.opensearch.sql.calcite.utils; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -92,4 +93,141 @@ private static boolean matchesCompiledPattern(String[] parts, String fieldName) public static boolean containsWildcard(String str) { return str != null && str.contains(WILDCARD); } + + /** + * Converts a wildcard pattern to a regex pattern. + * + *

Example: "*ada" → "^(.*?)ada$" + * + * @param wildcardPattern wildcard pattern with '*' and escape sequences (\*, \\) + * @return regex pattern with capture groups + */ + public static String convertWildcardPatternToRegex(String wildcardPattern) { + String[] parts = splitWildcards(wildcardPattern); + StringBuilder regexBuilder = new StringBuilder("^"); + + for (int i = 0; i < parts.length; i++) { + regexBuilder.append(java.util.regex.Pattern.quote(parts[i])); + if (i < parts.length - 1) { + regexBuilder.append("(.*?)"); // Non-greedy capture group for wildcard + } + } + regexBuilder.append("$"); + + return regexBuilder.toString(); + } + + /** + * Converts a wildcard replacement string to a regex replacement string. + * + *

Example: "*_*" → "$1_$2" + * + * @param wildcardReplacement replacement string with '*' and escape sequences (\*, \\) + * @return regex replacement string with capture group references + */ + public static String convertWildcardReplacementToRegex(String wildcardReplacement) { + if (!wildcardReplacement.contains("*")) { + return wildcardReplacement; // No wildcards = literal replacement + } + + StringBuilder result = new StringBuilder(); + int captureIndex = 1; // Regex capture groups start at $1 + boolean escaped = false; + + for (char c : wildcardReplacement.toCharArray()) { + if (escaped) { + // Handle escape sequences: \* or \\ + result.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + // Replace wildcard with $1, $2, etc. + result.append('$').append(captureIndex++); + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Splits a wildcard pattern into parts separated by unescaped wildcards. + * + *

Example: "a*b*c" → ["a", "b", "c"] + * + * @param pattern wildcard pattern with escape sequences + * @return array of pattern parts + */ + private static String[] splitWildcards(String pattern) { + List parts = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + boolean escaped = false; + + for (char c : pattern.toCharArray()) { + if (escaped) { + current.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + parts.add(current.toString()); + current = new StringBuilder(); + } else { + current.append(c); + } + } + + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: pattern ends with unescaped backslash"); + } + + parts.add(current.toString()); + return parts.toArray(new String[0]); + } + + /** + * Counts the number of unescaped wildcards in a string. + * + * @param str string to count wildcards in + * @return number of unescaped wildcards + */ + private static int countWildcards(String str) { + int count = 0; + boolean escaped = false; + for (char c : str.toCharArray()) { + if (escaped) { + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + count++; + } + } + return count; + } + + /** + * Validates that wildcard count is symmetric between pattern and replacement. + * + *

Replacement must have either the same number of wildcards as the pattern, or zero wildcards. + * + * @param pattern wildcard pattern + * @param replacement wildcard replacement + * @throws IllegalArgumentException if wildcard counts are mismatched + */ + public static void validateWildcardSymmetry(String pattern, String replacement) { + int patternWildcards = countWildcards(pattern); + int replacementWildcards = countWildcards(replacement); + + if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { + throw new IllegalArgumentException( + String.format( + "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " + + "replacement has %d. Replacement must have same number of wildcards or none.", + patternWildcards, replacementWildcards)); + } + } } diff --git a/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java b/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java index 53cc1d5163c..2e41de018a5 100644 --- a/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java +++ b/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java @@ -5,6 +5,11 @@ package org.opensearch.sql.calcite.utils; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + import com.google.common.collect.ImmutableList; import java.util.List; import org.junit.jupiter.api.BeforeEach; @@ -74,6 +79,32 @@ void testMatchesWildcardPattern() { testPattern("*a*e", "city", false); } + @Test + void testMatchesWildcardPatternEdgeCases() { + // Test null handling + assertFalse(WildcardUtils.matchesWildcardPattern(null, "field")); + assertFalse(WildcardUtils.matchesWildcardPattern("pattern", null)); + assertFalse(WildcardUtils.matchesWildcardPattern(null, null)); + + // Test empty strings + assertTrue(WildcardUtils.matchesWildcardPattern("", "")); + assertFalse(WildcardUtils.matchesWildcardPattern("", "field")); + assertFalse(WildcardUtils.matchesWildcardPattern("field", "")); + + // Test single wildcard + assertTrue(WildcardUtils.matchesWildcardPattern("*", "anything")); + assertTrue(WildcardUtils.matchesWildcardPattern("*", "")); + + // Test multiple consecutive wildcards + assertTrue(WildcardUtils.matchesWildcardPattern("**", "field")); + assertTrue(WildcardUtils.matchesWildcardPattern("a**b", "ab")); + assertTrue(WildcardUtils.matchesWildcardPattern("a**b", "axxxb")); + + // Test wildcards at start and end + assertTrue(WildcardUtils.matchesWildcardPattern("*field*", "myfield123")); + assertTrue(WildcardUtils.matchesWildcardPattern("*field*", "field")); + } + @Test void testExpandWildcardPattern() { // Test exact match @@ -97,6 +128,20 @@ void testExpandWildcardPattern() { testExpansion("XYZ*", ImmutableList.of()); } + @Test + void testExpandWildcardPatternEdgeCases() { + // Test null handling + assertEquals(List.of(), WildcardUtils.expandWildcardPattern(null, availableFields)); + assertEquals(List.of(), WildcardUtils.expandWildcardPattern("pattern", null)); + assertEquals(List.of(), WildcardUtils.expandWildcardPattern(null, null)); + + // Test empty list + assertEquals(List.of(), WildcardUtils.expandWildcardPattern("*", List.of())); + + // Test single wildcard matches all + assertEquals(availableFields, WildcardUtils.expandWildcardPattern("*", availableFields)); + } + @Test void testContainsWildcard() { // Test with wildcard @@ -108,4 +153,142 @@ void testContainsWildcard() { testContainsWildcard("field", false); testContainsWildcard("", false); } + + @Test + void testContainsWildcardEdgeCases() { + // Test null + assertFalse(WildcardUtils.containsWildcard(null)); + + // Test multiple wildcards + assertTrue(WildcardUtils.containsWildcard("**")); + assertTrue(WildcardUtils.containsWildcard("a*b*c")); + } + + @Test + void testConvertWildcardPatternToRegex() { + // Basic patterns + assertEquals("^\\Qada\\E$", WildcardUtils.convertWildcardPatternToRegex("ada")); + assertEquals("^\\Q\\E(.*?)\\Qada\\E$", WildcardUtils.convertWildcardPatternToRegex("*ada")); + assertEquals("^\\Qada\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("ada*")); + assertEquals( + "^\\Q\\E(.*?)\\Qada\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("*ada*")); + + // Multiple wildcards + assertEquals( + "^\\Qa\\E(.*?)\\Qb\\E(.*?)\\Qc\\E$", WildcardUtils.convertWildcardPatternToRegex("a*b*c")); + + // Pattern with special regex characters + assertEquals( + "^\\Qa.b\\E(.*?)\\Qc+d\\E$", WildcardUtils.convertWildcardPatternToRegex("a.b*c+d")); + + // Single wildcard + assertEquals("^\\Q\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("*")); + + // Empty pattern + assertEquals("^\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("")); + + // Invalid pattern with trailing backslash should throw + IllegalArgumentException ex = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.convertWildcardPatternToRegex("pattern\\")); + assertTrue(ex.getMessage().contains("Invalid escape sequence")); + } + + @Test + void testConvertWildcardReplacementToRegex() { + // No wildcards - literal replacement + assertEquals("ada", WildcardUtils.convertWildcardReplacementToRegex("ada")); + assertEquals("test_value", WildcardUtils.convertWildcardReplacementToRegex("test_value")); + + // Single wildcard + assertEquals("$1", WildcardUtils.convertWildcardReplacementToRegex("*")); + + // Wildcards with text + assertEquals("$1_$2", WildcardUtils.convertWildcardReplacementToRegex("*_*")); + assertEquals("prefix_$1", WildcardUtils.convertWildcardReplacementToRegex("prefix_*")); + assertEquals("$1_suffix", WildcardUtils.convertWildcardReplacementToRegex("*_suffix")); + + // Multiple wildcards + assertEquals("$1_$2_$3", WildcardUtils.convertWildcardReplacementToRegex("*_*_*")); + + // Empty string + assertEquals("", WildcardUtils.convertWildcardReplacementToRegex("")); + } + + @Test + void testConvertWildcardReplacementToRegexWithEscapes() { + // Escaped wildcard should be treated as literal + assertEquals("*", WildcardUtils.convertWildcardReplacementToRegex("\\*")); // \* -> * + assertEquals("$1_*", WildcardUtils.convertWildcardReplacementToRegex("*_\\*")); + assertEquals("*_$1", WildcardUtils.convertWildcardReplacementToRegex("\\*_*")); + + // Escaped backslash when there's no wildcard - returned unchanged + assertEquals("\\\\", WildcardUtils.convertWildcardReplacementToRegex("\\\\")); + + // Mixed escaped and unescaped wildcards + assertEquals("$1_*_$2", WildcardUtils.convertWildcardReplacementToRegex("*_\\*_*")); + assertEquals("$1\\$2", WildcardUtils.convertWildcardReplacementToRegex("*\\\\*")); // \\ -> \ + } + + @Test + void testValidateWildcardSymmetry() { + // Valid: same number of wildcards + WildcardUtils.validateWildcardSymmetry("*", "*"); + WildcardUtils.validateWildcardSymmetry("*ada*", "*_*"); + WildcardUtils.validateWildcardSymmetry("a*b*c", "x*y*z"); + + // Valid: replacement has no wildcards (literal replacement) + WildcardUtils.validateWildcardSymmetry("*", "literal"); + WildcardUtils.validateWildcardSymmetry("*ada*", "replacement"); + WildcardUtils.validateWildcardSymmetry("a*b*c", "xyz"); + + // Valid: pattern has no wildcards + WildcardUtils.validateWildcardSymmetry("ada", "replacement"); + } + + @Test + void testValidateWildcardSymmetryFailure() { + // Invalid: mismatched wildcard counts + IllegalArgumentException ex1 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*", "**")); + assertTrue(ex1.getMessage().contains("Wildcard count mismatch")); + assertTrue(ex1.getMessage().contains("pattern has 1 wildcard(s)")); + assertTrue(ex1.getMessage().contains("replacement has 2")); + + IllegalArgumentException ex2 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*b*", "*_*")); + assertTrue(ex2.getMessage().contains("pattern has 3 wildcard(s)")); + assertTrue(ex2.getMessage().contains("replacement has 2")); + + IllegalArgumentException ex3 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("ada", "*")); + assertTrue(ex3.getMessage().contains("pattern has 0 wildcard(s)")); + assertTrue(ex3.getMessage().contains("replacement has 1")); + } + + @Test + void testValidateWildcardSymmetryWithEscapes() { + // Escaped wildcards should not count + WildcardUtils.validateWildcardSymmetry("\\*", "literal"); // 0 wildcards in pattern + WildcardUtils.validateWildcardSymmetry("*\\*", "*"); // 1 wildcard in both + + // Pattern with 2 wildcards, replacement with 1 wildcard (middle one in \\**\\*) + WildcardUtils.validateWildcardSymmetry("*", "\\**\\*"); // 1 wildcard in both + + // Should fail when unescaped counts don't match + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*", "*\\*")); // 2 vs 1 + + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*", "\\**\\*")); // 2 vs 1 + } } diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index bcb0d57e677..0098124344d 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -11,7 +11,7 @@ replace Description ============ -Using ``replace`` command to replace text in one or more fields in the search result. +Using ``replace`` command to replace text in one or more fields. Supports literal string replacement and wildcard patterns using ``*``. Note: This command is only available when Calcite engine is enabled. @@ -21,13 +21,6 @@ Syntax replace '' WITH '' [, '' WITH '']... IN [, ]... -Parameters -========== -* **pattern**: mandatory. The text pattern you want to replace. Currently supports only plain text literals (no wildcards or regular expressions). -* **replacement**: mandatory. The text you want to replace with. -* **field-name**: mandatory. One or more field names where the replacement should occur. - - Examples ======== @@ -120,8 +113,158 @@ PPL query:: +-----------------+-------+--------+-----+--------+ +Example 6: Wildcard suffix match +--------------------------------- + +Replace values that end with a specific pattern. The wildcard ``*`` matches any prefix. + +PPL query:: + + os> source=accounts | replace "*IL" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Example 7: Wildcard prefix match +--------------------------------- + +Replace values that start with a specific pattern. The wildcard ``*`` matches any suffix. + +PPL query:: + + os> source=accounts | replace "IL*" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Example 8: Wildcard capture and substitution +--------------------------------------------- + +Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. + +PPL query:: + + os> source=accounts | replace "* Lane" WITH "Lane *" IN address | fields address; + fetched rows / total rows = 4/4 + +----------------------+ + | address | + |----------------------| + | Lane 880 Holmes | + | 671 Bristol Street | + | 789 Madison Street | + | 467 Hutchinson Court | + +----------------------+ + + +Example 9: Multiple wildcards for pattern transformation +--------------------------------------------------------- + +Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. + +PPL query:: + + os> source=accounts | replace "* *" WITH "*_*" IN address | fields address; + fetched rows / total rows = 4/4 + +----------------------+ + | address | + |----------------------| + | 880_Holmes Lane | + | 671_Bristol Street | + | 789_Madison Street | + | 467_Hutchinson Court | + +----------------------+ + + +Example 10: Wildcard with zero wildcards in replacement +-------------------------------------------------------- + +When replacement has zero wildcards, all matching values are replaced with the literal replacement string. + +PPL query:: + + os> source=accounts | replace "*IL*" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Example 11: Matching literal asterisks +--------------------------------------- + +Use ``\*`` to match literal asterisk characters (``\*`` = literal asterisk, ``\\`` = literal backslash). + +PPL query:: + + os> source=accounts | eval note = 'price: *sale*' | replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note | fields note; + fetched rows / total rows = 4/4 + +------------+ + | note | + |------------| + | DISCOUNTED | + | DISCOUNTED | + | DISCOUNTED | + | DISCOUNTED | + +------------+ + +Example 12: Wildcard with no replacement wildcards +---------------------------------------------------- + +Use wildcards in pattern but none in replacement to create a fixed output. + +PPL query:: + + os> source=accounts | eval test = 'prefix-value-suffix' | replace 'prefix-*-suffix' WITH 'MATCHED' IN test | fields test; + fetched rows / total rows = 4/4 + +---------+ + | test | + |---------| + | MATCHED | + | MATCHED | + | MATCHED | + | MATCHED | + +---------+ + +Example 13: Escaped asterisks with wildcards +--------------------------------------------- + +Combine escaped asterisks (literal) with wildcards for complex patterns. + +PPL query:: + + os> source=accounts | eval label = 'file123.txt' | replace 'file*.*' WITH '\**.*' IN label | fields label; + fetched rows / total rows = 4/4 + +----------+ + | label | + |----------| + | *123.txt | + | *123.txt | + | *123.txt | + | *123.txt | + +----------+ + + Limitations =========== -* Only supports plain text literals for pattern matching. Wildcards and regular expressions are not supported. -* Pattern and replacement values must be string literals. -* The replace command modifies the specified fields in-place. \ No newline at end of file +* Wildcards: ``*`` matches zero or more characters (case-sensitive) +* Replacement wildcards must match pattern wildcard count, or be zero +* Escape sequences: ``\*`` (literal asterisk), ``\\`` (literal backslash) \ No newline at end of file diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index 77f3a45cc07..6da047e0c20 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -1335,6 +1335,17 @@ public void testReplaceCommandExplain() throws IOException { TEST_INDEX_ACCOUNT))); } + @Test + public void testReplaceCommandWildcardExplain() throws IOException { + String expected = loadExpectedPlan("explain_replace_wildcard.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | replace '*L' WITH 'STATE_IL' IN state | fields state", + TEST_INDEX_ACCOUNT))); + } + @Test public void testExplainRareCommandUseNull() throws IOException { String expected = loadExpectedPlan("explain_rare_usenull_false.yaml"); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java index 9d6304c363b..44cc4a3aaf0 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java @@ -288,4 +288,118 @@ public void testMultiplePairsSequentialApplication() throws IOException { rows("John", "Ontario Province"), rows("Jane", "Quebec")); } + + @Test + public void testWildcardReplace_suffixMatch() throws IOException { + // Pattern "*ada" should match "Canada" and replace with "CA" + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*ada' WITH 'CA' IN country | fields name, country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + verifyDataRows( + result, rows("Jake", "USA"), rows("Hello", "USA"), rows("John", "CA"), rows("Jane", "CA")); + } + + @Test + public void testWildcardReplace_prefixMatch() throws IOException { + // Pattern "US*" should match "USA" and replace with "United States" + JSONObject result = + executeQuery( + String.format( + "source = %s | replace 'US*' WITH 'United States' IN country | fields name," + + " country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + verifyDataRows( + result, + rows("Jake", "United States"), + rows("Hello", "United States"), + rows("John", "Canada"), + rows("Jane", "Canada")); + } + + @Test + public void testWildcardReplace_multipleWildcards() throws IOException { + // Pattern "* *" with replacement "*_*" should replace spaces with underscores + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '* *' WITH '*_*' IN state | fields name, state", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("state", "string")); + + verifyDataRows( + result, + rows("Jake", "California"), + rows("Hello", "New_York"), + rows("John", "Ontario"), + rows("Jane", "Quebec")); + } + + @Test + public void testWildcardReplace_symmetryMismatch_shouldFail() { + // Pattern has 2 wildcards, replacement has 1 - should fail + Throwable e = + assertThrowsWithReplace( + IllegalArgumentException.class, + () -> + executeQuery( + String.format( + "source = %s | replace '* *' WITH '*' IN state", + TEST_INDEX_STATE_COUNTRY))); + verifyErrorMessageContains(e, "Wildcard count mismatch"); + } + + @Test + public void testEscapeSequence_literalAsterisk() throws IOException { + // Test matching literal asterisks in data using \* escape sequence + JSONObject result = + executeQuery( + String.format( + "source = %s | eval note = 'price: *sale*' | replace 'price: \\\\*sale\\\\*' WITH" + + " 'DISCOUNTED' IN note | fields note | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("note", "string")); + // Pattern "price: \*sale\*" matches literal asterisks, result should be "DISCOUNTED" + verifyDataRows(result, rows("DISCOUNTED")); + } + + @Test + public void testEscapeSequence_mixedEscapeAndWildcard() throws IOException { + // Test combining escaped asterisks (literal) with wildcards (pattern matching) + JSONObject result = + executeQuery( + String.format( + "source = %s | eval label = 'file123.txt' | replace 'file*.*' WITH" + + " '\\\\**.*' IN label | fields label | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("label", "string")); + // Pattern "file*.*" captures "123" and "txt" + // Replacement "\**.*" has escaped * (literal), then 2 wildcards, producing "*123.txt" + verifyDataRows(result, rows("*123.txt")); + } + + @Test + public void testEscapeSequence_noMatchLiteral() throws IOException { + // Test that escaped asterisk doesn't match as wildcard + JSONObject result = + executeQuery( + String.format( + "source = %s | eval test = 'fooXbar' | replace 'foo\\\\*bar' WITH 'matched' IN test" + + " | fields test | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("test", "string")); + // Pattern "foo\*bar" matches literal "foo*bar", not "fooXbar", so original value returned + verifyDataRows(result, rows("fooXbar")); + } } diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml new file mode 100644 index 00000000000..0407849a472 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml @@ -0,0 +1,8 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[REGEXP_REPLACE($7, '^\Q\E(.*?)\QL\E$':VARCHAR, 'STATE_IL':VARCHAR)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0=[{inputs}], expr#1=['^\Q\E(.*?)\QL\E$':VARCHAR], expr#2=['STATE_IL':VARCHAR], expr#3=[REGEXP_REPLACE($t0, $t1, $t2)], $f0=[$t3]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[state], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["state"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml new file mode 100644 index 00000000000..194f680adf2 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[REGEXP_REPLACE($7, '^\Q\E(.*?)\QL\E$':VARCHAR, 'STATE_IL':VARCHAR)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=['^\Q\E(.*?)\QL\E$':VARCHAR], expr#18=['STATE_IL':VARCHAR], expr#19=[REGEXP_REPLACE($t7, $t17, $t18)], state=[$t19]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java index abde8b3a5bb..5f6f2beb76d 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java @@ -325,4 +325,76 @@ public void testReplaceWithMultiplePairsTrailingCommaShouldFail() { String ppl = "source=EMP | replace \"CLERK\" WITH \"EMPLOYEE\", IN JOB"; getRelNode(ppl); } + + @Test + public void testWildcardReplace_prefixWildcard() { + // Replace suffix wildcard - e.g., "*MAN" matches "SALESMAN" → "SELLER" + // Wildcard pattern is converted to regex at planning time + String ppl = "source=EMP | replace \"*MAN\" WITH \"SELLER\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2," + + " '^\\Q\\E(.*?)\\QMAN\\E$':VARCHAR, 'SELLER':VARCHAR)], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test + public void testWildcardReplace_multipleWildcards() { + // Replace with multiple wildcards for capture and substitution + // Wildcard pattern "*_*" is converted to regex replacement "$1_$2" + String ppl = "source=EMP | replace \"* - *\" WITH \"*_*\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2, '^\\Q\\E(.*?)\\Q -" + + " \\E(.*?)\\Q\\E$':VARCHAR, '$1_$2':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test(expected = IllegalArgumentException.class) + public void testWildcardReplace_symmetryMismatch_shouldFail() { + // Pattern has 2 wildcards, replacement has 1 - should throw error + String ppl = "source=EMP | replace \"* - *\" WITH \"*\" IN JOB"; + getRelNode(ppl); + } + + @Test + public void testWildcardReplace_symmetryValid_zeroInReplacement() { + // Pattern has 2 wildcards, replacement has 0 - should work + // Literal replacement "FIXED" has no wildcards, which is valid + String ppl = "source=EMP | replace \"* - *\" WITH \"FIXED\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2, '^\\Q\\E(.*?)\\Q -" + + " \\E(.*?)\\Q\\E$':VARCHAR, 'FIXED':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test + public void testWildcardAndLiteralReplace_mixedPairs() { + // Multiple pairs: one with wildcard (converted to REGEXP_REPLACE), one literal (REPLACE) + String ppl = + "source=EMP | replace \"*CLERK\" WITH \"EMPLOYEE\", \"MANAGER\" WITH \"SUPERVISOR\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REPLACE(REGEXP_REPLACE($2," + + " '^\\Q\\E(.*?)\\QCLERK\\E$':VARCHAR, 'EMPLOYEE':VARCHAR), 'MANAGER':VARCHAR," + + " 'SUPERVISOR':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6]," + + " DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } }