From 31302d9243a088e4f7b8a477ca10b265ccff7bad Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 13:58:39 -0700 Subject: [PATCH 01/12] Support wildcard for replace command Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRelNodeVisitor.java | 46 ++- .../calcite/utils/WildcardReplaceUtils.java | 173 ++++++++++ .../function/BuiltinFunctionName.java | 3 +- .../function/PPLBuiltinOperators.java | 5 + .../expression/function/PPLFuncImpTable.java | 2 + .../udf/WildcardReplaceFunctionImpl.java | 65 ++++ .../remote/CalciteReplaceCommandIT.java | 209 ++++++++++++ .../ppl/calcite/CalcitePPLReplaceTest.java | 67 ++++ .../ppl/calcite/WildcardReplaceUtilsTest.java | 319 ++++++++++++++++++ 9 files changed, 885 insertions(+), 4 deletions(-) create mode 100644 core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java create mode 100644 core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java create mode 100644 ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 573a51de2a7..24db64f084e 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -2854,9 +2854,30 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { for (ReplacePair pair : node.getReplacePairs()) { RexNode patternNode = rexVisitor.analyze(pair.getPattern(), context); RexNode replacementNode = rexVisitor.analyze(pair.getReplacement(), context); - fieldRef = - context.relBuilder.call( - SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); + + // Extract pattern and replacement string values + String patternStr = + ((org.opensearch.sql.ast.expression.Literal) pair.getPattern()).getValue().toString(); + String replacementStr = + ((org.opensearch.sql.ast.expression.Literal) pair.getReplacement()) + .getValue() + .toString(); + + // Check if pattern contains wildcards + if (patternStr.contains("*")) { + // Validate wildcard symmetry + org.opensearch.sql.calcite.utils.WildcardReplaceUtils.validateWildcardSymmetry( + patternStr, replacementStr); + + // For wildcard patterns, use custom wildcard replacement logic + fieldRef = + buildWildcardReplaceExpression(fieldRef, patternNode, replacementNode, context); + } else { + // For literal patterns, use Calcite's standard REPLACE function + fieldRef = + context.relBuilder.call( + SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); + } } projectList.add(fieldRef); @@ -2870,6 +2891,25 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { return context.relBuilder.peek(); } + /** + * Build a RexNode for wildcard-based replacement. + * + * @param fieldRex The field to apply replacement on + * @param patternNode The pattern RexNode + * @param replacementNode The replacement RexNode + * @param context The Calcite plan context + * @return RexNode representing the wildcard replacement operation + */ + private RexNode buildWildcardReplaceExpression( + RexNode fieldRex, RexNode patternNode, RexNode replacementNode, CalcitePlanContext context) { + // Use the registered WILDCARD_REPLACE operator from PPLBuiltinOperators + return context.rexBuilder.makeCall( + org.opensearch.sql.expression.function.PPLBuiltinOperators.WILDCARD_REPLACE, + fieldRex, + patternNode, + replacementNode); + } + private void buildParseRelNode(Parse node, CalcitePlanContext context) { RexNode sourceField = rexVisitor.analyze(node.getSourceField(), context); ParseMethod parseMethod = node.getParseMethod(); diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java new file mode 100644 index 00000000000..1896080ff3a --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -0,0 +1,173 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.utils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Utility for wildcard-based string replacement in PPL replace command. + * + *

Supports SPL-style wildcard matching where '*' matches zero or more characters. Captured + * wildcard portions can be reused in the replacement string. + * + *

Examples: + * + *

+ * + * @see SPL + * replace command + */ +public class WildcardReplaceUtils { + + /** + * Perform wildcard-based replacement. + * + * @param input The input string + * @param pattern The pattern (may contain wildcards) + * @param replacement The replacement (may contain wildcards) + * @return The replaced string, or original if no match + */ + public static String replaceWithWildcard(String input, String pattern, String replacement) { + if (input == null) { + return null; + } + + // Fast path: no wildcards = literal replacement + if (!pattern.contains("*")) { + return input.replace(pattern, replacement); + } + + // Match and capture wildcard portions + List captures = matchAndCapture(input, pattern); + if (captures == null) { + // No match - return original + return input; + } + + // Substitute wildcards in replacement with captured values + return substituteWildcards(replacement, captures); + } + + /** + * Match pattern against input and capture wildcard portions. + * + * @param input The input string + * @param pattern The pattern with wildcards + * @return List of captured strings (one per wildcard), or null if no match + */ + public static List matchAndCapture(String input, String pattern) { + List captures = new ArrayList<>(); + String[] parts = pattern.split("\\*", -1); // -1 keeps trailing empty strings + + int inputIndex = 0; + + for (int i = 0; i < parts.length; i++) { + String part = parts[i]; + + if (i == 0) { + // First part: must match at start + if (!input.startsWith(part)) { + return null; // No match + } + inputIndex = part.length(); + } else if (i == parts.length - 1) { + // Last part: must match at end + if (!input.endsWith(part)) { + return null; // No match + } + // Capture everything between previous position and where this part starts + int endIndex = input.length() - part.length(); + if (endIndex < inputIndex) { + return null; // Parts overlap - no valid match + } + captures.add(input.substring(inputIndex, endIndex)); + } else { + // Middle part: find next occurrence + int nextIndex = input.indexOf(part, inputIndex); + if (nextIndex == -1) { + return null; // No match + } + // Capture from current position to where this part starts + captures.add(input.substring(inputIndex, nextIndex)); + inputIndex = nextIndex + part.length(); + } + } + + return captures; + } + + /** + * Substitute wildcards in replacement string with captured values. + * + * @param replacement The replacement string (may contain wildcards) + * @param captures The captured values from pattern matching + * @return The substituted string + */ + public static String substituteWildcards(String replacement, List captures) { + if (!replacement.contains("*")) { + // No wildcards in replacement - return as-is + return replacement; + } + + StringBuilder result = new StringBuilder(); + int captureIndex = 0; + + for (char c : replacement.toCharArray()) { + if (c == '*') { + if (captureIndex < captures.size()) { + result.append(captures.get(captureIndex)); + captureIndex++; + } + // If more wildcards than captures, skip them (shouldn't happen with validation) + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Count the number of wildcards in a string. + * + * @param str The string to count wildcards in + * @return The number of wildcard characters ('*') + */ + public static int countWildcards(String str) { + int count = 0; + for (char c : str.toCharArray()) { + if (c == '*') { + count++; + } + } + return count; + } + + /** + * Validate wildcard symmetry between pattern and replacement. + * + * @param pattern The pattern string + * @param replacement The replacement string + * @throws IllegalArgumentException if wildcard counts don't match (and replacement has wildcards) + */ + public static void validateWildcardSymmetry(String pattern, String replacement) { + int patternWildcards = countWildcards(pattern); + int replacementWildcards = countWildcards(replacement); + + if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { + throw new IllegalArgumentException( + String.format( + "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " + + "replacement has %d. Replacement must have same number of wildcards or none.", + patternWildcards, replacementWildcards)); + } + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index ced98022ca9..62470a35069 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -333,7 +333,8 @@ public enum BuiltinFunctionName { INTERNAL_REGEXP_REPLACE_3(FunctionName.of("regexp_replace_3"), true), INTERNAL_REGEXP_REPLACE_PG_4(FunctionName.of("regexp_replace_pg_4"), true), INTERNAL_REGEXP_REPLACE_5(FunctionName.of("regexp_replace_5"), true), - INTERNAL_TRANSLATE3(FunctionName.of("translate3"), true); + INTERNAL_TRANSLATE3(FunctionName.of("translate3"), true), + INTERNAL_WILDCARD_REPLACE(FunctionName.of("wildcard_replace"), true); private final FunctionName name; private boolean isInternal; diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 68eb0ed5cca..93fe81135af 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -428,6 +428,11 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET"); + // Wildcard replace function for replace command + public static final SqlOperator WILDCARD_REPLACE = + new org.opensearch.sql.expression.function.udf.WildcardReplaceFunctionImpl() + .toUDF("WILDCARD_REPLACE"); + // Aggregation functions public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG); public static final SqlAggFunction STDDEV_POP_NULLABLE = diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index c85a429a81d..90c45deb804 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -87,6 +87,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_TRANSLATE3; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_WILDCARD_REPLACE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_BLANK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_EMPTY; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NOT_NULL; @@ -837,6 +838,7 @@ void populate() { registerOperator(INTERNAL_REGEXP_REPLACE_PG_4, SqlLibraryOperators.REGEXP_REPLACE_PG_4); registerOperator(INTERNAL_REGEXP_REPLACE_5, SqlLibraryOperators.REGEXP_REPLACE_5); registerOperator(INTERNAL_TRANSLATE3, SqlLibraryOperators.TRANSLATE3); + registerOperator(INTERNAL_WILDCARD_REPLACE, PPLBuiltinOperators.WILDCARD_REPLACE); // Register eval functions for PPL max() and min() calls registerOperator(MAX, PPLBuiltinOperators.MAX); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java new file mode 100644 index 00000000000..99fdaf21fc6 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java @@ -0,0 +1,65 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import java.util.List; +import org.apache.calcite.adapter.enumerable.NotNullImplementor; +import org.apache.calcite.adapter.enumerable.NullPolicy; +import org.apache.calcite.adapter.enumerable.RexImpTable; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; +import org.apache.calcite.linq4j.tree.Expression; +import org.apache.calcite.linq4j.tree.Types; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.schema.impl.ScalarFunctionImpl; +import org.apache.calcite.sql.type.OperandTypes; +import org.apache.calcite.sql.type.ReturnTypes; +import org.apache.calcite.sql.type.SqlReturnTypeInference; +import org.apache.calcite.sql.type.SqlTypeFamily; +import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; +import org.opensearch.sql.expression.function.ImplementorUDF; +import org.opensearch.sql.expression.function.UDFOperandMetadata; + +/** + * UDF for wildcard-based string replacement in PPL replace command. + * + *

This function wraps WildcardReplaceUtils.replaceWithWildcard to provide SPL-compatible + * wildcard matching where '*' matches zero or more characters. + */ +public class WildcardReplaceFunctionImpl extends ImplementorUDF { + + public WildcardReplaceFunctionImpl() { + super(new WildcardReplaceImplementor(), NullPolicy.ANY); + } + + @Override + public SqlReturnTypeInference getReturnTypeInference() { + return ReturnTypes.VARCHAR_2000; + } + + @Override + public UDFOperandMetadata getOperandMetadata() { + return UDFOperandMetadata.wrap( + OperandTypes.family(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.STRING)); + } + + /** Implementor for wildcard replace function. */ + public static class WildcardReplaceImplementor implements NotNullImplementor { + @Override + public Expression implement( + RexToLixTranslator translator, RexCall call, List translatedOperands) { + ScalarFunctionImpl function = + (ScalarFunctionImpl) + ScalarFunctionImpl.create( + Types.lookupMethod( + WildcardReplaceUtils.class, + "replaceWithWildcard", + String.class, + String.class, + String.class)); + return function.getImplementor().implement(translator, call, RexImpTable.NullAs.NULL); + } + } +} diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java index 9d6304c363b..1ca43f7f307 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java @@ -288,4 +288,213 @@ public void testMultiplePairsSequentialApplication() throws IOException { rows("John", "Ontario Province"), rows("Jane", "Quebec")); } + + // ========== Wildcard Integration Tests ========== + + @Test + public void testWildcardReplace_suffixMatch() throws IOException { + // Pattern "*ada" should match "Canada" and replace with "CA" + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*ada' WITH 'CA' IN country | fields name, country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + verifyDataRows( + result, rows("Jake", "USA"), rows("Hello", "USA"), rows("John", "CA"), rows("Jane", "CA")); + } + + @Test + public void testWildcardReplace_prefixMatch() throws IOException { + // Pattern "US*" should match "USA" and replace with "United States" + JSONObject result = + executeQuery( + String.format( + "source = %s | replace 'US*' WITH 'United States' IN country | fields name," + + " country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + verifyDataRows( + result, + rows("Jake", "United States"), + rows("Hello", "United States"), + rows("John", "Canada"), + rows("Jane", "Canada")); + } + + @Test + public void testWildcardReplace_multipleWildcards() throws IOException { + // Pattern "* *" with replacement "*_*" should replace spaces with underscores + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '* *' WITH '*_*' IN state | fields name, state", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("state", "string")); + + verifyDataRows( + result, + rows("Jake", "California"), + rows("Hello", "New_York"), + rows("John", "Ontario"), + rows("Jane", "Quebec")); + } + + @Test + public void testWildcardReplace_symmetryMismatch_shouldFail() { + // Pattern has 2 wildcards, replacement has 1 - should fail + Throwable e = + assertThrowsWithReplace( + IllegalArgumentException.class, + () -> + executeQuery( + String.format( + "source = %s | replace '* *' WITH '*' IN state", + TEST_INDEX_STATE_COUNTRY))); + verifyErrorMessageContains(e, "Wildcard count mismatch"); + } + + @Test + public void testWildcardReplace_multipleFields() throws IOException { + // Test wildcard replacement across multiple fields + // Pattern "*A" should match "USA" in country + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*A' WITH 'United States' IN country, name | fields name," + + " country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + verifyDataRows( + result, + rows("Jake", "United States"), + rows("Hello", "United States"), + rows("John", "Canada"), + rows("Jane", "Canada")); + } + + @Test + public void testWildcardReplace_internalField() throws IOException { + // Test wildcard replacement on internal fields + // Replace pattern in _index field + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*country' WITH 'test_index' IN _index | fields name," + + " _index", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("_index", "string")); + + // All rows should have _index replaced since it matches "*country" + verifyDataRows( + result, + rows("Jake", "test_index"), + rows("Hello", "test_index"), + rows("John", "test_index"), + rows("Jane", "test_index")); + } + + @Test + public void testWildcardReplace_multiplePairsWithWildcards() throws IOException { + // Test multiple wildcard pattern pairs in a single command + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*A' WITH 'United States', '*ada' WITH 'CA' IN country |" + + " fields name, country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + // First pair: "*A" matches "USA" → "United States" + // Second pair: "*ada" matches "Canada" → "CA" + verifyDataRows( + result, + rows("Jake", "United States"), + rows("Hello", "United States"), + rows("John", "CA"), + rows("Jane", "CA")); + } + + @Test + public void testWildcardReplace_withSort() throws IOException { + // Test wildcard replacement followed by sort command + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*A' WITH 'United States' IN country | fields name," + + " country | sort country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + // Results should be sorted by country after wildcard replacement + verifyDataRows( + result, + rows("John", "Canada"), + rows("Jane", "Canada"), + rows("Jake", "United States"), + rows("Hello", "United States")); + } + + @Test + public void testWildcardReplace_withWhereClause() throws IOException { + // Test wildcard replacement with where clause filtering + JSONObject result = + executeQuery( + String.format( + "source = %s | where country = 'USA' | replace 'US*' WITH 'United States' IN" + + " country | fields name, country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + // Only rows where country = 'USA' should be processed + verifyDataRows(result, rows("Jake", "United States"), rows("Hello", "United States")); + } + + @Test + public void testWildcardReplace_nullValues() throws IOException { + // Test wildcard replacement behavior with null field values + // Use a query that might have null values in results + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*' WITH 'N/A' IN country | fields name, country | head 2", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + // Wildcard pattern "*" matches everything, so all non-null values are replaced with "N/A" + verifyDataRows(result, rows("Jake", "N/A"), rows("Hello", "N/A")); + } + + @Test + public void testWildcardReplace_emptyStringIntegration() throws IOException { + // Integration test for empty string replacement with wildcards + // Replace the entire country value with empty string + JSONObject result = + executeQuery( + String.format( + "source = %s | replace '*A' WITH '' IN country | fields name, country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("name", "string"), schema("country", "string")); + + // "*A" matches "USA" → empty string, "Canada" stays unchanged + verifyDataRows( + result, + rows("Jake", ""), + rows("Hello", ""), + rows("John", "Canada"), + rows("Jane", "Canada")); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java index abde8b3a5bb..3f5a9dd8060 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java @@ -325,4 +325,71 @@ public void testReplaceWithMultiplePairsTrailingCommaShouldFail() { String ppl = "source=EMP | replace \"CLERK\" WITH \"EMPLOYEE\", IN JOB"; getRelNode(ppl); } + + // ========== Wildcard Tests ========== + + @Test + public void testWildcardReplace_prefixWildcard() { + // Replace suffix wildcard - e.g., "*MAN" matches "SALESMAN" → "SELLER" + String ppl = "source=EMP | replace \"*MAN\" WITH \"SELLER\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '*MAN':VARCHAR," + + " 'SELLER':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test + public void testWildcardReplace_multipleWildcards() { + // Replace with multiple wildcards for capture and substitution + String ppl = "source=EMP | replace \"* - *\" WITH \"*_*\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '* - *':VARCHAR," + + " '*_*':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test(expected = IllegalArgumentException.class) + public void testWildcardReplace_symmetryMismatch_shouldFail() { + // Pattern has 2 wildcards, replacement has 1 - should throw error + String ppl = "source=EMP | replace \"* - *\" WITH \"*\" IN JOB"; + getRelNode(ppl); + } + + @Test + public void testWildcardReplace_symmetryValid_zeroInReplacement() { + // Pattern has 2 wildcards, replacement has 0 - should work + String ppl = "source=EMP | replace \"* - *\" WITH \"FIXED\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '* - *':VARCHAR," + + " 'FIXED':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } + + @Test + public void testWildcardAndLiteralReplace_mixedPairs() { + // Multiple pairs: one with wildcard, one literal + String ppl = + "source=EMP | replace \"*CLERK\" WITH \"EMPLOYEE\", \"MANAGER\" WITH \"SUPERVISOR\" IN JOB"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REPLACE(WILDCARD_REPLACE($2," + + " '*CLERK':VARCHAR, 'EMPLOYEE':VARCHAR), 'MANAGER':VARCHAR, 'SUPERVISOR':VARCHAR)]," + + " MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + + verifyLogical(root, expectedLogical); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java new file mode 100644 index 00000000000..035d8f711e7 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java @@ -0,0 +1,319 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import static org.junit.Assert.*; + +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; + +/** Unit tests for {@link WildcardReplaceUtils}. */ +public class WildcardReplaceUtilsTest { + + // ========== Basic Wildcard Matching Tests ========== + + @Test + public void testWildcardMatch_prefixWildcard() { + assertEquals( + "localhost", + WildcardReplaceUtils.replaceWithWildcard("server.localhost", "*localhost", "localhost")); + } + + @Test + public void testWildcardMatch_suffixWildcard() { + assertEquals( + "server", WildcardReplaceUtils.replaceWithWildcard("server.local", "server*", "server")); + } + + @Test + public void testWildcardMatch_infixWildcard() { + assertEquals( + "replaced", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "replaced")); + } + + @Test + public void testWildcardMatch_multipleWildcards() { + assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); + } + + @Test + public void testWildcardMatch_noMatch() { + String input = "server.example.com"; + assertEquals(input, WildcardReplaceUtils.replaceWithWildcard(input, "*localhost", "localhost")); + } + + @Test + public void testWildcardMatch_onlyWildcard() { + assertEquals("replaced", WildcardReplaceUtils.replaceWithWildcard("anything", "*", "replaced")); + } + + // ========== Wildcard Capture and Substitution Tests ========== + + @Test + public void testWildcardCapture_single() { + assertEquals( + "localhost server", + WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); + } + + @Test + public void testWildcardCapture_multiple() { + // Pattern "* - *" captures ["foo", "bar"], replacement "*_*" gives "foo_bar" + assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); + } + + @Test + public void testWildcardCapture_reorder() { + assertEquals( + "localhost server", + WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); + } + + @Test + public void testWildcardSubstitute_noWildcards() { + assertEquals("fixed", WildcardReplaceUtils.replaceWithWildcard("foo bar", "* bar", "fixed")); + } + + @Test + public void testWildcardSubstitute_moreCapturesThanWildcards() { + // Pattern: "* - * - *" captures 3 values + // Replacement: "*_*" uses only 2 + assertEquals( + "foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar - baz", "* - * - *", "*_*")); + } + + // ========== Edge Cases ========== + + @Test + public void testWildcard_emptyCapture() { + // Wildcard matches empty string + assertEquals( + "fixed", WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "fixed")); + } + + @Test + public void testWildcard_emptyCaptureWithSubstitution() { + // Empty capture should be substituted as empty string + assertEquals( + "localhost ", + WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "localhost *")); + } + + @Test + public void testWildcard_overlappingParts() { + // No valid match - parts overlap + assertNull(WildcardReplaceUtils.matchAndCapture("foo", "foo*foo")); + } + + @Test + public void testWildcard_consecutiveWildcards() { + // "**" treated as two separate wildcards + // Pattern "**" splits to ["", "", ""], so first wildcard captures empty, second captures all + List captures = WildcardReplaceUtils.matchAndCapture("foobar", "**"); + assertNotNull(captures); + assertEquals(2, captures.size()); + // First wildcard captures empty (greedy matching finds "" immediately) + // Second wildcard captures the rest + assertEquals("", captures.get(0)); + assertEquals("foobar", captures.get(1)); + } + + @Test + public void testWildcard_emptyString() { + // Pattern "*" matches empty string (wildcard matches zero or more chars) + assertEquals("replacement", WildcardReplaceUtils.replaceWithWildcard("", "*", "replacement")); + } + + @Test + public void testWildcard_nullInput() { + assertNull(WildcardReplaceUtils.replaceWithWildcard(null, "*", "replacement")); + } + + @Test + public void testWildcard_singleWildcardMatchesAll() { + // Pattern "*" contains a wildcard, so it matches the entire input + String input = "foo * bar"; + assertEquals("replaced", WildcardReplaceUtils.replaceWithWildcard(input, "*", "replaced")); + } + + // ========== Literal Replacement (No Wildcards) ========== + + @Test + public void testLiteral_noWildcards() { + assertEquals("Illinois", WildcardReplaceUtils.replaceWithWildcard("IL", "IL", "Illinois")); + } + + @Test + public void testLiteral_multipleOccurrences() { + assertEquals( + "Illinois Illinois", WildcardReplaceUtils.replaceWithWildcard("IL IL", "IL", "Illinois")); + } + + @Test + public void testLiteral_noMatch() { + String input = "California"; + assertEquals(input, WildcardReplaceUtils.replaceWithWildcard(input, "IL", "Illinois")); + } + + // ========== Validation Tests ========== + + @Test + public void testValidation_symmetryValid_sameCount() { + // Should not throw exception + WildcardReplaceUtils.validateWildcardSymmetry("* - *", "*_*"); + } + + @Test + public void testValidation_symmetryValid_zeroInReplacement() { + // Should not throw exception + WildcardReplaceUtils.validateWildcardSymmetry("* - *", "fixed"); + } + + @Test + public void testValidation_symmetryInvalid_mismatch() { + try { + WildcardReplaceUtils.validateWildcardSymmetry("* - *", "*"); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException exception) { + assertTrue(exception.getMessage().contains("Wildcard count mismatch")); + assertTrue(exception.getMessage().contains("pattern has 2 wildcard(s)")); + assertTrue(exception.getMessage().contains("replacement has 1")); + } + } + + @Test + public void testValidation_symmetryValid_noWildcardsInPattern() { + // Should not throw exception + WildcardReplaceUtils.validateWildcardSymmetry("foo", "bar"); + } + + // ========== Count Wildcards Tests ========== + + @Test + public void testCountWildcards_none() { + assertEquals(0, WildcardReplaceUtils.countWildcards("no wildcards here")); + } + + @Test + public void testCountWildcards_single() { + assertEquals(1, WildcardReplaceUtils.countWildcards("*wildcard")); + } + + @Test + public void testCountWildcards_multiple() { + assertEquals(3, WildcardReplaceUtils.countWildcards("* - * - *")); + } + + @Test + public void testCountWildcards_consecutive() { + assertEquals(2, WildcardReplaceUtils.countWildcards("**")); + } + + // ========== Match and Capture Internal Tests ========== + + @Test + public void testMatchAndCapture_prefixWildcard() { + List captures = WildcardReplaceUtils.matchAndCapture("server.localhost", "*localhost"); + assertNotNull(captures); + assertEquals(1, captures.size()); + assertEquals("server.", captures.get(0)); + } + + @Test + public void testMatchAndCapture_suffixWildcard() { + List captures = WildcardReplaceUtils.matchAndCapture("server.local", "server*"); + assertNotNull(captures); + assertEquals(1, captures.size()); + assertEquals(".local", captures.get(0)); + } + + @Test + public void testMatchAndCapture_middlePart() { + List captures = WildcardReplaceUtils.matchAndCapture("foo - bar", "* - *"); + assertNotNull(captures); + assertEquals(2, captures.size()); + assertEquals("foo", captures.get(0)); + assertEquals("bar", captures.get(1)); + } + + @Test + public void testMatchAndCapture_noMatch_wrongPrefix() { + assertNull(WildcardReplaceUtils.matchAndCapture("server.localhost", "client*")); + } + + @Test + public void testMatchAndCapture_noMatch_wrongSuffix() { + assertNull(WildcardReplaceUtils.matchAndCapture("server.localhost", "*example")); + } + + @Test + public void testMatchAndCapture_noMatch_missingMiddle() { + assertNull(WildcardReplaceUtils.matchAndCapture("foo bar", "* - *")); + } + + // ========== Substitute Wildcards Internal Tests ========== + + @Test + public void testSubstituteWildcards_single() { + assertEquals( + "prefix_foo", WildcardReplaceUtils.substituteWildcards("prefix_*", Arrays.asList("foo"))); + } + + @Test + public void testSubstituteWildcards_multiple() { + assertEquals( + "foo_bar", WildcardReplaceUtils.substituteWildcards("*_*", Arrays.asList("foo", "bar"))); + } + + @Test + public void testSubstituteWildcards_noWildcardsInReplacement() { + assertEquals( + "fixed", WildcardReplaceUtils.substituteWildcards("fixed", Arrays.asList("foo", "bar"))); + } + + @Test + public void testSubstituteWildcards_moreWildcardsThanCaptures() { + // Should use available captures, skip extra wildcards + assertEquals("foo_", WildcardReplaceUtils.substituteWildcards("*_*", Arrays.asList("foo"))); + } + + // ========== SPL Examples from Documentation ========== + + @Test + public void testSPLExample1_replaceSuffix() { + // SPL: replace *localhost WITH localhost IN host + // Input: "server.localhost" → Output: "localhost" + assertEquals( + "localhost", + WildcardReplaceUtils.replaceWithWildcard("server.localhost", "*localhost", "localhost")); + } + + @Test + public void testSPLExample2_reorderWithCapture() { + // SPL: replace "* localhost" WITH "localhost *" IN host + // Input: "server localhost" → Output: "localhost server" + assertEquals( + "localhost server", + WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); + } + + @Test + public void testSPLExample3_multipleWildcards() { + // SPL: replace "* - *" WITH "*_*" IN field + // Input: "foo - bar" → Output: "foo_bar" + assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); + } + + @Test + public void testSPLExample4_infixReplacement() { + // SPL: replace *XYZ* WITH *ALL* IN _time + // Input: "fooXYZbar" → Output: "fooALLbar" + assertEquals( + "fooALLbar", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "*ALL*")); + } +} From ab74440a8ba73a02eb783355dd9884c7fff99562 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 16:30:33 -0700 Subject: [PATCH 02/12] fixes Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRelNodeVisitor.java | 28 +--- .../calcite/utils/WildcardReplaceUtils.java | 26 +--- .../udf/WildcardReplaceFunctionImpl.java | 7 +- docs/user/ppl/cmd/replace.rst | 142 +++++++++++++++++- .../remote/CalciteReplaceCommandIT.java | 2 - .../ppl/calcite/CalcitePPLReplaceTest.java | 2 - .../ppl/calcite/WildcardReplaceUtilsTest.java | 56 +------ 7 files changed, 156 insertions(+), 107 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 24db64f084e..19772ee1f1d 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -147,6 +147,7 @@ import org.opensearch.sql.calcite.utils.JoinAndLookupUtils; import org.opensearch.sql.calcite.utils.PlanUtils; import org.opensearch.sql.calcite.utils.UserDefinedFunctionUtils; +import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; import org.opensearch.sql.calcite.utils.WildcardUtils; import org.opensearch.sql.common.patterns.PatternUtils; import org.opensearch.sql.common.utils.StringUtils; @@ -2855,25 +2856,15 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { RexNode patternNode = rexVisitor.analyze(pair.getPattern(), context); RexNode replacementNode = rexVisitor.analyze(pair.getReplacement(), context); - // Extract pattern and replacement string values - String patternStr = - ((org.opensearch.sql.ast.expression.Literal) pair.getPattern()).getValue().toString(); - String replacementStr = - ((org.opensearch.sql.ast.expression.Literal) pair.getReplacement()) - .getValue() - .toString(); + String patternStr = pair.getPattern().getValue().toString(); + String replacementStr = pair.getReplacement().getValue().toString(); - // Check if pattern contains wildcards if (patternStr.contains("*")) { - // Validate wildcard symmetry - org.opensearch.sql.calcite.utils.WildcardReplaceUtils.validateWildcardSymmetry( - patternStr, replacementStr); + WildcardReplaceUtils.validateWildcardSymmetry(patternStr, replacementStr); - // For wildcard patterns, use custom wildcard replacement logic fieldRef = buildWildcardReplaceExpression(fieldRef, patternNode, replacementNode, context); } else { - // For literal patterns, use Calcite's standard REPLACE function fieldRef = context.relBuilder.call( SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); @@ -2891,18 +2882,9 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { return context.relBuilder.peek(); } - /** - * Build a RexNode for wildcard-based replacement. - * - * @param fieldRex The field to apply replacement on - * @param patternNode The pattern RexNode - * @param replacementNode The replacement RexNode - * @param context The Calcite plan context - * @return RexNode representing the wildcard replacement operation - */ + /** Build a RexNode for wildcard-based replacement. */ private RexNode buildWildcardReplaceExpression( RexNode fieldRex, RexNode patternNode, RexNode replacementNode, CalcitePlanContext context) { - // Use the registered WILDCARD_REPLACE operator from PPLBuiltinOperators return context.rexBuilder.makeCall( org.opensearch.sql.expression.function.PPLBuiltinOperators.WILDCARD_REPLACE, fieldRex, diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index 1896080ff3a..5e9fa444baa 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -11,8 +11,8 @@ /** * Utility for wildcard-based string replacement in PPL replace command. * - *

Supports SPL-style wildcard matching where '*' matches zero or more characters. Captured - * wildcard portions can be reused in the replacement string. + *

Supports wildcard matching where '*' matches zero or more characters. Captured wildcard + * portions can be reused in the replacement string. * *

Examples: * @@ -21,9 +21,6 @@ *

  • "* localhost" with replacement "localhost *" reorders to "localhost server" *
  • "* - *" matches "foo - bar" and captures ["foo", " bar"] * - * - * @see SPL - * replace command */ public class WildcardReplaceUtils { @@ -40,19 +37,15 @@ public static String replaceWithWildcard(String input, String pattern, String re return null; } - // Fast path: no wildcards = literal replacement if (!pattern.contains("*")) { return input.replace(pattern, replacement); } - // Match and capture wildcard portions List captures = matchAndCapture(input, pattern); if (captures == null) { - // No match - return original return input; } - // Substitute wildcards in replacement with captured values return substituteWildcards(replacement, captures); } @@ -73,29 +66,24 @@ public static List matchAndCapture(String input, String pattern) { String part = parts[i]; if (i == 0) { - // First part: must match at start if (!input.startsWith(part)) { - return null; // No match + return null; } inputIndex = part.length(); } else if (i == parts.length - 1) { - // Last part: must match at end if (!input.endsWith(part)) { - return null; // No match + return null; } - // Capture everything between previous position and where this part starts int endIndex = input.length() - part.length(); if (endIndex < inputIndex) { - return null; // Parts overlap - no valid match + return null; // Parts overlap } captures.add(input.substring(inputIndex, endIndex)); } else { - // Middle part: find next occurrence int nextIndex = input.indexOf(part, inputIndex); if (nextIndex == -1) { - return null; // No match + return null; } - // Capture from current position to where this part starts captures.add(input.substring(inputIndex, nextIndex)); inputIndex = nextIndex + part.length(); } @@ -113,7 +101,6 @@ public static List matchAndCapture(String input, String pattern) { */ public static String substituteWildcards(String replacement, List captures) { if (!replacement.contains("*")) { - // No wildcards in replacement - return as-is return replacement; } @@ -126,7 +113,6 @@ public static String substituteWildcards(String replacement, List captur result.append(captures.get(captureIndex)); captureIndex++; } - // If more wildcards than captures, skip them (shouldn't happen with validation) } else { result.append(c); } diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java index 99fdaf21fc6..23cb06c66af 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java @@ -22,12 +22,7 @@ import org.opensearch.sql.expression.function.ImplementorUDF; import org.opensearch.sql.expression.function.UDFOperandMetadata; -/** - * UDF for wildcard-based string replacement in PPL replace command. - * - *

    This function wraps WildcardReplaceUtils.replaceWithWildcard to provide SPL-compatible - * wildcard matching where '*' matches zero or more characters. - */ +/** UDF for wildcard-based string replacement in PPL replace command. */ public class WildcardReplaceFunctionImpl extends ImplementorUDF { public WildcardReplaceFunctionImpl() { diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index bcb0d57e677..8a32d40ce1c 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -11,7 +11,7 @@ replace Description ============ -Using ``replace`` command to replace text in one or more fields in the search result. +Using ``replace`` command to replace text in one or more fields in the search result. Supports both literal string replacement and wildcard pattern matching. Note: This command is only available when Calcite engine is enabled. @@ -23,8 +23,17 @@ replace '' WITH '' [, '' WITH '']... Parameters ========== -* **pattern**: mandatory. The text pattern you want to replace. Currently supports only plain text literals (no wildcards or regular expressions). -* **replacement**: mandatory. The text you want to replace with. +* **pattern**: mandatory. The text pattern you want to replace. Supports: + + - Plain text literals for exact matching + - Wildcard patterns using ``*`` (asterisk) to match zero or more characters + +* **replacement**: mandatory. The text you want to replace with. When using wildcards: + + - Can contain ``*`` to substitute captured wildcard portions + - Must have the same number of wildcards as the pattern, or zero wildcards + - Wildcards in replacement are substituted with values captured from the pattern match + * **field-name**: mandatory. One or more field names where the replacement should occur. @@ -120,8 +129,131 @@ PPL query:: +-----------------+-------+--------+-----+--------+ +Wildcard Pattern Matching +========================== + +The replace command supports wildcard patterns using ``*`` (asterisk) to match zero or more characters. This provides flexible pattern matching for text transformation. + +Example 6: Wildcard suffix match +--------------------------------- + +Replace values that end with a specific pattern. The wildcard ``*`` matches any prefix. + +PPL query:: + + os> source=accounts | replace "*IL" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Example 7: Wildcard prefix match +--------------------------------- + +Replace values that start with a specific pattern. The wildcard ``*`` matches any suffix. + +PPL query:: + + os> source=accounts | replace "IL*" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Example 8: Wildcard capture and substitution +--------------------------------------------- + +Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. + +PPL query:: + + os> source=accounts | replace "* Lane" WITH "Lane *" IN address | fields address; + fetched rows / total rows = 4/4 + +----------------------+ + | address | + |----------------------| + | Lane 880 Holmes | + | 671 Bristol Street | + | 789 Madison Street | + | 467 Hutchinson Court | + +----------------------+ + + +Example 9: Multiple wildcards for pattern transformation +--------------------------------------------------------- + +Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. + +PPL query:: + + os> source=accounts | replace "* *" WITH "*_*" IN address | fields address; + fetched rows / total rows = 4/4 + +----------------------+ + | address | + |----------------------| + | 880_Holmes Lane | + | 671_Bristol Street | + | 789_Madison Street | + | 467_Hutchinson Court | + +----------------------+ + + +Example 10: Wildcard with zero wildcards in replacement +-------------------------------------------------------- + +When replacement has zero wildcards, all matching values are replaced with the literal replacement string. + +PPL query:: + + os> source=accounts | replace "*IL*" WITH "Illinois" IN state | fields state; + fetched rows / total rows = 4/4 + +----------+ + | state | + |----------| + | Illinois | + | TN | + | VA | + | MD | + +----------+ + + +Wildcard Rules +============== + +When using wildcards in the replace command: + +* **Wildcard character**: Use ``*`` to match zero or more characters +* **Symmetry requirement**: The replacement must have the same number of wildcards as the pattern, OR zero wildcards +* **Substitution order**: Wildcards in replacement are substituted left-to-right with values captured from pattern +* **No match behavior**: If pattern doesn't match, the original value is returned unchanged +* **Case sensitivity**: Wildcard matching is case-sensitive + +**Valid wildcard pairs:** + +* Pattern: ``"*ada"`` (1 wildcard), Replacement: ``"CA"`` (0 wildcards) ✓ +* Pattern: ``"* localhost"`` (1 wildcard), Replacement: ``"localhost *"`` (1 wildcard) ✓ +* Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*_*"`` (2 wildcards) ✓ + +**Invalid wildcard pair:** + +* Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*"`` (1 wildcard) ✗ (mismatch error) + + Limitations =========== -* Only supports plain text literals for pattern matching. Wildcards and regular expressions are not supported. * Pattern and replacement values must be string literals. -* The replace command modifies the specified fields in-place. \ No newline at end of file +* The replace command modifies the specified fields in-place. +* Wildcard matching is case-sensitive. +* Regular expressions are not supported (only simple wildcard patterns with ``*``). \ No newline at end of file diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java index 1ca43f7f307..7b8134c84a5 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java @@ -289,8 +289,6 @@ public void testMultiplePairsSequentialApplication() throws IOException { rows("Jane", "Quebec")); } - // ========== Wildcard Integration Tests ========== - @Test public void testWildcardReplace_suffixMatch() throws IOException { // Pattern "*ada" should match "Canada" and replace with "CA" diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java index 3f5a9dd8060..2e3c93ee26f 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java @@ -326,8 +326,6 @@ public void testReplaceWithMultiplePairsTrailingCommaShouldFail() { getRelNode(ppl); } - // ========== Wildcard Tests ========== - @Test public void testWildcardReplace_prefixWildcard() { // Replace suffix wildcard - e.g., "*MAN" matches "SALESMAN" → "SELLER" diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java index 035d8f711e7..082aebddec9 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java @@ -12,11 +12,8 @@ import org.junit.Test; import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; -/** Unit tests for {@link WildcardReplaceUtils}. */ public class WildcardReplaceUtilsTest { - // ========== Basic Wildcard Matching Tests ========== - @Test public void testWildcardMatch_prefixWildcard() { assertEquals( @@ -52,8 +49,6 @@ public void testWildcardMatch_onlyWildcard() { assertEquals("replaced", WildcardReplaceUtils.replaceWithWildcard("anything", "*", "replaced")); } - // ========== Wildcard Capture and Substitution Tests ========== - @Test public void testWildcardCapture_single() { assertEquals( @@ -63,7 +58,6 @@ public void testWildcardCapture_single() { @Test public void testWildcardCapture_multiple() { - // Pattern "* - *" captures ["foo", "bar"], replacement "*_*" gives "foo_bar" assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); } @@ -81,24 +75,18 @@ public void testWildcardSubstitute_noWildcards() { @Test public void testWildcardSubstitute_moreCapturesThanWildcards() { - // Pattern: "* - * - *" captures 3 values - // Replacement: "*_*" uses only 2 assertEquals( "foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar - baz", "* - * - *", "*_*")); } - // ========== Edge Cases ========== - @Test public void testWildcard_emptyCapture() { - // Wildcard matches empty string assertEquals( "fixed", WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "fixed")); } @Test public void testWildcard_emptyCaptureWithSubstitution() { - // Empty capture should be substituted as empty string assertEquals( "localhost ", WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "localhost *")); @@ -106,26 +94,21 @@ public void testWildcard_emptyCaptureWithSubstitution() { @Test public void testWildcard_overlappingParts() { - // No valid match - parts overlap assertNull(WildcardReplaceUtils.matchAndCapture("foo", "foo*foo")); } @Test public void testWildcard_consecutiveWildcards() { - // "**" treated as two separate wildcards - // Pattern "**" splits to ["", "", ""], so first wildcard captures empty, second captures all + // "**" splits to ["", "", ""], first captures empty, second captures rest List captures = WildcardReplaceUtils.matchAndCapture("foobar", "**"); assertNotNull(captures); assertEquals(2, captures.size()); - // First wildcard captures empty (greedy matching finds "" immediately) - // Second wildcard captures the rest assertEquals("", captures.get(0)); assertEquals("foobar", captures.get(1)); } @Test public void testWildcard_emptyString() { - // Pattern "*" matches empty string (wildcard matches zero or more chars) assertEquals("replacement", WildcardReplaceUtils.replaceWithWildcard("", "*", "replacement")); } @@ -136,13 +119,10 @@ public void testWildcard_nullInput() { @Test public void testWildcard_singleWildcardMatchesAll() { - // Pattern "*" contains a wildcard, so it matches the entire input - String input = "foo * bar"; - assertEquals("replaced", WildcardReplaceUtils.replaceWithWildcard(input, "*", "replaced")); + assertEquals( + "replaced", WildcardReplaceUtils.replaceWithWildcard("foo * bar", "*", "replaced")); } - // ========== Literal Replacement (No Wildcards) ========== - @Test public void testLiteral_noWildcards() { assertEquals("Illinois", WildcardReplaceUtils.replaceWithWildcard("IL", "IL", "Illinois")); @@ -160,17 +140,13 @@ public void testLiteral_noMatch() { assertEquals(input, WildcardReplaceUtils.replaceWithWildcard(input, "IL", "Illinois")); } - // ========== Validation Tests ========== - @Test public void testValidation_symmetryValid_sameCount() { - // Should not throw exception WildcardReplaceUtils.validateWildcardSymmetry("* - *", "*_*"); } @Test public void testValidation_symmetryValid_zeroInReplacement() { - // Should not throw exception WildcardReplaceUtils.validateWildcardSymmetry("* - *", "fixed"); } @@ -188,12 +164,9 @@ public void testValidation_symmetryInvalid_mismatch() { @Test public void testValidation_symmetryValid_noWildcardsInPattern() { - // Should not throw exception WildcardReplaceUtils.validateWildcardSymmetry("foo", "bar"); } - // ========== Count Wildcards Tests ========== - @Test public void testCountWildcards_none() { assertEquals(0, WildcardReplaceUtils.countWildcards("no wildcards here")); @@ -214,8 +187,6 @@ public void testCountWildcards_consecutive() { assertEquals(2, WildcardReplaceUtils.countWildcards("**")); } - // ========== Match and Capture Internal Tests ========== - @Test public void testMatchAndCapture_prefixWildcard() { List captures = WildcardReplaceUtils.matchAndCapture("server.localhost", "*localhost"); @@ -256,8 +227,6 @@ public void testMatchAndCapture_noMatch_missingMiddle() { assertNull(WildcardReplaceUtils.matchAndCapture("foo bar", "* - *")); } - // ========== Substitute Wildcards Internal Tests ========== - @Test public void testSubstituteWildcards_single() { assertEquals( @@ -278,41 +247,30 @@ public void testSubstituteWildcards_noWildcardsInReplacement() { @Test public void testSubstituteWildcards_moreWildcardsThanCaptures() { - // Should use available captures, skip extra wildcards assertEquals("foo_", WildcardReplaceUtils.substituteWildcards("*_*", Arrays.asList("foo"))); } - // ========== SPL Examples from Documentation ========== - @Test - public void testSPLExample1_replaceSuffix() { - // SPL: replace *localhost WITH localhost IN host - // Input: "server.localhost" → Output: "localhost" + public void testWildcardExample1_replaceSuffix() { assertEquals( "localhost", WildcardReplaceUtils.replaceWithWildcard("server.localhost", "*localhost", "localhost")); } @Test - public void testSPLExample2_reorderWithCapture() { - // SPL: replace "* localhost" WITH "localhost *" IN host - // Input: "server localhost" → Output: "localhost server" + public void testWildcardExample2_reorderWithCapture() { assertEquals( "localhost server", WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); } @Test - public void testSPLExample3_multipleWildcards() { - // SPL: replace "* - *" WITH "*_*" IN field - // Input: "foo - bar" → Output: "foo_bar" + public void testWildcardExample3_multipleWildcards() { assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); } @Test - public void testSPLExample4_infixReplacement() { - // SPL: replace *XYZ* WITH *ALL* IN _time - // Input: "fooXYZbar" → Output: "fooALLbar" + public void testWildcardExample4_infixReplacement() { assertEquals( "fooALLbar", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "*ALL*")); } From 355751fc128327c1816b6820f1e8a0d92f92e5f2 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 16:32:14 -0700 Subject: [PATCH 03/12] fix Signed-off-by: Kai Huang --- .../calcite/utils/WildcardReplaceUtils.java | 55 ++----------------- 1 file changed, 6 insertions(+), 49 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index 5e9fa444baa..0dcf26cd5ba 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -8,30 +8,10 @@ import java.util.ArrayList; import java.util.List; -/** - * Utility for wildcard-based string replacement in PPL replace command. - * - *

    Supports wildcard matching where '*' matches zero or more characters. Captured wildcard - * portions can be reused in the replacement string. - * - *

    Examples: - * - *

      - *
    • "*localhost" matches "server.localhost" and captures "server." - *
    • "* localhost" with replacement "localhost *" reorders to "localhost server" - *
    • "* - *" matches "foo - bar" and captures ["foo", " bar"] - *
    - */ +/** Utility for wildcard-based string replacement in PPL replace command. */ public class WildcardReplaceUtils { - /** - * Perform wildcard-based replacement. - * - * @param input The input string - * @param pattern The pattern (may contain wildcards) - * @param replacement The replacement (may contain wildcards) - * @return The replaced string, or original if no match - */ + /** Perform wildcard-based replacement. */ public static String replaceWithWildcard(String input, String pattern, String replacement) { if (input == null) { return null; @@ -49,13 +29,7 @@ public static String replaceWithWildcard(String input, String pattern, String re return substituteWildcards(replacement, captures); } - /** - * Match pattern against input and capture wildcard portions. - * - * @param input The input string - * @param pattern The pattern with wildcards - * @return List of captured strings (one per wildcard), or null if no match - */ + /** Match pattern against input and capture wildcard portions. */ public static List matchAndCapture(String input, String pattern) { List captures = new ArrayList<>(); String[] parts = pattern.split("\\*", -1); // -1 keeps trailing empty strings @@ -92,13 +66,7 @@ public static List matchAndCapture(String input, String pattern) { return captures; } - /** - * Substitute wildcards in replacement string with captured values. - * - * @param replacement The replacement string (may contain wildcards) - * @param captures The captured values from pattern matching - * @return The substituted string - */ + /** Substitute wildcards in replacement string with captured values. */ public static String substituteWildcards(String replacement, List captures) { if (!replacement.contains("*")) { return replacement; @@ -121,12 +89,7 @@ public static String substituteWildcards(String replacement, List captur return result.toString(); } - /** - * Count the number of wildcards in a string. - * - * @param str The string to count wildcards in - * @return The number of wildcard characters ('*') - */ + /** Count the number of wildcards in a string. */ public static int countWildcards(String str) { int count = 0; for (char c : str.toCharArray()) { @@ -137,13 +100,7 @@ public static int countWildcards(String str) { return count; } - /** - * Validate wildcard symmetry between pattern and replacement. - * - * @param pattern The pattern string - * @param replacement The replacement string - * @throws IllegalArgumentException if wildcard counts don't match (and replacement has wildcards) - */ + /** Validate wildcard symmetry between pattern and replacement. */ public static void validateWildcardSymmetry(String pattern, String replacement) { int patternWildcards = countWildcards(pattern); int replacementWildcards = countWildcards(replacement); From b080c8ae417410537cb692e25d5161e5b9b89fe7 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 16:42:56 -0700 Subject: [PATCH 04/12] add limitation doc Signed-off-by: Kai Huang --- .../sql/calcite/utils/WildcardReplaceUtils.java | 11 ++++++++++- docs/user/ppl/cmd/replace.rst | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index 0dcf26cd5ba..1d36d340a36 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -8,7 +8,16 @@ import java.util.ArrayList; import java.util.List; -/** Utility for wildcard-based string replacement in PPL replace command. */ +/** + * Utility for wildcard-based string replacement in PPL replace command. + * + *

    Supports wildcard patterns using '*' to match zero or more characters. Wildcards in the + * replacement string are substituted with values captured from the pattern match. + * + *

    Limitation: Literal asterisk characters cannot be matched or replaced when using wildcard + * patterns. To replace literal asterisks in data, use non-wildcard (literal) replacement mode by + * not including '*' in the pattern string. + */ public class WildcardReplaceUtils { /** Perform wildcard-based replacement. */ diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index 8a32d40ce1c..54b2006f180 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -256,4 +256,5 @@ Limitations * Pattern and replacement values must be string literals. * The replace command modifies the specified fields in-place. * Wildcard matching is case-sensitive. -* Regular expressions are not supported (only simple wildcard patterns with ``*``). \ No newline at end of file +* Regular expressions are not supported (only simple wildcard patterns with ``*``). +* Literal asterisk characters (``*``) cannot be matched or replaced when using wildcard patterns. To replace literal asterisks in your data, use non-wildcard patterns (do not include ``*`` in the pattern string). \ No newline at end of file From 39768835cebad2fd274bdb732c68a297714a9d87 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 16:51:34 -0700 Subject: [PATCH 05/12] refactor to use regex matching Signed-off-by: Kai Huang --- .../calcite/utils/WildcardReplaceUtils.java | 46 ++++++++----------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index 1d36d340a36..02b9cca505e 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -7,6 +7,8 @@ import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Utility for wildcard-based string replacement in PPL replace command. @@ -40,38 +42,28 @@ public static String replaceWithWildcard(String input, String pattern, String re /** Match pattern against input and capture wildcard portions. */ public static List matchAndCapture(String input, String pattern) { - List captures = new ArrayList<>(); - String[] parts = pattern.split("\\*", -1); // -1 keeps trailing empty strings - - int inputIndex = 0; + String[] parts = pattern.split("\\*", -1); + StringBuilder regexBuilder = new StringBuilder("^"); for (int i = 0; i < parts.length; i++) { - String part = parts[i]; - - if (i == 0) { - if (!input.startsWith(part)) { - return null; - } - inputIndex = part.length(); - } else if (i == parts.length - 1) { - if (!input.endsWith(part)) { - return null; - } - int endIndex = input.length() - part.length(); - if (endIndex < inputIndex) { - return null; // Parts overlap - } - captures.add(input.substring(inputIndex, endIndex)); - } else { - int nextIndex = input.indexOf(part, inputIndex); - if (nextIndex == -1) { - return null; - } - captures.add(input.substring(inputIndex, nextIndex)); - inputIndex = nextIndex + part.length(); + regexBuilder.append(Pattern.quote(parts[i])); + if (i < parts.length - 1) { + regexBuilder.append("(.*?)"); } } + regexBuilder.append("$"); + + Pattern compiledPattern = Pattern.compile(regexBuilder.toString()); + Matcher matcher = compiledPattern.matcher(input); + if (!matcher.matches()) { + return null; + } + + List captures = new ArrayList<>(); + for (int i = 1; i <= matcher.groupCount(); i++) { + captures.add(matcher.group(i)); + } return captures; } From e4b28aaa19ba9cdbe01f71c31ab83c5fe66b3afb Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 29 Oct 2025 17:03:59 -0700 Subject: [PATCH 06/12] Add Cache for perf Signed-off-by: Kai Huang --- .../calcite/utils/WildcardReplaceUtils.java | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index 02b9cca505e..f4747a0368d 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -6,7 +6,10 @@ package org.opensearch.sql.calcite.utils; import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -22,6 +25,17 @@ */ public class WildcardReplaceUtils { + private static final int PATTERN_CACHE_SIZE = 100; + + private static final Map PATTERN_CACHE = + Collections.synchronizedMap( + new LinkedHashMap<>(16, 0.75f, true) { + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > PATTERN_CACHE_SIZE; + } + }); + /** Perform wildcard-based replacement. */ public static String replaceWithWildcard(String input, String pattern, String replacement) { if (input == null) { @@ -42,20 +56,10 @@ public static String replaceWithWildcard(String input, String pattern, String re /** Match pattern against input and capture wildcard portions. */ public static List matchAndCapture(String input, String pattern) { - String[] parts = pattern.split("\\*", -1); - StringBuilder regexBuilder = new StringBuilder("^"); + Pattern compiledPattern = + PATTERN_CACHE.computeIfAbsent(pattern, WildcardReplaceUtils::compileWildcardPattern); - for (int i = 0; i < parts.length; i++) { - regexBuilder.append(Pattern.quote(parts[i])); - if (i < parts.length - 1) { - regexBuilder.append("(.*?)"); - } - } - regexBuilder.append("$"); - - Pattern compiledPattern = Pattern.compile(regexBuilder.toString()); Matcher matcher = compiledPattern.matcher(input); - if (!matcher.matches()) { return null; } @@ -67,6 +71,22 @@ public static List matchAndCapture(String input, String pattern) { return captures; } + /** Compile a wildcard pattern to a regex Pattern. */ + private static Pattern compileWildcardPattern(String pattern) { + String[] parts = pattern.split("\\*", -1); + StringBuilder regexBuilder = new StringBuilder("^"); + + for (int i = 0; i < parts.length; i++) { + regexBuilder.append(Pattern.quote(parts[i])); + if (i < parts.length - 1) { + regexBuilder.append("(.*?)"); + } + } + regexBuilder.append("$"); + + return Pattern.compile(regexBuilder.toString()); + } + /** Substitute wildcards in replacement string with captured values. */ public static String substituteWildcards(String replacement, List captures) { if (!replacement.contains("*")) { From 4724fd2231e7bbf696ff47f53969963b5f5ed2a5 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 30 Oct 2025 13:17:33 -0700 Subject: [PATCH 07/12] handle asterisks Signed-off-by: Kai Huang --- .../calcite/utils/WildcardReplaceUtils.java | 89 ++++++++++++++++--- docs/user/ppl/cmd/replace.rst | 67 +++++++++++++- .../remote/CalciteReplaceCommandIT.java | 46 ++++++++++ .../ppl/calcite/WildcardReplaceUtilsTest.java | 78 ++++++++++++++++ 4 files changed, 268 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java index f4747a0368d..5165528c1fc 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java @@ -19,9 +19,8 @@ *

    Supports wildcard patterns using '*' to match zero or more characters. Wildcards in the * replacement string are substituted with values captured from the pattern match. * - *

    Limitation: Literal asterisk characters cannot be matched or replaced when using wildcard - * patterns. To replace literal asterisks in data, use non-wildcard (literal) replacement mode by - * not including '*' in the pattern string. + *

    Escape sequences: Use '\*' to match literal asterisks and '\\' to match literal backslashes. + * Without escapes, '*' is interpreted as a wildcard pattern. */ public class WildcardReplaceUtils { @@ -42,6 +41,9 @@ public static String replaceWithWildcard(String input, String pattern, String re return null; } + validateEscapeSequences(pattern); + validateEscapeSequences(replacement); + if (!pattern.contains("*")) { return input.replace(pattern, replacement); } @@ -54,6 +56,22 @@ public static String replaceWithWildcard(String input, String pattern, String re return substituteWildcards(replacement, captures); } + /** Validate that string doesn't end with unescaped backslash. */ + private static void validateEscapeSequences(String str) { + boolean escaped = false; + for (char c : str.toCharArray()) { + if (escaped) { + escaped = false; + } else if (c == '\\') { + escaped = true; + } + } + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: pattern ends with unescaped backslash"); + } + } + /** Match pattern against input and capture wildcard portions. */ public static List matchAndCapture(String input, String pattern) { Pattern compiledPattern = @@ -71,9 +89,46 @@ public static List matchAndCapture(String input, String pattern) { return captures; } + /** + * Split pattern on unescaped wildcards, handling escape sequences. + * + *

    Supports: \* (literal asterisk), \\ (literal backslash) + * + * @param pattern Wildcard pattern with potential escapes + * @return Array of literal parts between wildcards + * @throws IllegalArgumentException if pattern ends with unescaped backslash + */ + private static String[] splitWildcards(String pattern) { + List parts = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + boolean escaped = false; + + for (char c : pattern.toCharArray()) { + if (escaped) { + current.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + parts.add(current.toString()); + current = new StringBuilder(); + } else { + current.append(c); + } + } + + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: pattern ends with unescaped backslash"); + } + + parts.add(current.toString()); + return parts.toArray(new String[0]); + } + /** Compile a wildcard pattern to a regex Pattern. */ private static Pattern compileWildcardPattern(String pattern) { - String[] parts = pattern.split("\\*", -1); + String[] parts = splitWildcards(pattern); StringBuilder regexBuilder = new StringBuilder("^"); for (int i = 0; i < parts.length; i++) { @@ -89,15 +144,17 @@ private static Pattern compileWildcardPattern(String pattern) { /** Substitute wildcards in replacement string with captured values. */ public static String substituteWildcards(String replacement, List captures) { - if (!replacement.contains("*")) { - return replacement; - } - StringBuilder result = new StringBuilder(); int captureIndex = 0; + boolean escaped = false; for (char c : replacement.toCharArray()) { - if (c == '*') { + if (escaped) { + result.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { if (captureIndex < captures.size()) { result.append(captures.get(captureIndex)); captureIndex++; @@ -107,14 +164,24 @@ public static String substituteWildcards(String replacement, List captur } } + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: replacement ends with unescaped backslash"); + } + return result.toString(); } - /** Count the number of wildcards in a string. */ + /** Count the number of unescaped wildcards in a string. */ public static int countWildcards(String str) { int count = 0; + boolean escaped = false; for (char c : str.toCharArray()) { - if (c == '*') { + if (escaped) { + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { count++; } } diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index 54b2006f180..b29f8b10e22 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -251,10 +251,75 @@ When using wildcards in the replace command: * Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*"`` (1 wildcard) ✗ (mismatch error) +Escape Sequences +================ + +To match or replace literal asterisks or backslashes in your data, use escape sequences: + +* ``\*`` - Matches a literal asterisk character +* ``\\`` - Matches a literal backslash character + +Without escapes, asterisks are interpreted as wildcards. + +Example 11: Matching literal asterisks +--------------------------------------- + +Match and replace literal asterisk characters in data. + +PPL query:: + + os> source=accounts | eval note = 'price: *sale*' | replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note | fields note; + fetched rows / total rows = 4/4 + +------------+ + | note | + |------------| + | DISCOUNTED | + | DISCOUNTED | + | DISCOUNTED | + | DISCOUNTED | + +------------+ + +Example 12: Wildcard with no replacement wildcards +---------------------------------------------------- + +Use wildcards in pattern but none in replacement to create a fixed output. + +PPL query:: + + os> source=accounts | eval test = 'prefix-value-suffix' | replace 'prefix-*-suffix' WITH 'MATCHED' IN test | fields test; + fetched rows / total rows = 4/4 + +---------+ + | test | + |---------| + | MATCHED | + | MATCHED | + | MATCHED | + | MATCHED | + +---------+ + +Example 13: Escaped asterisks with wildcards +--------------------------------------------- + +Combine escaped asterisks (literal) with wildcards for complex patterns. + +PPL query:: + + os> source=accounts | eval label = 'file123.txt' | replace 'file*.*' WITH '\**.*' IN label | fields label; + fetched rows / total rows = 4/4 + +----------+ + | label | + |----------| + | *123.txt | + | *123.txt | + | *123.txt | + | *123.txt | + +----------+ + + Limitations =========== * Pattern and replacement values must be string literals. * The replace command modifies the specified fields in-place. * Wildcard matching is case-sensitive. * Regular expressions are not supported (only simple wildcard patterns with ``*``). -* Literal asterisk characters (``*``) cannot be matched or replaced when using wildcard patterns. To replace literal asterisks in your data, use non-wildcard patterns (do not include ``*`` in the pattern string). \ No newline at end of file +* Use backslash escape sequences (``\*``, ``\\``) to match literal asterisks or backslashes. \ No newline at end of file diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java index 7b8134c84a5..55aa6162f58 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java @@ -495,4 +495,50 @@ public void testWildcardReplace_emptyStringIntegration() throws IOException { rows("John", "Canada"), rows("Jane", "Canada")); } + + @Test + public void testEscapeSequence_literalAsterisk() throws IOException { + // Test matching literal asterisks in data using \* escape sequence + JSONObject result = + executeQuery( + String.format( + "source = %s | eval note = 'price: *sale*' | replace 'price: \\\\*sale\\\\*' WITH" + + " 'DISCOUNTED' IN note | fields note | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("note", "string")); + // Pattern "price: \*sale\*" matches literal asterisks, result should be "DISCOUNTED" + verifyDataRows(result, rows("DISCOUNTED")); + } + + @Test + public void testEscapeSequence_mixedEscapeAndWildcard() throws IOException { + // Test combining escaped asterisks (literal) with wildcards (pattern matching) + JSONObject result = + executeQuery( + String.format( + "source = %s | eval label = 'file123.txt' | replace 'file*.*' WITH" + + " '\\\\**.*' IN label | fields label | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("label", "string")); + // Pattern "file*.*" captures "123" and "txt" + // Replacement "\**.*" has escaped * (literal), then 2 wildcards, producing "*123.txt" + verifyDataRows(result, rows("*123.txt")); + } + + @Test + public void testEscapeSequence_noMatchLiteral() throws IOException { + // Test that escaped asterisk doesn't match as wildcard + JSONObject result = + executeQuery( + String.format( + "source = %s | eval test = 'fooXbar' | replace 'foo\\\\*bar' WITH 'matched' IN test" + + " | fields test | head 1", + TEST_INDEX_STATE_COUNTRY)); + + verifySchema(result, schema("test", "string")); + // Pattern "foo\*bar" matches literal "foo*bar", not "fooXbar", so original value returned + verifyDataRows(result, rows("fooXbar")); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java index 082aebddec9..f47875a0609 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java @@ -274,4 +274,82 @@ public void testWildcardExample4_infixReplacement() { assertEquals( "fooALLbar", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "*ALL*")); } + + @Test + public void testEscapedAsterisk_literal() { + assertEquals( + "foo*bar", WildcardReplaceUtils.replaceWithWildcard("foo*bar", "foo\\*bar", "foo\\*bar")); + } + + @Test + public void testEscapedAsterisk_noMatch() { + assertEquals( + "fooXbar", WildcardReplaceUtils.replaceWithWildcard("fooXbar", "foo\\*bar", "replacement")); + } + + @Test + public void testEscapedBackslash_beforeWildcard() { + assertEquals( + "foo\\123", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\123")); + } + + @Test + public void testEscapedBackslash_literal() { + assertEquals( + "foo\\bar", + WildcardReplaceUtils.replaceWithWildcard("foo\\bar", "foo\\\\bar", "foo\\\\bar")); + } + + @Test + public void testMixedEscapes_asteriskAndBackslash() { + assertEquals( + "price: *special* $100\\ea", + WildcardReplaceUtils.replaceWithWildcard( + "price: *special* $100\\ea", + "price: \\*special\\* $*\\\\*", + "price: \\*special\\* $*\\\\*")); + } + + @Test + public void testEscapedAsterisk_withWildcard_capture() { + assertEquals( + "file*.prefix-123", + WildcardReplaceUtils.replaceWithWildcard("file123.txt", "file*.*", "file\\*.prefix-*")); + } + + @Test + public void testTrailingBackslash_shouldFail() { + try { + WildcardReplaceUtils.replaceWithWildcard("foo", "foo\\", "bar"); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException exception) { + assertTrue(exception.getMessage().contains("Invalid escape sequence")); + } + } + + @Test + public void testOnlyEscapedAsterisks_noWildcards() { + assertEquals("***", WildcardReplaceUtils.replaceWithWildcard("***", "\\*\\*\\*", "\\*\\*\\*")); + } + + @Test + public void testDoubleBackslashBeforeAsterisk() { + assertEquals( + "foo\\bar", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\bar")); + } + + @Test + public void testCountWildcards_withEscapes() { + assertEquals(2, WildcardReplaceUtils.countWildcards("foo\\*bar*baz*")); + } + + @Test + public void testCountWildcards_allEscaped() { + assertEquals(0, WildcardReplaceUtils.countWildcards("\\*\\*\\*")); + } + + @Test + public void testValidation_escapedWildcardsNotCounted() { + WildcardReplaceUtils.validateWildcardSymmetry("foo\\**", "bar*"); + } } From 8ef3f1f152b53d417bc3eb8bb89c17dfc678d81b Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 30 Oct 2025 13:36:28 -0700 Subject: [PATCH 08/12] refactor the implementation Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRelNodeVisitor.java | 153 +++++++- .../calcite/utils/WildcardReplaceUtils.java | 204 ---------- .../function/BuiltinFunctionName.java | 3 +- .../function/PPLBuiltinOperators.java | 5 - .../expression/function/PPLFuncImpTable.java | 2 - .../udf/WildcardReplaceFunctionImpl.java | 60 --- .../ppl/calcite/CalcitePPLReplaceTest.java | 27 +- .../ppl/calcite/WildcardReplaceUtilsTest.java | 355 ------------------ 8 files changed, 160 insertions(+), 649 deletions(-) delete mode 100644 core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java delete mode 100644 core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java delete mode 100644 ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 19772ee1f1d..53f0a8ba37a 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -147,7 +147,6 @@ import org.opensearch.sql.calcite.utils.JoinAndLookupUtils; import org.opensearch.sql.calcite.utils.PlanUtils; import org.opensearch.sql.calcite.utils.UserDefinedFunctionUtils; -import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; import org.opensearch.sql.calcite.utils.WildcardUtils; import org.opensearch.sql.common.patterns.PatternUtils; import org.opensearch.sql.common.utils.StringUtils; @@ -2860,11 +2859,35 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { String replacementStr = pair.getReplacement().getValue().toString(); if (patternStr.contains("*")) { - WildcardReplaceUtils.validateWildcardSymmetry(patternStr, replacementStr); + // Wildcard pattern: convert to regex at planning time + validateWildcardSymmetry(patternStr, replacementStr); + // Convert wildcard pattern to regex pattern (e.g., "*ada" → "^(.*?)ada$") + String regexPattern = convertWildcardPatternToRegex(patternStr); + // Convert wildcard replacement to regex replacement (e.g., "*_*" → "$1_$2") + String regexReplacement = convertWildcardReplacementToRegex(replacementStr); + + // Create regex pattern and replacement literals + RexNode regexPatternNode = + context.rexBuilder.makeLiteral( + regexPattern, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true); + RexNode regexReplacementNode = + context.rexBuilder.makeLiteral( + regexReplacement, + context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), + true); + + // Use Calcite's REGEXP_REPLACE operator fieldRef = - buildWildcardReplaceExpression(fieldRef, patternNode, replacementNode, context); + context.rexBuilder.makeCall( + org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE_3, + fieldRef, + regexPatternNode, + regexReplacementNode); } else { + // Literal pattern: use standard REPLACE fieldRef = context.relBuilder.call( SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); @@ -2882,16 +2905,124 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { return context.relBuilder.peek(); } - /** Build a RexNode for wildcard-based replacement. */ - private RexNode buildWildcardReplaceExpression( - RexNode fieldRex, RexNode patternNode, RexNode replacementNode, CalcitePlanContext context) { - return context.rexBuilder.makeCall( - org.opensearch.sql.expression.function.PPLBuiltinOperators.WILDCARD_REPLACE, - fieldRex, - patternNode, - replacementNode); + // ============ Wildcard-to-Regex Conversion Utilities ============ + + /** + * Convert a wildcard pattern to a regex pattern string. Example: "*ada" → "^(.*?)ada$" Example: + * "* - *" → "^(.*?) - (.*?)$" Handles escape sequences: \* (literal asterisk), \\ (literal + * backslash) + */ + private static String convertWildcardPatternToRegex(String wildcardPattern) { + String[] parts = splitWildcards(wildcardPattern); + StringBuilder regexBuilder = new StringBuilder("^"); + + for (int i = 0; i < parts.length; i++) { + regexBuilder.append(java.util.regex.Pattern.quote(parts[i])); + if (i < parts.length - 1) { + regexBuilder.append("(.*?)"); // Non-greedy capture group for wildcard + } + } + regexBuilder.append("$"); + + return regexBuilder.toString(); + } + + /** + * Convert a wildcard replacement to a regex replacement string. Example: "*_*" → "$1_$2" Example: + * "SELLER" → "SELLER" (no wildcards) Handles escape sequences: \* (literal asterisk), \\ (literal + * backslash) + */ + private static String convertWildcardReplacementToRegex(String wildcardReplacement) { + if (!wildcardReplacement.contains("*")) { + return wildcardReplacement; // No wildcards = literal replacement + } + + StringBuilder result = new StringBuilder(); + int captureIndex = 1; // Regex capture groups start at $1 + boolean escaped = false; + + for (char c : wildcardReplacement.toCharArray()) { + if (escaped) { + // Handle escape sequences: \* or \\ + result.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + // Replace wildcard with $1, $2, etc. + result.append('$').append(captureIndex++); + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Split pattern on unescaped wildcards, handling escape sequences. Supports: \* (literal + * asterisk), \\ (literal backslash) + */ + private static String[] splitWildcards(String pattern) { + List parts = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + boolean escaped = false; + + for (char c : pattern.toCharArray()) { + if (escaped) { + current.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + parts.add(current.toString()); + current = new StringBuilder(); + } else { + current.append(c); + } + } + + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: pattern ends with unescaped backslash"); + } + + parts.add(current.toString()); + return parts.toArray(new String[0]); } + /** Count the number of unescaped wildcards in a string. */ + private static int countWildcards(String str) { + int count = 0; + boolean escaped = false; + for (char c : str.toCharArray()) { + if (escaped) { + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + count++; + } + } + return count; + } + + /** Validate wildcard symmetry between pattern and replacement. */ + private static void validateWildcardSymmetry(String pattern, String replacement) { + int patternWildcards = countWildcards(pattern); + int replacementWildcards = countWildcards(replacement); + + if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { + throw new IllegalArgumentException( + String.format( + "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " + + "replacement has %d. Replacement must have same number of wildcards or none.", + patternWildcards, replacementWildcards)); + } + } + + // ============ End Wildcard Utilities ============ + private void buildParseRelNode(Parse node, CalcitePlanContext context) { RexNode sourceField = rexVisitor.analyze(node.getSourceField(), context); ParseMethod parseMethod = node.getParseMethod(); diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java deleted file mode 100644 index 5165528c1fc..00000000000 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.calcite.utils; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Utility for wildcard-based string replacement in PPL replace command. - * - *

    Supports wildcard patterns using '*' to match zero or more characters. Wildcards in the - * replacement string are substituted with values captured from the pattern match. - * - *

    Escape sequences: Use '\*' to match literal asterisks and '\\' to match literal backslashes. - * Without escapes, '*' is interpreted as a wildcard pattern. - */ -public class WildcardReplaceUtils { - - private static final int PATTERN_CACHE_SIZE = 100; - - private static final Map PATTERN_CACHE = - Collections.synchronizedMap( - new LinkedHashMap<>(16, 0.75f, true) { - @Override - protected boolean removeEldestEntry(Map.Entry eldest) { - return size() > PATTERN_CACHE_SIZE; - } - }); - - /** Perform wildcard-based replacement. */ - public static String replaceWithWildcard(String input, String pattern, String replacement) { - if (input == null) { - return null; - } - - validateEscapeSequences(pattern); - validateEscapeSequences(replacement); - - if (!pattern.contains("*")) { - return input.replace(pattern, replacement); - } - - List captures = matchAndCapture(input, pattern); - if (captures == null) { - return input; - } - - return substituteWildcards(replacement, captures); - } - - /** Validate that string doesn't end with unescaped backslash. */ - private static void validateEscapeSequences(String str) { - boolean escaped = false; - for (char c : str.toCharArray()) { - if (escaped) { - escaped = false; - } else if (c == '\\') { - escaped = true; - } - } - if (escaped) { - throw new IllegalArgumentException( - "Invalid escape sequence: pattern ends with unescaped backslash"); - } - } - - /** Match pattern against input and capture wildcard portions. */ - public static List matchAndCapture(String input, String pattern) { - Pattern compiledPattern = - PATTERN_CACHE.computeIfAbsent(pattern, WildcardReplaceUtils::compileWildcardPattern); - - Matcher matcher = compiledPattern.matcher(input); - if (!matcher.matches()) { - return null; - } - - List captures = new ArrayList<>(); - for (int i = 1; i <= matcher.groupCount(); i++) { - captures.add(matcher.group(i)); - } - return captures; - } - - /** - * Split pattern on unescaped wildcards, handling escape sequences. - * - *

    Supports: \* (literal asterisk), \\ (literal backslash) - * - * @param pattern Wildcard pattern with potential escapes - * @return Array of literal parts between wildcards - * @throws IllegalArgumentException if pattern ends with unescaped backslash - */ - private static String[] splitWildcards(String pattern) { - List parts = new ArrayList<>(); - StringBuilder current = new StringBuilder(); - boolean escaped = false; - - for (char c : pattern.toCharArray()) { - if (escaped) { - current.append(c); - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - parts.add(current.toString()); - current = new StringBuilder(); - } else { - current.append(c); - } - } - - if (escaped) { - throw new IllegalArgumentException( - "Invalid escape sequence: pattern ends with unescaped backslash"); - } - - parts.add(current.toString()); - return parts.toArray(new String[0]); - } - - /** Compile a wildcard pattern to a regex Pattern. */ - private static Pattern compileWildcardPattern(String pattern) { - String[] parts = splitWildcards(pattern); - StringBuilder regexBuilder = new StringBuilder("^"); - - for (int i = 0; i < parts.length; i++) { - regexBuilder.append(Pattern.quote(parts[i])); - if (i < parts.length - 1) { - regexBuilder.append("(.*?)"); - } - } - regexBuilder.append("$"); - - return Pattern.compile(regexBuilder.toString()); - } - - /** Substitute wildcards in replacement string with captured values. */ - public static String substituteWildcards(String replacement, List captures) { - StringBuilder result = new StringBuilder(); - int captureIndex = 0; - boolean escaped = false; - - for (char c : replacement.toCharArray()) { - if (escaped) { - result.append(c); - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - if (captureIndex < captures.size()) { - result.append(captures.get(captureIndex)); - captureIndex++; - } - } else { - result.append(c); - } - } - - if (escaped) { - throw new IllegalArgumentException( - "Invalid escape sequence: replacement ends with unescaped backslash"); - } - - return result.toString(); - } - - /** Count the number of unescaped wildcards in a string. */ - public static int countWildcards(String str) { - int count = 0; - boolean escaped = false; - for (char c : str.toCharArray()) { - if (escaped) { - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - count++; - } - } - return count; - } - - /** Validate wildcard symmetry between pattern and replacement. */ - public static void validateWildcardSymmetry(String pattern, String replacement) { - int patternWildcards = countWildcards(pattern); - int replacementWildcards = countWildcards(replacement); - - if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { - throw new IllegalArgumentException( - String.format( - "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " - + "replacement has %d. Replacement must have same number of wildcards or none.", - patternWildcards, replacementWildcards)); - } - } -} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index 62470a35069..ced98022ca9 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -333,8 +333,7 @@ public enum BuiltinFunctionName { INTERNAL_REGEXP_REPLACE_3(FunctionName.of("regexp_replace_3"), true), INTERNAL_REGEXP_REPLACE_PG_4(FunctionName.of("regexp_replace_pg_4"), true), INTERNAL_REGEXP_REPLACE_5(FunctionName.of("regexp_replace_5"), true), - INTERNAL_TRANSLATE3(FunctionName.of("translate3"), true), - INTERNAL_WILDCARD_REPLACE(FunctionName.of("wildcard_replace"), true); + INTERNAL_TRANSLATE3(FunctionName.of("translate3"), true); private final FunctionName name; private boolean isInternal; diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 93fe81135af..68eb0ed5cca 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -428,11 +428,6 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET"); - // Wildcard replace function for replace command - public static final SqlOperator WILDCARD_REPLACE = - new org.opensearch.sql.expression.function.udf.WildcardReplaceFunctionImpl() - .toUDF("WILDCARD_REPLACE"); - // Aggregation functions public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG); public static final SqlAggFunction STDDEV_POP_NULLABLE = diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 90c45deb804..c85a429a81d 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -87,7 +87,6 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4; import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_TRANSLATE3; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.INTERNAL_WILDCARD_REPLACE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_BLANK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_EMPTY; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NOT_NULL; @@ -838,7 +837,6 @@ void populate() { registerOperator(INTERNAL_REGEXP_REPLACE_PG_4, SqlLibraryOperators.REGEXP_REPLACE_PG_4); registerOperator(INTERNAL_REGEXP_REPLACE_5, SqlLibraryOperators.REGEXP_REPLACE_5); registerOperator(INTERNAL_TRANSLATE3, SqlLibraryOperators.TRANSLATE3); - registerOperator(INTERNAL_WILDCARD_REPLACE, PPLBuiltinOperators.WILDCARD_REPLACE); // Register eval functions for PPL max() and min() calls registerOperator(MAX, PPLBuiltinOperators.MAX); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java deleted file mode 100644 index 23cb06c66af..00000000000 --- a/core/src/main/java/org/opensearch/sql/expression/function/udf/WildcardReplaceFunctionImpl.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.function.udf; - -import java.util.List; -import org.apache.calcite.adapter.enumerable.NotNullImplementor; -import org.apache.calcite.adapter.enumerable.NullPolicy; -import org.apache.calcite.adapter.enumerable.RexImpTable; -import org.apache.calcite.adapter.enumerable.RexToLixTranslator; -import org.apache.calcite.linq4j.tree.Expression; -import org.apache.calcite.linq4j.tree.Types; -import org.apache.calcite.rex.RexCall; -import org.apache.calcite.schema.impl.ScalarFunctionImpl; -import org.apache.calcite.sql.type.OperandTypes; -import org.apache.calcite.sql.type.ReturnTypes; -import org.apache.calcite.sql.type.SqlReturnTypeInference; -import org.apache.calcite.sql.type.SqlTypeFamily; -import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; -import org.opensearch.sql.expression.function.ImplementorUDF; -import org.opensearch.sql.expression.function.UDFOperandMetadata; - -/** UDF for wildcard-based string replacement in PPL replace command. */ -public class WildcardReplaceFunctionImpl extends ImplementorUDF { - - public WildcardReplaceFunctionImpl() { - super(new WildcardReplaceImplementor(), NullPolicy.ANY); - } - - @Override - public SqlReturnTypeInference getReturnTypeInference() { - return ReturnTypes.VARCHAR_2000; - } - - @Override - public UDFOperandMetadata getOperandMetadata() { - return UDFOperandMetadata.wrap( - OperandTypes.family(SqlTypeFamily.STRING, SqlTypeFamily.STRING, SqlTypeFamily.STRING)); - } - - /** Implementor for wildcard replace function. */ - public static class WildcardReplaceImplementor implements NotNullImplementor { - @Override - public Expression implement( - RexToLixTranslator translator, RexCall call, List translatedOperands) { - ScalarFunctionImpl function = - (ScalarFunctionImpl) - ScalarFunctionImpl.create( - Types.lookupMethod( - WildcardReplaceUtils.class, - "replaceWithWildcard", - String.class, - String.class, - String.class)); - return function.getImplementor().implement(translator, call, RexImpTable.NullAs.NULL); - } - } -} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java index 2e3c93ee26f..5f6f2beb76d 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLReplaceTest.java @@ -329,12 +329,14 @@ public void testReplaceWithMultiplePairsTrailingCommaShouldFail() { @Test public void testWildcardReplace_prefixWildcard() { // Replace suffix wildcard - e.g., "*MAN" matches "SALESMAN" → "SELLER" + // Wildcard pattern is converted to regex at planning time String ppl = "source=EMP | replace \"*MAN\" WITH \"SELLER\" IN JOB"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '*MAN':VARCHAR," - + " 'SELLER':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2," + + " '^\\Q\\E(.*?)\\QMAN\\E$':VARCHAR, 'SELLER':VARCHAR)], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); @@ -343,12 +345,14 @@ public void testWildcardReplace_prefixWildcard() { @Test public void testWildcardReplace_multipleWildcards() { // Replace with multiple wildcards for capture and substitution + // Wildcard pattern "*_*" is converted to regex replacement "$1_$2" String ppl = "source=EMP | replace \"* - *\" WITH \"*_*\" IN JOB"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '* - *':VARCHAR," - + " '*_*':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2, '^\\Q\\E(.*?)\\Q -" + + " \\E(.*?)\\Q\\E$':VARCHAR, '$1_$2':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); @@ -364,12 +368,14 @@ public void testWildcardReplace_symmetryMismatch_shouldFail() { @Test public void testWildcardReplace_symmetryValid_zeroInReplacement() { // Pattern has 2 wildcards, replacement has 0 - should work + // Literal replacement "FIXED" has no wildcards, which is valid String ppl = "source=EMP | replace \"* - *\" WITH \"FIXED\" IN JOB"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[WILDCARD_REPLACE($2, '* - *':VARCHAR," - + " 'FIXED':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REGEXP_REPLACE($2, '^\\Q\\E(.*?)\\Q -" + + " \\E(.*?)\\Q\\E$':VARCHAR, 'FIXED':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); @@ -377,15 +383,16 @@ public void testWildcardReplace_symmetryValid_zeroInReplacement() { @Test public void testWildcardAndLiteralReplace_mixedPairs() { - // Multiple pairs: one with wildcard, one literal + // Multiple pairs: one with wildcard (converted to REGEXP_REPLACE), one literal (REPLACE) String ppl = "source=EMP | replace \"*CLERK\" WITH \"EMPLOYEE\", \"MANAGER\" WITH \"SUPERVISOR\" IN JOB"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REPLACE(WILDCARD_REPLACE($2," - + " '*CLERK':VARCHAR, 'EMPLOYEE':VARCHAR), 'MANAGER':VARCHAR, 'SUPERVISOR':VARCHAR)]," - + " MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7])\n" + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[REPLACE(REGEXP_REPLACE($2," + + " '^\\Q\\E(.*?)\\QCLERK\\E$':VARCHAR, 'EMPLOYEE':VARCHAR), 'MANAGER':VARCHAR," + + " 'SUPERVISOR':VARCHAR)], MGR=[$3], HIREDATE=[$4], SAL=[$5], COMM=[$6]," + + " DEPTNO=[$7])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java deleted file mode 100644 index f47875a0609..00000000000 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.ppl.calcite; - -import static org.junit.Assert.*; - -import java.util.Arrays; -import java.util.List; -import org.junit.Test; -import org.opensearch.sql.calcite.utils.WildcardReplaceUtils; - -public class WildcardReplaceUtilsTest { - - @Test - public void testWildcardMatch_prefixWildcard() { - assertEquals( - "localhost", - WildcardReplaceUtils.replaceWithWildcard("server.localhost", "*localhost", "localhost")); - } - - @Test - public void testWildcardMatch_suffixWildcard() { - assertEquals( - "server", WildcardReplaceUtils.replaceWithWildcard("server.local", "server*", "server")); - } - - @Test - public void testWildcardMatch_infixWildcard() { - assertEquals( - "replaced", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "replaced")); - } - - @Test - public void testWildcardMatch_multipleWildcards() { - assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); - } - - @Test - public void testWildcardMatch_noMatch() { - String input = "server.example.com"; - assertEquals(input, WildcardReplaceUtils.replaceWithWildcard(input, "*localhost", "localhost")); - } - - @Test - public void testWildcardMatch_onlyWildcard() { - assertEquals("replaced", WildcardReplaceUtils.replaceWithWildcard("anything", "*", "replaced")); - } - - @Test - public void testWildcardCapture_single() { - assertEquals( - "localhost server", - WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); - } - - @Test - public void testWildcardCapture_multiple() { - assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); - } - - @Test - public void testWildcardCapture_reorder() { - assertEquals( - "localhost server", - WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); - } - - @Test - public void testWildcardSubstitute_noWildcards() { - assertEquals("fixed", WildcardReplaceUtils.replaceWithWildcard("foo bar", "* bar", "fixed")); - } - - @Test - public void testWildcardSubstitute_moreCapturesThanWildcards() { - assertEquals( - "foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar - baz", "* - * - *", "*_*")); - } - - @Test - public void testWildcard_emptyCapture() { - assertEquals( - "fixed", WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "fixed")); - } - - @Test - public void testWildcard_emptyCaptureWithSubstitution() { - assertEquals( - "localhost ", - WildcardReplaceUtils.replaceWithWildcard("localhost", "*localhost", "localhost *")); - } - - @Test - public void testWildcard_overlappingParts() { - assertNull(WildcardReplaceUtils.matchAndCapture("foo", "foo*foo")); - } - - @Test - public void testWildcard_consecutiveWildcards() { - // "**" splits to ["", "", ""], first captures empty, second captures rest - List captures = WildcardReplaceUtils.matchAndCapture("foobar", "**"); - assertNotNull(captures); - assertEquals(2, captures.size()); - assertEquals("", captures.get(0)); - assertEquals("foobar", captures.get(1)); - } - - @Test - public void testWildcard_emptyString() { - assertEquals("replacement", WildcardReplaceUtils.replaceWithWildcard("", "*", "replacement")); - } - - @Test - public void testWildcard_nullInput() { - assertNull(WildcardReplaceUtils.replaceWithWildcard(null, "*", "replacement")); - } - - @Test - public void testWildcard_singleWildcardMatchesAll() { - assertEquals( - "replaced", WildcardReplaceUtils.replaceWithWildcard("foo * bar", "*", "replaced")); - } - - @Test - public void testLiteral_noWildcards() { - assertEquals("Illinois", WildcardReplaceUtils.replaceWithWildcard("IL", "IL", "Illinois")); - } - - @Test - public void testLiteral_multipleOccurrences() { - assertEquals( - "Illinois Illinois", WildcardReplaceUtils.replaceWithWildcard("IL IL", "IL", "Illinois")); - } - - @Test - public void testLiteral_noMatch() { - String input = "California"; - assertEquals(input, WildcardReplaceUtils.replaceWithWildcard(input, "IL", "Illinois")); - } - - @Test - public void testValidation_symmetryValid_sameCount() { - WildcardReplaceUtils.validateWildcardSymmetry("* - *", "*_*"); - } - - @Test - public void testValidation_symmetryValid_zeroInReplacement() { - WildcardReplaceUtils.validateWildcardSymmetry("* - *", "fixed"); - } - - @Test - public void testValidation_symmetryInvalid_mismatch() { - try { - WildcardReplaceUtils.validateWildcardSymmetry("* - *", "*"); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException exception) { - assertTrue(exception.getMessage().contains("Wildcard count mismatch")); - assertTrue(exception.getMessage().contains("pattern has 2 wildcard(s)")); - assertTrue(exception.getMessage().contains("replacement has 1")); - } - } - - @Test - public void testValidation_symmetryValid_noWildcardsInPattern() { - WildcardReplaceUtils.validateWildcardSymmetry("foo", "bar"); - } - - @Test - public void testCountWildcards_none() { - assertEquals(0, WildcardReplaceUtils.countWildcards("no wildcards here")); - } - - @Test - public void testCountWildcards_single() { - assertEquals(1, WildcardReplaceUtils.countWildcards("*wildcard")); - } - - @Test - public void testCountWildcards_multiple() { - assertEquals(3, WildcardReplaceUtils.countWildcards("* - * - *")); - } - - @Test - public void testCountWildcards_consecutive() { - assertEquals(2, WildcardReplaceUtils.countWildcards("**")); - } - - @Test - public void testMatchAndCapture_prefixWildcard() { - List captures = WildcardReplaceUtils.matchAndCapture("server.localhost", "*localhost"); - assertNotNull(captures); - assertEquals(1, captures.size()); - assertEquals("server.", captures.get(0)); - } - - @Test - public void testMatchAndCapture_suffixWildcard() { - List captures = WildcardReplaceUtils.matchAndCapture("server.local", "server*"); - assertNotNull(captures); - assertEquals(1, captures.size()); - assertEquals(".local", captures.get(0)); - } - - @Test - public void testMatchAndCapture_middlePart() { - List captures = WildcardReplaceUtils.matchAndCapture("foo - bar", "* - *"); - assertNotNull(captures); - assertEquals(2, captures.size()); - assertEquals("foo", captures.get(0)); - assertEquals("bar", captures.get(1)); - } - - @Test - public void testMatchAndCapture_noMatch_wrongPrefix() { - assertNull(WildcardReplaceUtils.matchAndCapture("server.localhost", "client*")); - } - - @Test - public void testMatchAndCapture_noMatch_wrongSuffix() { - assertNull(WildcardReplaceUtils.matchAndCapture("server.localhost", "*example")); - } - - @Test - public void testMatchAndCapture_noMatch_missingMiddle() { - assertNull(WildcardReplaceUtils.matchAndCapture("foo bar", "* - *")); - } - - @Test - public void testSubstituteWildcards_single() { - assertEquals( - "prefix_foo", WildcardReplaceUtils.substituteWildcards("prefix_*", Arrays.asList("foo"))); - } - - @Test - public void testSubstituteWildcards_multiple() { - assertEquals( - "foo_bar", WildcardReplaceUtils.substituteWildcards("*_*", Arrays.asList("foo", "bar"))); - } - - @Test - public void testSubstituteWildcards_noWildcardsInReplacement() { - assertEquals( - "fixed", WildcardReplaceUtils.substituteWildcards("fixed", Arrays.asList("foo", "bar"))); - } - - @Test - public void testSubstituteWildcards_moreWildcardsThanCaptures() { - assertEquals("foo_", WildcardReplaceUtils.substituteWildcards("*_*", Arrays.asList("foo"))); - } - - @Test - public void testWildcardExample1_replaceSuffix() { - assertEquals( - "localhost", - WildcardReplaceUtils.replaceWithWildcard("server.localhost", "*localhost", "localhost")); - } - - @Test - public void testWildcardExample2_reorderWithCapture() { - assertEquals( - "localhost server", - WildcardReplaceUtils.replaceWithWildcard("server localhost", "* localhost", "localhost *")); - } - - @Test - public void testWildcardExample3_multipleWildcards() { - assertEquals("foo_bar", WildcardReplaceUtils.replaceWithWildcard("foo - bar", "* - *", "*_*")); - } - - @Test - public void testWildcardExample4_infixReplacement() { - assertEquals( - "fooALLbar", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "*ALL*")); - } - - @Test - public void testEscapedAsterisk_literal() { - assertEquals( - "foo*bar", WildcardReplaceUtils.replaceWithWildcard("foo*bar", "foo\\*bar", "foo\\*bar")); - } - - @Test - public void testEscapedAsterisk_noMatch() { - assertEquals( - "fooXbar", WildcardReplaceUtils.replaceWithWildcard("fooXbar", "foo\\*bar", "replacement")); - } - - @Test - public void testEscapedBackslash_beforeWildcard() { - assertEquals( - "foo\\123", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\123")); - } - - @Test - public void testEscapedBackslash_literal() { - assertEquals( - "foo\\bar", - WildcardReplaceUtils.replaceWithWildcard("foo\\bar", "foo\\\\bar", "foo\\\\bar")); - } - - @Test - public void testMixedEscapes_asteriskAndBackslash() { - assertEquals( - "price: *special* $100\\ea", - WildcardReplaceUtils.replaceWithWildcard( - "price: *special* $100\\ea", - "price: \\*special\\* $*\\\\*", - "price: \\*special\\* $*\\\\*")); - } - - @Test - public void testEscapedAsterisk_withWildcard_capture() { - assertEquals( - "file*.prefix-123", - WildcardReplaceUtils.replaceWithWildcard("file123.txt", "file*.*", "file\\*.prefix-*")); - } - - @Test - public void testTrailingBackslash_shouldFail() { - try { - WildcardReplaceUtils.replaceWithWildcard("foo", "foo\\", "bar"); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException exception) { - assertTrue(exception.getMessage().contains("Invalid escape sequence")); - } - } - - @Test - public void testOnlyEscapedAsterisks_noWildcards() { - assertEquals("***", WildcardReplaceUtils.replaceWithWildcard("***", "\\*\\*\\*", "\\*\\*\\*")); - } - - @Test - public void testDoubleBackslashBeforeAsterisk() { - assertEquals( - "foo\\bar", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\bar")); - } - - @Test - public void testCountWildcards_withEscapes() { - assertEquals(2, WildcardReplaceUtils.countWildcards("foo\\*bar*baz*")); - } - - @Test - public void testCountWildcards_allEscaped() { - assertEquals(0, WildcardReplaceUtils.countWildcards("\\*\\*\\*")); - } - - @Test - public void testValidation_escapedWildcardsNotCounted() { - WildcardReplaceUtils.validateWildcardSymmetry("foo\\**", "bar*"); - } -} From ffbe1d154b97814e14bad2bddc3ac18821abe485 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 30 Oct 2025 13:53:45 -0700 Subject: [PATCH 09/12] add ExplainIT Signed-off-by: Kai Huang # Conflicts: # integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java # Conflicts: # integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java --- .../sql/calcite/remote/CalciteExplainIT.java | 11 +++++++++++ .../calcite/explain_replace_wildcard.yaml | 8 ++++++++ .../calcite_no_pushdown/explain_replace_wildcard.yaml | 9 +++++++++ 3 files changed, 28 insertions(+) create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index 77f3a45cc07..6da047e0c20 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -1335,6 +1335,17 @@ public void testReplaceCommandExplain() throws IOException { TEST_INDEX_ACCOUNT))); } + @Test + public void testReplaceCommandWildcardExplain() throws IOException { + String expected = loadExpectedPlan("explain_replace_wildcard.yaml"); + assertYamlEqualsIgnoreId( + expected, + explainQueryYaml( + String.format( + "source=%s | replace '*L' WITH 'STATE_IL' IN state | fields state", + TEST_INDEX_ACCOUNT))); + } + @Test public void testExplainRareCommandUseNull() throws IOException { String expected = loadExpectedPlan("explain_rare_usenull_false.yaml"); diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml new file mode 100644 index 00000000000..0407849a472 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_replace_wildcard.yaml @@ -0,0 +1,8 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[REGEXP_REPLACE($7, '^\Q\E(.*?)\QL\E$':VARCHAR, 'STATE_IL':VARCHAR)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableCalc(expr#0=[{inputs}], expr#1=['^\Q\E(.*?)\QL\E$':VARCHAR], expr#2=['STATE_IL':VARCHAR], expr#3=[REGEXP_REPLACE($t0, $t1, $t2)], $f0=[$t3]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[state], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["state"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml new file mode 100644 index 00000000000..194f680adf2 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_replace_wildcard.yaml @@ -0,0 +1,9 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[REGEXP_REPLACE($7, '^\Q\E(.*?)\QL\E$':VARCHAR, 'STATE_IL':VARCHAR)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=['^\Q\E(.*?)\QL\E$':VARCHAR], expr#18=['STATE_IL':VARCHAR], expr#19=[REGEXP_REPLACE($t7, $t17, $t18)], state=[$t19]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) \ No newline at end of file From 0be5611e3c0acb9604e543ddf73c3fe27f7a054e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 30 Oct 2025 15:29:07 -0700 Subject: [PATCH 10/12] refactoring and trimming Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRelNodeVisitor.java | 131 +---------------- .../sql/calcite/utils/WildcardUtils.java | 138 +++++++++++++++++ docs/user/ppl/cmd/replace.rst | 66 +-------- .../remote/CalciteReplaceCommandIT.java | 139 ------------------ 4 files changed, 148 insertions(+), 326 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 53f0a8ba37a..09ad5d4009a 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -2859,15 +2859,12 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { String replacementStr = pair.getReplacement().getValue().toString(); if (patternStr.contains("*")) { - // Wildcard pattern: convert to regex at planning time - validateWildcardSymmetry(patternStr, replacementStr); + WildcardUtils.validateWildcardSymmetry(patternStr, replacementStr); - // Convert wildcard pattern to regex pattern (e.g., "*ada" → "^(.*?)ada$") - String regexPattern = convertWildcardPatternToRegex(patternStr); - // Convert wildcard replacement to regex replacement (e.g., "*_*" → "$1_$2") - String regexReplacement = convertWildcardReplacementToRegex(replacementStr); + String regexPattern = WildcardUtils.convertWildcardPatternToRegex(patternStr); + String regexReplacement = + WildcardUtils.convertWildcardReplacementToRegex(replacementStr); - // Create regex pattern and replacement literals RexNode regexPatternNode = context.rexBuilder.makeLiteral( regexPattern, @@ -2879,7 +2876,6 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { context.rexBuilder.getTypeFactory().createSqlType(SqlTypeName.VARCHAR), true); - // Use Calcite's REGEXP_REPLACE operator fieldRef = context.rexBuilder.makeCall( org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE_3, @@ -2887,7 +2883,6 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { regexPatternNode, regexReplacementNode); } else { - // Literal pattern: use standard REPLACE fieldRef = context.relBuilder.call( SqlStdOperatorTable.REPLACE, fieldRef, patternNode, replacementNode); @@ -2905,124 +2900,6 @@ public RelNode visitReplace(Replace node, CalcitePlanContext context) { return context.relBuilder.peek(); } - // ============ Wildcard-to-Regex Conversion Utilities ============ - - /** - * Convert a wildcard pattern to a regex pattern string. Example: "*ada" → "^(.*?)ada$" Example: - * "* - *" → "^(.*?) - (.*?)$" Handles escape sequences: \* (literal asterisk), \\ (literal - * backslash) - */ - private static String convertWildcardPatternToRegex(String wildcardPattern) { - String[] parts = splitWildcards(wildcardPattern); - StringBuilder regexBuilder = new StringBuilder("^"); - - for (int i = 0; i < parts.length; i++) { - regexBuilder.append(java.util.regex.Pattern.quote(parts[i])); - if (i < parts.length - 1) { - regexBuilder.append("(.*?)"); // Non-greedy capture group for wildcard - } - } - regexBuilder.append("$"); - - return regexBuilder.toString(); - } - - /** - * Convert a wildcard replacement to a regex replacement string. Example: "*_*" → "$1_$2" Example: - * "SELLER" → "SELLER" (no wildcards) Handles escape sequences: \* (literal asterisk), \\ (literal - * backslash) - */ - private static String convertWildcardReplacementToRegex(String wildcardReplacement) { - if (!wildcardReplacement.contains("*")) { - return wildcardReplacement; // No wildcards = literal replacement - } - - StringBuilder result = new StringBuilder(); - int captureIndex = 1; // Regex capture groups start at $1 - boolean escaped = false; - - for (char c : wildcardReplacement.toCharArray()) { - if (escaped) { - // Handle escape sequences: \* or \\ - result.append(c); - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - // Replace wildcard with $1, $2, etc. - result.append('$').append(captureIndex++); - } else { - result.append(c); - } - } - - return result.toString(); - } - - /** - * Split pattern on unescaped wildcards, handling escape sequences. Supports: \* (literal - * asterisk), \\ (literal backslash) - */ - private static String[] splitWildcards(String pattern) { - List parts = new ArrayList<>(); - StringBuilder current = new StringBuilder(); - boolean escaped = false; - - for (char c : pattern.toCharArray()) { - if (escaped) { - current.append(c); - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - parts.add(current.toString()); - current = new StringBuilder(); - } else { - current.append(c); - } - } - - if (escaped) { - throw new IllegalArgumentException( - "Invalid escape sequence: pattern ends with unescaped backslash"); - } - - parts.add(current.toString()); - return parts.toArray(new String[0]); - } - - /** Count the number of unescaped wildcards in a string. */ - private static int countWildcards(String str) { - int count = 0; - boolean escaped = false; - for (char c : str.toCharArray()) { - if (escaped) { - escaped = false; - } else if (c == '\\') { - escaped = true; - } else if (c == '*') { - count++; - } - } - return count; - } - - /** Validate wildcard symmetry between pattern and replacement. */ - private static void validateWildcardSymmetry(String pattern, String replacement) { - int patternWildcards = countWildcards(pattern); - int replacementWildcards = countWildcards(replacement); - - if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { - throw new IllegalArgumentException( - String.format( - "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " - + "replacement has %d. Replacement must have same number of wildcards or none.", - patternWildcards, replacementWildcards)); - } - } - - // ============ End Wildcard Utilities ============ - private void buildParseRelNode(Parse node, CalcitePlanContext context) { RexNode sourceField = rexVisitor.analyze(node.getSourceField(), context); ParseMethod parseMethod = node.getParseMethod(); diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java index 09552e97109..a54875e95f6 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java @@ -5,6 +5,7 @@ package org.opensearch.sql.calcite.utils; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -92,4 +93,141 @@ private static boolean matchesCompiledPattern(String[] parts, String fieldName) public static boolean containsWildcard(String str) { return str != null && str.contains(WILDCARD); } + + /** + * Converts a wildcard pattern to a regex pattern. + * + *

    Example: "*ada" → "^(.*?)ada$" + * + * @param wildcardPattern wildcard pattern with '*' and escape sequences (\*, \\) + * @return regex pattern with capture groups + */ + public static String convertWildcardPatternToRegex(String wildcardPattern) { + String[] parts = splitWildcards(wildcardPattern); + StringBuilder regexBuilder = new StringBuilder("^"); + + for (int i = 0; i < parts.length; i++) { + regexBuilder.append(java.util.regex.Pattern.quote(parts[i])); + if (i < parts.length - 1) { + regexBuilder.append("(.*?)"); // Non-greedy capture group for wildcard + } + } + regexBuilder.append("$"); + + return regexBuilder.toString(); + } + + /** + * Converts a wildcard replacement string to a regex replacement string. + * + *

    Example: "*_*" → "$1_$2" + * + * @param wildcardReplacement replacement string with '*' and escape sequences (\*, \\) + * @return regex replacement string with capture group references + */ + public static String convertWildcardReplacementToRegex(String wildcardReplacement) { + if (!wildcardReplacement.contains("*")) { + return wildcardReplacement; // No wildcards = literal replacement + } + + StringBuilder result = new StringBuilder(); + int captureIndex = 1; // Regex capture groups start at $1 + boolean escaped = false; + + for (char c : wildcardReplacement.toCharArray()) { + if (escaped) { + // Handle escape sequences: \* or \\ + result.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + // Replace wildcard with $1, $2, etc. + result.append('$').append(captureIndex++); + } else { + result.append(c); + } + } + + return result.toString(); + } + + /** + * Splits a wildcard pattern into parts separated by unescaped wildcards. + * + *

    Example: "a*b*c" → ["a", "b", "c"] + * + * @param pattern wildcard pattern with escape sequences + * @return array of pattern parts + */ + public static String[] splitWildcards(String pattern) { + List parts = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + boolean escaped = false; + + for (char c : pattern.toCharArray()) { + if (escaped) { + current.append(c); + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + parts.add(current.toString()); + current = new StringBuilder(); + } else { + current.append(c); + } + } + + if (escaped) { + throw new IllegalArgumentException( + "Invalid escape sequence: pattern ends with unescaped backslash"); + } + + parts.add(current.toString()); + return parts.toArray(new String[0]); + } + + /** + * Counts the number of unescaped wildcards in a string. + * + * @param str string to count wildcards in + * @return number of unescaped wildcards + */ + public static int countWildcards(String str) { + int count = 0; + boolean escaped = false; + for (char c : str.toCharArray()) { + if (escaped) { + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == '*') { + count++; + } + } + return count; + } + + /** + * Validates that wildcard count is symmetric between pattern and replacement. + * + *

    Replacement must have either the same number of wildcards as the pattern, or zero wildcards. + * + * @param pattern wildcard pattern + * @param replacement wildcard replacement + * @throws IllegalArgumentException if wildcard counts are mismatched + */ + public static void validateWildcardSymmetry(String pattern, String replacement) { + int patternWildcards = countWildcards(pattern); + int replacementWildcards = countWildcards(replacement); + + if (replacementWildcards != 0 && replacementWildcards != patternWildcards) { + throw new IllegalArgumentException( + String.format( + "Error in 'replace' command: Wildcard count mismatch - pattern has %d wildcard(s), " + + "replacement has %d. Replacement must have same number of wildcards or none.", + patternWildcards, replacementWildcards)); + } + } } diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index b29f8b10e22..5ee78fcfb2c 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -11,7 +11,7 @@ replace Description ============ -Using ``replace`` command to replace text in one or more fields in the search result. Supports both literal string replacement and wildcard pattern matching. +Using ``replace`` command to replace text in one or more fields. Supports literal string replacement and wildcard patterns using ``*``. Note: This command is only available when Calcite engine is enabled. @@ -21,22 +21,6 @@ Syntax replace '' WITH '' [, '' WITH '']... IN [, ]... -Parameters -========== -* **pattern**: mandatory. The text pattern you want to replace. Supports: - - - Plain text literals for exact matching - - Wildcard patterns using ``*`` (asterisk) to match zero or more characters - -* **replacement**: mandatory. The text you want to replace with. When using wildcards: - - - Can contain ``*`` to substitute captured wildcard portions - - Must have the same number of wildcards as the pattern, or zero wildcards - - Wildcards in replacement are substituted with values captured from the pattern match - -* **field-name**: mandatory. One or more field names where the replacement should occur. - - Examples ======== @@ -129,11 +113,6 @@ PPL query:: +-----------------+-------+--------+-----+--------+ -Wildcard Pattern Matching -========================== - -The replace command supports wildcard patterns using ``*`` (asterisk) to match zero or more characters. This provides flexible pattern matching for text transformation. - Example 6: Wildcard suffix match --------------------------------- @@ -229,42 +208,10 @@ PPL query:: +----------+ -Wildcard Rules -============== - -When using wildcards in the replace command: - -* **Wildcard character**: Use ``*`` to match zero or more characters -* **Symmetry requirement**: The replacement must have the same number of wildcards as the pattern, OR zero wildcards -* **Substitution order**: Wildcards in replacement are substituted left-to-right with values captured from pattern -* **No match behavior**: If pattern doesn't match, the original value is returned unchanged -* **Case sensitivity**: Wildcard matching is case-sensitive - -**Valid wildcard pairs:** - -* Pattern: ``"*ada"`` (1 wildcard), Replacement: ``"CA"`` (0 wildcards) ✓ -* Pattern: ``"* localhost"`` (1 wildcard), Replacement: ``"localhost *"`` (1 wildcard) ✓ -* Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*_*"`` (2 wildcards) ✓ - -**Invalid wildcard pair:** - -* Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*"`` (1 wildcard) ✗ (mismatch error) - - -Escape Sequences -================ - -To match or replace literal asterisks or backslashes in your data, use escape sequences: - -* ``\*`` - Matches a literal asterisk character -* ``\\`` - Matches a literal backslash character - -Without escapes, asterisks are interpreted as wildcards. - Example 11: Matching literal asterisks --------------------------------------- -Match and replace literal asterisk characters in data. +Use ``\*`` to match literal asterisk characters (``\*`` = literal asterisk, ``\\`` = literal backslash). PPL query:: @@ -318,8 +265,7 @@ PPL query:: Limitations =========== -* Pattern and replacement values must be string literals. -* The replace command modifies the specified fields in-place. -* Wildcard matching is case-sensitive. -* Regular expressions are not supported (only simple wildcard patterns with ``*``). -* Use backslash escape sequences (``\*``, ``\\``) to match literal asterisks or backslashes. \ No newline at end of file +* Pattern and replacement must be string literals +* Wildcards: ``*`` matches zero or more characters (case-sensitive) +* Replacement wildcards must match pattern wildcard count, or be zero +* Escape sequences: ``\*`` (literal asterisk), ``\\`` (literal backslash) \ No newline at end of file diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java index 55aa6162f58..44cc4a3aaf0 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java @@ -357,145 +357,6 @@ public void testWildcardReplace_symmetryMismatch_shouldFail() { verifyErrorMessageContains(e, "Wildcard count mismatch"); } - @Test - public void testWildcardReplace_multipleFields() throws IOException { - // Test wildcard replacement across multiple fields - // Pattern "*A" should match "USA" in country - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*A' WITH 'United States' IN country, name | fields name," - + " country", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - verifyDataRows( - result, - rows("Jake", "United States"), - rows("Hello", "United States"), - rows("John", "Canada"), - rows("Jane", "Canada")); - } - - @Test - public void testWildcardReplace_internalField() throws IOException { - // Test wildcard replacement on internal fields - // Replace pattern in _index field - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*country' WITH 'test_index' IN _index | fields name," - + " _index", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("_index", "string")); - - // All rows should have _index replaced since it matches "*country" - verifyDataRows( - result, - rows("Jake", "test_index"), - rows("Hello", "test_index"), - rows("John", "test_index"), - rows("Jane", "test_index")); - } - - @Test - public void testWildcardReplace_multiplePairsWithWildcards() throws IOException { - // Test multiple wildcard pattern pairs in a single command - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*A' WITH 'United States', '*ada' WITH 'CA' IN country |" - + " fields name, country", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - // First pair: "*A" matches "USA" → "United States" - // Second pair: "*ada" matches "Canada" → "CA" - verifyDataRows( - result, - rows("Jake", "United States"), - rows("Hello", "United States"), - rows("John", "CA"), - rows("Jane", "CA")); - } - - @Test - public void testWildcardReplace_withSort() throws IOException { - // Test wildcard replacement followed by sort command - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*A' WITH 'United States' IN country | fields name," - + " country | sort country", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - // Results should be sorted by country after wildcard replacement - verifyDataRows( - result, - rows("John", "Canada"), - rows("Jane", "Canada"), - rows("Jake", "United States"), - rows("Hello", "United States")); - } - - @Test - public void testWildcardReplace_withWhereClause() throws IOException { - // Test wildcard replacement with where clause filtering - JSONObject result = - executeQuery( - String.format( - "source = %s | where country = 'USA' | replace 'US*' WITH 'United States' IN" - + " country | fields name, country", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - // Only rows where country = 'USA' should be processed - verifyDataRows(result, rows("Jake", "United States"), rows("Hello", "United States")); - } - - @Test - public void testWildcardReplace_nullValues() throws IOException { - // Test wildcard replacement behavior with null field values - // Use a query that might have null values in results - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*' WITH 'N/A' IN country | fields name, country | head 2", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - // Wildcard pattern "*" matches everything, so all non-null values are replaced with "N/A" - verifyDataRows(result, rows("Jake", "N/A"), rows("Hello", "N/A")); - } - - @Test - public void testWildcardReplace_emptyStringIntegration() throws IOException { - // Integration test for empty string replacement with wildcards - // Replace the entire country value with empty string - JSONObject result = - executeQuery( - String.format( - "source = %s | replace '*A' WITH '' IN country | fields name, country", - TEST_INDEX_STATE_COUNTRY)); - - verifySchema(result, schema("name", "string"), schema("country", "string")); - - // "*A" matches "USA" → empty string, "Canada" stays unchanged - verifyDataRows( - result, - rows("Jake", ""), - rows("Hello", ""), - rows("John", "Canada"), - rows("Jane", "Canada")); - } - @Test public void testEscapeSequence_literalAsterisk() throws IOException { // Test matching literal asterisks in data using \* escape sequence From e60720e9ecaaf57ac5991e8ba499adf6b34d7ea4 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 30 Oct 2025 15:32:56 -0700 Subject: [PATCH 11/12] doc Signed-off-by: Kai Huang --- docs/user/ppl/cmd/replace.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst index 5ee78fcfb2c..0098124344d 100644 --- a/docs/user/ppl/cmd/replace.rst +++ b/docs/user/ppl/cmd/replace.rst @@ -265,7 +265,6 @@ PPL query:: Limitations =========== -* Pattern and replacement must be string literals * Wildcards: ``*`` matches zero or more characters (case-sensitive) * Replacement wildcards must match pattern wildcard count, or be zero * Escape sequences: ``\*`` (literal asterisk), ``\\`` (literal backslash) \ No newline at end of file From 1ddce432bd8759f15d5c8dddb4d5e6790a3c0720 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 4 Nov 2025 09:51:54 -0800 Subject: [PATCH 12/12] add tests Signed-off-by: Kai Huang --- .../sql/calcite/utils/WildcardUtils.java | 4 +- .../sql/calcite/utils/WildcardUtilsTest.java | 183 ++++++++++++++++++ 2 files changed, 185 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java index a54875e95f6..8558a5292b7 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java +++ b/core/src/main/java/org/opensearch/sql/calcite/utils/WildcardUtils.java @@ -160,7 +160,7 @@ public static String convertWildcardReplacementToRegex(String wildcardReplacemen * @param pattern wildcard pattern with escape sequences * @return array of pattern parts */ - public static String[] splitWildcards(String pattern) { + private static String[] splitWildcards(String pattern) { List parts = new ArrayList<>(); StringBuilder current = new StringBuilder(); boolean escaped = false; @@ -194,7 +194,7 @@ public static String[] splitWildcards(String pattern) { * @param str string to count wildcards in * @return number of unescaped wildcards */ - public static int countWildcards(String str) { + private static int countWildcards(String str) { int count = 0; boolean escaped = false; for (char c : str.toCharArray()) { diff --git a/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java b/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java index 53cc1d5163c..2e41de018a5 100644 --- a/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java +++ b/core/src/test/java/org/opensearch/sql/calcite/utils/WildcardUtilsTest.java @@ -5,6 +5,11 @@ package org.opensearch.sql.calcite.utils; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + import com.google.common.collect.ImmutableList; import java.util.List; import org.junit.jupiter.api.BeforeEach; @@ -74,6 +79,32 @@ void testMatchesWildcardPattern() { testPattern("*a*e", "city", false); } + @Test + void testMatchesWildcardPatternEdgeCases() { + // Test null handling + assertFalse(WildcardUtils.matchesWildcardPattern(null, "field")); + assertFalse(WildcardUtils.matchesWildcardPattern("pattern", null)); + assertFalse(WildcardUtils.matchesWildcardPattern(null, null)); + + // Test empty strings + assertTrue(WildcardUtils.matchesWildcardPattern("", "")); + assertFalse(WildcardUtils.matchesWildcardPattern("", "field")); + assertFalse(WildcardUtils.matchesWildcardPattern("field", "")); + + // Test single wildcard + assertTrue(WildcardUtils.matchesWildcardPattern("*", "anything")); + assertTrue(WildcardUtils.matchesWildcardPattern("*", "")); + + // Test multiple consecutive wildcards + assertTrue(WildcardUtils.matchesWildcardPattern("**", "field")); + assertTrue(WildcardUtils.matchesWildcardPattern("a**b", "ab")); + assertTrue(WildcardUtils.matchesWildcardPattern("a**b", "axxxb")); + + // Test wildcards at start and end + assertTrue(WildcardUtils.matchesWildcardPattern("*field*", "myfield123")); + assertTrue(WildcardUtils.matchesWildcardPattern("*field*", "field")); + } + @Test void testExpandWildcardPattern() { // Test exact match @@ -97,6 +128,20 @@ void testExpandWildcardPattern() { testExpansion("XYZ*", ImmutableList.of()); } + @Test + void testExpandWildcardPatternEdgeCases() { + // Test null handling + assertEquals(List.of(), WildcardUtils.expandWildcardPattern(null, availableFields)); + assertEquals(List.of(), WildcardUtils.expandWildcardPattern("pattern", null)); + assertEquals(List.of(), WildcardUtils.expandWildcardPattern(null, null)); + + // Test empty list + assertEquals(List.of(), WildcardUtils.expandWildcardPattern("*", List.of())); + + // Test single wildcard matches all + assertEquals(availableFields, WildcardUtils.expandWildcardPattern("*", availableFields)); + } + @Test void testContainsWildcard() { // Test with wildcard @@ -108,4 +153,142 @@ void testContainsWildcard() { testContainsWildcard("field", false); testContainsWildcard("", false); } + + @Test + void testContainsWildcardEdgeCases() { + // Test null + assertFalse(WildcardUtils.containsWildcard(null)); + + // Test multiple wildcards + assertTrue(WildcardUtils.containsWildcard("**")); + assertTrue(WildcardUtils.containsWildcard("a*b*c")); + } + + @Test + void testConvertWildcardPatternToRegex() { + // Basic patterns + assertEquals("^\\Qada\\E$", WildcardUtils.convertWildcardPatternToRegex("ada")); + assertEquals("^\\Q\\E(.*?)\\Qada\\E$", WildcardUtils.convertWildcardPatternToRegex("*ada")); + assertEquals("^\\Qada\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("ada*")); + assertEquals( + "^\\Q\\E(.*?)\\Qada\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("*ada*")); + + // Multiple wildcards + assertEquals( + "^\\Qa\\E(.*?)\\Qb\\E(.*?)\\Qc\\E$", WildcardUtils.convertWildcardPatternToRegex("a*b*c")); + + // Pattern with special regex characters + assertEquals( + "^\\Qa.b\\E(.*?)\\Qc+d\\E$", WildcardUtils.convertWildcardPatternToRegex("a.b*c+d")); + + // Single wildcard + assertEquals("^\\Q\\E(.*?)\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("*")); + + // Empty pattern + assertEquals("^\\Q\\E$", WildcardUtils.convertWildcardPatternToRegex("")); + + // Invalid pattern with trailing backslash should throw + IllegalArgumentException ex = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.convertWildcardPatternToRegex("pattern\\")); + assertTrue(ex.getMessage().contains("Invalid escape sequence")); + } + + @Test + void testConvertWildcardReplacementToRegex() { + // No wildcards - literal replacement + assertEquals("ada", WildcardUtils.convertWildcardReplacementToRegex("ada")); + assertEquals("test_value", WildcardUtils.convertWildcardReplacementToRegex("test_value")); + + // Single wildcard + assertEquals("$1", WildcardUtils.convertWildcardReplacementToRegex("*")); + + // Wildcards with text + assertEquals("$1_$2", WildcardUtils.convertWildcardReplacementToRegex("*_*")); + assertEquals("prefix_$1", WildcardUtils.convertWildcardReplacementToRegex("prefix_*")); + assertEquals("$1_suffix", WildcardUtils.convertWildcardReplacementToRegex("*_suffix")); + + // Multiple wildcards + assertEquals("$1_$2_$3", WildcardUtils.convertWildcardReplacementToRegex("*_*_*")); + + // Empty string + assertEquals("", WildcardUtils.convertWildcardReplacementToRegex("")); + } + + @Test + void testConvertWildcardReplacementToRegexWithEscapes() { + // Escaped wildcard should be treated as literal + assertEquals("*", WildcardUtils.convertWildcardReplacementToRegex("\\*")); // \* -> * + assertEquals("$1_*", WildcardUtils.convertWildcardReplacementToRegex("*_\\*")); + assertEquals("*_$1", WildcardUtils.convertWildcardReplacementToRegex("\\*_*")); + + // Escaped backslash when there's no wildcard - returned unchanged + assertEquals("\\\\", WildcardUtils.convertWildcardReplacementToRegex("\\\\")); + + // Mixed escaped and unescaped wildcards + assertEquals("$1_*_$2", WildcardUtils.convertWildcardReplacementToRegex("*_\\*_*")); + assertEquals("$1\\$2", WildcardUtils.convertWildcardReplacementToRegex("*\\\\*")); // \\ -> \ + } + + @Test + void testValidateWildcardSymmetry() { + // Valid: same number of wildcards + WildcardUtils.validateWildcardSymmetry("*", "*"); + WildcardUtils.validateWildcardSymmetry("*ada*", "*_*"); + WildcardUtils.validateWildcardSymmetry("a*b*c", "x*y*z"); + + // Valid: replacement has no wildcards (literal replacement) + WildcardUtils.validateWildcardSymmetry("*", "literal"); + WildcardUtils.validateWildcardSymmetry("*ada*", "replacement"); + WildcardUtils.validateWildcardSymmetry("a*b*c", "xyz"); + + // Valid: pattern has no wildcards + WildcardUtils.validateWildcardSymmetry("ada", "replacement"); + } + + @Test + void testValidateWildcardSymmetryFailure() { + // Invalid: mismatched wildcard counts + IllegalArgumentException ex1 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*", "**")); + assertTrue(ex1.getMessage().contains("Wildcard count mismatch")); + assertTrue(ex1.getMessage().contains("pattern has 1 wildcard(s)")); + assertTrue(ex1.getMessage().contains("replacement has 2")); + + IllegalArgumentException ex2 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*b*", "*_*")); + assertTrue(ex2.getMessage().contains("pattern has 3 wildcard(s)")); + assertTrue(ex2.getMessage().contains("replacement has 2")); + + IllegalArgumentException ex3 = + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("ada", "*")); + assertTrue(ex3.getMessage().contains("pattern has 0 wildcard(s)")); + assertTrue(ex3.getMessage().contains("replacement has 1")); + } + + @Test + void testValidateWildcardSymmetryWithEscapes() { + // Escaped wildcards should not count + WildcardUtils.validateWildcardSymmetry("\\*", "literal"); // 0 wildcards in pattern + WildcardUtils.validateWildcardSymmetry("*\\*", "*"); // 1 wildcard in both + + // Pattern with 2 wildcards, replacement with 1 wildcard (middle one in \\**\\*) + WildcardUtils.validateWildcardSymmetry("*", "\\**\\*"); // 1 wildcard in both + + // Should fail when unescaped counts don't match + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*", "*\\*")); // 2 vs 1 + + assertThrows( + IllegalArgumentException.class, + () -> WildcardUtils.validateWildcardSymmetry("*a*", "\\**\\*")); // 2 vs 1 + } }