Skip to content

Commit 83ff62f

Browse files
committed
handle asterisks
Signed-off-by: Kai Huang <[email protected]>
1 parent b14a74b commit 83ff62f

File tree

4 files changed

+268
-12
lines changed

4 files changed

+268
-12
lines changed

core/src/main/java/org/opensearch/sql/calcite/utils/WildcardReplaceUtils.java

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,8 @@
1919
* <p>Supports wildcard patterns using '*' to match zero or more characters. Wildcards in the
2020
* replacement string are substituted with values captured from the pattern match.
2121
*
22-
* <p>Limitation: Literal asterisk characters cannot be matched or replaced when using wildcard
23-
* patterns. To replace literal asterisks in data, use non-wildcard (literal) replacement mode by
24-
* not including '*' in the pattern string.
22+
* <p>Escape sequences: Use '\*' to match literal asterisks and '\\' to match literal backslashes.
23+
* Without escapes, '*' is interpreted as a wildcard pattern.
2524
*/
2625
public class WildcardReplaceUtils {
2726

@@ -42,6 +41,9 @@ public static String replaceWithWildcard(String input, String pattern, String re
4241
return null;
4342
}
4443

44+
validateEscapeSequences(pattern);
45+
validateEscapeSequences(replacement);
46+
4547
if (!pattern.contains("*")) {
4648
return input.replace(pattern, replacement);
4749
}
@@ -54,6 +56,22 @@ public static String replaceWithWildcard(String input, String pattern, String re
5456
return substituteWildcards(replacement, captures);
5557
}
5658

59+
/** Validate that string doesn't end with unescaped backslash. */
60+
private static void validateEscapeSequences(String str) {
61+
boolean escaped = false;
62+
for (char c : str.toCharArray()) {
63+
if (escaped) {
64+
escaped = false;
65+
} else if (c == '\\') {
66+
escaped = true;
67+
}
68+
}
69+
if (escaped) {
70+
throw new IllegalArgumentException(
71+
"Invalid escape sequence: pattern ends with unescaped backslash");
72+
}
73+
}
74+
5775
/** Match pattern against input and capture wildcard portions. */
5876
public static List<String> matchAndCapture(String input, String pattern) {
5977
Pattern compiledPattern =
@@ -71,9 +89,46 @@ public static List<String> matchAndCapture(String input, String pattern) {
7189
return captures;
7290
}
7391

92+
/**
93+
* Split pattern on unescaped wildcards, handling escape sequences.
94+
*
95+
* <p>Supports: \* (literal asterisk), \\ (literal backslash)
96+
*
97+
* @param pattern Wildcard pattern with potential escapes
98+
* @return Array of literal parts between wildcards
99+
* @throws IllegalArgumentException if pattern ends with unescaped backslash
100+
*/
101+
private static String[] splitWildcards(String pattern) {
102+
List<String> parts = new ArrayList<>();
103+
StringBuilder current = new StringBuilder();
104+
boolean escaped = false;
105+
106+
for (char c : pattern.toCharArray()) {
107+
if (escaped) {
108+
current.append(c);
109+
escaped = false;
110+
} else if (c == '\\') {
111+
escaped = true;
112+
} else if (c == '*') {
113+
parts.add(current.toString());
114+
current = new StringBuilder();
115+
} else {
116+
current.append(c);
117+
}
118+
}
119+
120+
if (escaped) {
121+
throw new IllegalArgumentException(
122+
"Invalid escape sequence: pattern ends with unescaped backslash");
123+
}
124+
125+
parts.add(current.toString());
126+
return parts.toArray(new String[0]);
127+
}
128+
74129
/** Compile a wildcard pattern to a regex Pattern. */
75130
private static Pattern compileWildcardPattern(String pattern) {
76-
String[] parts = pattern.split("\\*", -1);
131+
String[] parts = splitWildcards(pattern);
77132
StringBuilder regexBuilder = new StringBuilder("^");
78133

79134
for (int i = 0; i < parts.length; i++) {
@@ -89,15 +144,17 @@ private static Pattern compileWildcardPattern(String pattern) {
89144

90145
/** Substitute wildcards in replacement string with captured values. */
91146
public static String substituteWildcards(String replacement, List<String> captures) {
92-
if (!replacement.contains("*")) {
93-
return replacement;
94-
}
95-
96147
StringBuilder result = new StringBuilder();
97148
int captureIndex = 0;
149+
boolean escaped = false;
98150

99151
for (char c : replacement.toCharArray()) {
100-
if (c == '*') {
152+
if (escaped) {
153+
result.append(c);
154+
escaped = false;
155+
} else if (c == '\\') {
156+
escaped = true;
157+
} else if (c == '*') {
101158
if (captureIndex < captures.size()) {
102159
result.append(captures.get(captureIndex));
103160
captureIndex++;
@@ -107,14 +164,24 @@ public static String substituteWildcards(String replacement, List<String> captur
107164
}
108165
}
109166

167+
if (escaped) {
168+
throw new IllegalArgumentException(
169+
"Invalid escape sequence: replacement ends with unescaped backslash");
170+
}
171+
110172
return result.toString();
111173
}
112174

113-
/** Count the number of wildcards in a string. */
175+
/** Count the number of unescaped wildcards in a string. */
114176
public static int countWildcards(String str) {
115177
int count = 0;
178+
boolean escaped = false;
116179
for (char c : str.toCharArray()) {
117-
if (c == '*') {
180+
if (escaped) {
181+
escaped = false;
182+
} else if (c == '\\') {
183+
escaped = true;
184+
} else if (c == '*') {
118185
count++;
119186
}
120187
}

docs/user/ppl/cmd/replace.rst

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,75 @@ When using wildcards in the replace command:
251251
* Pattern: ``"* - *"`` (2 wildcards), Replacement: ``"*"`` (1 wildcard) ✗ (mismatch error)
252252

253253

254+
Escape Sequences
255+
================
256+
257+
To match or replace literal asterisks or backslashes in your data, use escape sequences:
258+
259+
* ``\*`` - Matches a literal asterisk character
260+
* ``\\`` - Matches a literal backslash character
261+
262+
Without escapes, asterisks are interpreted as wildcards.
263+
264+
Example 11: Matching literal asterisks
265+
---------------------------------------
266+
267+
Match and replace literal asterisk characters in data.
268+
269+
PPL query::
270+
271+
os> source=accounts | eval note = 'price: *sale*' | replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note | fields note;
272+
fetched rows / total rows = 4/4
273+
+------------+
274+
| note |
275+
|------------|
276+
| DISCOUNTED |
277+
| DISCOUNTED |
278+
| DISCOUNTED |
279+
| DISCOUNTED |
280+
+------------+
281+
282+
Example 12: Wildcard with no replacement wildcards
283+
----------------------------------------------------
284+
285+
Use wildcards in pattern but none in replacement to create a fixed output.
286+
287+
PPL query::
288+
289+
os> source=accounts | eval test = 'prefix-value-suffix' | replace 'prefix-*-suffix' WITH 'MATCHED' IN test | fields test;
290+
fetched rows / total rows = 4/4
291+
+---------+
292+
| test |
293+
|---------|
294+
| MATCHED |
295+
| MATCHED |
296+
| MATCHED |
297+
| MATCHED |
298+
+---------+
299+
300+
Example 13: Escaped asterisks with wildcards
301+
---------------------------------------------
302+
303+
Combine escaped asterisks (literal) with wildcards for complex patterns.
304+
305+
PPL query::
306+
307+
os> source=accounts | eval label = 'file123.txt' | replace 'file*.*' WITH '\**.*' IN label | fields label;
308+
fetched rows / total rows = 4/4
309+
+----------+
310+
| label |
311+
|----------|
312+
| *123.txt |
313+
| *123.txt |
314+
| *123.txt |
315+
| *123.txt |
316+
+----------+
317+
318+
254319
Limitations
255320
===========
256321
* Pattern and replacement values must be string literals.
257322
* The replace command modifies the specified fields in-place.
258323
* Wildcard matching is case-sensitive.
259324
* Regular expressions are not supported (only simple wildcard patterns with ``*``).
260-
* Literal asterisk characters (``*``) cannot be matched or replaced when using wildcard patterns. To replace literal asterisks in your data, use non-wildcard patterns (do not include ``*`` in the pattern string).
325+
* Use backslash escape sequences (``\*``, ``\\``) to match literal asterisks or backslashes.

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,4 +495,50 @@ public void testWildcardReplace_emptyStringIntegration() throws IOException {
495495
rows("John", "Canada"),
496496
rows("Jane", "Canada"));
497497
}
498+
499+
@Test
500+
public void testEscapeSequence_literalAsterisk() throws IOException {
501+
// Test matching literal asterisks in data using \* escape sequence
502+
JSONObject result =
503+
executeQuery(
504+
String.format(
505+
"source = %s | eval note = 'price: *sale*' | replace 'price: \\\\*sale\\\\*' WITH"
506+
+ " 'DISCOUNTED' IN note | fields note | head 1",
507+
TEST_INDEX_STATE_COUNTRY));
508+
509+
verifySchema(result, schema("note", "string"));
510+
// Pattern "price: \*sale\*" matches literal asterisks, result should be "DISCOUNTED"
511+
verifyDataRows(result, rows("DISCOUNTED"));
512+
}
513+
514+
@Test
515+
public void testEscapeSequence_mixedEscapeAndWildcard() throws IOException {
516+
// Test combining escaped asterisks (literal) with wildcards (pattern matching)
517+
JSONObject result =
518+
executeQuery(
519+
String.format(
520+
"source = %s | eval label = 'file123.txt' | replace 'file*.*' WITH"
521+
+ " '\\\\**.*' IN label | fields label | head 1",
522+
TEST_INDEX_STATE_COUNTRY));
523+
524+
verifySchema(result, schema("label", "string"));
525+
// Pattern "file*.*" captures "123" and "txt"
526+
// Replacement "\**.*" has escaped * (literal), then 2 wildcards, producing "*123.txt"
527+
verifyDataRows(result, rows("*123.txt"));
528+
}
529+
530+
@Test
531+
public void testEscapeSequence_noMatchLiteral() throws IOException {
532+
// Test that escaped asterisk doesn't match as wildcard
533+
JSONObject result =
534+
executeQuery(
535+
String.format(
536+
"source = %s | eval test = 'fooXbar' | replace 'foo\\\\*bar' WITH 'matched' IN test"
537+
+ " | fields test | head 1",
538+
TEST_INDEX_STATE_COUNTRY));
539+
540+
verifySchema(result, schema("test", "string"));
541+
// Pattern "foo\*bar" matches literal "foo*bar", not "fooXbar", so original value returned
542+
verifyDataRows(result, rows("fooXbar"));
543+
}
498544
}

ppl/src/test/java/org/opensearch/sql/ppl/calcite/WildcardReplaceUtilsTest.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,4 +274,82 @@ public void testWildcardExample4_infixReplacement() {
274274
assertEquals(
275275
"fooALLbar", WildcardReplaceUtils.replaceWithWildcard("fooXYZbar", "*XYZ*", "*ALL*"));
276276
}
277+
278+
@Test
279+
public void testEscapedAsterisk_literal() {
280+
assertEquals(
281+
"foo*bar", WildcardReplaceUtils.replaceWithWildcard("foo*bar", "foo\\*bar", "foo\\*bar"));
282+
}
283+
284+
@Test
285+
public void testEscapedAsterisk_noMatch() {
286+
assertEquals(
287+
"fooXbar", WildcardReplaceUtils.replaceWithWildcard("fooXbar", "foo\\*bar", "replacement"));
288+
}
289+
290+
@Test
291+
public void testEscapedBackslash_beforeWildcard() {
292+
assertEquals(
293+
"foo\\123", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\123"));
294+
}
295+
296+
@Test
297+
public void testEscapedBackslash_literal() {
298+
assertEquals(
299+
"foo\\bar",
300+
WildcardReplaceUtils.replaceWithWildcard("foo\\bar", "foo\\\\bar", "foo\\\\bar"));
301+
}
302+
303+
@Test
304+
public void testMixedEscapes_asteriskAndBackslash() {
305+
assertEquals(
306+
"price: *special* $100\\ea",
307+
WildcardReplaceUtils.replaceWithWildcard(
308+
"price: *special* $100\\ea",
309+
"price: \\*special\\* $*\\\\*",
310+
"price: \\*special\\* $*\\\\*"));
311+
}
312+
313+
@Test
314+
public void testEscapedAsterisk_withWildcard_capture() {
315+
assertEquals(
316+
"file*.prefix-123",
317+
WildcardReplaceUtils.replaceWithWildcard("file123.txt", "file*.*", "file\\*.prefix-*"));
318+
}
319+
320+
@Test
321+
public void testTrailingBackslash_shouldFail() {
322+
try {
323+
WildcardReplaceUtils.replaceWithWildcard("foo", "foo\\", "bar");
324+
fail("Expected IllegalArgumentException");
325+
} catch (IllegalArgumentException exception) {
326+
assertTrue(exception.getMessage().contains("Invalid escape sequence"));
327+
}
328+
}
329+
330+
@Test
331+
public void testOnlyEscapedAsterisks_noWildcards() {
332+
assertEquals("***", WildcardReplaceUtils.replaceWithWildcard("***", "\\*\\*\\*", "\\*\\*\\*"));
333+
}
334+
335+
@Test
336+
public void testDoubleBackslashBeforeAsterisk() {
337+
assertEquals(
338+
"foo\\bar", WildcardReplaceUtils.replaceWithWildcard("foo\\abc", "foo\\\\*", "foo\\\\bar"));
339+
}
340+
341+
@Test
342+
public void testCountWildcards_withEscapes() {
343+
assertEquals(2, WildcardReplaceUtils.countWildcards("foo\\*bar*baz*"));
344+
}
345+
346+
@Test
347+
public void testCountWildcards_allEscaped() {
348+
assertEquals(0, WildcardReplaceUtils.countWildcards("\\*\\*\\*"));
349+
}
350+
351+
@Test
352+
public void testValidation_escapedWildcardsNotCounted() {
353+
WildcardReplaceUtils.validateWildcardSymmetry("foo\\**", "bar*");
354+
}
277355
}

0 commit comments

Comments
 (0)