Skip to content

Commit 0b20f03

Browse files
authored
[improvement](tvf load)add the data_{1..200}.csv wildcard in tvf load (#56705)
### What problem does this PR solve? Issue Number: close #xxx Related PR: apache/doris-website#2958 Problem Summary: Add wildcard recognition for data_{1..200}.csv in the files imported by tvf. after this pr: `data_{2..4,6}.csv` can be used in multi files load to math file `data_2.csv`,`data_3.csv`,`data_4.csv`,`data_6.csv` ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [x] Regression test - [x] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [x] Yes. <!-- Add document PR link here. eg: apache/doris-website#1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into -->
1 parent 324fd59 commit 0b20f03

File tree

10 files changed

+802
-3
lines changed

10 files changed

+802
-3
lines changed

fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
import java.util.ArrayList;
5959
import java.util.Arrays;
6060
import java.util.List;
61+
import java.util.regex.Matcher;
62+
import java.util.regex.Pattern;
6163

6264
public class S3Util {
6365
private static final Logger LOG = LogManager.getLogger(Util.class);
@@ -300,6 +302,92 @@ public static String getLongestPrefix(String globPattern) {
300302
return globPattern.substring(0, earliestSpecialCharIndex);
301303
}
302304

305+
// Apply some rules to extend the globs parsing behavior
306+
public static String extendGlobs(String pathPattern) {
307+
return extendGlobNumberRange(pathPattern);
308+
}
309+
310+
/**
311+
* Convert range patterns to brace enumeration patterns for glob matching.
312+
* Parts containing negative numbers or non-numeric characters are skipped.
313+
* eg(valid):
314+
* -> "file{1..3}" => "file{1,2,3}"
315+
* -> "file_{1..3,4,5..6}" => "file_{1,2,3,4,5,6}"
316+
* eg(invalid)
317+
* -> "data_{-1..4}.csv" will not load any file
318+
* -> "data_{a..4}.csv" will not load any file
319+
* @param pathPattern Path that may contain {start..end} or mixed {start..end,values} patterns
320+
* @return Path with ranges converted to comma-separated enumeration
321+
*/
322+
public static String extendGlobNumberRange(String pathPattern) {
323+
Pattern bracePattern = Pattern.compile("\\{([^}]+)\\}");
324+
Matcher braceMatcher = bracePattern.matcher(pathPattern);
325+
StringBuffer result = new StringBuffer();
326+
327+
while (braceMatcher.find()) {
328+
String braceContent = braceMatcher.group(1);
329+
String[] parts = braceContent.split(",");
330+
List<Integer> allNumbers = new ArrayList<>();
331+
Pattern rangePattern = Pattern.compile("^(-?\\d+)\\.\\.(-?\\d+)$");
332+
333+
for (String part : parts) {
334+
part = part.trim();
335+
Matcher rangeMatcher = rangePattern.matcher(part);
336+
337+
if (rangeMatcher.matches()) {
338+
int start = Integer.parseInt(rangeMatcher.group(1));
339+
int end = Integer.parseInt(rangeMatcher.group(2));
340+
341+
// Skip this range if either start or end is negative
342+
if (start < 0 || end < 0) {
343+
continue;
344+
}
345+
346+
if (start > end) {
347+
int temp = start;
348+
start = end;
349+
end = temp;
350+
}
351+
for (int i = start; i <= end; i++) {
352+
if (!allNumbers.contains(i)) {
353+
allNumbers.add(i);
354+
}
355+
}
356+
} else if (part.matches("^\\d+$")) {
357+
// This is a single non-negative number like "4"
358+
int num = Integer.parseInt(part);
359+
if (!allNumbers.contains(num)) {
360+
allNumbers.add(num);
361+
}
362+
} else {
363+
// Not a valid number or range (e.g., negative number, or contains non-numeric chars)
364+
// Just skip this part and continue processing other parts
365+
continue;
366+
}
367+
}
368+
369+
// If no valid numbers found after filtering, keep original content
370+
if (allNumbers.isEmpty()) {
371+
braceMatcher.appendReplacement(result, "{" + braceContent + "}");
372+
continue;
373+
}
374+
375+
// Build comma-separated result
376+
StringBuilder sb = new StringBuilder("{");
377+
for (int i = 0; i < allNumbers.size(); i++) {
378+
if (i > 0) {
379+
sb.append(",");
380+
}
381+
sb.append(allNumbers.get(i));
382+
}
383+
sb.append("}");
384+
braceMatcher.appendReplacement(result, sb.toString());
385+
}
386+
braceMatcher.appendTail(result);
387+
388+
return result.toString();
389+
}
390+
303391
// Fast fail validation for S3 endpoint connectivity to avoid retries and long waits
304392
// when network conditions are poor. Validates endpoint format, whitelist, security,
305393
// and tests connection with 10s timeout.

fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ public Status globList(String remotePath, List<RemoteFile> result, boolean fileN
348348
Status st = Status.OK;
349349
try {
350350
S3URI uri = S3URI.create(remotePath, isUsePathStyle, forceParsingByStandardUri);
351-
String globPath = uri.getKey();
351+
String globPath = S3Util.extendGlobs(uri.getKey());
352352
String bucket = uri.getBucket();
353353
if (LOG.isDebugEnabled()) {
354354
LOG.debug("try to glob list for azure, remote path {}, orig {}", globPath, remotePath);

fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ private GlobListResult globListInternal(String remotePath, List<RemoteFile> resu
573573
}
574574

575575
String bucket = uri.getBucket();
576-
String globPath = uri.getKey(); // eg: path/to/*.csv
576+
String globPath = S3Util.extendGlobs(uri.getKey());
577577

578578
if (LOG.isDebugEnabled()) {
579579
LOG.debug("globList globPath:{}, remotePath:{}", globPath, remotePath);

fe/fe-core/src/main/java/org/apache/doris/fs/remote/dfs/DFSFileSystem.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.apache.doris.backup.Status;
2222
import org.apache.doris.common.UserException;
2323
import org.apache.doris.common.security.authentication.HadoopAuthenticator;
24+
import org.apache.doris.common.util.S3Util;
2425
import org.apache.doris.common.util.URI;
2526
import org.apache.doris.datasource.property.storage.HdfsCompatibleProperties;
2627
import org.apache.doris.datasource.property.storage.StorageProperties;
@@ -510,7 +511,7 @@ public Status delete(String remotePath) {
510511
public Status globList(String remotePath, List<RemoteFile> result, boolean fileNameOnly) {
511512
try {
512513
URI pathUri = URI.create(remotePath);
513-
Path pathPattern = new Path(pathUri.getLocation());
514+
Path pathPattern = new Path(S3Util.extendGlobs(pathUri.getLocation()));
514515
FileSystem fileSystem = nativeFileSystem(pathPattern);
515516
FileStatus[] files = hdfsProperties.getHadoopAuthenticator().doAs(() -> fileSystem.globStatus(pathPattern));
516517
if (files == null) {
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.common.util;
19+
20+
import org.junit.Assert;
21+
import org.junit.Test;
22+
23+
public class S3UtilTest {
24+
25+
@Test
26+
public void testExtendGlobNumberRange_simpleRange() {
27+
// Test simple range expansion {1..3}
28+
String input = "file_{1..3}.csv";
29+
String expected = "file_{1,2,3}.csv";
30+
String result = S3Util.extendGlobNumberRange(input);
31+
Assert.assertEquals(expected, result);
32+
}
33+
34+
@Test
35+
public void testExtendGlobNumberRange_reverseRange() {
36+
// Test reverse range {3..1}, should normalize to {1,2,3}
37+
String input = "file_{3..1}.csv";
38+
String expected = "file_{1,2,3}.csv";
39+
String result = S3Util.extendGlobNumberRange(input);
40+
Assert.assertEquals(expected, result);
41+
}
42+
43+
@Test
44+
public void testExtendGlobNumberRange_singleNumber() {
45+
// Test single number range {2..2}
46+
String input = "file_{2..2}.csv";
47+
String expected = "file_{2}.csv";
48+
String result = S3Util.extendGlobNumberRange(input);
49+
Assert.assertEquals(expected, result);
50+
}
51+
52+
@Test
53+
public void testExtendGlobNumberRange_mixedRangeAndValues() {
54+
// Test mixed range and single values {1..2,3,1..3}
55+
String input = "file_{1..2,3,1..3}.csv";
56+
String expected = "file_{1,2,3}.csv";
57+
String result = S3Util.extendGlobNumberRange(input);
58+
Assert.assertEquals(expected, result);
59+
}
60+
61+
@Test
62+
public void testExtendGlobNumberRange_multipleRanges() {
63+
// Test multiple ranges in one path {1..2}_{1..2}
64+
String input = "file_{1..2}_{1..2}.csv";
65+
String expected = "file_{1,2}_{1,2}.csv";
66+
String result = S3Util.extendGlobNumberRange(input);
67+
Assert.assertEquals(expected, result);
68+
}
69+
70+
@Test
71+
public void testExtendGlobNumberRange_largeRange() {
72+
// Test large range {0..9}
73+
String input = "file_{0..9}.csv";
74+
String expected = "file_{0,1,2,3,4,5,6,7,8,9}.csv";
75+
String result = S3Util.extendGlobNumberRange(input);
76+
Assert.assertEquals(expected, result);
77+
}
78+
79+
@Test
80+
public void testExtendGlobNumberRange_negativeNumbersFiltered() {
81+
// If start or end is negative, the entire range is skipped
82+
String input = "file_{-1..2}.csv";
83+
String expected = "file_{-1..2}.csv";
84+
String result = S3Util.extendGlobNumberRange(input);
85+
Assert.assertEquals(expected, result);
86+
}
87+
88+
@Test
89+
public void testExtendGlobNumberRange_allNegativeRange() {
90+
// Test all negative range {-3..-1}, should keep original
91+
String input = "file_{-3..-1}.csv";
92+
String expected = "file_{-3..-1}.csv";
93+
String result = S3Util.extendGlobNumberRange(input);
94+
Assert.assertEquals(expected, result);
95+
}
96+
97+
@Test
98+
public void testExtendGlobNumberRange_mixedWithNegative() {
99+
// The range -1..2 is skipped, only 1..3 is expanded
100+
String input = "file_{-1..2,1..3}.csv";
101+
String expected = "file_{1,2,3}.csv";
102+
String result = S3Util.extendGlobNumberRange(input);
103+
Assert.assertEquals(expected, result);
104+
}
105+
106+
@Test
107+
public void testExtendGlobNumberRange_invalidCharacters() {
108+
// Test invalid characters {Refrain,1..3}
109+
String input = "file_{Refrain,1..3}.csv";
110+
String expected = "file_{1,2,3}.csv";
111+
String result = S3Util.extendGlobNumberRange(input);
112+
Assert.assertEquals(expected, result);
113+
}
114+
115+
@Test
116+
public void testExtendGlobNumberRange_mixedInvalidAndValid() {
117+
// Range 3..1 is normalized to 1..3, resulting in {1,2,3}
118+
String input = "file_{3..1,2,1..2}.csv";
119+
String expected = "file_{1,2,3}.csv";
120+
String result = S3Util.extendGlobNumberRange(input);
121+
Assert.assertEquals(expected, result);
122+
}
123+
124+
@Test
125+
public void testExtendGlobNumberRange_noRange() {
126+
// Test no range pattern
127+
String input = "file_123.csv";
128+
String expected = "file_123.csv";
129+
String result = S3Util.extendGlobNumberRange(input);
130+
Assert.assertEquals(expected, result);
131+
}
132+
133+
@Test
134+
public void testExtendGlobNumberRange_noNumericRange() {
135+
// Test no numeric range {a..z}
136+
String input = "file_{a..z}.csv";
137+
String expected = "file_{a..z}.csv";
138+
String result = S3Util.extendGlobNumberRange(input);
139+
Assert.assertEquals(expected, result);
140+
}
141+
142+
@Test
143+
public void testExtendGlobNumberRange_emptyBraces() {
144+
// Test empty braces {}
145+
String input = "file_{}.csv";
146+
String expected = "file_{}.csv";
147+
String result = S3Util.extendGlobNumberRange(input);
148+
Assert.assertEquals(expected, result);
149+
}
150+
151+
@Test
152+
public void testExtendGlobNumberRange_singleValue() {
153+
// Test single value in braces {5}
154+
String input = "file_{5}.csv";
155+
String expected = "file_{5}.csv";
156+
String result = S3Util.extendGlobNumberRange(input);
157+
Assert.assertEquals(expected, result);
158+
}
159+
160+
@Test
161+
public void testExtendGlobNumberRange_multipleValues() {
162+
// Test multiple single values {1,2,3}
163+
String input = "file_{1,2,3}.csv";
164+
String expected = "file_{1,2,3}.csv";
165+
String result = S3Util.extendGlobNumberRange(input);
166+
Assert.assertEquals(expected, result);
167+
}
168+
169+
@Test
170+
public void testExtendGlobNumberRange_duplicateRemoval() {
171+
// Test duplicate removal {1..3,2..4}
172+
String input = "file_{1..3,2..4}.csv";
173+
String expected = "file_{1,2,3,4}.csv";
174+
String result = S3Util.extendGlobNumberRange(input);
175+
Assert.assertEquals(expected, result);
176+
}
177+
178+
@Test
179+
public void testExtendGlobNumberRange_largeNumbers() {
180+
// Test large numbers {100..103}
181+
String input = "file_{100..103}.csv";
182+
String expected = "file_{100,101,102,103}.csv";
183+
String result = S3Util.extendGlobNumberRange(input);
184+
Assert.assertEquals(expected, result);
185+
}
186+
187+
@Test
188+
public void testExtendGlobNumberRange_zeroPadding() {
189+
// Test that zero-padding is not preserved (behavior test)
190+
// The function converts to integers, so "01" becomes "1"
191+
String input = "file_{01..03}.csv";
192+
String expected = "file_{1,2,3}.csv";
193+
String result = S3Util.extendGlobNumberRange(input);
194+
Assert.assertEquals(expected, result);
195+
}
196+
197+
@Test
198+
public void testExtendGlobNumberRange_complexPath() {
199+
// Test complex path with multiple patterns
200+
String input = "s3://bucket/data_{0..9}/file_{1..3}.csv";
201+
String expected = "s3://bucket/data_{0,1,2,3,4,5,6,7,8,9}/file_{1,2,3}.csv";
202+
String result = S3Util.extendGlobNumberRange(input);
203+
Assert.assertEquals(expected, result);
204+
}
205+
206+
@Test
207+
public void testExtendGlobNumberRange_noBraces() {
208+
// Test path without any braces
209+
String input = "s3://bucket/data.csv";
210+
String expected = "s3://bucket/data.csv";
211+
String result = S3Util.extendGlobNumberRange(input);
212+
Assert.assertEquals(expected, result);
213+
}
214+
215+
@Test
216+
public void testExtendGlobNumberRange_specialCase() {
217+
// Test special case from PR description {2..4,6}
218+
String input = "data_{2..4,6}.csv";
219+
String expected = "data_{2,3,4,6}.csv";
220+
String result = S3Util.extendGlobNumberRange(input);
221+
Assert.assertEquals(expected, result);
222+
}
223+
224+
@Test
225+
public void testGetLongestPrefix_withGlobPattern() {
226+
// Test getLongestPrefix with glob patterns
227+
String input1 = "s3://bucket/path/to/file_{1..3}.csv";
228+
String expected1 = "s3://bucket/path/to/file_";
229+
String result1 = S3Util.getLongestPrefix(input1);
230+
Assert.assertEquals(expected1, result1);
231+
232+
String input2 = "s3://bucket/path/*/file.csv";
233+
String expected2 = "s3://bucket/path/";
234+
String result2 = S3Util.getLongestPrefix(input2);
235+
Assert.assertEquals(expected2, result2);
236+
237+
String input3 = "s3://bucket/path/file.csv";
238+
String expected3 = "s3://bucket/path/file.csv";
239+
String result3 = S3Util.getLongestPrefix(input3);
240+
Assert.assertEquals(expected3, result3);
241+
}
242+
243+
@Test
244+
public void testExtendGlobs() {
245+
// Test extendGlobs method (which currently just calls extendGlobNumberRange)
246+
String input = "file_{1..3}.csv";
247+
String expected = "file_{1,2,3}.csv";
248+
String result = S3Util.extendGlobs(input);
249+
Assert.assertEquals(expected, result);
250+
}
251+
}
252+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1,1
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2,2
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3,3

0 commit comments

Comments
 (0)