Skip to content

Commit

Permalink
Fix exclusion folder check for subdirectories and for Windows
Browse files Browse the repository at this point in the history
There were 2 issues here:

* We are comparing a folder name like `*/foo/*` with a virtual dir name which is something like `/foo` or `/bar/foo`. It's missing the `/` at the end when it's a directory.
* On windows, the exclusion for a dir named `\foo\arbets` does not match the exclusion `*/arbets/*` because of the `/` vs `\` mismatch

This commit fixes this behavior.

Closes #1974.
  • Loading branch information
dadoonet committed Feb 10, 2025
1 parent 5911d42 commit e0e3e35
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,21 @@ private static boolean isIndexable(String filename, List<String> includes, List<
* @param includes include rules, may be empty not null
* @param excludes exclude rules, may be empty not null
*/
public static boolean isIndexable(boolean directory, String filename, List<String> includes, List<String> excludes) {
public static boolean isIndexable(final boolean directory, final String filename, final List<String> includes, final List<String> excludes) {
logger.debug("directory = [{}], filename = [{}], includes = [{}], excludes = [{}]", directory, filename, includes, excludes);

boolean isIndexable = isIndexable(filename, includes, excludes);
String originalFilename = filename;

// When the current file is a directory, we need to append a / to the filename
if (directory && !filename.endsWith("/")) {
originalFilename += "/";
}

boolean isIndexable = isIndexable(originalFilename, includes, excludes);

// It can happen that we have a dir "foo" which does not match the included name like "*.txt"
// We need to go in it unless it has been explicitly excluded by the user
if (directory && !isExcluded(filename, excludes)) {
if (directory && !isExcluded(originalFilename, excludes)) {
isIndexable = true;
}

Expand Down Expand Up @@ -214,12 +221,16 @@ public static boolean isIncluded(String filename, List<String> includes) {
return isMatching(filename, includes, "inclusion");
}

public static boolean isMatching(String filename, List<String> matches, String type) {
public static boolean isMatching(final String filename, final List<String> matches, final String type) {
logger.debug("checking {} for filename = [{}], matches = [{}]", type, filename, matches);

// We are using a linux style virtual path, meaning that if we have a windows path, we need to convert it
// to a linux path
String virtualPath = filename.replace("\\", "/");

for (String match : matches) {
String regex = match.toLowerCase().replace("?", ".?").replace("*", ".*");
String filenameLowerCase = filename.toLowerCase();
String filenameLowerCase = virtualPath.toLowerCase();
if (filenameLowerCase.matches(regex)) {
logger.trace("✅ [{}] does match {} regex [{}] (was [{}])", filenameLowerCase, type, regex, match);
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,20 @@ public void testIsMatching() {
assertThat(isMatching("/filter test/should-not-exclude.docx", Collections.singletonList("*~*"), "exclusion"), is(false));
assertThat(isMatching("/filter test/should-not-exclude.docx.exclude", Collections.singletonList("*.exclude"), "exclusion"), is(true));
}

/**
* Testing with windows separator
* See <a href="https://github.com/dadoonet/fscrawler/issues/1974>#1974</a>
*/
@Test
public void windowsSeparator() {
// We test with the Linux separator
assertThat(isIndexable(true, "/arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "/foo/arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "/foo", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(true));
// We test with the Windows separator
assertThat(isIndexable(true, "\\arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "\\foo\\arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "\\foo", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(true));
}
}

0 comments on commit e0e3e35

Please sign in to comment.