Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix exclusion folder check for subdirectories and for Windows #2012

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/admin/fs/local-fs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,15 @@ If you define the following ``fs.excludes`` property in your
Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and
``/folderB/subfolderC`` will be indexed.

If you want to exclude a specific folder, you need to use a wildcard character at the end of the folder name, like:

.. code:: yaml

name: "test"
fs:
excludes:
- "/folderB/subfolderB/*"

Since the includes and excludes work on the entire *path of the file* you must consider that when using wildcards. Below are some includes and excludes pattern to help convey the idea better.

+--------------------+------------------------------------------------+------------------------------------------------+
Expand Down
6 changes: 6 additions & 0 deletions docs/source/release/2.10.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Version 2.10
============

Breaking changes
----------------

* If you want to exclude a specific folder, you need to use a wildcard character at the end of the folder name.
For example, to exclude the folder ``/tmp/foo``, you need to use ``/tmp/foo/*``. Thanks to dadoonet.

New
---

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,21 @@ private static boolean isIndexable(String filename, List<String> includes, List<
* @param includes include rules, may be empty not null
* @param excludes exclude rules, may be empty not null
*/
public static boolean isIndexable(boolean directory, String filename, List<String> includes, List<String> excludes) {
public static boolean isIndexable(final boolean directory, final String filename, final List<String> includes, final List<String> excludes) {
logger.debug("directory = [{}], filename = [{}], includes = [{}], excludes = [{}]", directory, filename, includes, excludes);

boolean isIndexable = isIndexable(filename, includes, excludes);
String originalFilename = filename;

// When the current file is a directory, we need to append a / to the filename
if (directory && !filename.endsWith("/")) {
originalFilename += "/";
}

boolean isIndexable = isIndexable(originalFilename, includes, excludes);

// It can happen that we have a dir "foo" which does not match the included name like "*.txt"
// We need to go in it unless it has been explicitly excluded by the user
if (directory && !isExcluded(filename, excludes)) {
if (directory && !isExcluded(originalFilename, excludes)) {
isIndexable = true;
}

Expand Down Expand Up @@ -214,12 +221,16 @@ public static boolean isIncluded(String filename, List<String> includes) {
return isMatching(filename, includes, "inclusion");
}

public static boolean isMatching(String filename, List<String> matches, String type) {
public static boolean isMatching(final String filename, final List<String> matches, final String type) {
logger.debug("checking {} for filename = [{}], matches = [{}]", type, filename, matches);

// We are using a linux style virtual path, meaning that if we have a windows path, we need to convert it
// to a linux path
String virtualPath = filename.replace("\\", "/");

for (String match : matches) {
String regex = match.toLowerCase().replace("?", ".?").replace("*", ".*");
String filenameLowerCase = filename.toLowerCase();
String filenameLowerCase = virtualPath.toLowerCase();
if (filenameLowerCase.matches(regex)) {
logger.trace("✅ [{}] does match {} regex [{}] (was [{}])", filenameLowerCase, type, regex, match);
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,21 @@ public void directories() {
assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(false));
assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(false));
assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB/*")), is(true));
}

/**
Expand Down Expand Up @@ -146,4 +146,20 @@ public void testIsMatching() {
assertThat(isMatching("/filter test/should-not-exclude.docx", Collections.singletonList("*~*"), "exclusion"), is(false));
assertThat(isMatching("/filter test/should-not-exclude.docx.exclude", Collections.singletonList("*.exclude"), "exclusion"), is(true));
}

/**
* Testing with windows separator
* See <a href="https://github.com/dadoonet/fscrawler/issues/1974>#1974</a>
*/
@Test
public void windowsSeparator() {
// We test with the Linux separator
assertThat(isIndexable(true, "/arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "/foo/arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "/foo", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(true));
// We test with the Windows separator
assertThat(isIndexable(true, "\\arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "\\foo\\arbets", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(false));
assertThat(isIndexable(true, "\\foo", new ArrayList<>(), Collections.singletonList("*/arbets/*")), is(true));
}
}
Loading