Skip to content

feat(search): Multishard cutoffs #1924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
88577ba
core(search): Add EF_RUNTIME parameter
dranikpg Jun 21, 2024
03947c9
fix: fixes
dranikpg Jun 23, 2024
c2a4315
chore(ci): run replication tests on arm (#3168)
kostasrim Jun 18, 2024
c60cd7e
chore(search): improve parser error reporting (#3184)
romange Jun 18, 2024
4701272
fix(reply_builder): remove virtual modifier in SendError(ErrorReply) …
BagritsevichStepan Jun 19, 2024
a42f9f6
fix(cluster): Support `FLUSHALL` while slot migration is in progress …
chakaz Jun 20, 2024
81db231
fix: fix RegisterOnChange methods for journal and db_slice (#3171)
BorysTheDev Jun 20, 2024
5aeb8b2
fix(generic_family): fix RenameGeneric command for non-string data ty…
BagritsevichStepan Jun 20, 2024
f0064d8
fix(server): Rename confusing flag `replica_reconnect_on_master_resta…
chakaz Jun 20, 2024
350be10
fix(unit tests): fix generic family info test (#3187)
adiholden Jun 20, 2024
5e4fdfd
test: improve cluster_fuzzy_migration test (#3197)
BorysTheDev Jun 20, 2024
7bda51a
chore(core): Remove DfImpl inside ScoredMap (#3199)
dranikpg Jun 20, 2024
5e6672e
chore(tiering): Remove IoMgr (#3198)
dranikpg Jun 20, 2024
2a5a2a6
test(cluster): Make sure migration maintains TTL (#3188)
chakaz Jun 20, 2024
21de1c4
feat(cluster): Support `STICK` bit in slot migration (#3200)
chakaz Jun 21, 2024
406eb47
chore: Re-enable previously flaky test (#3196)
chakaz Jun 21, 2024
a86fa21
feat(acl): add support of multiple passwords (#3189)
kostasrim Jun 21, 2024
5322199
chore(tiering): More advanced tiering tests (#3201)
dranikpg Jun 21, 2024
c5f2724
chore: add replica-priority flag (#3204)
kostasrim Jun 21, 2024
80e09ca
feat(search): Multishard cutoffs
dranikpg Sep 24, 2023
8ffeacf
fix: fixes, comments, polishment
dranikpg Oct 29, 2023
c2bffc3
fix: fixes v2
dranikpg Oct 29, 2023
d4ee35d
fix: fixes
dranikpg Oct 30, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 3 additions & 36 deletions .github/actions/regression-tests/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ runs:
# timeout-minutes: 20
steps:
- name: Run PyTests
id: first
id: main
shell: bash
run: |
ls -l ${GITHUB_WORKSPACE}/
Expand All @@ -37,7 +37,7 @@ runs:
export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"
export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1 # to crash on errors

timeout 20m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report --json-report-file=report.json dragonfly --ignore=dragonfly/replication_test.py --log-cli-level=INFO || code=$?
timeout 40m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report --json-report-file=report.json dragonfly --log-cli-level=INFO || code=$?

# timeout returns 124 if we exceeded the timeout duration
if [[ $code -eq 124 ]]; then
Expand All @@ -50,32 +50,6 @@ runs:
exit 1
fi

- name: Run PyTests replication test
id: second
if: ${{ inputs.run-only-on-ubuntu-latest == 'true' || (inputs.run-only-on-ubuntu-latest == 'false' && matrix.runner == 'ubuntu-latest') }}
shell: bash
run: |
echo "Running PyTests replication test"
cd ${GITHUB_WORKSPACE}/tests
# used by PyTests
export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"


timeout 20m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report \
--json-report-file=rep1_report.json dragonfly/replication_test.py --log-cli-level=INFO \
--df alsologtostderr $1 $2 || code=$?

# timeout returns 124 if we exceeded the timeout duration
if [[ $code -eq 124 ]]; then
echo "TIMEDOUT=1">> "$GITHUB_OUTPUT"
exit 1
fi

# when a test fails in pytest it returns 1 but there are other return codes as well so we just check if the code is non zero
if [[ $code -ne 0 ]]; then
exit 1
fi

- name: Print last log on timeout
if: failure()
shell: bash
Expand Down Expand Up @@ -106,14 +80,7 @@ runs:
}
cd ${GITHUB_WORKSPACE}/tests
failed_tests=""
# The order in of if is important, and expected to be the oposite order of the pytest run.
# As github runner will not run the next step if the pytest failed, we start from the last
# report file and if exist we get the failed test from the pytest run, if there are any.
if [ -f rep2_report.json ]; then
failed_tests=$(get_failed_tests rep2_report.json)
elif [ -f rep1_report.json ]; then
failed_tests=$(get_failed_tests rep1_report.json)
elif [ -f report.json ]; then
if [ -f report.json ]; then
failed_tests=$(get_failed_tests report.json)
fi

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/regression-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ jobs:
with:
dfly-executable: dragonfly
gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
run-only-on-ubuntu-latest: false
build-folder-name: build
# This expression serves as a ternary operator, i.e. if the condition holds it returns
# 'not NON_EXISTING_MARK' otherwise not opt_only.
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ jobs:
with:
dfly-executable: dragonfly-x86_64
gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
run-only-on-ubuntu-latest: true
build-folder-name: ${{ env.RELEASE_DIR }}
- name: Save artifacts
run: |
Expand Down
1 change: 1 addition & 0 deletions src/core/bloom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <absl/numeric/bits.h>
#include <xxhash.h>

#include <algorithm>
#include <cmath>

#include "base/logging.h"
Expand Down
11 changes: 9 additions & 2 deletions src/core/search/ast_expr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ AstTagsNode::AstTagsNode(AstExpr&& l, std::string tag) {
}

AstKnnNode::AstKnnNode(uint32_t limit, std::string_view field, OwnedFtVector vec,
std::string_view score_alias)
std::string_view score_alias, std::optional<size_t> ef_runtime)
: filter{nullptr},
limit{limit},
field{field.substr(1)},
vec{std::move(vec)},
score_alias{score_alias} {
score_alias{score_alias},
ef_runtime{ef_runtime} {
}

AstKnnNode::AstKnnNode(AstNode&& filter, AstKnnNode&& self) {
Expand All @@ -72,3 +73,9 @@ AstKnnNode::AstKnnNode(AstNode&& filter, AstKnnNode&& self) {
}

} // namespace dfly::search

namespace std {
ostream& operator<<(ostream& os, optional<size_t> o) {
return os;
}
} // namespace std
13 changes: 11 additions & 2 deletions src/core/search/ast_expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ struct AstTagsNode {
struct AstKnnNode {
AstKnnNode() = default;
AstKnnNode(uint32_t limit, std::string_view field, OwnedFtVector vec,
std::string_view score_alias);
std::string_view score_alias, std::optional<size_t> ef_runtime);

AstKnnNode(AstNode&& sub, AstKnnNode&& self);

friend std::ostream& operator<<(std::ostream& stream, const AstKnnNode& matrix) {
Expand All @@ -86,6 +87,7 @@ struct AstKnnNode {
std::string field;
OwnedFtVector vec;
std::string score_alias;
std::optional<float> ef_runtime;
};

struct AstSortNode {
Expand All @@ -108,11 +110,18 @@ struct AstNode : public NodeVariants {
const NodeVariants& Variant() const& {
return *this;
}

// Aggregations: KNN, SORTBY. They reorder result sets and optionally reduce them.
bool IsAggregation() const {
return std::holds_alternative<AstKnnNode>(Variant());
}
};

using AstExpr = AstNode;

} // namespace search
} // namespace dfly

namespace std {} // namespace std
namespace std {
ostream& operator<<(ostream& os, optional<size_t> o);
}
18 changes: 13 additions & 5 deletions src/core/search/indices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ const float* FlatVectorIndex::Get(DocId doc) const {
}

struct HnswlibAdapter {
// Default setting of hnswlib/hnswalg
constexpr static size_t kDefaultEfRuntime = 10;

HnswlibAdapter(const SchemaField::VectorParams& params)
: space_{MakeSpace(params.dim, params.sim)}, world_{GetSpacePtr(),
params.capacity,
Expand All @@ -214,11 +217,13 @@ struct HnswlibAdapter {
world_.markDelete(id);
}

vector<pair<float, DocId>> Knn(float* target, size_t k) {
vector<pair<float, DocId>> Knn(float* target, size_t k, std::optional<size_t> ef) {
world_.setEf(ef.value_or(kDefaultEfRuntime));
return QueueToVec(world_.searchKnn(target, k));
}

vector<pair<float, DocId>> Knn(float* target, size_t k, const vector<DocId>& allowed) {
vector<pair<float, DocId>> Knn(float* target, size_t k, std::optional<size_t> ef,
const vector<DocId>& allowed) {
struct BinsearchFilter : hnswlib::BaseFilterFunctor {
virtual bool operator()(hnswlib::labeltype id) {
return binary_search(allowed->begin(), allowed->end(), id);
Expand All @@ -229,6 +234,7 @@ struct HnswlibAdapter {
const vector<DocId>* allowed;
};

world_.setEf(ef.value_or(kDefaultEfRuntime));
BinsearchFilter filter{&allowed};
return QueueToVec(world_.searchKnn(target, k, &filter));
}
Expand Down Expand Up @@ -276,12 +282,14 @@ void HnswVectorIndex::Add(DocId id, DocumentAccessor* doc, string_view field) {
adapter_->Add(ptr.get(), id);
}

std::vector<std::pair<float, DocId>> HnswVectorIndex::Knn(float* target, size_t k) const {
return adapter_->Knn(target, k);
std::vector<std::pair<float, DocId>> HnswVectorIndex::Knn(float* target, size_t k,
std::optional<size_t> ef) const {
return adapter_->Knn(target, k, ef);
}
std::vector<std::pair<float, DocId>> HnswVectorIndex::Knn(float* target, size_t k,
std::optional<size_t> ef,
const std::vector<DocId>& allowed) const {
return adapter_->Knn(target, k, allowed);
return adapter_->Knn(target, k, ef, allowed);
}

void HnswVectorIndex::Remove(DocId id, DocumentAccessor* doc, string_view field) {
Expand Down
4 changes: 2 additions & 2 deletions src/core/search/indices.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ struct HnswVectorIndex : public BaseVectorIndex {
void Add(DocId id, DocumentAccessor* doc, std::string_view field) override;
void Remove(DocId id, DocumentAccessor* doc, std::string_view field) override;

std::vector<std::pair<float, DocId>> Knn(float* target, size_t k) const;
std::vector<std::pair<float, DocId>> Knn(float* target, size_t k,
std::vector<std::pair<float, DocId>> Knn(float* target, size_t k, std::optional<size_t> ef) const;
std::vector<std::pair<float, DocId>> Knn(float* target, size_t k, std::optional<size_t> ef,
const std::vector<DocId>& allowed) const;

private:
Expand Down
1 change: 1 addition & 0 deletions src/core/search/lexer.lex
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ term_char [_]|\w
"|" return Parser::make_OR_OP (loc());
"KNN" return Parser::make_KNN (loc());
"AS" return Parser::make_AS (loc());
"EF_RUNTIME" return Parser::make_EF_RUNTIME (loc());

[0-9]{1,9} return make_UINT32(matched_view(), loc());
[+-]?(([0-9]*[.])?[0-9]+|inf) return make_DOUBLE(matched_view(), loc());
Expand Down
37 changes: 22 additions & 15 deletions src/core/search/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#define yylex driver->scanner()->Lex

using namespace std;

}

%parse-param { QueryDriver *driver }
Expand All @@ -46,18 +47,19 @@ using namespace std;
%define api.token.prefix {TOK_}

%token
LPAREN "("
RPAREN ")"
STAR "*"
ARROW "=>"
COLON ":"
LBRACKET "["
RBRACKET "]"
LCURLBR "{"
RCURLBR "}"
OR_OP "|"
KNN "KNN"
AS "AS"
LPAREN "("
RPAREN ")"
STAR "*"
ARROW "=>"
COLON ":"
LBRACKET "["
RBRACKET "]"
LCURLBR "{"
RCURLBR "}"
OR_OP "|"
KNN "KNN"
AS "AS"
EF_RUNTIME "EF_RUNTIME"
;

%token AND_OP
Expand All @@ -81,6 +83,7 @@ using namespace std;

%nterm <AstKnnNode> knn_query
%nterm <std::string> opt_knn_alias
%nterm <std::optional<size_t>> opt_ef_runtime

%printer { yyo << $$; } <*>;

Expand All @@ -93,13 +96,17 @@ final_query:
{ driver->Set(AstKnnNode(std::move($1), std::move($3))); }

knn_query:
LBRACKET KNN UINT32 FIELD TERM opt_knn_alias RBRACKET
{ $$ = AstKnnNode($3, $4, BytesToFtVector($5), $6); }
LBRACKET KNN UINT32 FIELD TERM opt_knn_alias opt_ef_runtime RBRACKET
{ $$ = AstKnnNode($3, $4, BytesToFtVector($5), $6, $7); }

opt_knn_alias:
AS TERM { $$ = std::move($2); }
| { $$ = std::string{}; }

opt_ef_runtime:
/* empty */ { $$ = std::nullopt; }
| EF_RUNTIME UINT32 { $$ = $2; }

filter:
search_expr { $$ = std::move($1); }
| STAR { $$ = AstStarNode(); }
Expand Down Expand Up @@ -174,5 +181,5 @@ tag_list:
void
dfly::search::Parser::error(const location_type& l, const string& m)
{
cerr << l << ": " << m << '\n';
driver->Error(l, m);
}
4 changes: 4 additions & 0 deletions src/core/search/query_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ void QueryDriver::ResetScanner() {
scanner_->SetParams(params_);
}

void QueryDriver::Error(const Parser::location_type& loc, std::string_view msg) {
LOG(ERROR) << "Parse error " << loc << ": " << msg;
}

} // namespace search

} // namespace dfly
2 changes: 2 additions & 0 deletions src/core/search/query_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class QueryDriver {
return scanner_.get();
}

void Error(const Parser::location_type& loc, std::string_view msg);

public:
Parser::location_type location;

Expand Down
11 changes: 9 additions & 2 deletions src/core/search/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,9 +353,10 @@ struct BasicSearch {

void SearchKnnHnsw(HnswVectorIndex* vec_index, const AstKnnNode& knn, IndexResult&& sub_results) {
if (indices_->GetAllDocs().size() == sub_results.Size())
knn_distances_ = vec_index->Knn(knn.vec.first.get(), knn.limit);
knn_distances_ = vec_index->Knn(knn.vec.first.get(), knn.limit, knn.ef_runtime);
else
knn_distances_ = vec_index->Knn(knn.vec.first.get(), knn.limit, sub_results.Take());
knn_distances_ =
vec_index->Knn(knn.vec.first.get(), knn.limit, knn.ef_runtime, sub_results.Take());
}

// [KNN limit @field vec]: Compute distance from `vec` to all vectors keep closest `limit`
Expand Down Expand Up @@ -420,6 +421,7 @@ struct BasicSearch {
profile_builder_ ? make_optional(profile_builder_->Take()) : nullopt;

size_t total = result.Size();

return SearchResult{total,
max(total, preagg_total_),
result.Take(limit_),
Expand All @@ -428,6 +430,7 @@ struct BasicSearch {
std::move(error_)};
}

private:
const FieldIndices* indices_;
size_t limit_;

Expand Down Expand Up @@ -622,4 +625,8 @@ void SearchAlgorithm::EnableProfiling() {
profiling_enabled_ = true;
}

bool SearchAlgorithm::IsProfilingEnabled() const {
return profiling_enabled_;
}

} // namespace dfly::search
1 change: 1 addition & 0 deletions src/core/search/search.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class SearchAlgorithm {
std::optional<AggregationInfo> HasAggregation() const;

void EnableProfiling();
bool IsProfilingEnabled() const;

private:
bool profiling_enabled_ = false;
Expand Down
20 changes: 20 additions & 0 deletions src/core/search/search_parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,24 @@ TEST_F(SearchParserTest, KNN) {
NEXT_TOK(TOK_LBRACKET);
}

TEST_F(SearchParserTest, KNNfull) {
SetInput("*=>[KNN 1 @vector field_vec AS vec_sort EF_RUNTIME 15]");
NEXT_TOK(TOK_STAR);
NEXT_TOK(TOK_ARROW);
NEXT_TOK(TOK_LBRACKET);

NEXT_TOK(TOK_KNN);
NEXT_EQ(TOK_UINT32, uint32_t, 1);
NEXT_TOK(TOK_FIELD);
NEXT_TOK(TOK_TERM);

NEXT_TOK(TOK_AS);
NEXT_EQ(TOK_TERM, string, "vec_sort");

NEXT_TOK(TOK_EF_RUNTIME);
NEXT_EQ(TOK_UINT32, uint32_t, 15);

NEXT_TOK(TOK_RBRACKET);
}

} // namespace dfly::search
Loading