From 85aded68932141ecfa5b57ff03735890fe2f7823 Mon Sep 17 00:00:00 2001 From: hemant-endee Date: Wed, 25 Mar 2026 17:18:36 +0530 Subject: [PATCH 01/29] Rebuild index with new config (#136) * Rebuild index with new config * fix 1 * index name in get stattu api correction * docs changes * Rebuild Status Persistence --- docs/rebuild.md | 124 ++++++++++++++++++++ src/core/ndd.hpp | 272 ++++++++++++++++++++++++++++++++++++++++++- src/core/rebuild.hpp | 130 +++++++++++++++++++++ src/main.cpp | 95 +++++++++++++++ 4 files changed, 618 insertions(+), 3 deletions(-) create mode 100644 docs/rebuild.md create mode 100644 src/core/rebuild.hpp diff --git a/docs/rebuild.md b/docs/rebuild.md new file mode 100644 index 0000000000..be6b35640f --- /dev/null +++ b/docs/rebuild.md @@ -0,0 +1,124 @@ +# Index Rebuild + +Rebuild allows you to reconstruct an HNSW index graph with new configuration parameters (M, ef_construction) without re-uploading vector data. All vectors are re-indexed from MDBX storage — only the graph structure is rebuilt. + +## API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/api/v1/index/{name}/rebuild` | Start async rebuild | +| GET | `/api/v1/index/{name}/rebuild/status` | Check rebuild progress | + +--- + +## Start Rebuild + +**POST** `/api/v1/index/{name}/rebuild` + +All parameters are optional. Omitted parameters retain their current values. + +```json +{ + "M": 32, + "ef_con": 256 +} +``` + +**Parameters:** + +| Parameter | Type | Description | +|-----------|------|-------------| +| `M` | int | HNSW graph connectivity (4–512) | +| `ef_con` | int | Construction-time search quality (8–4096) | + +**Response 202:** +```json +{ + "status": "rebuilding", + "previous_config": { "M": 16, "ef_con": 128 }, + "new_config": { "M": 32, "ef_con": 256 }, + "total_vectors": 50000 +} +``` + +**Errors:** + +| Code | Condition | +|------|-----------| +| 400 | No changes specified, invalid parameters, or attempted to change `precision`/`space_type` | +| 404 | Index not found | +| 409 | Rebuild or backup already in progress for this user | + +--- + +## Check Progress + +**GET** `/api/v1/index/{name}/rebuild/status` + +**Status values:** + +| Status | Meaning | +|--------|---------| +| `idle` | No rebuild has run for this index (or querying a different index) | +| `in_progress` | Rebuild is currently running | +| `completed` | Rebuild finished successfully | +| `failed` | Rebuild failed (see `error` field) | + +**In progress:** +```json +{ + "status": "in_progress", + "vectors_processed": 45000, + "total_vectors": 100000, + "percent_complete": 45.0, + "started_at": "2026-03-25T10:30:00Z" +} +``` + +**Completed:** +```json +{ + "status": "completed", + "vectors_processed": 100000, + "total_vectors": 100000, + "percent_complete": 100.0, + "started_at": "2026-03-25T10:30:00Z", + "completed_at": "2026-03-25T10:32:15Z" +} +``` + +**Failed:** +```json +{ + "status": "failed", + "vectors_processed": 45000, + "total_vectors": 100000, + "percent_complete": 45.0, + "started_at": "2026-03-25T10:30:00Z", + "completed_at": "2026-03-25T10:31:05Z", + "error": "Out of memory" +} +``` + +Status is per-index. The `completed`/`failed` state persists until the next rebuild is started for that user. + +--- + +## Restrictions + +The following parameters **cannot** be changed via rebuild (returns 400): +- `precision` (quantization level) +- `space_type` + + +--- + +## Behavior + +- **All vectors are re-indexed** from MDBX storage into a new HNSW graph with the updated configuration. +- **Search continues** during rebuild — queries use the old index until the rebuild completes. +- **Write operations** (insert, delete, update) will block and timeout while the rebuild is running, same as during backup. +- **One rebuild at a time per user** — cannot start a rebuild on any index while another rebuild is in progress for the same user. Also cannot run concurrently with a backup. +- **Periodic checkpoints** — the in-progress graph is saved to a temp file at regular intervals. +- **On completion**, the new graph replaces `default.idx`. All temporary and intermediate files are cleaned up. +- **On server restart** during an incomplete rebuild, the old index loads normally. Temp files are cleaned up automatically. The rebuild must be restarted manually. diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 55f6e5bc57..3f89592790 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -197,6 +197,7 @@ struct PersistenceConfig { }; #include "../storage/backup_store.hpp" +#include "rebuild.hpp" class IndexManager { private: @@ -220,8 +221,10 @@ class IndexManager { std::thread autosave_thread_; std::atomic running_{true}; BackupStore backup_store_; - void executeBackupJob(const std::string& index_id, const std::string& backup_name, - std::stop_token st); + Rebuild rebuild_; + void executeBackupJob(const std::string& index_id, const std::string& backup_name); + void executeRebuildJob(const std::string& index_id, const std::string& username, + size_t new_M, size_t new_ef_con); std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; @@ -581,6 +584,7 @@ class IndexManager { backup_store_(data_dir) { std::filesystem::create_directories(data_dir); metadata_manager_ = std::make_unique(data_dir); + rebuild_.cleanupTempFiles(data_dir); // Start the autosave thread autosave_thread_ = std::thread(&IndexManager::autosaveLoop, this); } @@ -1925,9 +1929,59 @@ class IndexManager { return backup_store_.validateBackupName(backup_name); } +<<<<<<< HEAD std::pair uploadBackup(const std::string& backup_name, const std::string& username, const std::string& file_content); +======= + // Metadata access + std::optional getMetadata(const std::string& index_id) { + return metadata_manager_->getMetadata(index_id); + } + + // Index stats (safe to call from routes) + size_t getElementCount(const std::string& index_id) { + auto& entry = getIndexEntry(index_id); + return entry.alg->getElementsCount(); + } + + + // ========== Rebuild operations ========== + + // Orchestration method (defined below after class) + std::pair rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con); + + bool hasActiveRebuild(const std::string& username) const { + return rebuild_.hasActiveRebuild(username); + } + + nlohmann::json getRebuildProgress(const std::string& username, + const std::string& index_id) const { + auto state = rebuild_.getActiveRebuild(username); + if (state && state->index_id == index_id) { + size_t processed = state->vectors_processed.load(); + size_t total = state->total_vectors.load(); + double percent = total > 0 ? (100.0 * processed / total) : 0.0; + nlohmann::json result = { + {"status", state->status}, + {"vectors_processed", processed}, + {"total_vectors", total}, + {"percent_complete", percent}, + {"started_at", Rebuild::formatTime(state->started_at)} + }; + if (state->status == "completed" || state->status == "failed") { + result["completed_at"] = Rebuild::formatTime(state->completed_at); + } + if (state->status == "failed" && !state->error_message.empty()) { + result["error"] = state->error_message; + } + return result; + } + return {{"status", "idle"}}; + } +>>>>>>> e66b946 (Rebuild index with new config (#136)) }; // ========== IndexManager backup implementations ========== @@ -2221,6 +2275,7 @@ inline std::pair IndexManager::createBackupAsync(const std::s return {true, backup_name}; } + inline std::pair IndexManager::uploadBackup(const std::string& backup_name, const std::string& username, const std::string& file_content) { std::string user_backup_dir = backup_store_.getUserBackupDir(username); std::filesystem::create_directories(user_backup_dir); @@ -2280,4 +2335,215 @@ inline std::pair IndexManager::uploadBackup(const std::string backup_store_.writeBackupJson(username, backup_db); return {true, "Backup uploaded successfully"}; -} \ No newline at end of file +} + +// ========== IndexManager rebuild implementations ========== + +inline std::pair IndexManager::rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con) { + // Validate index exists + auto meta = metadata_manager_->getMetadata(index_id); + if (!meta) { + return {false, "Index not found"}; + } + + // Extract username for backup check + std::string username; + size_t pos = index_id.find('/'); + if (pos != std::string::npos) { + username = index_id.substr(0, pos); + } else { + return {false, "Invalid index ID format"}; + } + + // Check for active backup or rebuild + if (backup_store_.hasActiveBackup(username)) { + return {false, "Backup already in progress for user: " + username}; + } + if (rebuild_.hasActiveRebuild(username)) { + return {false, "Rebuild already in progress for user: " + username}; + } + + // Load entry to get current element count + auto& entry = getIndexEntry(index_id); + size_t current_count = entry.alg->getElementsCount(); + + // Ensure at least one parameter differs + if (new_M == meta->M && new_ef_con == meta->ef_con) { + return {false, "No configuration changes specified"}; + } + + // Set active rebuild state (per-user, one rebuild at a time) + rebuild_.setActiveRebuild(username, index_id, current_count); + + // Spawn background thread (same pattern as createBackupAsync) + std::thread([this, index_id, username, new_M, new_ef_con]() { + executeRebuildJob(index_id, username, new_M, new_ef_con); + }).detach(); + + LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M + << " ef_con=" << new_ef_con); + + return {true, "Rebuild started"}; +} + +inline void IndexManager::executeRebuildJob(const std::string& index_id, + const std::string& username, + size_t new_M, size_t new_ef_con) { + std::string base_path = data_dir_ + "/" + index_id; + std::string temp_path = rebuild_.getTempPath(base_path); + std::string timestamped_path = rebuild_.getTimestampedPath(base_path); + std::string vector_storage_dir = base_path + "/vectors"; + std::string index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx"; + + try { + auto& entry = getIndexEntry(index_id); + + // Hold operation_mutex for entire rebuild — writes timeout, searches continue + std::lock_guard operation_lock(entry.operation_mutex); + + // Phase 1 — Save current state + saveIndexInternal(entry); + + // Read current config from the existing HNSW graph + auto space_type = entry.alg->getSpaceType(); + size_t dim = entry.alg->getDimension(); + auto quant_level = entry.alg->getQuantLevel(); + int32_t checksum = entry.alg->getChecksum(); + size_t max_elements = entry.alg->getMaxElements(); + + // Phase 2 — Build new HNSW (same max_elements as current index) + auto new_alg = std::make_unique>( + max_elements, space_type, dim, new_M, new_ef_con, + settings::RANDOM_SEED, quant_level, checksum); + + // Set vector fetcher BEFORE adding vectors — searchBaseLayer during + // graph construction needs this to compute distances for base-layer-only + // nodes (base layer doesn't store vector data inline) + new_alg->setVectorFetcher([vs = entry.vector_storage](ndd::idInt label, uint8_t* buffer) { + return vs->get_vector(label, buffer); + }); + + new_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) -> size_t { + return vs->get_vectors_batch_into(labels, buffers, success, count); + }); + + // Iterate VectorStore and re-insert all vectors + auto cursor = entry.vector_storage->getCursor(); + const size_t batch_size = settings::RECOVERY_BATCH_SIZE; + size_t total_processed = 0; + size_t batches_since_checkpoint = 0; + constexpr size_t CHECKPOINT_INTERVAL = 5; // Save temp every 5 batches + + while (cursor.hasNext()) { + // Collect batch + std::vector>> batch; + batch.reserve(batch_size); + while (cursor.hasNext() && batch.size() < batch_size) { + auto [label, vec_bytes] = cursor.next(); + if (!vec_bytes.empty()) { + batch.emplace_back(label, std::move(vec_bytes)); + } + } + + if (batch.empty()) { + break; + } + + // Multi-threaded insert (same pattern as addVectors and recoverIndex) + size_t num_threads = std::min(settings::NUM_RECOVERY_THREADS, batch.size()); + std::atomic next{0}; + std::vector threads; + + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back([&]() { + size_t i; + while ((i = next.fetch_add(1)) < batch.size()) { + const auto& [label, vec_bytes] = batch[i]; + new_alg->addPoint(vec_bytes.data(), label); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + + total_processed += batch.size(); + + // Update progress + auto state = rebuild_.getActiveRebuild(username); + if (state) { + state->vectors_processed.store(total_processed); + } + + // Periodic checkpoint save + batches_since_checkpoint++; + if (batches_since_checkpoint >= CHECKPOINT_INTERVAL) { + new_alg->saveIndex(temp_path); + batches_since_checkpoint = 0; + } + } + + // Phase 3 — Save final + Copy + Swap + + // Save new graph to timestamped file + new_alg->saveIndex(timestamped_path); + + // Copy to canonical name (overwrites old default.idx on disk) + std::filesystem::copy_file(timestamped_path, index_path, + std::filesystem::copy_options::overwrite_existing); + + // Load fresh from disk + swap pointer (reloadIndex pattern) + auto fresh_alg = std::make_unique>(index_path, 0); + + fresh_alg->setVectorFetcher([vs = entry.vector_storage](ndd::idInt label, uint8_t* buffer) { + return vs->get_vector(label, buffer); + }); + + fresh_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, + uint8_t* buffers, + bool* success, + size_t count) -> size_t { + return vs->get_vectors_batch_into(labels, buffers, success, count); + }); + + entry.alg = std::move(fresh_alg); + + // Delete temp checkpoint and timestamped file + if (std::filesystem::exists(temp_path)) { + std::filesystem::remove(temp_path); + } + if (std::filesystem::exists(timestamped_path)) { + std::filesystem::remove(timestamped_path); + } + + // Update metadata with new config + auto meta = metadata_manager_->getMetadata(index_id); + if (meta) { + meta->M = new_M; + meta->ef_con = new_ef_con; + meta->total_elements = entry.alg->getElementsCount(); + metadata_manager_->storeMetadata(index_id, *meta); + } + + entry.markUpdated(); + entry.updated = false; // We just saved the new graph + + LOG_INFO(2051, index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); + rebuild_.completeActiveRebuild(username); + + } catch (const std::exception& e) { + LOG_ERROR(2052, index_id, "Rebuild failed: " << e.what()); + + // Cleanup temp file on error + if (std::filesystem::exists(temp_path)) { + std::filesystem::remove(temp_path); + } + rebuild_.failActiveRebuild(username, e.what()); + } +} diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp new file mode 100644 index 0000000000..27777fa571 --- /dev/null +++ b/src/core/rebuild.hpp @@ -0,0 +1,130 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "settings.hpp" +#include "log.hpp" + +struct ActiveRebuild { + std::string index_id; + std::string status{"in_progress"}; // "in_progress", "completed", "failed" + std::string error_message; + std::atomic vectors_processed{0}; + std::atomic total_vectors{0}; + std::chrono::system_clock::time_point started_at; + std::chrono::system_clock::time_point completed_at; +}; + +class Rebuild { +private: + // Keyed by username — one rebuild per user at a time + std::unordered_map> active_rebuilds_; + mutable std::mutex rebuild_state_mutex_; + + static std::string timeToISO8601(std::chrono::system_clock::time_point tp) { + auto time_t_val = std::chrono::system_clock::to_time_t(tp); + std::tm tm_val{}; + gmtime_r(&time_t_val, &tm_val); + std::ostringstream oss; + oss << std::put_time(&tm_val, "%Y-%m-%dT%H:%M:%SZ"); + return oss.str(); + } + +public: + Rebuild() = default; + + // Lifecycle — cleanup temp files from interrupted rebuilds on startup + void cleanupTempFiles(const std::string& data_dir) { + if (!std::filesystem::exists(data_dir)) { + return; + } + try { + std::string temp_filename = std::string(settings::DEFAULT_SUBINDEX) + ".idx.temp"; + for (const auto& entry : std::filesystem::recursive_directory_iterator(data_dir)) { + if (entry.is_regular_file() && + entry.path().filename().string() == temp_filename) { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::exception&) { + // Silently ignore cleanup errors on startup + } + } + + // State tracking — per user + + void setActiveRebuild(const std::string& username, const std::string& index_id, + size_t total_vectors) { + std::lock_guard lock(rebuild_state_mutex_); + auto state = std::make_shared(); + state->index_id = index_id; + state->status = "in_progress"; + state->total_vectors.store(total_vectors); + state->vectors_processed.store(0); + state->started_at = std::chrono::system_clock::now(); + active_rebuilds_[username] = state; + } + + void completeActiveRebuild(const std::string& username) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->status = "completed"; + it->second->completed_at = std::chrono::system_clock::now(); + } + } + + void failActiveRebuild(const std::string& username, const std::string& error) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->status = "failed"; + it->second->error_message = error; + it->second->completed_at = std::chrono::system_clock::now(); + } + } + + bool hasActiveRebuild(const std::string& username) const { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + // Only "in_progress" blocks a new rebuild + return it != active_rebuilds_.end() && it->second->status == "in_progress"; + } + + std::shared_ptr getActiveRebuild(const std::string& username) const { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + return it->second; + } + return nullptr; + } + + // Format state as JSON fields + static std::string formatTime(std::chrono::system_clock::time_point tp) { + return timeToISO8601(tp); + } + + // Path helpers + + static std::string getTempPath(const std::string& index_dir) { + return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx.temp"; + } + + static std::string getTimestampedPath(const std::string& index_dir) { + auto ts = std::to_string( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch() + ).count() + ); + return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx." + ts; + } +}; diff --git a/src/main.cpp b/src/main.cpp index 4654a54c20..7821451ae8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -692,6 +692,101 @@ int main(int argc, char** argv) { } }); + // ========== Rebuild operations ========== + + // Start index rebuild + CROW_ROUTE(app, "/api/v1/index//rebuild") + .CROW_MIDDLEWARES(app, AuthMiddleware) + .methods("POST"_method)([&index_manager, &app](const crow::request& req, + const std::string& index_name) { + auto& ctx = app.get_context(req); + std::string index_id = ctx.username + "/" + index_name; + + auto body = crow::json::load(req.body); + if (!body) { + return json_error(400, "Invalid JSON"); + } + + // Reject parameters that cannot be changed via rebuild + if (body.has("precision")) { + return json_error(400, "precision cannot be changed via rebuild"); + } + if (body.has("space_type")) { + return json_error(400, "space_type cannot be changed via rebuild"); + } + + // Get current metadata for defaults + auto meta = index_manager.getMetadata(index_id); + if (!meta) { + return json_error(404, "Index not found"); + } + + // Parse parameters with current values as defaults + size_t new_M = body.has("M") ? (size_t)body["M"].i() : meta->M; + size_t new_ef_con = body.has("ef_con") ? (size_t)body["ef_con"].i() : meta->ef_con; + + // Validate M + if (new_M < settings::MIN_M || new_M > settings::MAX_M) { + return json_error(400, + "M must be between " + std::to_string(settings::MIN_M) + + " and " + std::to_string(settings::MAX_M)); + } + + // Validate ef_con + if (new_ef_con < settings::MIN_EF_CONSTRUCT || new_ef_con > settings::MAX_EF_CONSTRUCT) { + return json_error(400, + "ef_con must be between " + std::to_string(settings::MIN_EF_CONSTRUCT) + + " and " + std::to_string(settings::MAX_EF_CONSTRUCT)); + } + + // Get actual vector count for response + size_t actual_element_count = 0; + try { + actual_element_count = index_manager.getElementCount(index_id); + } catch (...) {} + + try { + auto [success, message] = index_manager.rebuildIndexAsync( + index_id, new_M, new_ef_con); + + if (!success) { + int code = (message.find("already in progress") != std::string::npos) ? 409 : 400; + return json_error(code, message); + } + + crow::json::wvalue response; + response["status"] = "rebuilding"; + response["previous_config"]["M"] = meta->M; + response["previous_config"]["ef_con"] = meta->ef_con; + response["new_config"]["M"] = new_M; + response["new_config"]["ef_con"] = new_ef_con; + response["total_vectors"] = actual_element_count; + return crow::response(202, response.dump()); + } catch (const std::exception& e) { + return json_error_500(ctx.username, index_name, req.url, e.what()); + } + }); + + // Get rebuild status + CROW_ROUTE(app, "/api/v1/index//rebuild/status") + .CROW_MIDDLEWARES(app, AuthMiddleware) + .methods("GET"_method)([&index_manager, &app](const crow::request& req, + const std::string& index_name) { + auto& ctx = app.get_context(req); + std::string index_id = ctx.username + "/" + index_name; + + try { + auto progress = index_manager.getRebuildProgress(ctx.username, index_id); + crow::response res; + res.code = 200; + res.set_header("Content-Type", "application/json"); + res.body = progress.dump(); + return res; + } catch (const std::exception& e) { + return json_error_500(ctx.username, index_name, req.url, e.what()); + } + }); + // List indexes for current user CROW_ROUTE(app, "/api/v1/index/list") .CROW_MIDDLEWARES(app, AuthMiddleware) From bb2f3d6db53f36f59997c9b0d7f231c67c4dbcce Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Wed, 8 Apr 2026 16:40:22 +0530 Subject: [PATCH 02/29] using jthread with stop token --- src/core/ndd.hpp | 32 ++++++++++++++++++++------------ src/core/rebuild.hpp | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 3f89592790..046254da7e 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -222,9 +222,9 @@ class IndexManager { std::atomic running_{true}; BackupStore backup_store_; Rebuild rebuild_; - void executeBackupJob(const std::string& index_id, const std::string& backup_name); + void executeBackupJob(const std::string& index_id, const std::string& backup_name, std::stop_token st); void executeRebuildJob(const std::string& index_id, const std::string& username, - size_t new_M, size_t new_ef_con); + size_t new_M, size_t new_ef_con, std::stop_token st); std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; @@ -593,9 +593,10 @@ class IndexManager { // Signal all threads to stop (running_ is checked by autosave and backup threads) running_ = false; - // Join background backup threads before destroying members - // (prevents use-after-free when detached threads outlive IndexManager) + // Join background backup and rebuild threads before destroying members + // (prevents use-after-free when threads outlive IndexManager) backup_store_.joinAllThreads(); + rebuild_.joinAllThreads(); /** * Don't wait for autosave thread to exit. @@ -2374,13 +2375,11 @@ inline std::pair IndexManager::rebuildIndexAsync(const std::s return {false, "No configuration changes specified"}; } - // Set active rebuild state (per-user, one rebuild at a time) - rebuild_.setActiveRebuild(username, index_id, current_count); - - // Spawn background thread (same pattern as createBackupAsync) - std::thread([this, index_id, username, new_M, new_ef_con]() { - executeRebuildJob(index_id, username, new_M, new_ef_con); - }).detach(); + // Set active rebuild state and spawn jthread (same pattern as createBackupAsync) + std::jthread t([this, index_id, username, new_M, new_ef_con](std::stop_token st) { + executeRebuildJob(index_id, username, new_M, new_ef_con, st); + }); + rebuild_.setActiveRebuild(username, index_id, current_count, std::move(t)); LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); @@ -2390,7 +2389,8 @@ inline std::pair IndexManager::rebuildIndexAsync(const std::s inline void IndexManager::executeRebuildJob(const std::string& index_id, const std::string& username, - size_t new_M, size_t new_ef_con) { + size_t new_M, size_t new_ef_con, + std::stop_token st) { std::string base_path = data_dir_ + "/" + index_id; std::string temp_path = rebuild_.getTempPath(base_path); std::string timestamped_path = rebuild_.getTimestampedPath(base_path); @@ -2440,6 +2440,14 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, constexpr size_t CHECKPOINT_INTERVAL = 5; // Save temp every 5 batches while (cursor.hasNext()) { + if (st.stop_requested()) { + if (std::filesystem::exists(temp_path)) { + std::filesystem::remove(temp_path); + } + rebuild_.failActiveRebuild(username, "Rebuild interrupted by server shutdown"); + return; + } + // Collect batch std::vector>> batch; batch.reserve(batch_size); diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index 27777fa571..98d10192f4 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include #include +#include #include "settings.hpp" #include "log.hpp" @@ -21,6 +23,7 @@ struct ActiveRebuild { std::atomic total_vectors{0}; std::chrono::system_clock::time_point started_at; std::chrono::system_clock::time_point completed_at; + std::jthread thread; // jthread: built-in stop_token + auto-join on destruction }; class Rebuild { @@ -62,7 +65,7 @@ class Rebuild { // State tracking — per user void setActiveRebuild(const std::string& username, const std::string& index_id, - size_t total_vectors) { + size_t total_vectors, std::jthread&& thread) { std::lock_guard lock(rebuild_state_mutex_); auto state = std::make_shared(); state->index_id = index_id; @@ -70,6 +73,7 @@ class Rebuild { state->total_vectors.store(total_vectors); state->vectors_processed.store(0); state->started_at = std::chrono::system_clock::now(); + state->thread = std::move(thread); active_rebuilds_[username] = state; } @@ -77,6 +81,10 @@ class Rebuild { std::lock_guard lock(rebuild_state_mutex_); auto it = active_rebuilds_.find(username); if (it != active_rebuilds_.end()) { + // Called from within the thread — detach so the jthread dtor doesn't join us + if (it->second->thread.joinable()) { + it->second->thread.detach(); + } it->second->status = "completed"; it->second->completed_at = std::chrono::system_clock::now(); } @@ -86,6 +94,10 @@ class Rebuild { std::lock_guard lock(rebuild_state_mutex_); auto it = active_rebuilds_.find(username); if (it != active_rebuilds_.end()) { + // Called from within the thread — detach so the jthread dtor doesn't join us + if (it->second->thread.joinable()) { + it->second->thread.detach(); + } it->second->status = "failed"; it->second->error_message = error; it->second->completed_at = std::chrono::system_clock::now(); @@ -99,6 +111,28 @@ class Rebuild { return it != active_rebuilds_.end() && it->second->status == "in_progress"; } + // Join all in-progress rebuild threads on shutdown. Mirrors BackupStore::joinAllThreads: + // move threads out under lock, request_stop + join outside lock to avoid deadlock + // (finishing threads call completeActiveRebuild which also locks rebuild_state_mutex_). + void joinAllThreads() { + std::vector threads_to_join; + { + std::lock_guard lock(rebuild_state_mutex_); + for (auto& [username, state] : active_rebuilds_) { + if (state->thread.joinable()) { + threads_to_join.push_back(std::move(state->thread)); + } + } + active_rebuilds_.clear(); + } + for (auto& t : threads_to_join) { + t.request_stop(); + if (t.joinable()) { + t.join(); + } + } + } + std::shared_ptr getActiveRebuild(const std::string& username) const { std::lock_guard lock(rebuild_state_mutex_); auto it = active_rebuilds_.find(username); From cbaa7445e57b0a77860f32348253857867d61c83 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Thu, 9 Apr 2026 14:20:31 +0530 Subject: [PATCH 03/29] =?UTF-8?q?Shared=20parallel=20addPoint=20utility=20?= =?UTF-8?q?function=20=E2=80=94=20static=20chunk=20partition(same=20as=20a?= =?UTF-8?q?ddVectors)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/ndd.hpp | 134 ++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 77 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 046254da7e..95043d0b2d 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -226,6 +226,31 @@ class IndexManager { void executeRebuildJob(const std::string& index_id, const std::string& username, size_t new_M, size_t new_ef_con, std::stop_token st); + // Shared parallel addPoint utility — static chunk partition (same as addVectors). + // ProcessFn signature: void(size_t index) + template + static void parallelAddPoints(size_t count, size_t max_threads, ProcessFn&& process) { + if (count == 0) return; + size_t num_threads = std::min(max_threads, count); + const size_t chunk_size = (count + num_threads - 1) / num_threads; // Ceiling division + std::vector threads; + threads.reserve(num_threads); + + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back([&, t]() { + size_t start_idx = t * chunk_size; + size_t end_idx = std::min(start_idx + chunk_size, count); + for (size_t i = start_idx; i < end_idx; ++i) { + process(i); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + } + std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; return std::make_unique(wal_dir, index_id); @@ -1113,47 +1138,15 @@ class IndexManager { logInsertsAndUpdates(entry, numeric_ids); // Add to HNSW index in parallel using pre-quantized data from QuantVectorObject - size_t available_threads = settings::NUM_PARALLEL_INSERTS; - const size_t num_threads = (available_threads < quantized_vectors.size()) - ? available_threads - : quantized_vectors.size(); - std::vector threads; - const size_t chunk_size = - (quantized_vectors.size() + num_threads - 1) / num_threads; // Ceiling division - - threads.reserve(num_threads); - for(size_t t = 0; t < num_threads; t++) { - threads.emplace_back([&, t]() { - // Calculate start and end indices for this thread - size_t start_idx = t * chunk_size; - size_t end_idx = (start_idx + chunk_size < quantized_vectors.size()) - ? (start_idx + chunk_size) - : quantized_vectors.size(); - - // Process assigned chunk of vectors - for(size_t i = start_idx; i < end_idx; i++) { - const auto& quant_vec_obj = quantized_vectors[i]; - - // Use pre-quantized data directly from QuantVectorObject - no conversion - // needed! - const uint8_t* vector_data = quant_vec_obj.quant_vector.data(); - - // Add to HNSW index using pre-quantized raw bytes - if(numeric_ids[i].second) { - // If it's a new ID, add it to the index - entry.alg->addPoint(vector_data, numeric_ids[i].first); - } else { - // If it's an update, add it to the index - entry.alg->addPoint(vector_data, numeric_ids[i].first); - } + parallelAddPoints(quantized_vectors.size(), settings::NUM_PARALLEL_INSERTS, + [&](size_t i) { + const uint8_t* vector_data = quantized_vectors[i].quant_vector.data(); + if(numeric_ids[i].second) { + entry.alg->addPoint(vector_data, numeric_ids[i].first); + } else { + entry.alg->addPoint(vector_data, numeric_ids[i].first); } }); - } - - // Wait for all threads to complete - for(auto& thread : threads) { - thread.join(); - } entry.markDirty(); @@ -1942,8 +1935,8 @@ class IndexManager { // Index stats (safe to call from routes) size_t getElementCount(const std::string& index_id) { - auto& entry = getIndexEntry(index_id); - return entry.alg->getElementsCount(); + auto entry = getIndexEntry(index_id); + return entry->alg->getElementsCount(); } @@ -2367,8 +2360,8 @@ inline std::pair IndexManager::rebuildIndexAsync(const std::s } // Load entry to get current element count - auto& entry = getIndexEntry(index_id); - size_t current_count = entry.alg->getElementsCount(); + auto entry = getIndexEntry(index_id); + size_t current_count = entry->alg->getElementsCount(); // Ensure at least one parameter differs if (new_M == meta->M && new_ef_con == meta->ef_con) { @@ -2398,20 +2391,20 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, std::string index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx"; try { - auto& entry = getIndexEntry(index_id); + auto entry = getIndexEntry(index_id); // Hold operation_mutex for entire rebuild — writes timeout, searches continue - std::lock_guard operation_lock(entry.operation_mutex); + std::unique_lock operation_lock(entry->operation_mutex); // Phase 1 — Save current state - saveIndexInternal(entry); + saveIndexInternal(*entry); // Read current config from the existing HNSW graph - auto space_type = entry.alg->getSpaceType(); - size_t dim = entry.alg->getDimension(); - auto quant_level = entry.alg->getQuantLevel(); - int32_t checksum = entry.alg->getChecksum(); - size_t max_elements = entry.alg->getMaxElements(); + auto space_type = entry->alg->getSpaceType(); + size_t dim = entry->alg->getDimension(); + auto quant_level = entry->alg->getQuantLevel(); + int32_t checksum = entry->alg->getChecksum(); + size_t max_elements = entry->alg->getMaxElements(); // Phase 2 — Build new HNSW (same max_elements as current index) auto new_alg = std::make_unique>( @@ -2421,11 +2414,11 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, // Set vector fetcher BEFORE adding vectors — searchBaseLayer during // graph construction needs this to compute distances for base-layer-only // nodes (base layer doesn't store vector data inline) - new_alg->setVectorFetcher([vs = entry.vector_storage](ndd::idInt label, uint8_t* buffer) { + new_alg->setVectorFetcher([vs = entry->vector_storage](ndd::idInt label, uint8_t* buffer) { return vs->get_vector(label, buffer); }); - new_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, + new_alg->setVectorFetcherBatch([vs = entry->vector_storage](const ndd::idInt* labels, uint8_t* buffers, bool* success, size_t count) -> size_t { @@ -2433,7 +2426,7 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, }); // Iterate VectorStore and re-insert all vectors - auto cursor = entry.vector_storage->getCursor(); + auto cursor = entry->vector_storage->getCursor(); const size_t batch_size = settings::RECOVERY_BATCH_SIZE; size_t total_processed = 0; size_t batches_since_checkpoint = 0; @@ -2462,24 +2455,12 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, break; } - // Multi-threaded insert (same pattern as addVectors and recoverIndex) - size_t num_threads = std::min(settings::NUM_RECOVERY_THREADS, batch.size()); - std::atomic next{0}; - std::vector threads; - - for (size_t t = 0; t < num_threads; ++t) { - threads.emplace_back([&]() { - size_t i; - while ((i = next.fetch_add(1)) < batch.size()) { - const auto& [label, vec_bytes] = batch[i]; - new_alg->addPoint(vec_bytes.data(), label); - } + // Multi-threaded insert (shared utility with addVectors) + parallelAddPoints(batch.size(), settings::NUM_PARALLEL_INSERTS, + [&](size_t i) { + const auto& [label, vec_bytes] = batch[i]; + new_alg->addPoint(vec_bytes.data(), label); }); - } - - for (auto& th : threads) { - th.join(); - } total_processed += batch.size(); @@ -2509,18 +2490,18 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, // Load fresh from disk + swap pointer (reloadIndex pattern) auto fresh_alg = std::make_unique>(index_path, 0); - fresh_alg->setVectorFetcher([vs = entry.vector_storage](ndd::idInt label, uint8_t* buffer) { + fresh_alg->setVectorFetcher([vs = entry->vector_storage](ndd::idInt label, uint8_t* buffer) { return vs->get_vector(label, buffer); }); - fresh_alg->setVectorFetcherBatch([vs = entry.vector_storage](const ndd::idInt* labels, + fresh_alg->setVectorFetcherBatch([vs = entry->vector_storage](const ndd::idInt* labels, uint8_t* buffers, bool* success, size_t count) -> size_t { return vs->get_vectors_batch_into(labels, buffers, success, count); }); - entry.alg = std::move(fresh_alg); + entry->alg = std::move(fresh_alg); // Delete temp checkpoint and timestamped file if (std::filesystem::exists(temp_path)) { @@ -2535,12 +2516,11 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, if (meta) { meta->M = new_M; meta->ef_con = new_ef_con; - meta->total_elements = entry.alg->getElementsCount(); + meta->total_elements = entry->alg->getElementsCount(); metadata_manager_->storeMetadata(index_id, *meta); } - entry.markUpdated(); - entry.updated = false; // We just saved the new graph + entry->is_dirty = false; // We just saved the new graph LOG_INFO(2051, index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); rebuild_.completeActiveRebuild(username); From cb9f73d18c465cbcb4263268f6456b005bfbde4d Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Tue, 21 Apr 2026 15:28:16 +0530 Subject: [PATCH 04/29] comments resolved --- src/core/ndd.hpp | 110 ++++++++++++++++++------------------------- src/core/rebuild.hpp | 43 ++++++++++++++++- src/main.cpp | 19 ++++---- 3 files changed, 96 insertions(+), 76 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 95043d0b2d..b0ad08c9ac 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -251,6 +251,19 @@ class IndexManager { } } + // Wires vector fetchers on an HNSW graph. Must be called before addPoint — searchBaseLayer + // during graph construction needs fetchers to compute distances for base-layer-only nodes. + static void wireVectorFetchers(hnswlib::HierarchicalNSW* alg, + std::shared_ptr vs) { + alg->setVectorFetcher([vs](ndd::idInt label, uint8_t* buffer) { + return vs->get_vector(label, buffer); + }); + alg->setVectorFetcherBatch([vs](const ndd::idInt* labels, uint8_t* buffers, + bool* success, size_t count) -> size_t { + return vs->get_vectors_batch_into(labels, buffers, success, count); + }); + } + std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; return std::make_unique(wal_dir, index_id); @@ -1933,7 +1946,7 @@ class IndexManager { return metadata_manager_->getMetadata(index_id); } - // Index stats (safe to call from routes) + // Reads live count from the in-memory HNSW graph; meta->total_elements can be stale between saves. size_t getElementCount(const std::string& index_id) { auto entry = getIndexEntry(index_id); return entry->alg->getElementsCount(); @@ -1943,9 +1956,9 @@ class IndexManager { // ========== Rebuild operations ========== // Orchestration method (defined below after class) - std::pair rebuildIndexAsync(const std::string& index_id, - size_t new_M, - size_t new_ef_con); + RebuildResult rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con); bool hasActiveRebuild(const std::string& username) const { return rebuild_.hasActiveRebuild(username); @@ -1953,27 +1966,7 @@ class IndexManager { nlohmann::json getRebuildProgress(const std::string& username, const std::string& index_id) const { - auto state = rebuild_.getActiveRebuild(username); - if (state && state->index_id == index_id) { - size_t processed = state->vectors_processed.load(); - size_t total = state->total_vectors.load(); - double percent = total > 0 ? (100.0 * processed / total) : 0.0; - nlohmann::json result = { - {"status", state->status}, - {"vectors_processed", processed}, - {"total_vectors", total}, - {"percent_complete", percent}, - {"started_at", Rebuild::formatTime(state->started_at)} - }; - if (state->status == "completed" || state->status == "failed") { - result["completed_at"] = Rebuild::formatTime(state->completed_at); - } - if (state->status == "failed" && !state->error_message.empty()) { - result["error"] = state->error_message; - } - return result; - } - return {{"status", "idle"}}; + return rebuild_.getProgress(username, index_id); } >>>>>>> e66b946 (Rebuild index with new config (#136)) }; @@ -2333,13 +2326,13 @@ inline std::pair IndexManager::uploadBackup(const std::string // ========== IndexManager rebuild implementations ========== -inline std::pair IndexManager::rebuildIndexAsync(const std::string& index_id, - size_t new_M, - size_t new_ef_con) { +inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con) { // Validate index exists auto meta = metadata_manager_->getMetadata(index_id); if (!meta) { - return {false, "Index not found"}; + return {false, 404, "Index not found"}; } // Extract username for backup check @@ -2348,15 +2341,15 @@ inline std::pair IndexManager::rebuildIndexAsync(const std::s if (pos != std::string::npos) { username = index_id.substr(0, pos); } else { - return {false, "Invalid index ID format"}; + return {false, 400, "Invalid index ID format"}; } // Check for active backup or rebuild if (backup_store_.hasActiveBackup(username)) { - return {false, "Backup already in progress for user: " + username}; + return {false, 409, "Backup already in progress for user: " + username}; } if (rebuild_.hasActiveRebuild(username)) { - return {false, "Rebuild already in progress for user: " + username}; + return {false, 409, "Rebuild already in progress for user: " + username}; } // Load entry to get current element count @@ -2365,21 +2358,30 @@ inline std::pair IndexManager::rebuildIndexAsync(const std::s // Ensure at least one parameter differs if (new_M == meta->M && new_ef_con == meta->ef_con) { - return {false, "No configuration changes specified"}; + return {false, 400, "No configuration changes specified"}; } - // Set active rebuild state and spawn jthread (same pattern as createBackupAsync) + // Register state FIRST with empty thread — hasActiveRebuild() now returns true immediately, + // blocking any concurrent rebuild requests before the thread is even spawned. + rebuild_.setActiveRebuild(username, index_id, current_count, std::jthread{}); + + // THEN spawn thread std::jthread t([this, index_id, username, new_M, new_ef_con](std::stop_token st) { executeRebuildJob(index_id, username, new_M, new_ef_con, st); }); - rebuild_.setActiveRebuild(username, index_id, current_count, std::move(t)); + + // Move real thread into the already-registered state + rebuild_.attachRebuildThread(username, std::move(t)); LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); - return {true, "Rebuild started"}; + return {true, 202, "Rebuild started"}; } +// executeRebuildJob lives in IndexManager (not Rebuild) because it needs direct access to +// CacheEntry, parallelAddPoints, and saveIndexInternal. Rebuild is a state-tracker only — +// moving execution here would create a circular dependency with IndexManager. inline void IndexManager::executeRebuildJob(const std::string& index_id, const std::string& username, size_t new_M, size_t new_ef_con, @@ -2411,19 +2413,8 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, max_elements, space_type, dim, new_M, new_ef_con, settings::RANDOM_SEED, quant_level, checksum); - // Set vector fetcher BEFORE adding vectors — searchBaseLayer during - // graph construction needs this to compute distances for base-layer-only - // nodes (base layer doesn't store vector data inline) - new_alg->setVectorFetcher([vs = entry->vector_storage](ndd::idInt label, uint8_t* buffer) { - return vs->get_vector(label, buffer); - }); - - new_alg->setVectorFetcherBatch([vs = entry->vector_storage](const ndd::idInt* labels, - uint8_t* buffers, - bool* success, - size_t count) -> size_t { - return vs->get_vectors_batch_into(labels, buffers, success, count); - }); + // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes + wireVectorFetchers(new_alg.get(), entry->vector_storage); // Iterate VectorStore and re-insert all vectors auto cursor = entry->vector_storage->getCursor(); @@ -2487,23 +2478,16 @@ inline void IndexManager::executeRebuildJob(const std::string& index_id, std::filesystem::copy_file(timestamped_path, index_path, std::filesystem::copy_options::overwrite_existing); - // Load fresh from disk + swap pointer (reloadIndex pattern) + // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires + // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. + // Calling reloadIndex here would deadlock with a concurrent delete on the same index. auto fresh_alg = std::make_unique>(index_path, 0); - - fresh_alg->setVectorFetcher([vs = entry->vector_storage](ndd::idInt label, uint8_t* buffer) { - return vs->get_vector(label, buffer); - }); - - fresh_alg->setVectorFetcherBatch([vs = entry->vector_storage](const ndd::idInt* labels, - uint8_t* buffers, - bool* success, - size_t count) -> size_t { - return vs->get_vectors_batch_into(labels, buffers, success, count); - }); - + wireVectorFetchers(fresh_alg.get(), entry->vector_storage); entry->alg = std::move(fresh_alg); - // Delete temp checkpoint and timestamped file + // Both files are deleted here on success. If the server crashes before reaching this + // point, the timestamped file (default.idx.) may be left on disk — it is safe + // to delete manually on next startup as it does not affect index correctness. if (std::filesystem::exists(temp_path)) { std::filesystem::remove(temp_path); } diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index 98d10192f4..fdd2591843 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -14,6 +14,13 @@ #include "settings.hpp" #include "log.hpp" +#include "json/nlohmann_json.hpp" + +struct RebuildResult { + bool success; + int http_code; + std::string message; +}; struct ActiveRebuild { std::string index_id; @@ -57,8 +64,8 @@ class Rebuild { std::filesystem::remove(entry.path()); } } - } catch (const std::exception&) { - // Silently ignore cleanup errors on startup + } catch (const std::exception& e) { + LOG_WARN(2053, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); } } @@ -133,6 +140,38 @@ class Rebuild { } } + void attachRebuildThread(const std::string& username, std::jthread&& thread) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->thread = std::move(thread); + } + } + + nlohmann::json getProgress(const std::string& username, const std::string& index_id) const { + auto state = getActiveRebuild(username); + if (state && state->index_id == index_id) { + size_t processed = state->vectors_processed.load(); + size_t total = state->total_vectors.load(); + double percent = total > 0 ? (100.0 * processed / total) : 0.0; + nlohmann::json result = { + {"status", state->status}, + {"vectors_processed", processed}, + {"total_vectors", total}, + {"percent_complete", percent}, + {"started_at", formatTime(state->started_at)} + }; + if (state->status == "completed" || state->status == "failed") { + result["completed_at"] = formatTime(state->completed_at); + } + if (state->status == "failed" && !state->error_message.empty()) { + result["error"] = state->error_message; + } + return result; + } + return {{"status", "idle"}}; + } + std::shared_ptr getActiveRebuild(const std::string& username) const { std::lock_guard lock(rebuild_state_mutex_); auto it = active_rebuilds_.find(username); diff --git a/src/main.cpp b/src/main.cpp index 7821451ae8..69dfb908f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -720,7 +720,6 @@ int main(int argc, char** argv) { if (!meta) { return json_error(404, "Index not found"); } - // Parse parameters with current values as defaults size_t new_M = body.has("M") ? (size_t)body["M"].i() : meta->M; size_t new_ef_con = body.has("ef_con") ? (size_t)body["ef_con"].i() : meta->ef_con; @@ -739,19 +738,17 @@ int main(int argc, char** argv) { + " and " + std::to_string(settings::MAX_EF_CONSTRUCT)); } - // Get actual vector count for response - size_t actual_element_count = 0; - try { - actual_element_count = index_manager.getElementCount(index_id); - } catch (...) {} + // Use live count — meta->total_elements can be stale if not yet flushed to disk + size_t actual_element_count = index_manager.getElementCount(index_id); + if (actual_element_count == 0) { + return json_error(400, "Cannot rebuild an empty index"); + } try { - auto [success, message] = index_manager.rebuildIndexAsync( - index_id, new_M, new_ef_con); + auto result = index_manager.rebuildIndexAsync(index_id, new_M, new_ef_con); - if (!success) { - int code = (message.find("already in progress") != std::string::npos) ? 409 : 400; - return json_error(code, message); + if (!result.success) { + return json_error(result.http_code, result.message); } crow::json::wvalue response; From fd2c034f5639ff80e7552fe70c4226efa2b31b66 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Thu, 23 Apr 2026 14:17:10 +0530 Subject: [PATCH 05/29] rebuild also handles execute_rebuild_job --- src/core/ndd.hpp | 281 +++++++++++++------------------------------ src/core/rebuild.hpp | 132 ++++++++++++++++++++ 2 files changed, 214 insertions(+), 199 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index b0ad08c9ac..17f34ffc00 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -223,46 +223,6 @@ class IndexManager { BackupStore backup_store_; Rebuild rebuild_; void executeBackupJob(const std::string& index_id, const std::string& backup_name, std::stop_token st); - void executeRebuildJob(const std::string& index_id, const std::string& username, - size_t new_M, size_t new_ef_con, std::stop_token st); - - // Shared parallel addPoint utility — static chunk partition (same as addVectors). - // ProcessFn signature: void(size_t index) - template - static void parallelAddPoints(size_t count, size_t max_threads, ProcessFn&& process) { - if (count == 0) return; - size_t num_threads = std::min(max_threads, count); - const size_t chunk_size = (count + num_threads - 1) / num_threads; // Ceiling division - std::vector threads; - threads.reserve(num_threads); - - for (size_t t = 0; t < num_threads; ++t) { - threads.emplace_back([&, t]() { - size_t start_idx = t * chunk_size; - size_t end_idx = std::min(start_idx + chunk_size, count); - for (size_t i = start_idx; i < end_idx; ++i) { - process(i); - } - }); - } - - for (auto& th : threads) { - th.join(); - } - } - - // Wires vector fetchers on an HNSW graph. Must be called before addPoint — searchBaseLayer - // during graph construction needs fetchers to compute distances for base-layer-only nodes. - static void wireVectorFetchers(hnswlib::HierarchicalNSW* alg, - std::shared_ptr vs) { - alg->setVectorFetcher([vs](ndd::idInt label, uint8_t* buffer) { - return vs->get_vector(label, buffer); - }); - alg->setVectorFetcherBatch([vs](const ndd::idInt* labels, uint8_t* buffers, - bool* success, size_t count) -> size_t { - return vs->get_vectors_batch_into(labels, buffers, success, count); - }); - } std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; @@ -1936,11 +1896,10 @@ class IndexManager { return backup_store_.validateBackupName(backup_name); } -<<<<<<< HEAD std::pair uploadBackup(const std::string& backup_name, const std::string& username, const std::string& file_content); -======= + // Metadata access std::optional getMetadata(const std::string& index_id) { return metadata_manager_->getMetadata(index_id); @@ -1968,7 +1927,42 @@ class IndexManager { const std::string& index_id) const { return rebuild_.getProgress(username, index_id); } ->>>>>>> e66b946 (Rebuild index with new config (#136)) + + // Shared parallel addPoint utility — static chunk partition (same as addVectors). + // ProcessFn signature: void(size_t index) + template + static void parallelAddPoints(size_t count, size_t max_threads, ProcessFn&& process) { + if (count == 0) return; + size_t num_threads = std::min(max_threads, count); + const size_t chunk_size = (count + num_threads - 1) / num_threads; + std::vector threads; + threads.reserve(num_threads); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back([&, t]() { + size_t start_idx = t * chunk_size; + size_t end_idx = std::min(start_idx + chunk_size, count); + for (size_t i = start_idx; i < end_idx; ++i) { + process(i); + } + }); + } + for (auto& th : threads) { + th.join(); + } + } + + // Wires vector fetchers on an HNSW graph. Must be called before addPoint — searchBaseLayer + // during graph construction needs fetchers to compute distances for base-layer-only nodes. + static void wireVectorFetchers(hnswlib::HierarchicalNSW* alg, + std::shared_ptr vs) { + alg->setVectorFetcher([vs](ndd::idInt label, uint8_t* buffer) { + return vs->get_vector(label, buffer); + }); + alg->setVectorFetcherBatch([vs](const ndd::idInt* labels, uint8_t* buffers, + bool* success, size_t count) -> size_t { + return vs->get_vectors_batch_into(labels, buffers, success, count); + }); + } }; // ========== IndexManager backup implementations ========== @@ -2327,15 +2321,13 @@ inline std::pair IndexManager::uploadBackup(const std::string // ========== IndexManager rebuild implementations ========== inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id, - size_t new_M, - size_t new_ef_con) { - // Validate index exists + size_t new_M, + size_t new_ef_con) { auto meta = metadata_manager_->getMetadata(index_id); if (!meta) { return {false, 404, "Index not found"}; } - // Extract username for backup check std::string username; size_t pos = index_id.find('/'); if (pos != std::string::npos) { @@ -2344,7 +2336,6 @@ inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id return {false, 400, "Invalid index ID format"}; } - // Check for active backup or rebuild if (backup_store_.hasActiveBackup(username)) { return {false, 409, "Backup already in progress for user: " + username}; } @@ -2352,170 +2343,62 @@ inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id return {false, 409, "Rebuild already in progress for user: " + username}; } - // Load entry to get current element count + // Pre-fetch entry now — captured by lambdas so the thread never calls getIndexEntry auto entry = getIndexEntry(index_id); size_t current_count = entry->alg->getElementsCount(); - // Ensure at least one parameter differs if (new_M == meta->M && new_ef_con == meta->ef_con) { return {false, 400, "No configuration changes specified"}; } - // Register state FIRST with empty thread — hasActiveRebuild() now returns true immediately, - // blocking any concurrent rebuild requests before the thread is even spawned. + std::string base_path = data_dir_ + "/" + index_id; + std::string vector_storage_dir = base_path + "/vectors"; + + RebuildJobParams params{ + .index_id = index_id, + .username = username, + .new_M = new_M, + .new_ef_con = new_ef_con, + .space_type = entry->alg->getSpaceType(), + .dim = entry->alg->getDimension(), + .quant_level = entry->alg->getQuantLevel(), + .checksum = entry->alg->getChecksum(), + .max_elements = entry->alg->getMaxElements(), + .vector_storage = entry->vector_storage, + .temp_path = Rebuild::getTempPath(base_path), + .timestamped_path = Rebuild::getTimestampedPath(base_path), + .index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx", + .num_parallel_inserts = settings::NUM_PARALLEL_INSERTS, + .operation_mutex = &entry->operation_mutex, + .save_current_index = [this, entry]() { saveIndexInternal(*entry); }, + .swap_alg = [entry](auto fresh) { entry->alg = std::move(fresh); }, + .update_metadata = [this, index_id, entry](size_t nm, size_t nef) { + auto m = metadata_manager_->getMetadata(index_id); + if (m) { + m->M = nm; + m->ef_con = nef; + m->total_elements = entry->alg->getElementsCount(); + metadata_manager_->storeMetadata(index_id, *m); + } + }, + .clear_dirty = [entry]() { entry->is_dirty = false; }, + .wire_fetchers = [](auto* alg, auto vs) { IndexManager::wireVectorFetchers(alg, vs); }, + .parallel_add = [](size_t n, size_t t, std::function fn) { + IndexManager::parallelAddPoints(n, t, std::move(fn)); + }, + }; + + // Register state FIRST with empty thread — hasActiveRebuild() returns true immediately rebuild_.setActiveRebuild(username, index_id, current_count, std::jthread{}); - // THEN spawn thread - std::jthread t([this, index_id, username, new_M, new_ef_con](std::stop_token st) { - executeRebuildJob(index_id, username, new_M, new_ef_con, st); + // Spawn thread — lambda calls rebuild_.executeJob directly (execution lives in Rebuild) + std::jthread t([this, params = std::move(params)](std::stop_token st) mutable { + rebuild_.executeJob(params, st); }); // Move real thread into the already-registered state rebuild_.attachRebuildThread(username, std::move(t)); - LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M - << " ef_con=" << new_ef_con); - + LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); return {true, 202, "Rebuild started"}; } - -// executeRebuildJob lives in IndexManager (not Rebuild) because it needs direct access to -// CacheEntry, parallelAddPoints, and saveIndexInternal. Rebuild is a state-tracker only — -// moving execution here would create a circular dependency with IndexManager. -inline void IndexManager::executeRebuildJob(const std::string& index_id, - const std::string& username, - size_t new_M, size_t new_ef_con, - std::stop_token st) { - std::string base_path = data_dir_ + "/" + index_id; - std::string temp_path = rebuild_.getTempPath(base_path); - std::string timestamped_path = rebuild_.getTimestampedPath(base_path); - std::string vector_storage_dir = base_path + "/vectors"; - std::string index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx"; - - try { - auto entry = getIndexEntry(index_id); - - // Hold operation_mutex for entire rebuild — writes timeout, searches continue - std::unique_lock operation_lock(entry->operation_mutex); - - // Phase 1 — Save current state - saveIndexInternal(*entry); - - // Read current config from the existing HNSW graph - auto space_type = entry->alg->getSpaceType(); - size_t dim = entry->alg->getDimension(); - auto quant_level = entry->alg->getQuantLevel(); - int32_t checksum = entry->alg->getChecksum(); - size_t max_elements = entry->alg->getMaxElements(); - - // Phase 2 — Build new HNSW (same max_elements as current index) - auto new_alg = std::make_unique>( - max_elements, space_type, dim, new_M, new_ef_con, - settings::RANDOM_SEED, quant_level, checksum); - - // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes - wireVectorFetchers(new_alg.get(), entry->vector_storage); - - // Iterate VectorStore and re-insert all vectors - auto cursor = entry->vector_storage->getCursor(); - const size_t batch_size = settings::RECOVERY_BATCH_SIZE; - size_t total_processed = 0; - size_t batches_since_checkpoint = 0; - constexpr size_t CHECKPOINT_INTERVAL = 5; // Save temp every 5 batches - - while (cursor.hasNext()) { - if (st.stop_requested()) { - if (std::filesystem::exists(temp_path)) { - std::filesystem::remove(temp_path); - } - rebuild_.failActiveRebuild(username, "Rebuild interrupted by server shutdown"); - return; - } - - // Collect batch - std::vector>> batch; - batch.reserve(batch_size); - while (cursor.hasNext() && batch.size() < batch_size) { - auto [label, vec_bytes] = cursor.next(); - if (!vec_bytes.empty()) { - batch.emplace_back(label, std::move(vec_bytes)); - } - } - - if (batch.empty()) { - break; - } - - // Multi-threaded insert (shared utility with addVectors) - parallelAddPoints(batch.size(), settings::NUM_PARALLEL_INSERTS, - [&](size_t i) { - const auto& [label, vec_bytes] = batch[i]; - new_alg->addPoint(vec_bytes.data(), label); - }); - - total_processed += batch.size(); - - // Update progress - auto state = rebuild_.getActiveRebuild(username); - if (state) { - state->vectors_processed.store(total_processed); - } - - // Periodic checkpoint save - batches_since_checkpoint++; - if (batches_since_checkpoint >= CHECKPOINT_INTERVAL) { - new_alg->saveIndex(temp_path); - batches_since_checkpoint = 0; - } - } - - // Phase 3 — Save final + Copy + Swap - - // Save new graph to timestamped file - new_alg->saveIndex(timestamped_path); - - // Copy to canonical name (overwrites old default.idx on disk) - std::filesystem::copy_file(timestamped_path, index_path, - std::filesystem::copy_options::overwrite_existing); - - // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires - // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. - // Calling reloadIndex here would deadlock with a concurrent delete on the same index. - auto fresh_alg = std::make_unique>(index_path, 0); - wireVectorFetchers(fresh_alg.get(), entry->vector_storage); - entry->alg = std::move(fresh_alg); - - // Both files are deleted here on success. If the server crashes before reaching this - // point, the timestamped file (default.idx.) may be left on disk — it is safe - // to delete manually on next startup as it does not affect index correctness. - if (std::filesystem::exists(temp_path)) { - std::filesystem::remove(temp_path); - } - if (std::filesystem::exists(timestamped_path)) { - std::filesystem::remove(timestamped_path); - } - - // Update metadata with new config - auto meta = metadata_manager_->getMetadata(index_id); - if (meta) { - meta->M = new_M; - meta->ef_con = new_ef_con; - meta->total_elements = entry->alg->getElementsCount(); - metadata_manager_->storeMetadata(index_id, *meta); - } - - entry->is_dirty = false; // We just saved the new graph - - LOG_INFO(2051, index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); - rebuild_.completeActiveRebuild(username); - - } catch (const std::exception& e) { - LOG_ERROR(2052, index_id, "Rebuild failed: " << e.what()); - - // Cleanup temp file on error - if (std::filesystem::exists(temp_path)) { - std::filesystem::remove(temp_path); - } - rebuild_.failActiveRebuild(username, e.what()); - } -} diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index fdd2591843..f7727514a7 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -5,16 +5,22 @@ #include #include #include +#include #include #include #include +#include #include #include +#include #include #include "settings.hpp" #include "log.hpp" #include "json/nlohmann_json.hpp" +#include "hnsw/hnswlib.h" +#include "vector_storage.hpp" +#include "../quant/common.hpp" struct RebuildResult { bool success; @@ -33,6 +39,45 @@ struct ActiveRebuild { std::jthread thread; // jthread: built-in stop_token + auto-join on destruction }; +// Parameters passed to Rebuild::executeJob. IndexManager-specific operations are +// provided as callbacks so rebuild.hpp does not need to include ndd.hpp. +struct RebuildJobParams { + // Identity + std::string index_id; + std::string username; + size_t new_M; + size_t new_ef_con; + + // Current graph config (read from entry->alg by IndexManager before thread spawn) + hnswlib::SpaceType space_type; + size_t dim; + ndd::quant::QuantizationLevel quant_level; + int32_t checksum; + size_t max_elements; + + // Storage for vector iteration + std::shared_ptr vector_storage; + + // File paths + std::string temp_path; + std::string timestamped_path; + std::string index_path; + + // Threading + size_t num_parallel_inserts; + + // Mutex pointer — executeJob acquires this for the whole job duration + std::shared_mutex* operation_mutex; + + // Callbacks for IndexManager-specific actions (avoids circular ndd.hpp include) + std::function save_current_index; + std::function>)> swap_alg; + std::function update_metadata; + std::function clear_dirty; + std::function*, std::shared_ptr)> wire_fetchers; + std::function)> parallel_add; +}; + class Rebuild { private: // Keyed by username — one rebuild per user at a time @@ -200,4 +245,91 @@ class Rebuild { ); return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx." + ts; } + + // Owns all rebuild execution. Called directly from the jthread lambda spawned in + // rebuildIndexAsync. IndexManager-specific operations come in via p callbacks. + void executeJob(const RebuildJobParams& p, std::stop_token st) { + try { + std::unique_lock op_lock(*p.operation_mutex); + + // Phase 1 — save current state before rebuilding + p.save_current_index(); + + // Phase 2 — build new HNSW with updated M/ef_con + auto new_alg = std::make_unique>( + p.max_elements, p.space_type, p.dim, p.new_M, p.new_ef_con, + settings::RANDOM_SEED, p.quant_level, p.checksum); + + // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes + p.wire_fetchers(new_alg.get(), p.vector_storage); + + auto cursor = p.vector_storage->getCursor(); + const size_t batch_size = settings::RECOVERY_BATCH_SIZE; + size_t total_processed = 0; + size_t batches_since_checkpoint = 0; + constexpr size_t CHECKPOINT_INTERVAL = 5; + + while (cursor.hasNext()) { + if (st.stop_requested()) { + if (std::filesystem::exists(p.temp_path)) + std::filesystem::remove(p.temp_path); + failActiveRebuild(p.username, "Rebuild interrupted by server shutdown"); + return; + } + + std::vector>> batch; + batch.reserve(batch_size); + while (cursor.hasNext() && batch.size() < batch_size) { + auto [label, vec_bytes] = cursor.next(); + if (!vec_bytes.empty()) + batch.emplace_back(label, std::move(vec_bytes)); + } + if (batch.empty()) break; + + p.parallel_add(batch.size(), p.num_parallel_inserts, + [&](size_t i) { + const auto& [label, vec_bytes] = batch[i]; + new_alg->addPoint(vec_bytes.data(), label); + }); + + total_processed += batch.size(); + auto state = getActiveRebuild(p.username); + if (state) state->vectors_processed.store(total_processed); + + if (++batches_since_checkpoint >= CHECKPOINT_INTERVAL) { + new_alg->saveIndex(p.temp_path); + batches_since_checkpoint = 0; + } + } + + // Phase 3 — save final, copy to canonical path, load fresh from disk + new_alg->saveIndex(p.timestamped_path); + std::filesystem::copy_file(p.timestamped_path, p.index_path, + std::filesystem::copy_options::overwrite_existing); + + // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires + // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. + // Calling reloadIndex here would deadlock with a concurrent delete on the same index. + auto fresh_alg = std::make_unique>(p.index_path, 0); + p.wire_fetchers(fresh_alg.get(), p.vector_storage); + + // Both files are deleted here on success. If the server crashes before reaching this + // point, the timestamped file (default.idx.) may be left on disk — it is safe + // to delete manually on next startup as it does not affect index correctness. + if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); + if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); + + p.swap_alg(std::move(fresh_alg)); + p.update_metadata(p.new_M, p.new_ef_con); + p.clear_dirty(); + + LOG_INFO(2051, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); + completeActiveRebuild(p.username); + + } catch (const std::exception& e) { + LOG_ERROR(2052, p.index_id, "Rebuild failed: " << e.what()); + if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); + failActiveRebuild(p.username, e.what()); + } + } }; From 3b32a2e902b87602d3ea8a91a33396869a76f2ac Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Thu, 23 Apr 2026 15:27:29 +0530 Subject: [PATCH 06/29] correction on cleantempfiles and error code --- src/core/ndd.hpp | 30 ++++++++++-------- src/core/rebuild.hpp | 72 ++++++++++++++++++++++---------------------- src/main.cpp | 6 ++-- src/utils/types.hpp | 11 +++++++ 4 files changed, 67 insertions(+), 52 deletions(-) create mode 100644 src/utils/types.hpp diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 17f34ffc00..e3d062dd03 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -1914,10 +1914,14 @@ class IndexManager { // ========== Rebuild operations ========== - // Orchestration method (defined below after class) - RebuildResult rebuildIndexAsync(const std::string& index_id, - size_t new_M, - size_t new_ef_con); + // Return codes: + // 0: rebuild started successfully + // 1: index not found + // 2: rebuild or backup already in progress for this user + // 3: no configuration changes specified / invalid parameters + OperationResult rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con); bool hasActiveRebuild(const std::string& username) const { return rebuild_.hasActiveRebuild(username); @@ -2320,12 +2324,12 @@ inline std::pair IndexManager::uploadBackup(const std::string // ========== IndexManager rebuild implementations ========== -inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id, - size_t new_M, - size_t new_ef_con) { +inline OperationResult IndexManager::rebuildIndexAsync(const std::string& index_id, + size_t new_M, + size_t new_ef_con) { auto meta = metadata_manager_->getMetadata(index_id); if (!meta) { - return {false, 404, "Index not found"}; + return {1, "Index not found"}; } std::string username; @@ -2333,14 +2337,14 @@ inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id if (pos != std::string::npos) { username = index_id.substr(0, pos); } else { - return {false, 400, "Invalid index ID format"}; + return {3, "Invalid index ID format"}; } if (backup_store_.hasActiveBackup(username)) { - return {false, 409, "Backup already in progress for user: " + username}; + return {2, "Backup already in progress for user: " + username}; } if (rebuild_.hasActiveRebuild(username)) { - return {false, 409, "Rebuild already in progress for user: " + username}; + return {2, "Rebuild already in progress for user: " + username}; } // Pre-fetch entry now — captured by lambdas so the thread never calls getIndexEntry @@ -2348,7 +2352,7 @@ inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id size_t current_count = entry->alg->getElementsCount(); if (new_M == meta->M && new_ef_con == meta->ef_con) { - return {false, 400, "No configuration changes specified"}; + return {3, "No configuration changes specified"}; } std::string base_path = data_dir_ + "/" + index_id; @@ -2400,5 +2404,5 @@ inline RebuildResult IndexManager::rebuildIndexAsync(const std::string& index_id rebuild_.attachRebuildThread(username, std::move(t)); LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); - return {true, 202, "Rebuild started"}; + return {0, "Rebuild started"}; } diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index f7727514a7..25ae997941 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -21,19 +20,14 @@ #include "hnsw/hnswlib.h" #include "vector_storage.hpp" #include "../quant/common.hpp" - -struct RebuildResult { - bool success; - int http_code; - std::string message; -}; +#include "utils/types.hpp" struct ActiveRebuild { std::string index_id; std::string status{"in_progress"}; // "in_progress", "completed", "failed" std::string error_message; - std::atomic vectors_processed{0}; - std::atomic total_vectors{0}; + size_t vectors_processed{0}; + size_t total_vectors{0}; std::chrono::system_clock::time_point started_at; std::chrono::system_clock::time_point completed_at; std::jthread thread; // jthread: built-in stop_token + auto-join on destruction @@ -103,9 +97,15 @@ class Rebuild { } try { std::string temp_filename = std::string(settings::DEFAULT_SUBINDEX) + ".idx.temp"; + std::string ts_prefix = std::string(settings::DEFAULT_SUBINDEX) + ".idx."; for (const auto& entry : std::filesystem::recursive_directory_iterator(data_dir)) { - if (entry.is_regular_file() && - entry.path().filename().string() == temp_filename) { + if (!entry.is_regular_file()) continue; + const std::string fname = entry.path().filename().string(); + bool is_temp = (fname == temp_filename); + bool is_ts = fname.size() > ts_prefix.size() + && fname.substr(0, ts_prefix.size()) == ts_prefix + && std::all_of(fname.begin() + ts_prefix.size(), fname.end(), ::isdigit); + if (is_temp || is_ts) { std::filesystem::remove(entry.path()); } } @@ -122,8 +122,8 @@ class Rebuild { auto state = std::make_shared(); state->index_id = index_id; state->status = "in_progress"; - state->total_vectors.store(total_vectors); - state->vectors_processed.store(0); + state->total_vectors = total_vectors; + state->vectors_processed = 0; state->started_at = std::chrono::system_clock::now(); state->thread = std::move(thread); active_rebuilds_[username] = state; @@ -193,39 +193,40 @@ class Rebuild { } } + void updateProgress(const std::string& username, size_t processed) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->vectors_processed = processed; + } + } + nlohmann::json getProgress(const std::string& username, const std::string& index_id) const { - auto state = getActiveRebuild(username); - if (state && state->index_id == index_id) { - size_t processed = state->vectors_processed.load(); - size_t total = state->total_vectors.load(); + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end() && it->second->index_id == index_id) { + const auto& state = *it->second; + size_t processed = state.vectors_processed; + size_t total = state.total_vectors; double percent = total > 0 ? (100.0 * processed / total) : 0.0; nlohmann::json result = { - {"status", state->status}, + {"status", state.status}, {"vectors_processed", processed}, {"total_vectors", total}, {"percent_complete", percent}, - {"started_at", formatTime(state->started_at)} + {"started_at", formatTime(state.started_at)} }; - if (state->status == "completed" || state->status == "failed") { - result["completed_at"] = formatTime(state->completed_at); + if (state.status == "completed" || state.status == "failed") { + result["completed_at"] = formatTime(state.completed_at); } - if (state->status == "failed" && !state->error_message.empty()) { - result["error"] = state->error_message; + if (state.status == "failed" && !state.error_message.empty()) { + result["error"] = state.error_message; } return result; } return {{"status", "idle"}}; } - std::shared_ptr getActiveRebuild(const std::string& username) const { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end()) { - return it->second; - } - return nullptr; - } - // Format state as JSON fields static std::string formatTime(std::chrono::system_clock::time_point tp) { return timeToISO8601(tp); @@ -293,8 +294,7 @@ class Rebuild { }); total_processed += batch.size(); - auto state = getActiveRebuild(p.username); - if (state) state->vectors_processed.store(total_processed); + updateProgress(p.username, total_processed); if (++batches_since_checkpoint >= CHECKPOINT_INTERVAL) { new_alg->saveIndex(p.temp_path); @@ -314,8 +314,8 @@ class Rebuild { p.wire_fetchers(fresh_alg.get(), p.vector_storage); // Both files are deleted here on success. If the server crashes before reaching this - // point, the timestamped file (default.idx.) may be left on disk — it is safe - // to delete manually on next startup as it does not affect index correctness. + // point, the timestamped file (default.idx.) will be removed on next startup + // by cleanupTempFiles — it does not affect index correctness. if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); diff --git a/src/main.cpp b/src/main.cpp index 69dfb908f8..05cbf21ac8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -747,9 +747,9 @@ int main(int argc, char** argv) { try { auto result = index_manager.rebuildIndexAsync(index_id, new_M, new_ef_con); - if (!result.success) { - return json_error(result.http_code, result.message); - } + if (result.code == 1) return json_error(404, result.message); + if (result.code == 2) return json_error(409, result.message); + if (result.code == 3) return json_error(400, result.message); crow::json::wvalue response; response["status"] = "rebuilding"; diff --git a/src/utils/types.hpp b/src/utils/types.hpp new file mode 100644 index 0000000000..e45d13678f --- /dev/null +++ b/src/utils/types.hpp @@ -0,0 +1,11 @@ +#pragma once +#include + +// Generic operation result returned by async and sync operations. +// Each function documents its return codes in comments above its declaration. +// Code 0 always means success. Non-zero codes are operation-specific. +// Codes can be conglomerated into ENUMs per operation as the codebase matures. +struct OperationResult { + unsigned char code; // 0 = success, non-zero = error (operation-specific) + std::string message; +}; From 13fe6ce31d371141904e5cfe5700aeae448ad5af Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Thu, 23 Apr 2026 16:15:14 +0530 Subject: [PATCH 07/29] rebuild status enum and logs code change --- docs/logs.md | 1 + src/core/ndd.hpp | 4 ++-- src/core/rebuild.hpp | 42 ++++++++++++++++++++++++++++-------------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/docs/logs.md b/docs/logs.md index 5ceb9d78b2..b8829df3c7 100644 --- a/docs/logs.md +++ b/docs/logs.md @@ -88,6 +88,7 @@ The same overload shapes apply to `LOG_WARN` and `LOG_ERROR`. - `1500s` metadata logs - `1600s` vector storage logs - `1700s` system sanity checks (CPU compatibility, disk, memory, ulimits) + - `1800s` rebuild subsystem logs - `2000s` index manager logs - `2100s` HNSW load/cache logs diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index e3d062dd03..087e31b215 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2393,7 +2393,7 @@ inline OperationResult IndexManager::rebuildIndexAsync(const std::string& index_ }; // Register state FIRST with empty thread — hasActiveRebuild() returns true immediately - rebuild_.setActiveRebuild(username, index_id, current_count, std::jthread{}); + rebuild_.setActiveRebuild(username, index_id, current_count); // Spawn thread — lambda calls rebuild_.executeJob directly (execution lives in Rebuild) std::jthread t([this, params = std::move(params)](std::stop_token st) mutable { @@ -2403,6 +2403,6 @@ inline OperationResult IndexManager::rebuildIndexAsync(const std::string& index_ // Move real thread into the already-registered state rebuild_.attachRebuildThread(username, std::move(t)); - LOG_INFO(2050, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); + LOG_INFO(1800, index_id, "Rebuild started: M=" << new_M << " ef_con=" << new_ef_con); return {0, "Rebuild started"}; } diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index 25ae997941..97fdd369ae 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -22,9 +22,15 @@ #include "../quant/common.hpp" #include "utils/types.hpp" +enum class RebuildStatus : unsigned char { + IN_PROGRESS = 0, + COMPLETED = 1, + FAILED = 2 +}; + struct ActiveRebuild { std::string index_id; - std::string status{"in_progress"}; // "in_progress", "completed", "failed" + RebuildStatus status{RebuildStatus::IN_PROGRESS}; std::string error_message; size_t vectors_processed{0}; size_t total_vectors{0}; @@ -78,6 +84,15 @@ class Rebuild { std::unordered_map> active_rebuilds_; mutable std::mutex rebuild_state_mutex_; + static std::string statusToString(RebuildStatus s) { + switch (s) { + case RebuildStatus::IN_PROGRESS: return "in_progress"; + case RebuildStatus::COMPLETED: return "completed"; + case RebuildStatus::FAILED: return "failed"; + default: return "unknown"; + } + } + static std::string timeToISO8601(std::chrono::system_clock::time_point tp) { auto time_t_val = std::chrono::system_clock::to_time_t(tp); std::tm tm_val{}; @@ -110,22 +125,21 @@ class Rebuild { } } } catch (const std::exception& e) { - LOG_WARN(2053, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); + LOG_WARN(1803, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); } } // State tracking — per user void setActiveRebuild(const std::string& username, const std::string& index_id, - size_t total_vectors, std::jthread&& thread) { + size_t total_vectors) { std::lock_guard lock(rebuild_state_mutex_); auto state = std::make_shared(); state->index_id = index_id; - state->status = "in_progress"; + state->status = RebuildStatus::IN_PROGRESS; state->total_vectors = total_vectors; state->vectors_processed = 0; state->started_at = std::chrono::system_clock::now(); - state->thread = std::move(thread); active_rebuilds_[username] = state; } @@ -137,7 +151,7 @@ class Rebuild { if (it->second->thread.joinable()) { it->second->thread.detach(); } - it->second->status = "completed"; + it->second->status = RebuildStatus::COMPLETED; it->second->completed_at = std::chrono::system_clock::now(); } } @@ -150,7 +164,7 @@ class Rebuild { if (it->second->thread.joinable()) { it->second->thread.detach(); } - it->second->status = "failed"; + it->second->status = RebuildStatus::FAILED; it->second->error_message = error; it->second->completed_at = std::chrono::system_clock::now(); } @@ -159,8 +173,8 @@ class Rebuild { bool hasActiveRebuild(const std::string& username) const { std::lock_guard lock(rebuild_state_mutex_); auto it = active_rebuilds_.find(username); - // Only "in_progress" blocks a new rebuild - return it != active_rebuilds_.end() && it->second->status == "in_progress"; + // Only IN_PROGRESS blocks a new rebuild + return it != active_rebuilds_.end() && it->second->status == RebuildStatus::IN_PROGRESS; } // Join all in-progress rebuild threads on shutdown. Mirrors BackupStore::joinAllThreads: @@ -210,16 +224,16 @@ class Rebuild { size_t total = state.total_vectors; double percent = total > 0 ? (100.0 * processed / total) : 0.0; nlohmann::json result = { - {"status", state.status}, + {"status", statusToString(state.status)}, {"vectors_processed", processed}, {"total_vectors", total}, {"percent_complete", percent}, {"started_at", formatTime(state.started_at)} }; - if (state.status == "completed" || state.status == "failed") { + if (state.status == RebuildStatus::COMPLETED || state.status == RebuildStatus::FAILED) { result["completed_at"] = formatTime(state.completed_at); } - if (state.status == "failed" && !state.error_message.empty()) { + if (state.status == RebuildStatus::FAILED && !state.error_message.empty()) { result["error"] = state.error_message; } return result; @@ -323,11 +337,11 @@ class Rebuild { p.update_metadata(p.new_M, p.new_ef_con); p.clear_dirty(); - LOG_INFO(2051, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); + LOG_INFO(1801, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); completeActiveRebuild(p.username); } catch (const std::exception& e) { - LOG_ERROR(2052, p.index_id, "Rebuild failed: " << e.what()); + LOG_ERROR(1802, p.index_id, "Rebuild failed: " << e.what()); if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); failActiveRebuild(p.username, e.what()); } From 6f4083e0515d8de577c872afa4b824a3bab0e3e0 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Thu, 23 Apr 2026 16:34:44 +0530 Subject: [PATCH 08/29] rebuild.cpp --- CMakeLists.txt | 1 + src/core/ndd.hpp | 1 + src/core/rebuild.cpp | 258 +++++++++++++++++++++++++++++++++++++++ src/core/rebuild.hpp | 284 +++---------------------------------------- 4 files changed, 279 insertions(+), 265 deletions(-) create mode 100644 src/core/rebuild.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fa15b0d93..ca5f37e564 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,6 +254,7 @@ message(STATUS "Binary name: ${NDD_BINARY_NAME}") set(NDD_CORE_SOURCES src/sparse/inverted_index.cpp src/utils/system_sanity/system_sanity.cpp + src/core/rebuild.cpp ) # Build non-main project sources separately so they can be compiled in parallel diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 087e31b215..ccee4d756e 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -198,6 +198,7 @@ struct PersistenceConfig { #include "../storage/backup_store.hpp" #include "rebuild.hpp" +#include "utils/types.hpp" class IndexManager { private: diff --git a/src/core/rebuild.cpp b/src/core/rebuild.cpp new file mode 100644 index 0000000000..96cd9410ea --- /dev/null +++ b/src/core/rebuild.cpp @@ -0,0 +1,258 @@ +#include "rebuild.hpp" + +#include +#include +#include + +#include "settings.hpp" +#include "log.hpp" +#include "utils/types.hpp" + +std::string Rebuild::statusToString(RebuildStatus s) { + switch (s) { + case RebuildStatus::IN_PROGRESS: return "in_progress"; + case RebuildStatus::COMPLETED: return "completed"; + case RebuildStatus::FAILED: return "failed"; + default: return "unknown"; + } +} + +std::string Rebuild::timeToISO8601(std::chrono::system_clock::time_point tp) { + auto time_t_val = std::chrono::system_clock::to_time_t(tp); + std::tm tm_val{}; + gmtime_r(&time_t_val, &tm_val); + std::ostringstream oss; + oss << std::put_time(&tm_val, "%Y-%m-%dT%H:%M:%SZ"); + return oss.str(); +} + +void Rebuild::cleanupTempFiles(const std::string& data_dir) { + if (!std::filesystem::exists(data_dir)) { + return; + } + try { + std::string temp_filename = std::string(settings::DEFAULT_SUBINDEX) + ".idx.temp"; + std::string ts_prefix = std::string(settings::DEFAULT_SUBINDEX) + ".idx."; + for (const auto& entry : std::filesystem::recursive_directory_iterator(data_dir)) { + if (!entry.is_regular_file()) continue; + const std::string fname = entry.path().filename().string(); + bool is_temp = (fname == temp_filename); + bool is_ts = fname.size() > ts_prefix.size() + && fname.substr(0, ts_prefix.size()) == ts_prefix + && std::all_of(fname.begin() + ts_prefix.size(), fname.end(), ::isdigit); + if (is_temp || is_ts) { + std::filesystem::remove(entry.path()); + } + } + } catch (const std::exception& e) { + LOG_WARN(1803, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); + } +} + +void Rebuild::setActiveRebuild(const std::string& username, const std::string& index_id, + size_t total_vectors) { + std::lock_guard lock(rebuild_state_mutex_); + auto state = std::make_shared(); + state->index_id = index_id; + state->status = RebuildStatus::IN_PROGRESS; + state->total_vectors = total_vectors; + state->vectors_processed = 0; + state->started_at = std::chrono::system_clock::now(); + active_rebuilds_[username] = state; +} + +void Rebuild::completeActiveRebuild(const std::string& username) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + // Called from within the thread — detach so the jthread dtor doesn't join us + if (it->second->thread.joinable()) { + it->second->thread.detach(); + } + it->second->status = RebuildStatus::COMPLETED; + it->second->completed_at = std::chrono::system_clock::now(); + } +} + +void Rebuild::failActiveRebuild(const std::string& username, const std::string& error) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + // Called from within the thread — detach so the jthread dtor doesn't join us + if (it->second->thread.joinable()) { + it->second->thread.detach(); + } + it->second->status = RebuildStatus::FAILED; + it->second->error_message = error; + it->second->completed_at = std::chrono::system_clock::now(); + } +} + +bool Rebuild::hasActiveRebuild(const std::string& username) const { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + // Only IN_PROGRESS blocks a new rebuild + return it != active_rebuilds_.end() && it->second->status == RebuildStatus::IN_PROGRESS; +} + +void Rebuild::joinAllThreads() { + std::vector threads_to_join; + { + std::lock_guard lock(rebuild_state_mutex_); + for (auto& [username, state] : active_rebuilds_) { + if (state->thread.joinable()) { + threads_to_join.push_back(std::move(state->thread)); + } + } + active_rebuilds_.clear(); + } + for (auto& t : threads_to_join) { + t.request_stop(); + if (t.joinable()) { + t.join(); + } + } +} + +void Rebuild::attachRebuildThread(const std::string& username, std::jthread&& thread) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->thread = std::move(thread); + } +} + +void Rebuild::updateProgress(const std::string& username, size_t processed) { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end()) { + it->second->vectors_processed = processed; + } +} + +nlohmann::json Rebuild::getProgress(const std::string& username, const std::string& index_id) const { + std::lock_guard lock(rebuild_state_mutex_); + auto it = active_rebuilds_.find(username); + if (it != active_rebuilds_.end() && it->second->index_id == index_id) { + const auto& state = *it->second; + size_t processed = state.vectors_processed; + size_t total = state.total_vectors; + double percent = total > 0 ? (100.0 * processed / total) : 0.0; + nlohmann::json result = { + {"status", statusToString(state.status)}, + {"vectors_processed", processed}, + {"total_vectors", total}, + {"percent_complete", percent}, + {"started_at", formatTime(state.started_at)} + }; + if (state.status == RebuildStatus::COMPLETED || state.status == RebuildStatus::FAILED) { + result["completed_at"] = formatTime(state.completed_at); + } + if (state.status == RebuildStatus::FAILED && !state.error_message.empty()) { + result["error"] = state.error_message; + } + return result; + } + return {{"status", "idle"}}; +} + +std::string Rebuild::formatTime(std::chrono::system_clock::time_point tp) { + return timeToISO8601(tp); +} + +std::string Rebuild::getTempPath(const std::string& index_dir) { + return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx.temp"; +} + +std::string Rebuild::getTimestampedPath(const std::string& index_dir) { + auto ts = std::to_string( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch() + ).count() + ); + return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx." + ts; +} + +void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { + try { + std::unique_lock op_lock(*p.operation_mutex); + + // Phase 1 — save current state before rebuilding + p.save_current_index(); + + // Phase 2 — build new HNSW with updated M/ef_con + auto new_alg = std::make_unique>( + p.max_elements, p.space_type, p.dim, p.new_M, p.new_ef_con, + settings::RANDOM_SEED, p.quant_level, p.checksum); + + // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes + p.wire_fetchers(new_alg.get(), p.vector_storage); + + auto cursor = p.vector_storage->getCursor(); + const size_t batch_size = settings::RECOVERY_BATCH_SIZE; + size_t total_processed = 0; + size_t batches_since_checkpoint = 0; + constexpr size_t CHECKPOINT_INTERVAL = 5; + + while (cursor.hasNext()) { + if (st.stop_requested()) { + if (std::filesystem::exists(p.temp_path)) + std::filesystem::remove(p.temp_path); + failActiveRebuild(p.username, "Rebuild interrupted by server shutdown"); + return; + } + + std::vector>> batch; + batch.reserve(batch_size); + while (cursor.hasNext() && batch.size() < batch_size) { + auto [label, vec_bytes] = cursor.next(); + if (!vec_bytes.empty()) + batch.emplace_back(label, std::move(vec_bytes)); + } + if (batch.empty()) break; + + p.parallel_add(batch.size(), p.num_parallel_inserts, + [&](size_t i) { + const auto& [label, vec_bytes] = batch[i]; + new_alg->addPoint(vec_bytes.data(), label); + }); + + total_processed += batch.size(); + updateProgress(p.username, total_processed); + + if (++batches_since_checkpoint >= CHECKPOINT_INTERVAL) { + new_alg->saveIndex(p.temp_path); + batches_since_checkpoint = 0; + } + } + + // Phase 3 — save final, copy to canonical path, load fresh from disk + new_alg->saveIndex(p.timestamped_path); + std::filesystem::copy_file(p.timestamped_path, p.index_path, + std::filesystem::copy_options::overwrite_existing); + + // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires + // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. + // Calling reloadIndex here would deadlock with a concurrent delete on the same index. + auto fresh_alg = std::make_unique>(p.index_path, 0); + p.wire_fetchers(fresh_alg.get(), p.vector_storage); + + // Both files are deleted here on success. If the server crashes before reaching this + // point, the timestamped file (default.idx.) will be removed on next startup + // by cleanupTempFiles — it does not affect index correctness. + if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); + if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); + + p.swap_alg(std::move(fresh_alg)); + p.update_metadata(p.new_M, p.new_ef_con); + p.clear_dirty(); + + LOG_INFO(1801, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); + completeActiveRebuild(p.username); + + } catch (const std::exception& e) { + LOG_ERROR(1802, p.index_id, "Rebuild failed: " << e.what()); + if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); + failActiveRebuild(p.username, e.what()); + } +} diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index 97fdd369ae..9b474a1475 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -7,20 +7,14 @@ #include #include #include -#include -#include -#include -#include -#include #include +#include +#include -#include "settings.hpp" -#include "log.hpp" #include "json/nlohmann_json.hpp" #include "hnsw/hnswlib.h" #include "vector_storage.hpp" #include "../quant/common.hpp" -#include "utils/types.hpp" enum class RebuildStatus : unsigned char { IN_PROGRESS = 0, @@ -80,270 +74,30 @@ struct RebuildJobParams { class Rebuild { private: - // Keyed by username — one rebuild per user at a time std::unordered_map> active_rebuilds_; mutable std::mutex rebuild_state_mutex_; - static std::string statusToString(RebuildStatus s) { - switch (s) { - case RebuildStatus::IN_PROGRESS: return "in_progress"; - case RebuildStatus::COMPLETED: return "completed"; - case RebuildStatus::FAILED: return "failed"; - default: return "unknown"; - } - } - - static std::string timeToISO8601(std::chrono::system_clock::time_point tp) { - auto time_t_val = std::chrono::system_clock::to_time_t(tp); - std::tm tm_val{}; - gmtime_r(&time_t_val, &tm_val); - std::ostringstream oss; - oss << std::put_time(&tm_val, "%Y-%m-%dT%H:%M:%SZ"); - return oss.str(); - } + static std::string statusToString(RebuildStatus s); + static std::string timeToISO8601(std::chrono::system_clock::time_point tp); public: Rebuild() = default; - // Lifecycle — cleanup temp files from interrupted rebuilds on startup - void cleanupTempFiles(const std::string& data_dir) { - if (!std::filesystem::exists(data_dir)) { - return; - } - try { - std::string temp_filename = std::string(settings::DEFAULT_SUBINDEX) + ".idx.temp"; - std::string ts_prefix = std::string(settings::DEFAULT_SUBINDEX) + ".idx."; - for (const auto& entry : std::filesystem::recursive_directory_iterator(data_dir)) { - if (!entry.is_regular_file()) continue; - const std::string fname = entry.path().filename().string(); - bool is_temp = (fname == temp_filename); - bool is_ts = fname.size() > ts_prefix.size() - && fname.substr(0, ts_prefix.size()) == ts_prefix - && std::all_of(fname.begin() + ts_prefix.size(), fname.end(), ::isdigit); - if (is_temp || is_ts) { - std::filesystem::remove(entry.path()); - } - } - } catch (const std::exception& e) { - LOG_WARN(1803, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); - } - } - - // State tracking — per user + void cleanupTempFiles(const std::string& data_dir); void setActiveRebuild(const std::string& username, const std::string& index_id, - size_t total_vectors) { - std::lock_guard lock(rebuild_state_mutex_); - auto state = std::make_shared(); - state->index_id = index_id; - state->status = RebuildStatus::IN_PROGRESS; - state->total_vectors = total_vectors; - state->vectors_processed = 0; - state->started_at = std::chrono::system_clock::now(); - active_rebuilds_[username] = state; - } - - void completeActiveRebuild(const std::string& username) { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end()) { - // Called from within the thread — detach so the jthread dtor doesn't join us - if (it->second->thread.joinable()) { - it->second->thread.detach(); - } - it->second->status = RebuildStatus::COMPLETED; - it->second->completed_at = std::chrono::system_clock::now(); - } - } - - void failActiveRebuild(const std::string& username, const std::string& error) { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end()) { - // Called from within the thread — detach so the jthread dtor doesn't join us - if (it->second->thread.joinable()) { - it->second->thread.detach(); - } - it->second->status = RebuildStatus::FAILED; - it->second->error_message = error; - it->second->completed_at = std::chrono::system_clock::now(); - } - } - - bool hasActiveRebuild(const std::string& username) const { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - // Only IN_PROGRESS blocks a new rebuild - return it != active_rebuilds_.end() && it->second->status == RebuildStatus::IN_PROGRESS; - } - - // Join all in-progress rebuild threads on shutdown. Mirrors BackupStore::joinAllThreads: - // move threads out under lock, request_stop + join outside lock to avoid deadlock - // (finishing threads call completeActiveRebuild which also locks rebuild_state_mutex_). - void joinAllThreads() { - std::vector threads_to_join; - { - std::lock_guard lock(rebuild_state_mutex_); - for (auto& [username, state] : active_rebuilds_) { - if (state->thread.joinable()) { - threads_to_join.push_back(std::move(state->thread)); - } - } - active_rebuilds_.clear(); - } - for (auto& t : threads_to_join) { - t.request_stop(); - if (t.joinable()) { - t.join(); - } - } - } - - void attachRebuildThread(const std::string& username, std::jthread&& thread) { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end()) { - it->second->thread = std::move(thread); - } - } - - void updateProgress(const std::string& username, size_t processed) { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end()) { - it->second->vectors_processed = processed; - } - } - - nlohmann::json getProgress(const std::string& username, const std::string& index_id) const { - std::lock_guard lock(rebuild_state_mutex_); - auto it = active_rebuilds_.find(username); - if (it != active_rebuilds_.end() && it->second->index_id == index_id) { - const auto& state = *it->second; - size_t processed = state.vectors_processed; - size_t total = state.total_vectors; - double percent = total > 0 ? (100.0 * processed / total) : 0.0; - nlohmann::json result = { - {"status", statusToString(state.status)}, - {"vectors_processed", processed}, - {"total_vectors", total}, - {"percent_complete", percent}, - {"started_at", formatTime(state.started_at)} - }; - if (state.status == RebuildStatus::COMPLETED || state.status == RebuildStatus::FAILED) { - result["completed_at"] = formatTime(state.completed_at); - } - if (state.status == RebuildStatus::FAILED && !state.error_message.empty()) { - result["error"] = state.error_message; - } - return result; - } - return {{"status", "idle"}}; - } - - // Format state as JSON fields - static std::string formatTime(std::chrono::system_clock::time_point tp) { - return timeToISO8601(tp); - } - - // Path helpers - - static std::string getTempPath(const std::string& index_dir) { - return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx.temp"; - } - - static std::string getTimestampedPath(const std::string& index_dir) { - auto ts = std::to_string( - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch() - ).count() - ); - return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx." + ts; - } - - // Owns all rebuild execution. Called directly from the jthread lambda spawned in - // rebuildIndexAsync. IndexManager-specific operations come in via p callbacks. - void executeJob(const RebuildJobParams& p, std::stop_token st) { - try { - std::unique_lock op_lock(*p.operation_mutex); - - // Phase 1 — save current state before rebuilding - p.save_current_index(); - - // Phase 2 — build new HNSW with updated M/ef_con - auto new_alg = std::make_unique>( - p.max_elements, p.space_type, p.dim, p.new_M, p.new_ef_con, - settings::RANDOM_SEED, p.quant_level, p.checksum); - - // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes - p.wire_fetchers(new_alg.get(), p.vector_storage); - - auto cursor = p.vector_storage->getCursor(); - const size_t batch_size = settings::RECOVERY_BATCH_SIZE; - size_t total_processed = 0; - size_t batches_since_checkpoint = 0; - constexpr size_t CHECKPOINT_INTERVAL = 5; - - while (cursor.hasNext()) { - if (st.stop_requested()) { - if (std::filesystem::exists(p.temp_path)) - std::filesystem::remove(p.temp_path); - failActiveRebuild(p.username, "Rebuild interrupted by server shutdown"); - return; - } - - std::vector>> batch; - batch.reserve(batch_size); - while (cursor.hasNext() && batch.size() < batch_size) { - auto [label, vec_bytes] = cursor.next(); - if (!vec_bytes.empty()) - batch.emplace_back(label, std::move(vec_bytes)); - } - if (batch.empty()) break; - - p.parallel_add(batch.size(), p.num_parallel_inserts, - [&](size_t i) { - const auto& [label, vec_bytes] = batch[i]; - new_alg->addPoint(vec_bytes.data(), label); - }); - - total_processed += batch.size(); - updateProgress(p.username, total_processed); - - if (++batches_since_checkpoint >= CHECKPOINT_INTERVAL) { - new_alg->saveIndex(p.temp_path); - batches_since_checkpoint = 0; - } - } - - // Phase 3 — save final, copy to canonical path, load fresh from disk - new_alg->saveIndex(p.timestamped_path); - std::filesystem::copy_file(p.timestamped_path, p.index_path, - std::filesystem::copy_options::overwrite_existing); - - // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires - // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. - // Calling reloadIndex here would deadlock with a concurrent delete on the same index. - auto fresh_alg = std::make_unique>(p.index_path, 0); - p.wire_fetchers(fresh_alg.get(), p.vector_storage); - - // Both files are deleted here on success. If the server crashes before reaching this - // point, the timestamped file (default.idx.) will be removed on next startup - // by cleanupTempFiles — it does not affect index correctness. - if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); - if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); - - p.swap_alg(std::move(fresh_alg)); - p.update_metadata(p.new_M, p.new_ef_con); - p.clear_dirty(); - - LOG_INFO(1801, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); - completeActiveRebuild(p.username); - - } catch (const std::exception& e) { - LOG_ERROR(1802, p.index_id, "Rebuild failed: " << e.what()); - if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); - failActiveRebuild(p.username, e.what()); - } - } + size_t total_vectors); + void completeActiveRebuild(const std::string& username); + void failActiveRebuild(const std::string& username, const std::string& error); + bool hasActiveRebuild(const std::string& username) const; + void joinAllThreads(); + void attachRebuildThread(const std::string& username, std::jthread&& thread); + void updateProgress(const std::string& username, size_t processed); + nlohmann::json getProgress(const std::string& username, const std::string& index_id) const; + + static std::string formatTime(std::chrono::system_clock::time_point tp); + static std::string getTempPath(const std::string& index_dir); + static std::string getTimestampedPath(const std::string& index_dir); + + void executeJob(const RebuildJobParams& p, std::stop_token st); }; From 2202923697fa5e1fab6315b4dc64ddabfd5748c8 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Sat, 25 Apr 2026 15:10:56 +0530 Subject: [PATCH 09/29] Using Rebuild as friend class on Indexmanager helps to pass CacheEntry + IndexManager to executeJob instead of unpacking fields --- CMakeLists.txt | 1 + src/core/ndd.hpp | 39 +++++++++------------------------------ src/core/rebuild.cpp | 40 +++++++++++++++++++++++++++------------- src/core/rebuild.hpp | 42 +++++++++--------------------------------- 4 files changed, 46 insertions(+), 76 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca5f37e564..1f8e27dbe0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,6 +289,7 @@ target_include_directories(ndd_core PRIVATE ${ASIO_INCLUDE_DIR} ${OPENSSL_INCLUDE_DIR} ${CURL_INCLUDE_DIRS} + ${LIBARCHIVE_INCLUDE_DIR} ) target_include_directories(${NDD_BINARY_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index ccee4d756e..3fc95e6fa6 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -201,6 +201,7 @@ struct PersistenceConfig { #include "utils/types.hpp" class IndexManager { + friend class Rebuild; // executeJob accesses saveIndexInternal + metadata_manager_ private: std::deque indices_list_; std::unordered_map> indices_; @@ -2360,37 +2361,15 @@ inline OperationResult IndexManager::rebuildIndexAsync(const std::string& index_ std::string vector_storage_dir = base_path + "/vectors"; RebuildJobParams params{ - .index_id = index_id, - .username = username, - .new_M = new_M, - .new_ef_con = new_ef_con, - .space_type = entry->alg->getSpaceType(), - .dim = entry->alg->getDimension(), - .quant_level = entry->alg->getQuantLevel(), - .checksum = entry->alg->getChecksum(), - .max_elements = entry->alg->getMaxElements(), - .vector_storage = entry->vector_storage, - .temp_path = Rebuild::getTempPath(base_path), - .timestamped_path = Rebuild::getTimestampedPath(base_path), - .index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx", + .username = username, + .new_M = new_M, + .new_ef_con = new_ef_con, + .entry = entry, + .manager = this, + .temp_path = Rebuild::getTempPath(base_path), + .timestamped_path = Rebuild::getTimestampedPath(base_path), + .index_path = vector_storage_dir + "/" + settings::DEFAULT_SUBINDEX + ".idx", .num_parallel_inserts = settings::NUM_PARALLEL_INSERTS, - .operation_mutex = &entry->operation_mutex, - .save_current_index = [this, entry]() { saveIndexInternal(*entry); }, - .swap_alg = [entry](auto fresh) { entry->alg = std::move(fresh); }, - .update_metadata = [this, index_id, entry](size_t nm, size_t nef) { - auto m = metadata_manager_->getMetadata(index_id); - if (m) { - m->M = nm; - m->ef_con = nef; - m->total_elements = entry->alg->getElementsCount(); - metadata_manager_->storeMetadata(index_id, *m); - } - }, - .clear_dirty = [entry]() { entry->is_dirty = false; }, - .wire_fetchers = [](auto* alg, auto vs) { IndexManager::wireVectorFetchers(alg, vs); }, - .parallel_add = [](size_t n, size_t t, std::function fn) { - IndexManager::parallelAddPoints(n, t, std::move(fn)); - }, }; // Register state FIRST with empty thread — hasActiveRebuild() returns true immediately diff --git a/src/core/rebuild.cpp b/src/core/rebuild.cpp index 96cd9410ea..d5e04a2af2 100644 --- a/src/core/rebuild.cpp +++ b/src/core/rebuild.cpp @@ -7,6 +7,7 @@ #include "settings.hpp" #include "log.hpp" #include "utils/types.hpp" +#include "ndd.hpp" // CacheEntry, IndexManager (friend access) std::string Rebuild::statusToString(RebuildStatus s) { switch (s) { @@ -174,21 +175,25 @@ std::string Rebuild::getTimestampedPath(const std::string& index_dir) { } void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { + auto& entry = p.entry; // shared_ptr + auto* manager = p.manager; try { - std::unique_lock op_lock(*p.operation_mutex); + std::unique_lock op_lock(entry->operation_mutex); // Phase 1 — save current state before rebuilding - p.save_current_index(); + manager->saveIndexInternal(*entry); // Phase 2 — build new HNSW with updated M/ef_con + auto* old_alg = entry->alg.get(); auto new_alg = std::make_unique>( - p.max_elements, p.space_type, p.dim, p.new_M, p.new_ef_con, - settings::RANDOM_SEED, p.quant_level, p.checksum); + old_alg->getMaxElements(), old_alg->getSpaceType(), old_alg->getDimension(), + p.new_M, p.new_ef_con, + settings::RANDOM_SEED, old_alg->getQuantLevel(), old_alg->getChecksum()); // MUST wire fetchers before addPoint — searchBaseLayer needs this for base-layer-only nodes - p.wire_fetchers(new_alg.get(), p.vector_storage); + IndexManager::wireVectorFetchers(new_alg.get(), entry->vector_storage); - auto cursor = p.vector_storage->getCursor(); + auto cursor = entry->vector_storage->getCursor(); const size_t batch_size = settings::RECOVERY_BATCH_SIZE; size_t total_processed = 0; size_t batches_since_checkpoint = 0; @@ -211,7 +216,7 @@ void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { } if (batch.empty()) break; - p.parallel_add(batch.size(), p.num_parallel_inserts, + IndexManager::parallelAddPoints(batch.size(), p.num_parallel_inserts, [&](size_t i) { const auto& [label, vec_bytes] = batch[i]; new_alg->addPoint(vec_bytes.data(), label); @@ -235,7 +240,7 @@ void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. // Calling reloadIndex here would deadlock with a concurrent delete on the same index. auto fresh_alg = std::make_unique>(p.index_path, 0); - p.wire_fetchers(fresh_alg.get(), p.vector_storage); + IndexManager::wireVectorFetchers(fresh_alg.get(), entry->vector_storage); // Both files are deleted here on success. If the server crashes before reaching this // point, the timestamped file (default.idx.) will be removed on next startup @@ -243,15 +248,24 @@ void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); - p.swap_alg(std::move(fresh_alg)); - p.update_metadata(p.new_M, p.new_ef_con); - p.clear_dirty(); + entry->alg = std::move(fresh_alg); - LOG_INFO(1801, p.index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); + // Update metadata (uses friend access to manager->metadata_manager_) + auto m = manager->metadata_manager_->getMetadata(entry->index_id); + if (m) { + m->M = p.new_M; + m->ef_con = p.new_ef_con; + m->total_elements = entry->alg->getElementsCount(); + manager->metadata_manager_->storeMetadata(entry->index_id, *m); + } + + entry->is_dirty = false; + + LOG_INFO(1801, entry->index_id, "Rebuild completed: " << total_processed << " vectors rebuilt"); completeActiveRebuild(p.username); } catch (const std::exception& e) { - LOG_ERROR(1802, p.index_id, "Rebuild failed: " << e.what()); + LOG_ERROR(1802, entry->index_id, "Rebuild failed: " << e.what()); if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); failActiveRebuild(p.username, e.what()); } diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index 9b474a1475..e87a8ce041 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -4,17 +4,16 @@ #include #include #include -#include #include #include #include #include -#include #include "json/nlohmann_json.hpp" -#include "hnsw/hnswlib.h" -#include "vector_storage.hpp" -#include "../quant/common.hpp" + +// Forward declarations — full definitions live in ndd.hpp, included by rebuild.cpp. +struct CacheEntry; +class IndexManager; enum class RebuildStatus : unsigned char { IN_PROGRESS = 0, @@ -33,43 +32,20 @@ struct ActiveRebuild { std::jthread thread; // jthread: built-in stop_token + auto-join on destruction }; -// Parameters passed to Rebuild::executeJob. IndexManager-specific operations are -// provided as callbacks so rebuild.hpp does not need to include ndd.hpp. +// Parameters passed to Rebuild::executeJob. `entry` and `manager` give executeJob +// direct access to graph config, vector storage, mutexes, save/metadata operations. struct RebuildJobParams { - // Identity - std::string index_id; std::string username; size_t new_M; size_t new_ef_con; - // Current graph config (read from entry->alg by IndexManager before thread spawn) - hnswlib::SpaceType space_type; - size_t dim; - ndd::quant::QuantizationLevel quant_level; - int32_t checksum; - size_t max_elements; - - // Storage for vector iteration - std::shared_ptr vector_storage; + std::shared_ptr entry; // shared_ptr keeps CacheEntry alive for the rebuild duration + IndexManager* manager; // saveIndexInternal, metadata_manager_ (via friend) - // File paths std::string temp_path; std::string timestamped_path; std::string index_path; - - // Threading - size_t num_parallel_inserts; - - // Mutex pointer — executeJob acquires this for the whole job duration - std::shared_mutex* operation_mutex; - - // Callbacks for IndexManager-specific actions (avoids circular ndd.hpp include) - std::function save_current_index; - std::function>)> swap_alg; - std::function update_metadata; - std::function clear_dirty; - std::function*, std::shared_ptr)> wire_fetchers; - std::function)> parallel_add; + size_t num_parallel_inserts; }; class Rebuild { From c2adbac09d716706d48f3fd22e0e42d3cfe20b65 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Mon, 27 Apr 2026 09:43:50 +0530 Subject: [PATCH 10/29] fix(rebuild): using new_alg directly, stop_request before phase3, and doc review comments --- docs/rebuild.md | 12 +++++++++++- src/core/ndd.hpp | 2 +- src/core/rebuild.cpp | 39 ++++++++++++++++----------------------- src/core/rebuild.hpp | 1 - 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/docs/rebuild.md b/docs/rebuild.md index be6b35640f..a78342c344 100644 --- a/docs/rebuild.md +++ b/docs/rebuild.md @@ -121,4 +121,14 @@ The following parameters **cannot** be changed via rebuild (returns 400): - **One rebuild at a time per user** — cannot start a rebuild on any index while another rebuild is in progress for the same user. Also cannot run concurrently with a backup. - **Periodic checkpoints** — the in-progress graph is saved to a temp file at regular intervals. - **On completion**, the new graph replaces `default.idx`. All temporary and intermediate files are cleaned up. -- **On server restart** during an incomplete rebuild, the old index loads normally. Temp files are cleaned up automatically. The rebuild must be restarted manually. +- **On server restart** during an incomplete rebuild, the old index loads normally. Orphaned temp files are removed automatically on startup. The rebuild must be restarted manually. To confirm a rebuild was incomplete, check that M/ef_con in the index info still show the original values. + +--- + +## Capacity and Timing + +**Disk space:** Plan for roughly **2× the index file size** free. A temporary copy of the completed graph is written before being renamed into place. + +**Memory:** Both the old and new graphs are in RAM simultaneously during rebuild. Peak usage is approximately **2× the index graph size** in addition to normal vector storage. + +**Duration:** Roughly 8-10 minutes per million vectors on commodity hardware at default settings. Higher M or ef_con increases build time. The final disk save adds additional time proportional to index size. diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 3fc95e6fa6..8d1907efdf 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2376,7 +2376,7 @@ inline OperationResult IndexManager::rebuildIndexAsync(const std::string& index_ rebuild_.setActiveRebuild(username, index_id, current_count); // Spawn thread — lambda calls rebuild_.executeJob directly (execution lives in Rebuild) - std::jthread t([this, params = std::move(params)](std::stop_token st) mutable { + std::jthread t([this, params = std::move(params)](std::stop_token st) { rebuild_.executeJob(params, st); }); diff --git a/src/core/rebuild.cpp b/src/core/rebuild.cpp index d5e04a2af2..ebdf14d7ac 100644 --- a/src/core/rebuild.cpp +++ b/src/core/rebuild.cpp @@ -14,8 +14,8 @@ std::string Rebuild::statusToString(RebuildStatus s) { case RebuildStatus::IN_PROGRESS: return "in_progress"; case RebuildStatus::COMPLETED: return "completed"; case RebuildStatus::FAILED: return "failed"; - default: return "unknown"; } + __builtin_unreachable(); } std::string Rebuild::timeToISO8601(std::chrono::system_clock::time_point tp) { @@ -45,7 +45,7 @@ void Rebuild::cleanupTempFiles(const std::string& data_dir) { std::filesystem::remove(entry.path()); } } - } catch (const std::exception& e) { + } catch (const std::filesystem::filesystem_error& e) { LOG_WARN(1803, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); } } @@ -144,10 +144,10 @@ nlohmann::json Rebuild::getProgress(const std::string& username, const std::stri {"vectors_processed", processed}, {"total_vectors", total}, {"percent_complete", percent}, - {"started_at", formatTime(state.started_at)} + {"started_at", timeToISO8601(state.started_at)} }; if (state.status == RebuildStatus::COMPLETED || state.status == RebuildStatus::FAILED) { - result["completed_at"] = formatTime(state.completed_at); + result["completed_at"] = timeToISO8601(state.completed_at); } if (state.status == RebuildStatus::FAILED && !state.error_message.empty()) { result["error"] = state.error_message; @@ -157,10 +157,6 @@ nlohmann::json Rebuild::getProgress(const std::string& username, const std::stri return {{"status", "idle"}}; } -std::string Rebuild::formatTime(std::chrono::system_clock::time_point tp) { - return timeToISO8601(tp); -} - std::string Rebuild::getTempPath(const std::string& index_dir) { return index_dir + "/vectors/" + settings::DEFAULT_SUBINDEX + ".idx.temp"; } @@ -231,24 +227,21 @@ void Rebuild::executeJob(const RebuildJobParams& p, std::stop_token st) { } } - // Phase 3 — save final, copy to canonical path, load fresh from disk + if (st.stop_requested()) { + if (std::filesystem::exists(p.temp_path)) + std::filesystem::remove(p.temp_path); + failActiveRebuild(p.username, "Rebuild interrupted by server shutdown"); + return; + } + + // Phase 3 — persist to timestamped path, atomically rename to canonical path new_alg->saveIndex(p.timestamped_path); - std::filesystem::copy_file(p.timestamped_path, p.index_path, - std::filesystem::copy_options::overwrite_existing); - - // Cannot call reloadIndex() here — we hold operation_mutex and reloadIndex acquires - // indices_mutex_, while deleteIndex holds indices_mutex_ then acquires operation_mutex. - // Calling reloadIndex here would deadlock with a concurrent delete on the same index. - auto fresh_alg = std::make_unique>(p.index_path, 0); - IndexManager::wireVectorFetchers(fresh_alg.get(), entry->vector_storage); - - // Both files are deleted here on success. If the server crashes before reaching this - // point, the timestamped file (default.idx.) will be removed on next startup - // by cleanupTempFiles — it does not affect index correctness. + std::filesystem::rename(p.timestamped_path, p.index_path); + if (std::filesystem::exists(p.temp_path)) std::filesystem::remove(p.temp_path); - if (std::filesystem::exists(p.timestamped_path)) std::filesystem::remove(p.timestamped_path); - entry->alg = std::move(fresh_alg); + // new_alg is fully built and fetchers are already wired (line 194) — use directly + entry->alg = std::move(new_alg); // Update metadata (uses friend access to manager->metadata_manager_) auto m = manager->metadata_manager_->getMetadata(entry->index_id); diff --git a/src/core/rebuild.hpp b/src/core/rebuild.hpp index e87a8ce041..4bd14a44ae 100644 --- a/src/core/rebuild.hpp +++ b/src/core/rebuild.hpp @@ -71,7 +71,6 @@ class Rebuild { void updateProgress(const std::string& username, size_t processed); nlohmann::json getProgress(const std::string& username, const std::string& index_id) const; - static std::string formatTime(std::chrono::system_clock::time_point tp); static std::string getTempPath(const std::string& index_dir); static std::string getTimestampedPath(const std::string& index_dir); From af444b62a572c80188f2ecca4eb058a07ae642d5 Mon Sep 17 00:00:00 2001 From: Hemant Sharma Date: Mon, 27 Apr 2026 14:07:17 +0530 Subject: [PATCH 11/29] test cases --- src/core/rebuild.cpp | 5 +- src/utils/settings.hpp | 1 + tests/CMakeLists.txt | 34 +++++ tests/README.md | 84 +++++++++-- tests/filter_test.cpp | 2 +- tests/rebuild_test.cpp | 328 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 442 insertions(+), 12 deletions(-) create mode 100644 tests/rebuild_test.cpp diff --git a/src/core/rebuild.cpp b/src/core/rebuild.cpp index ebdf14d7ac..9526d7aaf1 100644 --- a/src/core/rebuild.cpp +++ b/src/core/rebuild.cpp @@ -46,7 +46,10 @@ void Rebuild::cleanupTempFiles(const std::string& data_dir) { } } } catch (const std::filesystem::filesystem_error& e) { - LOG_WARN(1803, "rebuild", "Failed to cleanup temp files on startup: " << e.what()); + if (e.code() != std::errc::no_such_file_or_directory) + LOG_WARN(1803, "rebuild", "Error during temp cleanup: " << e.what()); + } catch (const std::exception& e) { + LOG_WARN(1803, "rebuild", "Error during temp cleanup: " << e.what()); } } diff --git a/src/utils/settings.hpp b/src/utils/settings.hpp index 9949e9109e..07210e7bc9 100644 --- a/src/utils/settings.hpp +++ b/src/utils/settings.hpp @@ -5,6 +5,7 @@ #include #include #include +#include constexpr uint64_t KB = (1024ULL); constexpr uint64_t MB = (1024ULL * KB); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0793a2e2f3..ccd5019366 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,6 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip + DOWNLOAD_EXTRACT_TIMESTAMP TRUE ) # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) @@ -38,3 +39,36 @@ target_compile_definitions(ndd_filter_test PRIVATE MDB_MAXKEYSIZE=512) include(GoogleTest) gtest_discover_tests(ndd_filter_test) + +# --- ndd_rebuild_test --- +add_executable(ndd_rebuild_test rebuild_test.cpp ${LMDB_SOURCES} ${ROARING_SOURCE}) + +set_source_files_properties(${LMDB_SOURCES} PROPERTIES + COMPILE_FLAGS "-DMDBX_BUILD_SHARED_LIBRARY=0 -DMDBX_BUILD_FLAGS=\\\"NDD_EMBEDDED\\\"" +) + +target_include_directories(ndd_rebuild_test PRIVATE + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_SOURCE_DIR}/src/core + ${CMAKE_SOURCE_DIR}/src/utils + ${CMAKE_SOURCE_DIR}/src/server + ${CMAKE_SOURCE_DIR}/src/storage + ${CMAKE_SOURCE_DIR}/src/filter + ${CMAKE_SOURCE_DIR}/src/sparse + ${CMAKE_SOURCE_DIR}/src/hnsw + ${CMAKE_SOURCE_DIR}/src/quant + ${CMAKE_SOURCE_DIR}/third_party + ${CMAKE_SOURCE_DIR}/third_party/json + ${CMAKE_SOURCE_DIR}/third_party/msgpack/include + ${LIBARCHIVE_INCLUDE_DIR} +) + +target_link_libraries(ndd_rebuild_test + PRIVATE + ndd_core + GTest::gtest_main +) + +target_compile_definitions(ndd_rebuild_test PRIVATE MDB_MAXKEYSIZE=512) + +gtest_discover_tests(ndd_rebuild_test) diff --git a/tests/README.md b/tests/README.md index a62ef40998..a5a04a2c0a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,19 +1,83 @@ # Tests -This folder contains unit tests for Endee. +Unit tests for Endee. Currently two test suites: filter and rebuild. -## Build & Run +## Build & Run All Tests -From the repository root: + cmake -S . -B build -DENABLE_TESTING=ON -DUSE_NEON=ON # Apple Silicon + cmake -S . -B build -DENABLE_TESTING=ON -DUSE_AVX2=ON # Intel/AMD + cmake --build build + ctest --test-dir build --output-on-failure -1. Configure with tests enabled: - - `cmake -S . -B build -DENABLE_TESTING=ON` -2. Build the test target: - - `cmake --build build --target ndd_filter_test` -3. Run: - - `./build/tests/ndd_filter_test` +## ndd_filter_test + +Tests for the filter subsystem (categorical, numeric, boolean filtering). + +Build and run individually: + + cmake --build build --target ndd_filter_test + ./build/tests/ndd_filter_test + +Test cases: +- BucketTest: bucket serialization and deserialization +- FilterTest/CategoryFilterBasics: string category filter add and query +- FilterTest/BooleanFilterBasics: boolean filter via JSON input +- FilterTest/NumericFilterBasics: integer range queries +- FilterTest/FloatNumericFilter: float range queries +- FilterTest/MixedAndLogic: AND logic across multiple fields +- FilterTest/InOperator: $in operator with multiple values +- FilterTest/DeleteFilter: removal of categorical filters +- FilterTest/NumericDelete: removal of numeric filters + +## ndd_rebuild_test + +Unit and integration tests for the rebuild subsystem. + +Build and run individually: + + cmake --build build --target ndd_rebuild_test + ./build/tests/ndd_rebuild_test + +Test cases: + +State management (Rebuild class in isolation): +- RebuildStateTest/NoRebuild_HasActiveIsFalse +- RebuildStateTest/NoRebuild_GetProgressIsIdle +- RebuildStateTest/SetActive_HasActiveIsTrue +- RebuildStateTest/SetActive_GetProgressShowsInProgress +- RebuildStateTest/UpdateProgress_ReflectedInGetProgress +- RebuildStateTest/PercentComplete_CalculatedCorrectly +- RebuildStateTest/PercentComplete_ZeroTotal_IsZero +- RebuildStateTest/Complete_StatusIsCompleted +- RebuildStateTest/Complete_HasActiveIsFalse +- RebuildStateTest/Complete_CompletedAtPresent +- RebuildStateTest/Fail_StatusIsFailed +- RebuildStateTest/Fail_HasActiveIsFalse +- RebuildStateTest/Fail_ErrorMessagePresent +- RebuildStateTest/Fail_CompletedAtPresent +- RebuildStateTest/TwoUsers_IndependentState +- RebuildStateTest/GetProgress_WrongIndex_ReturnsIdle +- RebuildStateTest/SetActive_OverwritesPreviousCompleted + +Temp file cleanup and path helpers: +- RebuildCleanupTest/CleanupTempFiles_NonExistentDir_NoOp +- RebuildCleanupTest/CleanupTempFiles_RemovesTempFile +- RebuildCleanupTest/CleanupTempFiles_RemovesTimestampedFile +- RebuildCleanupTest/CleanupTempFiles_LeavesCanonicalIndex +- RebuildCleanupTest/CleanupTempFiles_EmptyDir_NoOp +- RebuildPathTest/GetTempPath_Format +- RebuildPathTest/GetTimestampedPath_HasTimestamp + +End-to-end rebuild via IndexManager: +- RebuildIntegrationTest/RebuildAsync_ReturnSuccessCode +- RebuildIntegrationTest/RebuildCompletes_ConfigUpdated +- RebuildIntegrationTest/RebuildCompletes_VectorCountPreserved +- RebuildIntegrationTest/RebuildWhileInProgress_Returns409Code +- RebuildIntegrationTest/RebuildNonExistentIndex_Returns404Code +- RebuildIntegrationTest/RebuildNoChange_Returns400Code ## Notes -- Tests can also be built in a dedicated tests build directory (e.g., `tests/build/`). +- Tests use real file I/O and real MDBX databases — no mocking. +- Each test creates its own temp directory and removes it on teardown. - The `tests/build/` directory is ignored by git. diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 101be3403e..f75d51ed15 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -37,7 +37,7 @@ class FilterTest : public ::testing::Test { } // Initialize Filter - filter = std::make_unique(db_path); + filter = std::make_unique(db_path, "testuser/testidx"); } void TearDown() override { diff --git a/tests/rebuild_test.cpp b/tests/rebuild_test.cpp new file mode 100644 index 0000000000..e3a8a38f6a --- /dev/null +++ b/tests/rebuild_test.cpp @@ -0,0 +1,328 @@ +#include +#include +#include +#include +#include + +#include "rebuild.hpp" +#include "ndd.hpp" +#include "utils/msgpack_ndd.hpp" +#include "server/auth.hpp" + +namespace fs = std::filesystem; + +// ============================================================ +// Layer 1 — Rebuild state management (no IndexManager needed) +// ============================================================ + +class RebuildStateTest : public ::testing::Test { +protected: + Rebuild rebuild; +}; + +TEST_F(RebuildStateTest, NoRebuild_HasActiveIsFalse) { + EXPECT_FALSE(rebuild.hasActiveRebuild("alice")); +} + +TEST_F(RebuildStateTest, NoRebuild_GetProgressIsIdle) { + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["status"], "idle"); +} + +TEST_F(RebuildStateTest, SetActive_HasActiveIsTrue) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + EXPECT_TRUE(rebuild.hasActiveRebuild("alice")); +} + +TEST_F(RebuildStateTest, SetActive_GetProgressShowsInProgress) { + rebuild.setActiveRebuild("alice", "alice/idx", 200); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["status"], "in_progress"); + EXPECT_EQ(p["total_vectors"], 200); + EXPECT_EQ(p["vectors_processed"], 0); +} + +TEST_F(RebuildStateTest, UpdateProgress_ReflectedInGetProgress) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.updateProgress("alice", 50); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["vectors_processed"], 50); +} + +TEST_F(RebuildStateTest, PercentComplete_CalculatedCorrectly) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.updateProgress("alice", 50); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_DOUBLE_EQ(p["percent_complete"].get(), 50.0); +} + +TEST_F(RebuildStateTest, PercentComplete_ZeroTotal_IsZero) { + rebuild.setActiveRebuild("alice", "alice/idx", 0); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_DOUBLE_EQ(p["percent_complete"].get(), 0.0); +} + +TEST_F(RebuildStateTest, Complete_StatusIsCompleted) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.completeActiveRebuild("alice"); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["status"], "completed"); +} + +TEST_F(RebuildStateTest, Complete_HasActiveIsFalse) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.completeActiveRebuild("alice"); + EXPECT_FALSE(rebuild.hasActiveRebuild("alice")); +} + +TEST_F(RebuildStateTest, Complete_CompletedAtPresent) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.completeActiveRebuild("alice"); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_TRUE(p.contains("completed_at")); +} + +TEST_F(RebuildStateTest, Fail_StatusIsFailed) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.failActiveRebuild("alice", "disk full"); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["status"], "failed"); +} + +TEST_F(RebuildStateTest, Fail_HasActiveIsFalse) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.failActiveRebuild("alice", "disk full"); + EXPECT_FALSE(rebuild.hasActiveRebuild("alice")); +} + +TEST_F(RebuildStateTest, Fail_ErrorMessagePresent) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.failActiveRebuild("alice", "disk full"); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["error"], "disk full"); +} + +TEST_F(RebuildStateTest, Fail_CompletedAtPresent) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.failActiveRebuild("alice", "oom"); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_TRUE(p.contains("completed_at")); +} + +TEST_F(RebuildStateTest, TwoUsers_IndependentState) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + EXPECT_TRUE(rebuild.hasActiveRebuild("alice")); + EXPECT_FALSE(rebuild.hasActiveRebuild("bob")); + rebuild.setActiveRebuild("bob", "bob/idx", 50); + EXPECT_TRUE(rebuild.hasActiveRebuild("bob")); + rebuild.completeActiveRebuild("alice"); + EXPECT_FALSE(rebuild.hasActiveRebuild("alice")); + EXPECT_TRUE(rebuild.hasActiveRebuild("bob")); +} + +TEST_F(RebuildStateTest, GetProgress_WrongIndex_ReturnsIdle) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + auto p = rebuild.getProgress("alice", "alice/other"); + EXPECT_EQ(p["status"], "idle"); +} + +TEST_F(RebuildStateTest, SetActive_OverwritesPreviousCompleted) { + rebuild.setActiveRebuild("alice", "alice/idx", 100); + rebuild.completeActiveRebuild("alice"); + rebuild.setActiveRebuild("alice", "alice/idx", 200); + auto p = rebuild.getProgress("alice", "alice/idx"); + EXPECT_EQ(p["status"], "in_progress"); + EXPECT_EQ(p["total_vectors"], 200); +} + +// ============================================================ +// Layer 2 — Temp file cleanup and path helpers +// ============================================================ + +class RebuildCleanupTest : public ::testing::Test { +protected: + std::string dir_; + Rebuild rebuild_; + + void SetUp() override { + dir_ = "./test_rebuild_cleanup_" + std::to_string(rand()); + fs::create_directories(dir_ + "/user/idx/vectors"); + } + + void TearDown() override { + if (fs::exists(dir_)) fs::remove_all(dir_); + } + + void touch(const std::string& rel_path) { + std::ofstream f(dir_ + "/" + rel_path); + f << "x"; + } + + bool exists(const std::string& rel_path) { + return fs::exists(dir_ + "/" + rel_path); + } +}; + +TEST_F(RebuildCleanupTest, CleanupTempFiles_NonExistentDir_NoOp) { + EXPECT_NO_THROW(rebuild_.cleanupTempFiles("/nonexistent/path/xyz")); +} + +TEST_F(RebuildCleanupTest, CleanupTempFiles_RemovesTempFile) { + touch("user/idx/vectors/default.idx.temp"); + rebuild_.cleanupTempFiles(dir_); + EXPECT_FALSE(exists("user/idx/vectors/default.idx.temp")); +} + +TEST_F(RebuildCleanupTest, CleanupTempFiles_RemovesTimestampedFile) { + touch("user/idx/vectors/default.idx.1714900000"); + rebuild_.cleanupTempFiles(dir_); + EXPECT_FALSE(exists("user/idx/vectors/default.idx.1714900000")); +} + +TEST_F(RebuildCleanupTest, CleanupTempFiles_LeavesCanonicalIndex) { + touch("user/idx/vectors/default.idx"); + rebuild_.cleanupTempFiles(dir_); + EXPECT_TRUE(exists("user/idx/vectors/default.idx")); +} + +TEST_F(RebuildCleanupTest, CleanupTempFiles_EmptyDir_NoOp) { + EXPECT_NO_THROW(rebuild_.cleanupTempFiles(dir_)); +} + +TEST(RebuildPathTest, GetTempPath_Format) { + auto path = Rebuild::getTempPath("/data/user/idx"); + EXPECT_EQ(path, "/data/user/idx/vectors/default.idx.temp"); +} + +TEST(RebuildPathTest, GetTimestampedPath_HasTimestamp) { + auto path = Rebuild::getTimestampedPath("/data/user/idx"); + // Should match /data/user/idx/vectors/default.idx. + std::string prefix = "/data/user/idx/vectors/default.idx."; + ASSERT_GT(path.size(), prefix.size()); + EXPECT_EQ(path.substr(0, prefix.size()), prefix); + std::string suffix = path.substr(prefix.size()); + EXPECT_FALSE(suffix.empty()); + EXPECT_TRUE(std::all_of(suffix.begin(), suffix.end(), ::isdigit)); +} + +// ============================================================ +// Layer 3 — Integration tests via IndexManager +// ============================================================ + +class RebuildIntegrationTest : public ::testing::Test { +protected: + static constexpr const char* USERNAME = "testuser"; + static constexpr const char* IDX_NAME = "testidx"; + static constexpr const char* INDEX_ID = "testuser/testidx"; + static constexpr size_t DIM = 32; + static constexpr size_t N_VECTORS = 100; + + std::string data_dir_; + std::unique_ptr manager_; + + void SetUp() override { + data_dir_ = "./test_rebuild_integration_" + std::to_string(rand()); + fs::create_directories(data_dir_); + PersistenceConfig pcfg; + pcfg.save_on_shutdown = false; + manager_ = std::make_unique(data_dir_, pcfg); + } + + void TearDown() override { + manager_.reset(); + if (fs::exists(data_dir_)) fs::remove_all(data_dir_); + } + + void createTestIndex(size_t M = 8, size_t ef_con = 64) { + IndexConfig config{ + .dim = DIM, + .max_elements = 1000, + .space_type_str = "cosine", + .M = M, + .ef_construction = ef_con, + .quant_level = ndd::quant::QuantizationLevel::FP32, + .checksum = 0 + }; + manager_->createIndex(INDEX_ID, config, UserType::Admin, 0); + } + + void insertVectors(size_t n = N_VECTORS) { + std::vector vecs; + vecs.reserve(n); + for (size_t i = 0; i < n; ++i) { + ndd::HybridVectorObject v; + v.id = "vec_" + std::to_string(i); + v.vector.resize(DIM); + for (size_t d = 0; d < DIM; ++d) + v.vector[d] = static_cast(rand()) / RAND_MAX; + vecs.push_back(std::move(v)); + } + manager_->addVectors(INDEX_ID, vecs); + } + + // Returns true if rebuild completed successfully within timeout_sec. + bool waitForRebuild(int timeout_sec = 10) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(timeout_sec); + while (std::chrono::steady_clock::now() < deadline) { + auto progress = manager_->getRebuildProgress(USERNAME, INDEX_ID); + std::string status = progress.value("status", ""); + if (status == "completed") return true; + if (status == "failed") return false; + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return false; + } +}; + +TEST_F(RebuildIntegrationTest, RebuildAsync_ReturnSuccessCode) { + createTestIndex(); + insertVectors(); + auto result = manager_->rebuildIndexAsync(INDEX_ID, 16, 128); + EXPECT_EQ(result.code, 0); + waitForRebuild(); +} + +TEST_F(RebuildIntegrationTest, RebuildCompletes_ConfigUpdated) { + createTestIndex(8, 64); + insertVectors(); + manager_->rebuildIndexAsync(INDEX_ID, 16, 128); + ASSERT_TRUE(waitForRebuild()); + auto meta = manager_->getMetadata(INDEX_ID); + ASSERT_TRUE(meta.has_value()); + EXPECT_EQ(meta->M, 16u); + EXPECT_EQ(meta->ef_con, 128u); +} + +TEST_F(RebuildIntegrationTest, RebuildCompletes_VectorCountPreserved) { + createTestIndex(); + insertVectors(N_VECTORS); + size_t before = manager_->getElementCount(INDEX_ID); + manager_->rebuildIndexAsync(INDEX_ID, 16, 128); + ASSERT_TRUE(waitForRebuild()); + size_t after = manager_->getElementCount(INDEX_ID); + EXPECT_EQ(before, after); +} + +TEST_F(RebuildIntegrationTest, RebuildWhileInProgress_Returns409Code) { + createTestIndex(); + insertVectors(); + // setActiveRebuild is synchronous — second call sees IN_PROGRESS before thread starts + auto r1 = manager_->rebuildIndexAsync(INDEX_ID, 16, 128); + ASSERT_EQ(r1.code, 0); + auto r2 = manager_->rebuildIndexAsync(INDEX_ID, 32, 256); + EXPECT_EQ(r2.code, 2); + waitForRebuild(); +} + +TEST_F(RebuildIntegrationTest, RebuildNonExistentIndex_Returns404Code) { + auto result = manager_->rebuildIndexAsync("testuser/doesnotexist", 16, 128); + EXPECT_EQ(result.code, 1); +} + +TEST_F(RebuildIntegrationTest, RebuildNoChange_Returns400Code) { + createTestIndex(8, 64); + insertVectors(); + auto result = manager_->rebuildIndexAsync(INDEX_ID, 8, 64); + EXPECT_EQ(result.code, 3); +} From 37c72b588833a27dd979edc2cf3a065a22f9e975 Mon Sep 17 00:00:00 2001 From: omnish-endee Date: Thu, 23 Apr 2026 11:01:56 +0530 Subject: [PATCH 12/29] Omnish/sync release note (#223) --- .github/workflows/sync_release_notes.yml | 197 +++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 .github/workflows/sync_release_notes.yml diff --git a/.github/workflows/sync_release_notes.yml b/.github/workflows/sync_release_notes.yml new file mode 100644 index 0000000000..5d66b84327 --- /dev/null +++ b/.github/workflows/sync_release_notes.yml @@ -0,0 +1,197 @@ +name: Release Notes Syncing + +on: + workflow_dispatch: + inputs: + tag_name: + description: 'Release tag to build (e.g. v1.0.0)' + required: true + type: string + + +jobs: + + create-and-build: + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + arch: + - name: avx2 + instance_type: c6i.large + binary_file_name: ndd-avx2 + - name: avx512 + instance_type: c6i.large + binary_file_name: ndd-avx2 + - name: neon + instance_type: c6g.large + binary_file_name: ndd-neon + - name: sve2 + instance_type: c7g.large + binary_file_name: ndd-neon + + steps: + + - name: Checkout PR commit + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.tag_name }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Launch Endee Server + id: launch + shell: bash + run: | + ARCH_NAME="${{ matrix.arch.name }}" + INSTANCE_TYPE="${{ matrix.arch.instance_type }}" + + if [[ "$ARCH_NAME" == "avx2" ]] || [[ "$ARCH_NAME" == "avx512" ]]; then + AMI_ID="${{ vars.AMI_ID }}" + else + AMI_ID="${{ vars.ARM_AMI_ID }}" + fi + + ENDEE_INSTANCE_ID=$(aws ec2 run-instances \ + --region ${{ vars.AWS_REGION }} \ + --image-id "$AMI_ID" \ + --instance-type "$INSTANCE_TYPE" \ + --key-name ${{ secrets.ENDEE_PEM }} \ + --security-group-ids ${{ secrets.VECTORDBBENCH_SERVER_GROUP_ID }} \ + --subnet-id ${{ secrets.AWS_SUBNET_ID }} \ + --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":30,"VolumeType":"gp3"}}]' \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$ARCH_NAME}]" \ + --query 'Instances[0].InstanceId' \ + --output text) + + echo "InstanceID: $ENDEE_INSTANCE_ID" + echo "instance_id=$ENDEE_INSTANCE_ID" >> $GITHUB_OUTPUT + + aws ec2 wait instance-running \ + --instance-ids $ENDEE_INSTANCE_ID + + IP=$(aws ec2 describe-instances \ + --instance-ids $ENDEE_INSTANCE_ID \ + --query 'Reservations[0].Instances[0].PublicIpAddress' \ + --output text) + + echo "IP: $IP" + echo "ip=$IP" >> $GITHUB_OUTPUT + + - name: Write PEM file + run: | + mkdir -p "$HOME/.ssh" + echo "${{ secrets.ENDEE_SSH_PRIVATE_KEY }}" > "$HOME/.ssh/${{ secrets.ENDEE_PEM }}" + chmod 400 "$HOME/.ssh/${{ secrets.ENDEE_PEM }}" + echo "PEM file created" + + - name: Wait for SSH to be ready + shell: bash + run: | + ENDEE_SSH_READY=false + ENDEE_IP="${{ steps.launch.outputs.ip }}" + ENDEE_PEM_FILE="$HOME/.ssh/${{ secrets.ENDEE_PEM }}" + + for i in {1..20}; do + if ssh -i "$ENDEE_PEM_FILE" -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o BatchMode=yes ubuntu@"$ENDEE_IP" "echo ok" 2>/dev/null; then + echo "SSH ready on ${{ matrix.arch.name }} @ $ENDEE_IP" + ENDEE_SSH_READY=true + break + fi + echo "Attempt $i/20 failed, retrying in 10 seconds..." + sleep 10 + done + + if [ "$ENDEE_SSH_READY" = false ]; then + echo "Failed to SSH to Endee Server" + exit 1 + fi + + - name: Build Endee Binary + run: | + ssh -o StrictHostKeyChecking=no -i "$HOME/.ssh/${{ secrets.ENDEE_PEM }}" ubuntu@${{ steps.launch.outputs.ip }} << 'EOF' + set -euo pipefail + sudo apt-get update -y + sudo apt-get install -y git build-essential + cd ~ + git clone https://github.com/endee-io/endee.git + cd endee + ulimit -n 5000 + chmod +x ./install.sh + ARCH="${{ matrix.arch.name }}" + if [[ "$ARCH" == "avx2" ]] || [[ "$ARCH" == "avx512" ]]; then + ./install.sh --release --avx2 + else + ./install.sh --release --neon + fi + EOF + + - name: Download binary + run: | + # verify path exists first + ssh -o StrictHostKeyChecking=no -i "$HOME/.ssh/${{ secrets.ENDEE_PEM }}" \ + ubuntu@${{ steps.launch.outputs.ip }} \ + "find /home/ubuntu -name '${{ matrix.arch.binary_file_name }}' 2>/dev/null" + + scp -o StrictHostKeyChecking=no -i "$HOME/.ssh/${{ secrets.ENDEE_PEM }}" \ + ubuntu@${{ steps.launch.outputs.ip }}:"/home/ubuntu/endee/build/${{ matrix.arch.binary_file_name }}" \ + ./ndd-${{ matrix.arch.name }} + + - name: Upload binary as artifact + uses: actions/upload-artifact@v4 + with: + name: ndd-${{ matrix.arch.name }} + path: ./ndd-${{ matrix.arch.name }} + + + - name: Terminate instance + if: always() + run: | + aws ec2 terminate-instances \ + --instance-ids ${{ steps.launch.outputs.instance_id }} + + # ← separate job, runs AFTER all 4 builds finish + push-binaries: + runs-on: ubuntu-latest + needs: create-and-build # waits for all 4 matrix jobs to complete + + steps: + - name: Download all binaries + uses: actions/download-artifact@v4 + with: + path: ./binaries # downloads all 4 artifacts here + + - name: Push all binaries to ndd-repo + run: | + git clone https://x-access-token:${{ secrets.PAT }}@github.com/Endee-Pro/ndd-docker.git + cd ndd-docker + + git checkout main + mkdir -p build + + # copy all 4 binaries at once + cp ../binaries/ndd-avx2/ndd-avx2 ./build/ndd-avx2 + cp ../binaries/ndd-avx512/ndd-avx512 ./build/ndd-avx512 + cp ../binaries/ndd-neon/ndd-neon ./build/ndd-neon + cp ../binaries/ndd-sve2/ndd-sve2 ./build/ndd-sve2 + + # UPDATE TAG IN DOCKERFILE + sed -i 's/LABEL version=".*"/LABEL version="${{ github.event.inputs.tag_name }}"/' ./Dockerfile + + git config user.email "actions@github.com" + git config user.name "GitHub Actions" + + git add . + + if git diff --staged --quiet; then + echo "No changes to commit" + else + git commit -m "Add binaries from release ${{ github.event.inputs.tag_name }}" + git push -u origin omnish/release-note-sync + fi \ No newline at end of file From 012bd44ffd63b7ba155ef3b8f8b5b1b763a408f5 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 11:23:44 +0530 Subject: [PATCH 13/29] add restore backup asyn with backupOperation --- src/core/ndd.hpp | 73 ++++++++++++++++++++++++++---------- src/main.cpp | 5 ++- src/storage/backup_store.hpp | 32 ++++++++++++++-- 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 8d1907efdf..900e68cdb5 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -1871,7 +1871,7 @@ class IndexManager { std::pair createBackupAsync(const std::string& index_id, const std::string& backup_name); - std::pair restoreBackup(const std::string& backup_name, + std::pair restoreBackupAsync(const std::string& backup_name, const std::string& target_index_name, const std::string& username); @@ -1885,7 +1885,7 @@ class IndexManager { return backup_store_.deleteBackup(backup_name, username); } - std::optional> getActiveBackup(const std::string& username) { + std::optional getActiveBackup(const std::string& username) { return backup_store_.getActiveBackup(username); } @@ -2135,13 +2135,14 @@ inline void IndexManager::executeBackupJob(const std::string& index_id, const st } } -inline std::pair IndexManager::restoreBackup(const std::string& backup_name, +inline void IndexManager::restoreBackup(const std::string& backup_name, const std::string& target_index_name, - const std::string& username) { - std::pair result = backup_store_.validateBackupName(backup_name); - if(!result.first) { - return result; - } + const std::string& username, + std::stop_token st) { + // std::pair result = backup_store_.validateBackupName(backup_name); + // if(!result.first) { + // return result; + // } std::string backup_dir_root = backup_store_.getUserBackupDir(username); std::string backup_tar = backup_dir_root + "/" + backup_name + ".tar"; @@ -2151,17 +2152,17 @@ inline std::pair IndexManager::restoreBackup(const std::strin std::string target_index_id = username + "/" + target_index_name; std::string target_dir = data_dir_ + "/" + target_index_id; - if(!std::filesystem::exists(backup_tar)) { - return {false, "Backup not found: " + backup_name}; - } + // if(!std::filesystem::exists(backup_tar)) { + // return {false, "Backup not found: " + backup_name}; + // } - if(metadata_manager_->getMetadata(target_index_id).has_value()) { - return {false, "Target index already exists"}; - } + // if(metadata_manager_->getMetadata(target_index_id).has_value()) { + // return {false, "Target index already exists"}; + // } std::string error_msg; if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { - return {false, "Failed to extract backup archive: " + error_msg}; + throw std::runtime_error("Failed to extract backup archive: " + error_msg); } std::vector folders; @@ -2173,7 +2174,7 @@ inline std::pair IndexManager::restoreBackup(const std::strin if(folders.size() != 1) { std::filesystem::remove_all(backup_extract_dir); - return {false, "Backup extraction failed - directory not found"}; + throw std::runtime_error("Backup extraction failed - directory not found"; } std::string backup_dir = folders[0]; @@ -2182,7 +2183,7 @@ inline std::pair IndexManager::restoreBackup(const std::strin std::ifstream f(backup_dir + "/metadata.json"); if(!f.good()) { std::filesystem::remove_all(backup_extract_dir); - return {false, "Backup metadata missing"}; + throw std::runtime_error("Backup metadata missing"); } nlohmann::json meta_json = nlohmann::json::parse(f); @@ -2218,11 +2219,13 @@ inline std::pair IndexManager::restoreBackup(const std::strin loadIndex(target_index_id); } + backup_store_.clearActiveBackup(username); + LOG_INFO(2045, username, target_index_name, "Restored backup from " << backup_tar); - return {true, ""}; } catch(const std::exception& e) { std::filesystem::remove_all(backup_extract_dir); - return {false, "Failed to restore backup: " + std::string(e.what())}; + backup_store_.clearActiveBackup(username); + LOG_ERROR(2058, backup_name, "Restoration of backup failed for " << backup_name << ", index name ", << target_index_name <<": " << e.what()); } } @@ -2255,13 +2258,43 @@ inline std::pair IndexManager::createBackupAsync(const std::s std::jthread t([this, index_id, backup_name](std::stop_token st) { executeBackupJob(index_id, backup_name, st); }); - backup_store_.setActiveBackup(username, index_id, backup_name, std::move(t)); + backup_store_.setActiveBackup(username, index_id, backup_name, BackupOperation::Creation, std::move(t)); LOG_INFO(2046, index_id, "Backup started: " << backup_name); return {true, backup_name}; } +inline std::pair IndexManager::restoreBackupAsync(const std::string& backup_name, + const std::string& target_index_name, + const std::string& username) { + + // Check if any backup is already under creation or restoration + if(backup_store_.hasActiveBackup(username)) { + return {false, "Backup already in progress for user: " + username}; + } + + // Check if the backup exists + nlohmann::json backup_db = backup_store_.readBackupJson(username); + if(!backup_db.contains(backup_name)) { + return {false, "Backup not found: " + backup_name} + } + + // Check if an index with target name already exists + std::string target_index_id = username + "/" + target_index_name; + if(metadata_manager_->getMetaData(target_index_id).has_value()) { + return {false, "Target index already exists"}; + } + + std::jthread t([this, backup_name, target_index_name](std::stop_token st) { + restoreBackup(backup_name, target_index_name, username); + }) + + const std::string index_id = username + "/" + target_index_name; + backup_store_.setActiveBackup(username,index_id, backup_name, BackupOperation::Restoration, std::move(t)); + + LOG_INFO(2059, username, "Restoration started for backup: " << backup_name <<", target_index: " << target_index_name); + } inline std::pair IndexManager::uploadBackup(const std::string& backup_name, const std::string& username, const std::string& file_content) { std::string user_backup_dir = backup_store_.getUserBackupDir(username); diff --git a/src/main.cpp b/src/main.cpp index 05cbf21ac8..8cca1b4608 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -659,8 +659,9 @@ int main(int argc, char** argv) { crow::json::wvalue response; if (active) { response["active"] = true; - response["backup_name"] = active->second; - response["index_id"] = active->first; + response["backup_name"] = active->backup_name; + response["index_id"] = active->index_id; + response["operation"] = active->operation; } else { response["active"] = false; } diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index 45fea9ec4c..ca33bede2d 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -19,9 +19,29 @@ #include "settings.hpp" #include "log.hpp" +enum class BackupOperation { + Creation, + Restoration +}; + +inline std::string backupOperationToString(BackupOperation op) { + switch (op) { + case BackupOperation::Creation: return "creation"; + case BackupOperation::Restoration: return "restoration"; + } + return ""; +} + +struct ActiveBackupStatus { + std::string index_id; + std::string backup_name; + std::string operation; +}; + struct ActiveBackup { std::string index_id; std::string backup_name; + BackupOperation operation; std::jthread thread; // jthread: built-in stop_token + auto-join on destruction }; @@ -189,9 +209,9 @@ class BackupStore { // Active backup tracking void setActiveBackup(const std::string& username, const std::string& index_id, - const std::string& backup_name, std::jthread&& thread) { + const std::string& backup_name, const BackupOperation& operation ,std::jthread&& thread) { std::lock_guard lock(active_user_backups_mutex_); - active_user_backups_[username] = {index_id, backup_name, std::move(thread)}; + active_user_backups_[username] = {index_id, backup_name, operation, std::move(thread)}; } void clearActiveBackup(const std::string& username) { @@ -292,11 +312,15 @@ class BackupStore { // Active backup query - std::optional> getActiveBackup(const std::string& username) { + std::optional getActiveBackup(const std::string& username) { std::lock_guard lock(active_user_backups_mutex_); auto it = active_user_backups_.find(username); if (it != active_user_backups_.end()) { - return std::make_pair(it->second.index_id, it->second.backup_name); + return ActiveBackupStatus{ + it->second.index_id, + it->second.backup_name, + backupOperationToString(it->second.operation) + }; } return std::nullopt; } From c1584f028b59716ae2ad5aebda848ef8d13f60b2 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 12:31:58 +0530 Subject: [PATCH 14/29] fix: try catch error handling --- src/core/ndd.hpp | 89 +++++++++++++++++++++++++----------------------- src/main.cpp | 8 +++-- 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 900e68cdb5..85a14de3fe 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -226,6 +226,9 @@ class IndexManager { Rebuild rebuild_; void executeBackupJob(const std::string& index_id, const std::string& backup_name, std::stop_token st); + void restoreBackup(const std::string& backup_name, const std::string& target_index_name, + const std::string& username,std::stop_token st); + std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; return std::make_unique(wal_dir, index_id); @@ -2160,26 +2163,26 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, // return {false, "Target index already exists"}; // } - std::string error_msg; - if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { - throw std::runtime_error("Failed to extract backup archive: " + error_msg); - } + try { + std::string error_msg; + if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { + throw std::runtime_error("Failed to extract backup archive: " + error_msg); + } - std::vector folders; - for(const auto& entry : std::filesystem::directory_iterator(backup_extract_dir)) { - if(entry.is_directory()) { - folders.push_back(entry.path().string()); + std::vector folders; + for(const auto& entry : std::filesystem::directory_iterator(backup_extract_dir)) { + if(entry.is_directory()) { + folders.push_back(entry.path().string()); + } } - } - if(folders.size() != 1) { - std::filesystem::remove_all(backup_extract_dir); - throw std::runtime_error("Backup extraction failed - directory not found"; - } + if(folders.size() != 1) { + std::filesystem::remove_all(backup_extract_dir); + throw std::runtime_error("Backup extraction failed - directory not found"); + } - std::string backup_dir = folders[0]; + std::string backup_dir = folders[0]; - try { std::ifstream f(backup_dir + "/metadata.json"); if(!f.good()) { std::filesystem::remove_all(backup_extract_dir); @@ -2225,7 +2228,7 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, } catch(const std::exception& e) { std::filesystem::remove_all(backup_extract_dir); backup_store_.clearActiveBackup(username); - LOG_ERROR(2058, backup_name, "Restoration of backup failed for " << backup_name << ", index name ", << target_index_name <<": " << e.what()); + LOG_ERROR(2058, backup_name, "Restoration of backup failed for " << backup_name << ", index name " << target_index_name <<": " << e.what()); } } @@ -2269,32 +2272,34 @@ inline std::pair IndexManager::restoreBackupAsync(const std:: const std::string& target_index_name, const std::string& username) { - // Check if any backup is already under creation or restoration - if(backup_store_.hasActiveBackup(username)) { - return {false, "Backup already in progress for user: " + username}; - } - - // Check if the backup exists - nlohmann::json backup_db = backup_store_.readBackupJson(username); - if(!backup_db.contains(backup_name)) { - return {false, "Backup not found: " + backup_name} - } - - // Check if an index with target name already exists - std::string target_index_id = username + "/" + target_index_name; - if(metadata_manager_->getMetaData(target_index_id).has_value()) { - return {false, "Target index already exists"}; - } - - std::jthread t([this, backup_name, target_index_name](std::stop_token st) { - restoreBackup(backup_name, target_index_name, username); - }) - - const std::string index_id = username + "/" + target_index_name; - backup_store_.setActiveBackup(username,index_id, backup_name, BackupOperation::Restoration, std::move(t)); - - LOG_INFO(2059, username, "Restoration started for backup: " << backup_name <<", target_index: " << target_index_name); - } + // Check if any backup is already under creation or restoration + if(backup_store_.hasActiveBackup(username)) { + return {false, "Backup already in progress for user: " + username}; + } + + // Check if the backup exists + nlohmann::json backup_db = backup_store_.readBackupJson(username); + if(!backup_db.contains(backup_name)) { + return {false, "Backup not found: " + backup_name}; + } + + // Check if an index with target name already exists + std::string target_index_id = username + "/" + target_index_name; + if(metadata_manager_->getMetadata(target_index_id).has_value()) { + return {false, "Target index already exists"}; + } + + std::jthread t([this, backup_name, target_index_name, username](std::stop_token st) { + restoreBackup(backup_name, target_index_name, username,st); + }); + + const std::string index_id = username + "/" + target_index_name; + backup_store_.setActiveBackup(username,index_id, backup_name, BackupOperation::Restoration, std::move(t)); + + LOG_INFO(2059, username, "Restoration started for backup: " << backup_name <<", target_index: " << target_index_name); + + return {true, target_index_name}; +} inline std::pair IndexManager::uploadBackup(const std::string& backup_name, const std::string& username, const std::string& file_content) { std::string user_backup_dir = backup_store_.getUserBackupDir(username); diff --git a/src/main.cpp b/src/main.cpp index 8cca1b4608..224295ebd9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -521,12 +521,16 @@ int main(int argc, char** argv) { try { std::pair result = - index_manager.restoreBackup(backup_name, target_index_name, ctx.username); + index_manager.restoreBackupAsync(backup_name, target_index_name, ctx.username); if(!result.first) { LOG_WARN(1023, ctx.username, target_index_name, "Restore-backup request rejected: " << result.second); return json_error(400, result.second); } - return crow::response(201, "Backup restored successfully"); + crow::json::wvalue response; + response["backup_name"] = backup_name; + response["target_index"] = result.second; + response["status"] = "in_progress"; + return crow::response(202, response.dump()); } catch(const std::exception& e) { return json_error_500(ctx.username, target_index_name, req.url, e.what()); } From cefe072d2fd7f92cf6962e2b1a3338f690af7dd0 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 12:36:25 +0530 Subject: [PATCH 15/29] remove commented code --- src/core/ndd.hpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 85a14de3fe..9c46e36140 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2142,10 +2142,6 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, const std::string& target_index_name, const std::string& username, std::stop_token st) { - // std::pair result = backup_store_.validateBackupName(backup_name); - // if(!result.first) { - // return result; - // } std::string backup_dir_root = backup_store_.getUserBackupDir(username); std::string backup_tar = backup_dir_root + "/" + backup_name + ".tar"; @@ -2155,14 +2151,6 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, std::string target_index_id = username + "/" + target_index_name; std::string target_dir = data_dir_ + "/" + target_index_id; - // if(!std::filesystem::exists(backup_tar)) { - // return {false, "Backup not found: " + backup_name}; - // } - - // if(metadata_manager_->getMetadata(target_index_id).has_value()) { - // return {false, "Target index already exists"}; - // } - try { std::string error_msg; if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { From 075589e02efb5a68608532fa2a2ffb1c5141a7f6 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 13:05:24 +0530 Subject: [PATCH 16/29] remove ActiveBackupStatus --- src/core/ndd.hpp | 7 +++---- src/main.cpp | 5 ++--- src/storage/backup_store.hpp | 20 ++++---------------- 3 files changed, 9 insertions(+), 23 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 9c46e36140..3ec0c61ef9 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -1888,7 +1888,7 @@ class IndexManager { return backup_store_.deleteBackup(backup_name, username); } - std::optional getActiveBackup(const std::string& username) { + std::optional> getActiveBackup(const std::string& username) { return backup_store_.getActiveBackup(username); } @@ -2249,7 +2249,7 @@ inline std::pair IndexManager::createBackupAsync(const std::s std::jthread t([this, index_id, backup_name](std::stop_token st) { executeBackupJob(index_id, backup_name, st); }); - backup_store_.setActiveBackup(username, index_id, backup_name, BackupOperation::Creation, std::move(t)); + backup_store_.setActiveBackup(username, backup_name, BackupOperation::Creation, std::move(t)); LOG_INFO(2046, index_id, "Backup started: " << backup_name); @@ -2281,8 +2281,7 @@ inline std::pair IndexManager::restoreBackupAsync(const std:: restoreBackup(backup_name, target_index_name, username,st); }); - const std::string index_id = username + "/" + target_index_name; - backup_store_.setActiveBackup(username,index_id, backup_name, BackupOperation::Restoration, std::move(t)); + backup_store_.setActiveBackup(username, backup_name, BackupOperation::Restoration, std::move(t)); LOG_INFO(2059, username, "Restoration started for backup: " << backup_name <<", target_index: " << target_index_name); diff --git a/src/main.cpp b/src/main.cpp index 224295ebd9..5109e46e20 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -663,9 +663,8 @@ int main(int argc, char** argv) { crow::json::wvalue response; if (active) { response["active"] = true; - response["backup_name"] = active->backup_name; - response["index_id"] = active->index_id; - response["operation"] = active->operation; + response["backup_name"] = active->first; + response["operation"] = active->second; } else { response["active"] = false; } diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index ca33bede2d..506651dbd5 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -32,14 +32,7 @@ inline std::string backupOperationToString(BackupOperation op) { return ""; } -struct ActiveBackupStatus { - std::string index_id; - std::string backup_name; - std::string operation; -}; - struct ActiveBackup { - std::string index_id; std::string backup_name; BackupOperation operation; std::jthread thread; // jthread: built-in stop_token + auto-join on destruction @@ -208,10 +201,9 @@ class BackupStore { // Active backup tracking - void setActiveBackup(const std::string& username, const std::string& index_id, - const std::string& backup_name, const BackupOperation& operation ,std::jthread&& thread) { + void setActiveBackup(const std::string& username, const std::string& backup_name, const BackupOperation& operation ,std::jthread&& thread) { std::lock_guard lock(active_user_backups_mutex_); - active_user_backups_[username] = {index_id, backup_name, operation, std::move(thread)}; + active_user_backups_[username] = {backup_name, operation, std::move(thread)}; } void clearActiveBackup(const std::string& username) { @@ -312,15 +304,11 @@ class BackupStore { // Active backup query - std::optional getActiveBackup(const std::string& username) { + std::optional> getActiveBackup(const std::string& username) { std::lock_guard lock(active_user_backups_mutex_); auto it = active_user_backups_.find(username); if (it != active_user_backups_.end()) { - return ActiveBackupStatus{ - it->second.index_id, - it->second.backup_name, - backupOperationToString(it->second.operation) - }; + return make_pair(it->second.backup_name, backupOperationToString(it->second.operation)); } return std::nullopt; } From b580bedb33ce3f48b299d8865dae55b5209fd0a0 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 13:07:02 +0530 Subject: [PATCH 17/29] minor change --- src/storage/backup_store.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index 506651dbd5..9b10be5ef7 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -308,7 +308,7 @@ class BackupStore { std::lock_guard lock(active_user_backups_mutex_); auto it = active_user_backups_.find(username); if (it != active_user_backups_.end()) { - return make_pair(it->second.backup_name, backupOperationToString(it->second.operation)); + return std::make_pair(it->second.backup_name, backupOperationToString(it->second.operation)); } return std::nullopt; } From d55cd9fe237878e1c055fefbe8821d6899236e57 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 17:02:11 +0530 Subject: [PATCH 18/29] docs: add async restore backup flow --- docs/backup-system.md | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/docs/backup-system.md b/docs/backup-system.md index 25ad0b4839..c30a8f07ec 100644 --- a/docs/backup-system.md +++ b/docs/backup-system.md @@ -2,7 +2,7 @@ `BackupStore` is a standalone utility class owned by `IndexManager` as a direct member (`BackupStore backup_store_`). It has no dependency on IndexManager — it handles tar operations, backup JSON, file paths, and active backup tracking. `IndexManager` orchestrates the backup flow (save, lock, metadata) and delegates file-level operations to `BackupStore`. All backup API calls go through `IndexManager` — `BackupStore` is not exposed to `main.cpp`. -Backups are stored as `.tar` archives in per-user directories: `{DATA_DIR}/backups/{username}/`. Temp files use a centralized `{DATA_DIR}/backups/.tmp/{username}/` directory. Active backup state is tracked in-memory with mutex protection (`backup_state_mutex_`). +Backups are stored as `.tar` archives in per-user directories: `{DATA_DIR}/backups/{username}/`. Temp files use a centralized `{DATA_DIR}/backups/.tmp/{username}/` directory. Active backup state is tracked in-memory with mutex protection (`active_user_backups_mutex_`). ## Architecture @@ -10,7 +10,7 @@ Backups are stored as `.tar` archives in per-user directories: `{DATA_DIR}/backu IndexManager (ndd.hpp) ├── BackupStore backup_store_ (direct member) ├── 3 orchestration methods (inline, defined after class): -│ executeBackupJob, createBackupAsync, restoreBackup, uploadBackup +│ executeBackupJob, createBackupAsync, restoreBackupAsync, uploadBackup ├── 5 forwarding methods: │ listBackups, deleteBackup, getActiveBackup, getBackupInfo, validateBackupName └── Handles: saveIndexInternal, getIndexEntry, metadata_manager_, loadIndex @@ -19,9 +19,9 @@ BackupStore (src/storage/backup_store.hpp — standalone, no IndexManager depend ├── Archive: createBackupTar(), extractBackupTar() ├── Helpers: getUserBackupDir(), getUserTempDir(), readBackupJson(), writeBackupJson(), cleanupTempDir() ├── Active backup: setActiveBackup(), clearActiveBackup(), hasActiveBackup(), getActiveBackup() -│ (all protected by backup_state_mutex_) +│ (all protected by active_user_backups_mutex_; tracks both Creation and Restoration operations) ├── Public methods: validateBackupName(), listBackups(), deleteBackup(), getBackupInfo() -└── Owns: data_dir_, active_user_backups_, backup_state_mutex_ (mutable) +└── Owns: data_dir_, active_user_backups_, active_user_backups_mutex_ (mutable) ``` ## API Endpoints @@ -32,7 +32,7 @@ BackupStore (src/storage/backup_store.hpp — standalone, no IndexManager depend | GET | `/api/v1/backups` | List all backup files | | GET | `/api/v1/backups/active` | Check active backup for current user | | GET | `/api/v1/backups/{name}/info` | Get backup metadata (read from .tar) | -| POST | `/api/v1/backups/{name}/restore` | Restore backup to new index | +| POST | `/api/v1/backups/{name}/restore` | Restore backup to new index (async, 202) | | DELETE | `/api/v1/backups/{name}` | Delete a backup file | | GET | `/api/v1/backups/{name}/download` | Download backup (streaming) | | POST | `/api/v1/backups/upload` | Upload a backup file | @@ -49,7 +49,7 @@ operation_mutex (mutex, per-index) └── Write operations block until mutex is available ``` -**Simple approach:** No atomic flags or file locks. The backup thread holds `operation_mutex` while saving and creating the tar. Write operations that arrive during backup simply block on the mutex until the backup releases it. One active backup per user is enforced via in-memory map protected by `backup_state_mutex_` for thread-safe access. +**Simple approach:** No atomic flags or file locks. The backup thread holds `operation_mutex` while saving and creating the tar. Write operations that arrive during backup simply block on the mutex until the backup releases it. One active operation per user is enforced via in-memory map protected by `active_user_backups_mutex_` — this covers both backup creation and restore operations, so a user cannot run a backup and a restore concurrently. **Write path during backup:** @@ -92,15 +92,29 @@ addVectors/deleteVectors/updateFilters/deleteByFilter/deleteIndex (blocks if backup holds operation_mutex — resumes after backup completes) ``` -### Restore Backup +### Restore Backup (Async) ``` -POST /backups/{name}/restore -→ validate name → check tar exists → check target index does NOT exist -→ extract tar to backups/.tmp/{username}/ → read metadata.json → copy files to target dir -→ register in MetadataManager → cleanup temp dir → loadIndex() -→ 201 OK +POST /backups/{name}/restore → validate name → check backup exists in backup registry +→ check target index does NOT exist → check active_user_backups_[username] empty (one per user) +→ insert into active_user_backups_ map (BackupOperation::Restoration) +→ spawn jthread → return 202 { backup_name, target_index, status: "in_progress" } +``` + +**Background thread** (`restoreBackup`): + ``` +→ extract tar to backups/.tmp/{username}/{backup_name}/ +→ validate archive structure (expect exactly 1 directory) +→ read metadata.json → copy files to target dir → remove metadata.json from target +→ register in MetadataManager +→ [LOCK indices_mutex_] loadIndex() [UNLOCK] +→ cleanup temp dir → erase from active_user_backups_ +``` + +**On failure**: cleanup temp dir → erase from active_user_backups_ → log error (not returned to client). + +**Status polling**: client polls `GET /api/v1/backups/active` to check if restore is still in progress. ### Download (Streaming) @@ -138,11 +152,11 @@ GET /backups/{name}/info | # | Check | Where | |---|-------|-------| -| 1 | **One backup per user** — `active_user_backups_` map rejects if user already has active backup | createBackupAsync | +| 1 | **One operation per user** — `active_user_backups_` map rejects if user already has an active backup or restore | createBackupAsync, restoreBackupAsync | | 2 | **Write blocking** — writes block on `operation_mutex` until backup completes | addVectors, deleteVectors, updateFilters, deleteByFilter, deleteIndex | | 3 | **Name validation** — alphanumeric, underscores, hyphens only; max 200 chars | validateBackupName | | 4 | **Duplicate prevention** — checks if .tar file already exists on disk | createBackupAsync, upload | | 5 | **Disk space** — requires 2x index size available | executeBackupJob | | 6 | **Atomic tar** — writes to `backups/.tmp/{username}/` first, then renames to final location | executeBackupJob | | 7 | **Crash recovery** — on startup: `cleanupTempDir()` deletes entire `backups/.tmp/` directory | BackupStore constructor | -| 8 | **Restore safety** — target must not exist, metadata must be valid, cleanup on failure | restoreBackup | +| 8 | **Restore safety** — target must not exist, metadata must be valid; cleanup (temp dir + active status) on failure in background thread | restoreBackupAsync, restoreBackup | From f2d70e109a829f06bbf0e0314bd769a004052b13 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Thu, 23 Apr 2026 17:03:48 +0530 Subject: [PATCH 19/29] bump web ui versiont to 1.6.0-alpha.5 --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 3facd070fb..4a62ea6e53 100755 --- a/install.sh +++ b/install.sh @@ -197,7 +197,7 @@ distro_factory() { # **************************************** add_frontend() { - VERSION="v1.6.1" + VERSION="v1.6.0-alpha.5" log "Pulling frontend version ${VERSION}" mkdir -p $script_dir/frontend cd $script_dir/frontend From 062752304ef84732dac08dbeadd5d5b5e12f974d Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Fri, 24 Apr 2026 14:59:03 +0530 Subject: [PATCH 20/29] refactor: add size check for restore backup --- src/core/ndd.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 3ec0c61ef9..e3c179af06 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2152,6 +2152,13 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, std::string target_dir = data_dir_ + "/" + target_index_id; try { + size_t backup_size = std::filesystem::file_size(backup_tar); + auto space_info = std::filesystem::space(user_temp_dir); + if(space_info.available < backup_size * 2) { + throw std::runtime_error("Insufficient disk space: need " + + std::to_string(backup_size * 2 / MB) + " MB"); + } + std::string error_msg; if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { throw std::runtime_error("Failed to extract backup archive: " + error_msg); From 1d525a69cdd45071f9e182a0e0da06f609ba747a Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Fri, 24 Apr 2026 17:31:19 +0530 Subject: [PATCH 21/29] refactor: streamline backup thread management in BackupStore --- src/core/ndd.hpp | 11 ++++++++--- src/storage/backup_store.hpp | 11 +++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index e3c179af06..630bf11b22 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2253,10 +2253,13 @@ inline std::pair IndexManager::createBackupAsync(const std::s return {false, "Backup already exists: " + backup_name}; } + backup_store_.setActiveBackup(username, backup_name, BackupOperation::Creation); + std::jthread t([this, index_id, backup_name](std::stop_token st) { executeBackupJob(index_id, backup_name, st); }); - backup_store_.setActiveBackup(username, backup_name, BackupOperation::Creation, std::move(t)); + + backup_store_.attachBackupThread(username, std::move(t)); LOG_INFO(2046, index_id, "Backup started: " << backup_name); @@ -2284,12 +2287,14 @@ inline std::pair IndexManager::restoreBackupAsync(const std:: return {false, "Target index already exists"}; } + backup_store_.setActiveBackup(username, backup_name, BackupOperation::Restoration); + std::jthread t([this, backup_name, target_index_name, username](std::stop_token st) { restoreBackup(backup_name, target_index_name, username,st); }); - - backup_store_.setActiveBackup(username, backup_name, BackupOperation::Restoration, std::move(t)); + backup_store_.attachBackupThread(username, std::move(t)); + LOG_INFO(2059, username, "Restoration started for backup: " << backup_name <<", target_index: " << target_index_name); return {true, target_index_name}; diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index 9b10be5ef7..e30bcbedee 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -200,10 +200,17 @@ class BackupStore { } // Active backup tracking + void setActiveBackup(const std::string& username, const std::string& backup_name, const BackupOperation& operation) { + std::lock_guard lock(active_user_backups_mutex_); + active_user_backups_[username] = ActiveBackup{backup_name, operation, {}}; + } - void setActiveBackup(const std::string& username, const std::string& backup_name, const BackupOperation& operation ,std::jthread&& thread) { + void attachBackupThread(const std::string& username, std::jthread&& thread) { std::lock_guard lock(active_user_backups_mutex_); - active_user_backups_[username] = {backup_name, operation, std::move(thread)}; + auto it = active_user_backups_.find(username); + if(it != active_user_backups_.end()) { + it->second.thread = std::move(thread); + } } void clearActiveBackup(const std::string& username) { From 26bb171d2b01305cf7f3b4a4730246ee771afcd2 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Fri, 24 Apr 2026 17:42:07 +0530 Subject: [PATCH 22/29] docs: add attachBackupThread() --- docs/backup-system.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/backup-system.md b/docs/backup-system.md index c30a8f07ec..94f3114f58 100644 --- a/docs/backup-system.md +++ b/docs/backup-system.md @@ -18,8 +18,9 @@ IndexManager (ndd.hpp) BackupStore (src/storage/backup_store.hpp — standalone, no IndexManager dependency) ├── Archive: createBackupTar(), extractBackupTar() ├── Helpers: getUserBackupDir(), getUserTempDir(), readBackupJson(), writeBackupJson(), cleanupTempDir() -├── Active backup: setActiveBackup(), clearActiveBackup(), hasActiveBackup(), getActiveBackup() +├── Active backup: setActiveBackup(), attachBackupThread(), clearActiveBackup(), hasActiveBackup(), getActiveBackup() │ (all protected by active_user_backups_mutex_; tracks both Creation and Restoration operations) +│ setActiveBackup() registers the entry before the thread is spawned; attachBackupThread() moves the jthread in after ├── Public methods: validateBackupName(), listBackups(), deleteBackup(), getBackupInfo() └── Owns: data_dir_, active_user_backups_, active_user_backups_mutex_ (mutable) ``` @@ -69,8 +70,8 @@ If backup holds the mutex, writes block until it completes. Normal write-vs-writ ``` POST /index/X/backup → validateBackupName() → check no duplicate .tar on disk → check active_user_backups_[username] empty (one per user) -→ insert into active_user_backups_ map -→ spawn detached thread → return 202 { backup_name } +→ setActiveBackup() — insert entry into active_user_backups_ map (no thread yet) +→ spawn jthread → attachBackupThread() — move jthread into map entry → return 202 { backup_name } ``` **Background thread** (`executeBackupJob`): @@ -97,8 +98,8 @@ addVectors/deleteVectors/updateFilters/deleteByFilter/deleteIndex ``` POST /backups/{name}/restore → validate name → check backup exists in backup registry → check target index does NOT exist → check active_user_backups_[username] empty (one per user) -→ insert into active_user_backups_ map (BackupOperation::Restoration) -→ spawn jthread → return 202 { backup_name, target_index, status: "in_progress" } +→ setActiveBackup() — insert entry into active_user_backups_ map (BackupOperation::Restoration, no thread yet) +→ spawn jthread → attachBackupThread() — move jthread into map entry → return 202 { backup_name, target_index, status: "in_progress" } ``` **Background thread** (`restoreBackup`): From 9749d474c2875a0bbb911a95d94a815dafcbe8c9 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Fri, 24 Apr 2026 17:52:33 +0530 Subject: [PATCH 23/29] refactor: add disk space check before restoring backup --- src/core/ndd.hpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 630bf11b22..92d1cb76ce 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -2152,13 +2152,6 @@ inline void IndexManager::restoreBackup(const std::string& backup_name, std::string target_dir = data_dir_ + "/" + target_index_id; try { - size_t backup_size = std::filesystem::file_size(backup_tar); - auto space_info = std::filesystem::space(user_temp_dir); - if(space_info.available < backup_size * 2) { - throw std::runtime_error("Insufficient disk space: need " + - std::to_string(backup_size * 2 / MB) + " MB"); - } - std::string error_msg; if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { throw std::runtime_error("Failed to extract backup archive: " + error_msg); @@ -2287,6 +2280,17 @@ inline std::pair IndexManager::restoreBackupAsync(const std:: return {false, "Target index already exists"}; } + // Check disk space before making the backup active + std::string backup_tar = backup_store_.getUserBackupDir(username) + "/" + backup_name + ".tar"; + std::string user_temp_dir = backup_store_.getUserTempDir(username); + std::filesystem::create_directories(user_temp_dir); + size_t backup_size = std::filesystem::file_size(backup_tar); + auto space_info = std::filesystem::space(user_temp_dir); + if (space_info.available < backup_size * 2) { + return {false, "Insufficient disk space: need " + + std::to_string(backup_size * 2 / MB) + " MB"}; + } + backup_store_.setActiveBackup(username, backup_name, BackupOperation::Restoration); std::jthread t([this, backup_name, target_index_name, username](std::stop_token st) { From b9f38b3308487eeb527647f3deb7fc3595ea1337 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 16:46:29 +0530 Subject: [PATCH 24/29] refactor: add backup.cpp --- CMakeLists.txt | 1 + src/core/ndd.hpp | 274 ++---------------- src/storage/backup_store.cpp | 547 +++++++++++++++++++++++++++++++++++ src/storage/backup_store.hpp | 320 ++++---------------- 4 files changed, 623 insertions(+), 519 deletions(-) create mode 100644 src/storage/backup_store.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f8e27dbe0..824e5c4684 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,6 +255,7 @@ set(NDD_CORE_SOURCES src/sparse/inverted_index.cpp src/utils/system_sanity/system_sanity.cpp src/core/rebuild.cpp + src/storage/backup_store.cpp ) # Build non-main project sources separately so they can be compiled in parallel diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 92d1cb76ce..7a1eca6094 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -107,7 +107,7 @@ struct CacheEntry { * * writers: addVectors, saveIndexInternal, saveIndex, deleteVectors, * evictIfNeeded, recoverIndex, deleteVectorsByFilter, updateFilters, - * deleteIndex, executeBackupJob + * deleteIndex * * readers: searchKNN, getVector, getIndexInfo (loaded-index path only) * @@ -202,6 +202,7 @@ struct PersistenceConfig { class IndexManager { friend class Rebuild; // executeJob accesses saveIndexInternal + metadata_manager_ + friend class BackupStore; private: std::deque indices_list_; std::unordered_map> indices_; @@ -224,10 +225,6 @@ class IndexManager { std::atomic running_{true}; BackupStore backup_store_; Rebuild rebuild_; - void executeBackupJob(const std::string& index_id, const std::string& backup_name, std::stop_token st); - - void restoreBackup(const std::string& backup_name, const std::string& target_index_name, - const std::string& username,std::stop_token st); std::unique_ptr createWAL(const std::string& index_id) { const std::string wal_dir = data_dir_ + "/" + index_id; @@ -1974,252 +1971,6 @@ class IndexManager { } }; -// ========== IndexManager backup implementations ========== - -inline void IndexManager::executeBackupJob(const std::string& index_id, const std::string& backup_name, - std::stop_token st) { - std::string username; - size_t upos = index_id.find('/'); - if (upos != std::string::npos) { - username = index_id.substr(0, upos); - } - - try { - std::string index_name; - if (upos != std::string::npos) { - index_name = index_id.substr(upos + 1); - } else { - throw std::runtime_error("Invalid index ID format"); - } - - std::string user_backup_dir = backup_store_.getUserBackupDir(username); - std::filesystem::create_directories(user_backup_dir); - std::string user_temp_dir = backup_store_.getUserTempDir(username); - std::filesystem::create_directories(user_temp_dir); - std::string source_dir = data_dir_ + "/" + index_id; - std::string backup_tar_final = user_backup_dir + "/" + backup_name + ".tar"; - std::string backup_tar_temp = user_temp_dir + "/.tmp_" + backup_name + ".tar"; - - if(std::filesystem::exists(backup_tar_final)) { - throw std::runtime_error("Backup already exists: " + backup_name); - } - - size_t index_size = 0; - for(const auto& file : std::filesystem::recursive_directory_iterator(source_dir)) { - if(!std::filesystem::is_directory(file)) { - index_size += std::filesystem::file_size(file); - } - } - - auto space_info = std::filesystem::space(user_backup_dir); - if(space_info.available < index_size * 2) { - throw std::runtime_error("Insufficient disk space: need " + - std::to_string(index_size * 2 / MB) + " MB"); - } - - auto meta = metadata_manager_->getMetadata(index_id); - nlohmann::json metadata_json; - if(meta) { - metadata_json["original_index"] = index_name; - metadata_json["timestamp"] = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - metadata_json["size_mb"] = index_size / MB; - metadata_json["params"] = {{"M", meta->M}, - {"ef_construction", meta->ef_con}, - {"dim", meta->dimension}, - {"sparse_model", - ndd::sparseScoringModelToString(meta->sparse_model)}, - {"space_type", meta->space_type_str}, - {"quant_level", static_cast(meta->quant_level)}, - {"total_elements", meta->total_elements}, - {"checksum", meta->checksum}}; - LOG_DEBUG("Metadata prepared for backup: " << metadata_json.dump()); - } else { - LOG_ERROR(2041, index_id, "Failed to get metadata for backup"); - throw std::runtime_error("Cannot create backup without index metadata"); - } - - // Check stop_token before expensive operations - if (st.stop_requested()) { - LOG_INFO(2056, index_id, "Backup cancelled before backup work started"); - backup_store_.clearActiveBackup(username); - return; - } - - auto entry_ptr = getIndexEntry(index_id); - auto& entry = *entry_ptr; - std::string metadata_file_in_index = source_dir + "/metadata.json"; - { - /** - * NOTE: While making a backup is a reading operation on the index, - * we are picking a writer's lock here because we have disabled reader's - * locks on other instances of read in the system right now. - * - * This is to enable reads while writes are happening on the index. - * Check other instances of shared_lock on operation_mutex. - */ - std::unique_lock operation_lock(entry.operation_mutex); - - // Check again after acquiring lock (shutdown may have been requested while waiting) - if (st.stop_requested()) { - LOG_INFO(2057, index_id, "Backup cancelled"); - backup_store_.clearActiveBackup(username); - return; - } - - saveIndexInternal(entry); - - if(!metadata_json.empty()) { - std::ofstream meta_file(metadata_file_in_index, std::ios::binary); - if(!meta_file) { - throw std::runtime_error("Failed to create metadata file: " + metadata_file_in_index); - } - meta_file << metadata_json.dump(4); - meta_file.flush(); - meta_file.close(); - - if(!std::filesystem::exists(metadata_file_in_index)) { - throw std::runtime_error("Metadata file was not created: " + metadata_file_in_index); - } - LOG_DEBUG("Metadata file created: " << metadata_file_in_index << " (size: " << std::filesystem::file_size(metadata_file_in_index) << " bytes)"); - } - - std::string error_msg; - LOG_DEBUG("Creating tar archive from " << source_dir << " to " << backup_tar_temp); - if(!backup_store_.createBackupTar(source_dir, backup_tar_temp, error_msg, st)) { - if(std::filesystem::exists(metadata_file_in_index)) { - std::filesystem::remove(metadata_file_in_index); - } - throw std::runtime_error("Failed to create tar archive: " + error_msg); - } - - if(!std::filesystem::exists(backup_tar_temp)) { - throw std::runtime_error("Tar archive was not created: " + backup_tar_temp); - } - LOG_DEBUG("Tar archive created successfully: " << backup_tar_temp << " (size: " << std::filesystem::file_size(backup_tar_temp) << " bytes)"); - - if(std::filesystem::exists(metadata_file_in_index)) { - std::filesystem::remove(metadata_file_in_index); - } - } - - backup_store_.clearActiveBackup(username); - - LOG_INFO(2042, index_id, "Backup tar created; write operations resumed"); - - std::filesystem::rename(backup_tar_temp, backup_tar_final); - - nlohmann::json backup_db = backup_store_.readBackupJson(username); - backup_db[backup_name] = metadata_json; - backup_store_.writeBackupJson(username, backup_db); - - LOG_INFO(2043, index_id, "Backup completed: " << backup_name << " -> " << backup_tar_final); - - } catch (const std::exception& e) { - std::string user_backup_dir = backup_store_.getUserBackupDir(username); - std::string user_temp_dir = backup_store_.getUserTempDir(username); - std::string source_dir = data_dir_ + "/" + index_id; - std::string backup_tar_final = user_backup_dir + "/" + backup_name + ".tar"; - std::string backup_tar_temp = user_temp_dir + "/.tmp_" + backup_name + ".tar"; - std::string metadata_file_in_index = source_dir + "/metadata.json"; - - if(std::filesystem::exists(backup_tar_temp)) { - std::filesystem::remove(backup_tar_temp); - } - if(std::filesystem::exists(backup_tar_final)) { - std::filesystem::remove(backup_tar_final); - } - if(std::filesystem::exists(metadata_file_in_index)) { - std::filesystem::remove(metadata_file_in_index); - } - - backup_store_.clearActiveBackup(username); - - LOG_ERROR(2044, index_id, "Backup failed for " << backup_name << ": " << e.what()); - } -} - -inline void IndexManager::restoreBackup(const std::string& backup_name, - const std::string& target_index_name, - const std::string& username, - std::stop_token st) { - - std::string backup_dir_root = backup_store_.getUserBackupDir(username); - std::string backup_tar = backup_dir_root + "/" + backup_name + ".tar"; - std::string user_temp_dir = backup_store_.getUserTempDir(username); - std::filesystem::create_directories(user_temp_dir); - std::string backup_extract_dir = user_temp_dir + "/" + backup_name; - std::string target_index_id = username + "/" + target_index_name; - std::string target_dir = data_dir_ + "/" + target_index_id; - - try { - std::string error_msg; - if(!backup_store_.extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { - throw std::runtime_error("Failed to extract backup archive: " + error_msg); - } - - std::vector folders; - for(const auto& entry : std::filesystem::directory_iterator(backup_extract_dir)) { - if(entry.is_directory()) { - folders.push_back(entry.path().string()); - } - } - - if(folders.size() != 1) { - std::filesystem::remove_all(backup_extract_dir); - throw std::runtime_error("Backup extraction failed - directory not found"); - } - - std::string backup_dir = folders[0]; - - std::ifstream f(backup_dir + "/metadata.json"); - if(!f.good()) { - std::filesystem::remove_all(backup_extract_dir); - throw std::runtime_error("Backup metadata missing"); - } - nlohmann::json meta_json = nlohmann::json::parse(f); - - std::filesystem::create_directories(target_dir); - std::filesystem::copy(backup_dir, - target_dir, - std::filesystem::copy_options::recursive - | std::filesystem::copy_options::overwrite_existing); - - std::filesystem::remove(target_dir + "/metadata.json"); - - IndexMetadata new_meta; - new_meta.name = target_index_name; - new_meta.dimension = meta_json["params"]["dim"]; - new_meta.M = meta_json["params"]["M"]; - new_meta.ef_con = meta_json["params"]["ef_construction"]; - new_meta.space_type_str = meta_json["params"]["space_type"]; - new_meta.quant_level = static_cast( - meta_json["params"]["quant_level"].get()); - const auto sparse_model = ndd::sparseScoringModelFromString( - meta_json["params"]["sparse_model"].get()); - new_meta.sparse_model = *sparse_model; - new_meta.created_at = std::chrono::system_clock::now(); - new_meta.total_elements = meta_json["params"].value("total_elements", 0ul); - new_meta.checksum = meta_json["params"].value("checksum", -1); - - metadata_manager_->storeMetadata(target_index_id, new_meta); - - std::filesystem::remove_all(backup_extract_dir); - - { - std::unique_lock write_lock(indices_mutex_); - loadIndex(target_index_id); - } - - backup_store_.clearActiveBackup(username); - - LOG_INFO(2045, username, target_index_name, "Restored backup from " << backup_tar); - } catch(const std::exception& e) { - std::filesystem::remove_all(backup_extract_dir); - backup_store_.clearActiveBackup(username); - LOG_ERROR(2058, backup_name, "Restoration of backup failed for " << backup_name << ", index name " << target_index_name <<": " << e.what()); - } -} - inline std::pair IndexManager::createBackupAsync(const std::string& index_id, const std::string& backup_name) { std::pair result = backup_store_.validateBackupName(backup_name); @@ -2248,8 +1999,14 @@ inline std::pair IndexManager::createBackupAsync(const std::s backup_store_.setActiveBackup(username, backup_name, BackupOperation::Creation); - std::jthread t([this, index_id, backup_name](std::stop_token st) { - executeBackupJob(index_id, backup_name, st); + CreateBackupParams params{ + .index_id = index_id, + .backup_name = backup_name, + .index_manager = this, + }; + + std::jthread t([this, params = std::move(params)](std::stop_token st) { + backup_store_.createBackup(params, st); }); backup_store_.attachBackupThread(username, std::move(t)); @@ -2293,8 +2050,15 @@ inline std::pair IndexManager::restoreBackupAsync(const std:: backup_store_.setActiveBackup(username, backup_name, BackupOperation::Restoration); - std::jthread t([this, backup_name, target_index_name, username](std::stop_token st) { - restoreBackup(backup_name, target_index_name, username,st); + RestoreBackupParams params{ + .backup_name = backup_name, + .target_index_name = target_index_name, + .username = username, + .index_manager = this, + }; + + std::jthread t([this, params = std::move(params)](std::stop_token st) { + backup_store_.restoreBackup(params,st); }); backup_store_.attachBackupThread(username, std::move(t)); diff --git a/src/storage/backup_store.cpp b/src/storage/backup_store.cpp new file mode 100644 index 0000000000..b6cd0a4d05 --- /dev/null +++ b/src/storage/backup_store.cpp @@ -0,0 +1,547 @@ +#include +#include +#include + +#include "backup_store.hpp" +#include "../core/ndd.hpp" +#include "utils/types.hpp" + +// Construction + +BackupStore::BackupStore(const std::string& data_dir) : + data_dir_(data_dir) { + std::filesystem::create_directories(data_dir + "/backups"); + cleanupTempDir(); +} + +// Core backup operations + +void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token st) { + std::string index_id = params.index_id; + std::string backup_name = params.backup_name; + auto* index_manager = params.index_manager; + + std::string username; + size_t upos = index_id.find('/'); + if (upos != std::string::npos) { + username = index_id.substr(0, upos); + } + + try { + std::string index_name; + if (upos != std::string::npos) { + index_name = index_id.substr(upos + 1); + } else { + throw std::runtime_error("Invalid index ID format"); + } + + std::string user_backup_dir = getUserBackupDir(username); + std::filesystem::create_directories(user_backup_dir); + std::string user_temp_dir = getUserTempDir(username); + std::filesystem::create_directories(user_temp_dir); + std::string source_dir = data_dir_ + "/" + index_id; + std::string backup_tar_final = user_backup_dir + "/" + backup_name + ".tar"; + std::string backup_tar_temp = user_temp_dir + "/.tmp_" + backup_name + ".tar"; + + if(std::filesystem::exists(backup_tar_final)) { + throw std::runtime_error("Backup already exists: " + backup_name); + } + + size_t index_size = 0; + for(const auto& file : std::filesystem::recursive_directory_iterator(source_dir)) { + if(!std::filesystem::is_directory(file)) { + index_size += std::filesystem::file_size(file); + } + } + + auto space_info = std::filesystem::space(user_backup_dir); + if(space_info.available < index_size * 2) { + throw std::runtime_error("Insufficient disk space: need " + + std::to_string(index_size * 2 / MB) + " MB"); + } + + auto meta = index_manager->metadata_manager_->getMetadata(index_id); + nlohmann::json metadata_json; + if(meta) { + metadata_json["original_index"] = index_name; + metadata_json["timestamp"] = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + metadata_json["size_mb"] = index_size / MB; + metadata_json["params"] = {{"M", meta->M}, + {"ef_construction", meta->ef_con}, + {"dim", meta->dimension}, + {"sparse_model", + ndd::sparseScoringModelToString(meta->sparse_model)}, + {"space_type", meta->space_type_str}, + {"quant_level", static_cast(meta->quant_level)}, + {"total_elements", meta->total_elements}, + {"checksum", meta->checksum}}; + LOG_DEBUG("Metadata prepared for backup: " << metadata_json.dump()); + } else { + LOG_ERROR(2041, index_id, "Failed to get metadata for backup"); + throw std::runtime_error("Cannot create backup without index metadata"); + } + + // Check stop_token before expensive operations + if (st.stop_requested()) { + LOG_INFO(2056, index_id, "Backup cancelled before backup work started"); + clearActiveBackup(username); + return; + } + + auto entry_ptr = index_manager->getIndexEntry(index_id); + auto& entry = *entry_ptr; + std::string metadata_file_in_index = source_dir + "/metadata.json"; + { + /** + * NOTE: While making a backup is a reading operation on the index, + * we are picking a writer's lock here because we have disabled reader's + * locks on other instances of read in the system right now. + * + * This is to enable reads while writes are happening on the index. + * Check other instances of shared_lock on operation_mutex. + */ + std::unique_lock operation_lock(entry.operation_mutex); + + // Check again after acquiring lock (shutdown may have been requested while waiting) + if (st.stop_requested()) { + LOG_INFO(2057, index_id, "Backup cancelled"); + clearActiveBackup(username); + return; + } + + index_manager->saveIndexInternal(entry); + + if(!metadata_json.empty()) { + std::ofstream meta_file(metadata_file_in_index, std::ios::binary); + if(!meta_file) { + throw std::runtime_error("Failed to create metadata file: " + metadata_file_in_index); + } + meta_file << metadata_json.dump(4); + meta_file.flush(); + meta_file.close(); + + if(!std::filesystem::exists(metadata_file_in_index)) { + throw std::runtime_error("Metadata file was not created: " + metadata_file_in_index); + } + LOG_DEBUG("Metadata file created: " << metadata_file_in_index << " (size: " << std::filesystem::file_size(metadata_file_in_index) << " bytes)"); + } + + std::string error_msg; + LOG_DEBUG("Creating tar archive from " << source_dir << " to " << backup_tar_temp); + if(!createBackupTar(source_dir, backup_tar_temp, error_msg, st)) { + if(std::filesystem::exists(metadata_file_in_index)) { + std::filesystem::remove(metadata_file_in_index); + } + throw std::runtime_error("Failed to create tar archive: " + error_msg); + } + + if(!std::filesystem::exists(backup_tar_temp)) { + throw std::runtime_error("Tar archive was not created: " + backup_tar_temp); + } + LOG_DEBUG("Tar archive created successfully: " << backup_tar_temp << " (size: " << std::filesystem::file_size(backup_tar_temp) << " bytes)"); + + if(std::filesystem::exists(metadata_file_in_index)) { + std::filesystem::remove(metadata_file_in_index); + } + } + + clearActiveBackup(username); + + LOG_INFO(2042, index_id, "Backup tar created; write operations resumed"); + + std::filesystem::rename(backup_tar_temp, backup_tar_final); + + nlohmann::json backup_db = readBackupJson(username); + backup_db[backup_name] = metadata_json; + writeBackupJson(username, backup_db); + + LOG_INFO(2043, index_id, "Backup completed: " << backup_name << " -> " << backup_tar_final); + + } catch (const std::exception& e) { + std::string user_backup_dir = getUserBackupDir(username); + std::string user_temp_dir = getUserTempDir(username); + std::string source_dir = data_dir_ + "/" + index_id; + std::string backup_tar_final = user_backup_dir + "/" + backup_name + ".tar"; + std::string backup_tar_temp = user_temp_dir + "/.tmp_" + backup_name + ".tar"; + std::string metadata_file_in_index = source_dir + "/metadata.json"; + + if(std::filesystem::exists(backup_tar_temp)) { + std::filesystem::remove(backup_tar_temp); + } + if(std::filesystem::exists(backup_tar_final)) { + std::filesystem::remove(backup_tar_final); + } + if(std::filesystem::exists(metadata_file_in_index)) { + std::filesystem::remove(metadata_file_in_index); + } + + clearActiveBackup(username); + + LOG_ERROR(2044, index_id, "Backup failed for " << backup_name << ": " << e.what()); + } +} + +void BackupStore::restoreBackup(const RestoreBackupParams& params, std::stop_token st) { + std::string username = params.username; + std::string backup_name = params.backup_name; + std::string target_index_name = params.target_index_name; + auto* index_manager = params.index_manager; + + std::string backup_dir_root = getUserBackupDir(username); + std::string backup_tar = backup_dir_root + "/" + backup_name + ".tar"; + std::string user_temp_dir = getUserTempDir(username); + std::filesystem::create_directories(user_temp_dir); + std::string backup_extract_dir = user_temp_dir + "/" + backup_name; + std::string target_index_id = username + "/" + target_index_name; + std::string target_dir = data_dir_ + "/" + target_index_id; + + try { + std::string error_msg; + if(!extractBackupTar(backup_tar, backup_extract_dir, error_msg)) { + throw std::runtime_error("Failed to extract backup archive: " + error_msg); + } + + std::vector folders; + for(const auto& entry : std::filesystem::directory_iterator(backup_extract_dir)) { + if(entry.is_directory()) { + folders.push_back(entry.path().string()); + } + } + + if(folders.size() != 1) { + std::filesystem::remove_all(backup_extract_dir); + throw std::runtime_error("Backup extraction failed - directory not found"); + } + + std::string backup_dir = folders[0]; + + std::ifstream f(backup_dir + "/metadata.json"); + if(!f.good()) { + std::filesystem::remove_all(backup_extract_dir); + throw std::runtime_error("Backup metadata missing"); + } + nlohmann::json meta_json = nlohmann::json::parse(f); + + std::filesystem::create_directories(target_dir); + std::filesystem::copy(backup_dir, + target_dir, + std::filesystem::copy_options::recursive + | std::filesystem::copy_options::overwrite_existing); + + std::filesystem::remove(target_dir + "/metadata.json"); + + IndexMetadata new_meta; + new_meta.name = target_index_name; + new_meta.dimension = meta_json["params"]["dim"]; + new_meta.M = meta_json["params"]["M"]; + new_meta.ef_con = meta_json["params"]["ef_construction"]; + new_meta.space_type_str = meta_json["params"]["space_type"]; + new_meta.quant_level = static_cast( + meta_json["params"]["quant_level"].get()); + const auto sparse_model = ndd::sparseScoringModelFromString( + meta_json["params"]["sparse_model"].get()); + new_meta.sparse_model = *sparse_model; + new_meta.created_at = std::chrono::system_clock::now(); + new_meta.total_elements = meta_json["params"].value("total_elements", 0ul); + new_meta.checksum = meta_json["params"].value("checksum", -1); + + index_manager->metadata_manager_->storeMetadata(target_index_id, new_meta); + + std::filesystem::remove_all(backup_extract_dir); + + { + std::unique_lock write_lock(index_manager->indices_mutex_); + index_manager->loadIndex(target_index_id); + } + + clearActiveBackup(username); + + LOG_INFO(2045, username, target_index_name, "Restored backup from " << backup_tar); + } catch(const std::exception& e) { + std::filesystem::remove_all(backup_extract_dir); + clearActiveBackup(username); + LOG_ERROR(2058, + backup_name, + "Restoration of backup failed for " << backup_name << ", index name " + << target_index_name << ": " << e.what()); + } +} + +// Archive operations + +bool BackupStore::createBackupTar(const std::filesystem::path& source_dir, + const std::filesystem::path& archive_path, + std::string& error_msg, + std::stop_token st) { + struct archive* a = archive_write_new(); + archive_write_set_format_pax_restricted(a); + + if(archive_write_open_filename(a, archive_path.string().c_str()) != ARCHIVE_OK) { + error_msg = archive_error_string(a); + archive_write_free(a); + return false; + } + + for(const auto& entry : std::filesystem::recursive_directory_iterator(source_dir)) { + // Check stop_token per-file so shutdown doesn't block on large tar operations + if(st.stop_requested()) { + archive_write_close(a); + archive_write_free(a); + error_msg = "Backup cancelled"; + return false; + } + if(entry.is_regular_file()) { + struct archive_entry* e = archive_entry_new(); + + std::filesystem::path rel_path = + std::filesystem::relative(entry.path(), source_dir.parent_path()); + archive_entry_set_pathname(e, rel_path.string().c_str()); + archive_entry_set_size(e, std::filesystem::file_size(entry.path())); + archive_entry_set_filetype(e, AE_IFREG); + archive_entry_set_perm(e, 0644); + + if(archive_write_header(a, e) != ARCHIVE_OK) { + error_msg = archive_error_string(a); + archive_entry_free(e); + archive_write_close(a); + archive_write_free(a); + return false; + } + + std::ifstream file(entry.path(), std::ios::binary); + char buffer[8192]; + while(file.read(buffer, sizeof(buffer)) || file.gcount() > 0) { + archive_write_data(a, buffer, file.gcount()); + } + file.close(); + archive_entry_free(e); + } + } + + archive_write_close(a); + archive_write_free(a); + return true; +} + +bool BackupStore::extractBackupTar(const std::filesystem::path& archive_path, + const std::filesystem::path& dest_dir, + std::string& error_msg) { + struct archive* a = archive_read_new(); + struct archive* ext = archive_write_disk_new(); + struct archive_entry* entry; + + archive_read_support_format_all(a); + archive_read_support_filter_all(a); + archive_write_disk_set_options(ext, ARCHIVE_EXTRACT_TIME | ARCHIVE_EXTRACT_PERM); + archive_write_disk_set_standard_lookup(ext); + + if(archive_read_open_filename(a, archive_path.string().c_str(), 10240) != ARCHIVE_OK) { + error_msg = archive_error_string(a); + archive_read_free(a); + archive_write_free(ext); + return false; + } + + while(archive_read_next_header(a, &entry) == ARCHIVE_OK) { + std::filesystem::path full_path = dest_dir / archive_entry_pathname(entry); + archive_entry_set_pathname(entry, full_path.string().c_str()); + + if(archive_write_header(ext, entry) == ARCHIVE_OK) { + const void* buff; + size_t size; + la_int64_t offset; + + while(archive_read_data_block(a, &buff, &size, &offset) == ARCHIVE_OK) { + archive_write_data_block(ext, buff, size, offset); + } + } + archive_write_finish_entry(ext); + } + + archive_read_close(a); + archive_read_free(a); + archive_write_close(ext); + archive_write_free(ext); + return true; +} + +// Backup listing & info + +nlohmann::json BackupStore::listBackups(const std::string& username) { + nlohmann::json backup_list_json = readBackupJson(username); + return backup_list_json; +} + +nlohmann::json BackupStore::getBackupInfo(const std::string& backup_name, + const std::string& username) { + nlohmann::json backup_db = readBackupJson(username); + if(backup_db.contains(backup_name)) { + return backup_db[backup_name]; + } + return nlohmann::json(); +} + +// Backup name validation + +std::pair BackupStore::validateBackupName(const std::string& backup_name) const { + if(backup_name.empty()) { + return std::make_pair(false, "Backup name cannot be empty"); + } + + if(backup_name.length() > settings::MAX_BACKUP_NAME_LENGTH) { + return std::make_pair(false, + "Backup name too long (max " + + std::to_string(settings::MAX_BACKUP_NAME_LENGTH) + + " characters)"); + } + + static const std::regex backup_name_regex("^[a-zA-Z0-9_-]+$"); + if(!std::regex_match(backup_name, backup_name_regex)) { + return std::make_pair(false, + "Invalid backup name: only alphanumeric, underscores, " + "and hyphens allowed"); + } + + return std::make_pair(true, ""); +} + +// Backup deletion + +std::pair BackupStore::deleteBackup(const std::string& backup_name, + const std::string& username) { + std::pair result = validateBackupName(backup_name); + if(!result.first) { + return result; + } + + std::string backup_tar = getUserBackupDir(username) + "/" + backup_name + ".tar"; + + if(std::filesystem::exists(backup_tar)) { + std::filesystem::remove(backup_tar); + + nlohmann::json backup_db = readBackupJson(username); + backup_db.erase(backup_name); + writeBackupJson(username, backup_db); + + LOG_INFO(1303, username, "Deleted backup " << backup_tar); + return {true, ""}; + } else { + return {false, "Backup not found"}; + } +} + +// Active backup tracking + +void BackupStore::setActiveBackup(const std::string& username, + const std::string& backup_name, + const BackupOperation& operation) { + std::lock_guard lock(active_user_backups_mutex_); + active_user_backups_[username] = ActiveBackup{backup_name, operation, {}}; +} + +void BackupStore::attachBackupThread(const std::string& username, std::jthread&& thread) { + std::lock_guard lock(active_user_backups_mutex_); + auto it = active_user_backups_.find(username); + if(it != active_user_backups_.end()) { + it->second.thread = std::move(thread); + } +} + +void BackupStore::clearActiveBackup(const std::string& username) { + std::lock_guard lock(active_user_backups_mutex_); + auto it = active_user_backups_.find(username); + if(it != active_user_backups_.end()) { + // Called from within the thread itself — detach so erase doesn't try to join + if(it->second.thread.joinable()) { + it->second.thread.detach(); + } + active_user_backups_.erase(it); + } +} + +bool BackupStore::hasActiveBackup(const std::string& username) const { + std::lock_guard lock(active_user_backups_mutex_); + return active_user_backups_.count(username) > 0; +} + +std::optional> BackupStore::getActiveBackup(const std::string& username) { + std::lock_guard lock(active_user_backups_mutex_); + auto it = active_user_backups_.find(username); + if(it != active_user_backups_.end()) { + return std::make_pair(it->second.backup_name, + backupOperationToString(it->second.operation)); + } + return std::nullopt; +} + +void BackupStore::joinAllThreads() { + std::vector threads_to_join; + { + std::lock_guard lock(active_user_backups_mutex_); + for(auto& [username, backup] : active_user_backups_) { + if(backup.thread.joinable()) { + threads_to_join.push_back(std::move(backup.thread)); + } + } + active_user_backups_.clear(); + } + // request_stop + join outside the lock + for(auto& t : threads_to_join) { + t.request_stop(); // signal stop_token — thread sees it inside createBackupTar + if(t.joinable()) { + t.join(); + } + } +} + +// Path helpers + +std::string BackupStore::getUserBackupDir(const std::string& username) const { + return data_dir_ + "/backups/" + username; +} + +std::string BackupStore::getBackupJsonPath(const std::string& username) const { + return getUserBackupDir(username) + "/backup.json"; +} + +std::string BackupStore::getUserTempDir(const std::string& username) const { + return data_dir_ + "/backups/.tmp/" + username; +} + +// Backup JSON helpers + +void BackupStore::writeBackupJson(const std::string& username, const nlohmann::json& data) { + std::string path = getBackupJsonPath(username); + std::ofstream f(path); + f << data.dump(2); +} + +nlohmann::json BackupStore::readBackupJson(const std::string& username) { + std::string path = getBackupJsonPath(username); + if(!std::filesystem::exists(path)) { + return nlohmann::json::object(); + } + try { + std::ifstream f(path); + return nlohmann::json::parse(f); + } catch(const std::exception& e) { + LOG_WARN(1304, + username, + "Failed to parse backup metadata file " << path << ": " << e.what()); + return nlohmann::json::object(); + } +} + +// Temp directory cleanup + +void BackupStore::cleanupTempDir() { + std::string temp_dir = data_dir_ + "/backups/.tmp"; + if(std::filesystem::exists(temp_dir)) { + try { + std::filesystem::remove_all(temp_dir); + LOG_INFO(1301, "Cleaned up backup temp directory"); + } catch(const std::exception& e) { + LOG_ERROR(1302, "Failed to clean up backup temp directory: " << e.what()); + } + } +} diff --git a/src/storage/backup_store.hpp b/src/storage/backup_store.hpp index e30bcbedee..1c75189ceb 100644 --- a/src/storage/backup_store.hpp +++ b/src/storage/backup_store.hpp @@ -2,40 +2,45 @@ #include #include -#include #include #include - #include -#include -#include #include #include #include #include "json/nlohmann_json.hpp" -#include "index_meta.hpp" -#include "settings.hpp" -#include "log.hpp" -enum class BackupOperation { - Creation, - Restoration -}; +class IndexManager; + +enum class BackupOperation { Creation, Restoration }; inline std::string backupOperationToString(BackupOperation op) { - switch (op) { + switch(op) { case BackupOperation::Creation: return "creation"; case BackupOperation::Restoration: return "restoration"; } return ""; } +struct CreateBackupParams { + std::string index_id; + std::string backup_name; + IndexManager* index_manager; +}; + +struct RestoreBackupParams { + std::string backup_name; + std::string target_index_name; + std::string username; + IndexManager* index_manager; +}; + struct ActiveBackup { std::string backup_name; BackupOperation operation; - std::jthread thread; // jthread: built-in stop_token + auto-join on destruction + std::jthread thread; // jthread: built-in stop_token + auto-join on destruction }; class BackupStore { @@ -45,288 +50,75 @@ class BackupStore { mutable std::mutex active_user_backups_mutex_; public: - BackupStore(const std::string& data_dir) - : data_dir_(data_dir) { - std::filesystem::create_directories(data_dir + "/backups"); - cleanupTempDir(); - } + BackupStore(const std::string& data_dir); + + // Core backup operations + + void createBackup(const CreateBackupParams& params, std::stop_token st); - // Archive methods + void restoreBackup(const RestoreBackupParams& params, std::stop_token st); + + // Archive operations bool createBackupTar(const std::filesystem::path& source_dir, const std::filesystem::path& archive_path, std::string& error_msg, - std::stop_token st = {}) { - struct archive* a = archive_write_new(); - archive_write_set_format_pax_restricted(a); - - if(archive_write_open_filename(a, archive_path.string().c_str()) != ARCHIVE_OK) { - error_msg = archive_error_string(a); - archive_write_free(a); - return false; - } - - for(const auto& entry : std::filesystem::recursive_directory_iterator(source_dir)) { - // Check stop_token per-file so shutdown doesn't block on large tar operations - if(st.stop_requested()) { - archive_write_close(a); - archive_write_free(a); - error_msg = "Backup cancelled"; - return false; - } - if(entry.is_regular_file()) { - struct archive_entry* e = archive_entry_new(); - - std::filesystem::path rel_path = - std::filesystem::relative(entry.path(), source_dir.parent_path()); - archive_entry_set_pathname(e, rel_path.string().c_str()); - archive_entry_set_size(e, std::filesystem::file_size(entry.path())); - archive_entry_set_filetype(e, AE_IFREG); - archive_entry_set_perm(e, 0644); - - if(archive_write_header(a, e) != ARCHIVE_OK) { - error_msg = archive_error_string(a); - archive_entry_free(e); - archive_write_close(a); - archive_write_free(a); - return false; - } - - std::ifstream file(entry.path(), std::ios::binary); - char buffer[8192]; - while(file.read(buffer, sizeof(buffer)) || file.gcount() > 0) { - archive_write_data(a, buffer, file.gcount()); - } - file.close(); - archive_entry_free(e); - } - } - - archive_write_close(a); - archive_write_free(a); - return true; - } + std::stop_token st = {}); bool extractBackupTar(const std::filesystem::path& archive_path, const std::filesystem::path& dest_dir, - std::string& error_msg) { - struct archive* a = archive_read_new(); - struct archive* ext = archive_write_disk_new(); - struct archive_entry* entry; - - archive_read_support_format_all(a); - archive_read_support_filter_all(a); - archive_write_disk_set_options(ext, ARCHIVE_EXTRACT_TIME | ARCHIVE_EXTRACT_PERM); - archive_write_disk_set_standard_lookup(ext); - - if(archive_read_open_filename(a, archive_path.string().c_str(), 10240) != ARCHIVE_OK) { - error_msg = archive_error_string(a); - archive_read_free(a); - archive_write_free(ext); - return false; - } - - while(archive_read_next_header(a, &entry) == ARCHIVE_OK) { - std::filesystem::path full_path = dest_dir / archive_entry_pathname(entry); - archive_entry_set_pathname(entry, full_path.string().c_str()); - - if(archive_write_header(ext, entry) == ARCHIVE_OK) { - const void* buff; - size_t size; - la_int64_t offset; - - while(archive_read_data_block(a, &buff, &size, &offset) == ARCHIVE_OK) { - archive_write_data_block(ext, buff, size, offset); - } - } - archive_write_finish_entry(ext); - } - - archive_read_close(a); - archive_read_free(a); - archive_write_close(ext); - archive_write_free(ext); - return true; - } + std::string& error_msg); - // Path helpers + // Backup listing & info - std::string getUserBackupDir(const std::string& username) const { - return data_dir_ + "/backups/" + username; - } + nlohmann::json listBackups(const std::string& username); - std::string getBackupJsonPath(const std::string& username) const { - return getUserBackupDir(username) + "/backup.json"; - } + nlohmann::json getBackupInfo(const std::string& backup_name, const std::string& username); - std::string getUserTempDir(const std::string& username) const { - return data_dir_ + "/backups/.tmp/" + username; - } + // Backup name validation - // Backup JSON helpers + std::pair validateBackupName(const std::string& backup_name) const; - nlohmann::json readBackupJson(const std::string& username) { - std::string path = getBackupJsonPath(username); - if (!std::filesystem::exists(path)) return nlohmann::json::object(); - try { - std::ifstream f(path); - return nlohmann::json::parse(f); - } catch (const std::exception& e) { - LOG_WARN(1304, - username, - "Failed to parse backup metadata file " << path << ": " << e.what()); - return nlohmann::json::object(); - } - } + // Backup deletion - void writeBackupJson(const std::string& username, const nlohmann::json& data) { - std::string path = getBackupJsonPath(username); - std::ofstream f(path); - f << data.dump(2); - } + std::pair deleteBackup(const std::string& backup_name, + const std::string& username); - // Temp directory cleanup + // Active backup tracking - void cleanupTempDir() { - std::string temp_dir = data_dir_ + "/backups/.tmp"; - if (std::filesystem::exists(temp_dir)) { - try { - std::filesystem::remove_all(temp_dir); - LOG_INFO(1301, "Cleaned up backup temp directory"); - } catch (const std::exception& e) { - LOG_ERROR(1302, "Failed to clean up backup temp directory: " << e.what()); - } - } - } + void setActiveBackup(const std::string& username, + const std::string& backup_name, + const BackupOperation& operation); - // Active backup tracking - void setActiveBackup(const std::string& username, const std::string& backup_name, const BackupOperation& operation) { - std::lock_guard lock(active_user_backups_mutex_); - active_user_backups_[username] = ActiveBackup{backup_name, operation, {}}; - } + void attachBackupThread(const std::string& username, std::jthread&& thread); - void attachBackupThread(const std::string& username, std::jthread&& thread) { - std::lock_guard lock(active_user_backups_mutex_); - auto it = active_user_backups_.find(username); - if(it != active_user_backups_.end()) { - it->second.thread = std::move(thread); - } - } + void clearActiveBackup(const std::string& username); - void clearActiveBackup(const std::string& username) { - std::lock_guard lock(active_user_backups_mutex_); - auto it = active_user_backups_.find(username); - if (it != active_user_backups_.end()) { - // Called from within the thread itself — detach so erase doesn't try to join - if (it->second.thread.joinable()) { - it->second.thread.detach(); - } - active_user_backups_.erase(it); - } - } + bool hasActiveBackup(const std::string& username) const; - bool hasActiveBackup(const std::string& username) const { - std::lock_guard lock(active_user_backups_mutex_); - return active_user_backups_.count(username) > 0; - } + std::optional> getActiveBackup(const std::string& username); // Join all background backup threads before destroying IndexManager members. // Moves threads out under lock, then request_stop + join outside lock to avoid - // deadlock (finishing threads call clearActiveBackup which also locks active_user_backups_mutex_). - void joinAllThreads() { - std::vector threads_to_join; - { - std::lock_guard lock(active_user_backups_mutex_); - for (auto& [username, backup] : active_user_backups_) { - if (backup.thread.joinable()) { - threads_to_join.push_back(std::move(backup.thread)); - } - } - active_user_backups_.clear(); - } - // request_stop + join outside the lock - for (auto& t : threads_to_join) { - t.request_stop(); // signal stop_token — thread sees it inside createBackupTar - if (t.joinable()) { - t.join(); - } - } - } + // deadlock (finishing threads call clearActiveBackup which also locks + // active_user_backups_mutex_). + void joinAllThreads(); - // Backup name validation - - std::pair validateBackupName(const std::string& backup_name) const { - if(backup_name.empty()) { - return std::make_pair(false, "Backup name cannot be empty"); - } - - if(backup_name.length() > settings::MAX_BACKUP_NAME_LENGTH) { - return std::make_pair(false, - "Backup name too long (max " - + std::to_string(settings::MAX_BACKUP_NAME_LENGTH) - + " characters)"); - } - - static const std::regex backup_name_regex("^[a-zA-Z0-9_-]+$"); - if(!std::regex_match(backup_name, backup_name_regex)) { - return std::make_pair(false, - "Invalid backup name: only alphanumeric, underscores, " - "and hyphens allowed"); - } - - return std::make_pair(true, ""); - } + // Path helpers - // Backup listing + std::string getUserBackupDir(const std::string& username) const; - nlohmann::json listBackups(const std::string& username) { - nlohmann::json backup_list_json = readBackupJson(username); - return backup_list_json; - } + std::string getBackupJsonPath(const std::string& username) const; - // Backup deletion + std::string getUserTempDir(const std::string& username) const; - std::pair deleteBackup(const std::string& backup_name, - const std::string& username) { - std::pair result = validateBackupName(backup_name); - if(!result.first) { - return result; - } - - std::string backup_tar = getUserBackupDir(username) + "/" + backup_name + ".tar"; - - if(std::filesystem::exists(backup_tar)) { - std::filesystem::remove(backup_tar); - - nlohmann::json backup_db = readBackupJson(username); - backup_db.erase(backup_name); - writeBackupJson(username, backup_db); - - LOG_INFO(1303, username, "Deleted backup " << backup_tar); - return {true, ""}; - } else { - return {false, "Backup not found"}; - } - } + // Backup JSON helpers - // Active backup query + nlohmann::json readBackupJson(const std::string& username); - std::optional> getActiveBackup(const std::string& username) { - std::lock_guard lock(active_user_backups_mutex_); - auto it = active_user_backups_.find(username); - if (it != active_user_backups_.end()) { - return std::make_pair(it->second.backup_name, backupOperationToString(it->second.operation)); - } - return std::nullopt; - } + void writeBackupJson(const std::string& username, const nlohmann::json& data); - // Backup info + // Temp directory cleanup - nlohmann::json getBackupInfo(const std::string& backup_name, const std::string& username) { - nlohmann::json backup_db = readBackupJson(username); - if (backup_db.contains(backup_name)) { - return backup_db[backup_name]; - } - return nlohmann::json(); - } + void cleanupTempDir(); }; From b69657e235199f0beed0fbdf16e7d1de9fe6c6f1 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 16:54:40 +0530 Subject: [PATCH 25/29] refactor: update logs --- src/storage/backup_store.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/storage/backup_store.cpp b/src/storage/backup_store.cpp index b6cd0a4d05..d3c564b0a5 100644 --- a/src/storage/backup_store.cpp +++ b/src/storage/backup_store.cpp @@ -77,13 +77,13 @@ void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token {"checksum", meta->checksum}}; LOG_DEBUG("Metadata prepared for backup: " << metadata_json.dump()); } else { - LOG_ERROR(2041, index_id, "Failed to get metadata for backup"); + LOG_ERROR(1305, index_id, "Failed to get metadata for backup"); throw std::runtime_error("Cannot create backup without index metadata"); } // Check stop_token before expensive operations if (st.stop_requested()) { - LOG_INFO(2056, index_id, "Backup cancelled before backup work started"); + LOG_INFO(1306, index_id, "Backup cancelled before backup work started"); clearActiveBackup(username); return; } @@ -104,7 +104,7 @@ void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token // Check again after acquiring lock (shutdown may have been requested while waiting) if (st.stop_requested()) { - LOG_INFO(2057, index_id, "Backup cancelled"); + LOG_INFO(1307, index_id, "Backup cancelled"); clearActiveBackup(username); return; } @@ -147,7 +147,7 @@ void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token clearActiveBackup(username); - LOG_INFO(2042, index_id, "Backup tar created; write operations resumed"); + LOG_INFO(1308, index_id, "Backup tar created; write operations resumed"); std::filesystem::rename(backup_tar_temp, backup_tar_final); @@ -155,7 +155,7 @@ void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token backup_db[backup_name] = metadata_json; writeBackupJson(username, backup_db); - LOG_INFO(2043, index_id, "Backup completed: " << backup_name << " -> " << backup_tar_final); + LOG_INFO(1309, index_id, "Backup completed: " << backup_name << " -> " << backup_tar_final); } catch (const std::exception& e) { std::string user_backup_dir = getUserBackupDir(username); @@ -177,7 +177,7 @@ void BackupStore::createBackup(const CreateBackupParams& params, std::stop_token clearActiveBackup(username); - LOG_ERROR(2044, index_id, "Backup failed for " << backup_name << ": " << e.what()); + LOG_ERROR(1310, index_id, "Backup failed for " << backup_name << ": " << e.what()); } } @@ -256,14 +256,12 @@ void BackupStore::restoreBackup(const RestoreBackupParams& params, std::stop_tok clearActiveBackup(username); - LOG_INFO(2045, username, target_index_name, "Restored backup from " << backup_tar); + LOG_INFO(1311, username, target_index_name, "Restored backup from " << backup_tar); } catch(const std::exception& e) { std::filesystem::remove_all(backup_extract_dir); clearActiveBackup(username); - LOG_ERROR(2058, - backup_name, - "Restoration of backup failed for " << backup_name << ", index name " - << target_index_name << ": " << e.what()); + LOG_ERROR(1312, username, target_index_name, + "Restoration of backup failed for " << backup_name << ": " << e.what()); } } From 868ed7489c014da00d04bb140f69f285ff464e58 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 16:54:56 +0530 Subject: [PATCH 26/29] update backup flow --- docs/backup-system.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/backup-system.md b/docs/backup-system.md index 94f3114f58..baa5cff823 100644 --- a/docs/backup-system.md +++ b/docs/backup-system.md @@ -97,7 +97,8 @@ addVectors/deleteVectors/updateFilters/deleteByFilter/deleteIndex ``` POST /backups/{name}/restore → validate name → check backup exists in backup registry -→ check target index does NOT exist → check active_user_backups_[username] empty (one per user) +→ check target index does NOT exist → check disk space (need 2x tar size) +→ check active_user_backups_[username] empty (one per user) → setActiveBackup() — insert entry into active_user_backups_ map (BackupOperation::Restoration, no thread yet) → spawn jthread → attachBackupThread() — move jthread into map entry → return 202 { backup_name, target_index, status: "in_progress" } ``` @@ -157,7 +158,8 @@ GET /backups/{name}/info | 2 | **Write blocking** — writes block on `operation_mutex` until backup completes | addVectors, deleteVectors, updateFilters, deleteByFilter, deleteIndex | | 3 | **Name validation** — alphanumeric, underscores, hyphens only; max 200 chars | validateBackupName | | 4 | **Duplicate prevention** — checks if .tar file already exists on disk | createBackupAsync, upload | -| 5 | **Disk space** — requires 2x index size available | executeBackupJob | +| 5 | **Disk space (create)** — requires 2x index size available in backup dir | executeBackupJob | +| 5b | **Disk space (restore)** — requires 2x tar file size available in temp dir | restoreBackupAsync | | 6 | **Atomic tar** — writes to `backups/.tmp/{username}/` first, then renames to final location | executeBackupJob | | 7 | **Crash recovery** — on startup: `cleanupTempDir()` deletes entire `backups/.tmp/` directory | BackupStore constructor | | 8 | **Restore safety** — target must not exist, metadata must be valid; cleanup (temp dir + active status) on failure in background thread | restoreBackupAsync, restoreBackup | From 2769c642b69817340cbbfba13b5a154b4672353e Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 17:38:47 +0530 Subject: [PATCH 27/29] refactor: add test cases for backup flow --- tests/CMakeLists.txt | 33 +++ tests/backup_test.cpp | 559 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 592 insertions(+) create mode 100644 tests/backup_test.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ccd5019366..268f3eab1e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -72,3 +72,36 @@ target_link_libraries(ndd_rebuild_test target_compile_definitions(ndd_rebuild_test PRIVATE MDB_MAXKEYSIZE=512) gtest_discover_tests(ndd_rebuild_test) + +# --- ndd_backup_test --- +add_executable(ndd_backup_test backup_test.cpp ${LMDB_SOURCES} ${ROARING_SOURCE}) + +set_source_files_properties(${LMDB_SOURCES} PROPERTIES + COMPILE_FLAGS "-DMDBX_BUILD_SHARED_LIBRARY=0 -DMDBX_BUILD_FLAGS=\\\"NDD_EMBEDDED\\\"" +) + +target_include_directories(ndd_backup_test PRIVATE + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_SOURCE_DIR}/src/core + ${CMAKE_SOURCE_DIR}/src/utils + ${CMAKE_SOURCE_DIR}/src/server + ${CMAKE_SOURCE_DIR}/src/storage + ${CMAKE_SOURCE_DIR}/src/filter + ${CMAKE_SOURCE_DIR}/src/sparse + ${CMAKE_SOURCE_DIR}/src/hnsw + ${CMAKE_SOURCE_DIR}/src/quant + ${CMAKE_SOURCE_DIR}/third_party + ${CMAKE_SOURCE_DIR}/third_party/json + ${CMAKE_SOURCE_DIR}/third_party/msgpack/include + ${LIBARCHIVE_INCLUDE_DIR} +) + +target_link_libraries(ndd_backup_test + PRIVATE + ndd_core + GTest::gtest_main +) + +target_compile_definitions(ndd_backup_test PRIVATE MDB_MAXKEYSIZE=512) + +gtest_discover_tests(ndd_backup_test) diff --git a/tests/backup_test.cpp b/tests/backup_test.cpp new file mode 100644 index 0000000000..0e0cfe9e44 --- /dev/null +++ b/tests/backup_test.cpp @@ -0,0 +1,559 @@ +#include +#include +#include +#include +#include +#include + +#include "backup_store.hpp" +#include "ndd.hpp" +#include "utils/msgpack_ndd.hpp" +#include "server/auth.hpp" + +namespace fs = std::filesystem; + +// ============================================================ +// Layer 1 — BackupStore state management (no IndexManager) +// ============================================================ + +class BackupStoreStateTest : public ::testing::Test { +protected: + std::string dir_; + std::unique_ptr store_; + + void SetUp() override { + dir_ = "./test_backup_state_" + std::to_string(rand()); + fs::create_directories(dir_); + store_ = std::make_unique(dir_); + } + + void TearDown() override { + store_.reset(); + if (fs::exists(dir_)) fs::remove_all(dir_); + } +}; + +// --- validateBackupName --- + +TEST_F(BackupStoreStateTest, ValidateName_AlphanumericUnderscore_Passes) { + auto [ok, msg] = store_->validateBackupName("my_backup"); + EXPECT_TRUE(ok); + EXPECT_TRUE(msg.empty()); +} + +TEST_F(BackupStoreStateTest, ValidateName_WithHyphen_Passes) { + auto [ok, msg] = store_->validateBackupName("backup-2024"); + EXPECT_TRUE(ok); +} + +TEST_F(BackupStoreStateTest, ValidateName_Empty_Fails) { + auto [ok, msg] = store_->validateBackupName(""); + EXPECT_FALSE(ok); + EXPECT_FALSE(msg.empty()); +} + +TEST_F(BackupStoreStateTest, ValidateName_TooLong_Fails) { + auto [ok, msg] = store_->validateBackupName(std::string(201, 'a')); + EXPECT_FALSE(ok); + EXPECT_NE(msg.find("too long"), std::string::npos); +} + +TEST_F(BackupStoreStateTest, ValidateName_Slash_Fails) { + auto [ok, msg] = store_->validateBackupName("bad/name"); + EXPECT_FALSE(ok); +} + +TEST_F(BackupStoreStateTest, ValidateName_Space_Fails) { + auto [ok, msg] = store_->validateBackupName("bad name"); + EXPECT_FALSE(ok); +} + +TEST_F(BackupStoreStateTest, ValidateName_Dot_Fails) { + auto [ok, msg] = store_->validateBackupName("backup.tar"); + EXPECT_FALSE(ok); +} + +// --- Active backup tracking --- + +TEST_F(BackupStoreStateTest, NoActive_HasActiveIsFalse) { + EXPECT_FALSE(store_->hasActiveBackup("alice")); +} + +TEST_F(BackupStoreStateTest, SetActive_HasActiveIsTrue) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Creation); + EXPECT_TRUE(store_->hasActiveBackup("alice")); +} + +TEST_F(BackupStoreStateTest, SetActive_GetActiveReturnsNameAndOperation) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Creation); + auto active = store_->getActiveBackup("alice"); + ASSERT_TRUE(active.has_value()); + EXPECT_EQ(active->first, "bk1"); + EXPECT_EQ(active->second, "creation"); +} + +TEST_F(BackupStoreStateTest, SetActive_Restoration_OperationString) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Restoration); + auto active = store_->getActiveBackup("alice"); + ASSERT_TRUE(active.has_value()); + EXPECT_EQ(active->second, "restoration"); +} + +TEST_F(BackupStoreStateTest, ClearActive_HasActiveIsFalse) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Creation); + store_->clearActiveBackup("alice"); + EXPECT_FALSE(store_->hasActiveBackup("alice")); +} + +TEST_F(BackupStoreStateTest, ClearActive_GetActiveReturnsNullopt) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Creation); + store_->clearActiveBackup("alice"); + EXPECT_FALSE(store_->getActiveBackup("alice").has_value()); +} + +TEST_F(BackupStoreStateTest, ClearNonExistent_NoOp) { + EXPECT_NO_THROW(store_->clearActiveBackup("nobody")); +} + +TEST_F(BackupStoreStateTest, TwoUsers_IndependentState) { + store_->setActiveBackup("alice", "bk1", BackupOperation::Creation); + EXPECT_TRUE(store_->hasActiveBackup("alice")); + EXPECT_FALSE(store_->hasActiveBackup("bob")); + + store_->setActiveBackup("bob", "bk2", BackupOperation::Restoration); + EXPECT_TRUE(store_->hasActiveBackup("bob")); + + store_->clearActiveBackup("alice"); + EXPECT_FALSE(store_->hasActiveBackup("alice")); + EXPECT_TRUE(store_->hasActiveBackup("bob")); +} + +// --- Backup JSON & listing --- + +TEST_F(BackupStoreStateTest, ReadBackupJson_MissingFile_ReturnsEmptyObject) { + auto json = store_->readBackupJson("alice"); + EXPECT_TRUE(json.empty()); +} + +TEST_F(BackupStoreStateTest, WriteAndReadBackupJson_RoundTrip) { + nlohmann::json data; + data["bk1"]["original_index"] = "my_idx"; + data["bk1"]["size_mb"] = 10; + + fs::create_directories(store_->getUserBackupDir("alice")); + store_->writeBackupJson("alice", data); + auto read = store_->readBackupJson("alice"); + + EXPECT_TRUE(read.contains("bk1")); + EXPECT_EQ(read["bk1"]["original_index"], "my_idx"); +} + +TEST_F(BackupStoreStateTest, ListBackups_EmptyWhenNoneExist) { + EXPECT_TRUE(store_->listBackups("alice").empty()); +} + +TEST_F(BackupStoreStateTest, ListBackups_ReturnsAllWrittenEntries) { + nlohmann::json data; + data["bk1"] = {{"original_index", "idx1"}}; + data["bk2"] = {{"original_index", "idx2"}}; + fs::create_directories(store_->getUserBackupDir("alice")); + store_->writeBackupJson("alice", data); + + auto list = store_->listBackups("alice"); + EXPECT_TRUE(list.contains("bk1")); + EXPECT_TRUE(list.contains("bk2")); +} + +TEST_F(BackupStoreStateTest, GetBackupInfo_ExistingEntry) { + nlohmann::json data; + data["bk1"] = {{"original_index", "idx1"}, {"size_mb", 5}}; + fs::create_directories(store_->getUserBackupDir("alice")); + store_->writeBackupJson("alice", data); + + auto info = store_->getBackupInfo("bk1", "alice"); + EXPECT_FALSE(info.is_null()); + EXPECT_EQ(info["original_index"], "idx1"); +} + +TEST_F(BackupStoreStateTest, GetBackupInfo_NonExistent_ReturnsNull) { + EXPECT_TRUE(store_->getBackupInfo("nonexistent", "alice").is_null()); +} + +// --- Backup deletion --- + +TEST_F(BackupStoreStateTest, DeleteBackup_NonExistent_ReturnsFalse) { + auto [ok, msg] = store_->deleteBackup("nonexistent", "alice"); + EXPECT_FALSE(ok); + EXPECT_EQ(msg, "Backup not found"); +} + +TEST_F(BackupStoreStateTest, DeleteBackup_InvalidName_ReturnsFalse) { + auto [ok, msg] = store_->deleteBackup("bad/name", "alice"); + EXPECT_FALSE(ok); +} + +TEST_F(BackupStoreStateTest, DeleteBackup_RemovesTarAndJsonEntry) { + std::string backup_dir = store_->getUserBackupDir("alice"); + fs::create_directories(backup_dir); + + std::string tar_path = backup_dir + "/bk1.tar"; + std::ofstream(tar_path) << "fake tar content"; + + nlohmann::json data; + data["bk1"] = {{"original_index", "idx1"}}; + store_->writeBackupJson("alice", data); + + auto [ok, msg] = store_->deleteBackup("bk1", "alice"); + EXPECT_TRUE(ok); + EXPECT_FALSE(fs::exists(tar_path)); + EXPECT_FALSE(store_->listBackups("alice").contains("bk1")); +} + +// ============================================================ +// Layer 2 — Archive (tar) operations +// ============================================================ + +class BackupArchiveTest : public ::testing::Test { +protected: + std::string dir_; + std::unique_ptr store_; + + void SetUp() override { + dir_ = "./test_backup_archive_" + std::to_string(rand()); + fs::create_directories(dir_); + store_ = std::make_unique(dir_); + } + + void TearDown() override { + store_.reset(); + if (fs::exists(dir_)) fs::remove_all(dir_); + } + + std::string makeSourceDir(const std::string& name) { + std::string src = dir_ + "/" + name; + fs::create_directories(src); + std::ofstream(src + "/file_a.bin") << "hello from file_a"; + std::ofstream(src + "/file_b.bin") << "hello from file_b"; + return src; + } +}; + +TEST_F(BackupArchiveTest, CreateBackupTar_ProducesNonEmptyFile) { + std::string src = makeSourceDir("myidx"); + std::string archive = dir_ + "/out.tar"; + std::string err; + bool ok = store_->createBackupTar(src, archive, err); + EXPECT_TRUE(ok) << "error: " << err; + EXPECT_TRUE(fs::exists(archive)); + EXPECT_GT(fs::file_size(archive), 0u); +} + +TEST_F(BackupArchiveTest, ExtractBackupTar_FilesRoundTrip) { + std::string src = makeSourceDir("myidx"); + std::string archive = dir_ + "/out.tar"; + std::string err; + ASSERT_TRUE(store_->createBackupTar(src, archive, err)) << err; + + std::string dest = dir_ + "/extracted"; + ASSERT_TRUE(store_->extractBackupTar(archive, dest, err)) << err; + + EXPECT_TRUE(fs::exists(dest + "/myidx/file_a.bin")); + EXPECT_TRUE(fs::exists(dest + "/myidx/file_b.bin")); +} + +TEST_F(BackupArchiveTest, ExtractBackupTar_ContentPreserved) { + std::string src = makeSourceDir("myidx"); + std::string archive = dir_ + "/out.tar"; + std::string err; + ASSERT_TRUE(store_->createBackupTar(src, archive, err)); + + std::string dest = dir_ + "/extracted"; + ASSERT_TRUE(store_->extractBackupTar(archive, dest, err)); + + std::ifstream f(dest + "/myidx/file_a.bin"); + std::string content((std::istreambuf_iterator(f)), {}); + EXPECT_EQ(content, "hello from file_a"); +} + +TEST_F(BackupArchiveTest, ExtractBackupTar_NonExistentArchive_Fails) { + std::string err; + bool ok = store_->extractBackupTar(dir_ + "/no.tar", dir_ + "/dest", err); + EXPECT_FALSE(ok); + EXPECT_FALSE(err.empty()); +} + +TEST_F(BackupArchiveTest, CreateBackupTar_PreCancelledStopToken_ReturnsFalse) { + std::string src = makeSourceDir("myidx"); + for (int i = 0; i < 10; ++i) + std::ofstream(src + "/extra_" + std::to_string(i) + ".bin") << std::string(512, 'x'); + + std::string archive = dir_ + "/out.tar"; + std::string err; + + std::stop_source ss; + ss.request_stop(); + + bool ok = store_->createBackupTar(src, archive, err, ss.get_token()); + EXPECT_FALSE(ok); + EXPECT_EQ(err, "Backup cancelled"); +} + +// ============================================================ +// Layer 3 — Integration tests via IndexManager +// ============================================================ + +class BackupIntegrationTest : public ::testing::Test { +protected: + static constexpr const char* USERNAME = "testuser"; + static constexpr const char* IDX_NAME = "testidx"; + static constexpr const char* INDEX_ID = "testuser/testidx"; + static constexpr const char* BACKUP_NAME = "mybk"; + static constexpr size_t DIM = 32; + static constexpr size_t N_VECTORS = 50; + + std::string data_dir_; + std::unique_ptr manager_; + + void SetUp() override { + data_dir_ = "./test_backup_integration_" + std::to_string(rand()); + fs::create_directories(data_dir_); + PersistenceConfig pcfg; + pcfg.save_on_shutdown = false; + manager_ = std::make_unique(data_dir_, pcfg); + } + + void TearDown() override { + manager_.reset(); + if (fs::exists(data_dir_)) fs::remove_all(data_dir_); + } + + void createTestIndex(const std::string& index_id = INDEX_ID) { + IndexConfig config{ + .dim = DIM, + .max_elements = 1000, + .space_type_str = "cosine", + .M = 8, + .ef_construction = 64, + .quant_level = ndd::quant::QuantizationLevel::FP32, + .checksum = 0 + }; + manager_->createIndex(index_id, config, UserType::Admin, 0); + } + + void insertVectors(size_t n = N_VECTORS, const std::string& index_id = INDEX_ID) { + std::vector vecs; + vecs.reserve(n); + for (size_t i = 0; i < n; ++i) { + ndd::HybridVectorObject v; + v.id = "vec_" + std::to_string(i); + v.vector.resize(DIM); + for (size_t d = 0; d < DIM; ++d) + v.vector[d] = static_cast(rand()) / RAND_MAX; + vecs.push_back(std::move(v)); + } + manager_->addVectors(index_id, vecs); + } + + // Waits until the named backup appears in listBackups (signals successful write). + // Used for create-backup completion because clearActiveBackup fires before the + // final rename + writeBackupJson, so polling getActiveBackup is not sufficient. + bool waitForBackupInList(const std::string& backup_name, int timeout_sec = 15) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(timeout_sec); + while (std::chrono::steady_clock::now() < deadline) { + if (manager_->listBackups(USERNAME).contains(backup_name)) + return true; + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + return false; + } + + // Waits until no active backup remains for USERNAME. + // Reliable for restore because clearActiveBackup fires after loadIndex. + bool waitForNoActiveBackup(int timeout_sec = 15) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(timeout_sec); + while (std::chrono::steady_clock::now() < deadline) { + if (!manager_->getActiveBackup(USERNAME).has_value()) + return true; + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + return false; + } +}; + +TEST_F(BackupIntegrationTest, CreateBackupAsync_ReturnsTrueAndBackupName) { + createTestIndex(); + insertVectors(); + auto [ok, name] = manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + EXPECT_TRUE(ok); + EXPECT_EQ(name, BACKUP_NAME); + waitForBackupInList(BACKUP_NAME); +} + +TEST_F(BackupIntegrationTest, CreateBackup_SetsActiveBackupDuringRun) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + // setActiveBackup is synchronous — active backup must be visible immediately + auto active = manager_->getActiveBackup(USERNAME); + EXPECT_TRUE(active.has_value()); + EXPECT_EQ(active->first, BACKUP_NAME); + EXPECT_EQ(active->second, "creation"); + waitForBackupInList(BACKUP_NAME); +} + +TEST_F(BackupIntegrationTest, CreateBackup_ProducesTarFile) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + std::string tar = data_dir_ + "/backups/" + USERNAME + "/" + BACKUP_NAME + ".tar"; + EXPECT_TRUE(fs::exists(tar)); + EXPECT_GT(fs::file_size(tar), 0u); +} + +TEST_F(BackupIntegrationTest, CreateBackup_AppearsInListBackups) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + EXPECT_TRUE(manager_->listBackups(USERNAME).contains(BACKUP_NAME)); +} + +TEST_F(BackupIntegrationTest, CreateBackup_MetadataHasExpectedFields) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + auto info = manager_->getBackupInfo(BACKUP_NAME, USERNAME); + EXPECT_FALSE(info.is_null()); + EXPECT_EQ(info["original_index"], IDX_NAME); + ASSERT_TRUE(info.contains("params")); + EXPECT_EQ(info["params"]["dim"].get(), DIM); + EXPECT_TRUE(info.contains("timestamp")); +} + +TEST_F(BackupIntegrationTest, CreateBackup_WhileInProgress_ReturnsFalse) { + createTestIndex(); + insertVectors(); + auto [ok1, _1] = manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(ok1); + + // setActiveBackup is synchronous; second call must be rejected + auto [ok2, msg] = manager_->createBackupAsync(INDEX_ID, "another_bk"); + EXPECT_FALSE(ok2); + EXPECT_NE(msg.find("in progress"), std::string::npos); + waitForBackupInList(BACKUP_NAME); +} + +TEST_F(BackupIntegrationTest, CreateBackup_DuplicateName_ReturnsFalse) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + auto [ok, msg] = manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + EXPECT_FALSE(ok); +} + +TEST_F(BackupIntegrationTest, CreateBackup_InvalidName_ReturnsFalse) { + createTestIndex(); + auto [ok, msg] = manager_->createBackupAsync(INDEX_ID, "bad/name"); + EXPECT_FALSE(ok); +} + +TEST_F(BackupIntegrationTest, DeleteBackup_RemovesTarAndJsonEntry) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + auto [ok, msg] = manager_->deleteBackup(BACKUP_NAME, USERNAME); + EXPECT_TRUE(ok); + + std::string tar = data_dir_ + "/backups/" + USERNAME + "/" + BACKUP_NAME + ".tar"; + EXPECT_FALSE(fs::exists(tar)); + EXPECT_FALSE(manager_->listBackups(USERNAME).contains(BACKUP_NAME)); +} + +TEST_F(BackupIntegrationTest, DeleteBackup_NonExistent_ReturnsFalse) { + auto [ok, msg] = manager_->deleteBackup("no_such_backup", USERNAME); + EXPECT_FALSE(ok); +} + +TEST_F(BackupIntegrationTest, RestoreBackupAsync_ReturnsTrueAndTargetName) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + auto [ok, name] = manager_->restoreBackupAsync(BACKUP_NAME, "restored_idx", USERNAME); + EXPECT_TRUE(ok); + EXPECT_EQ(name, "restored_idx"); + waitForNoActiveBackup(); +} + +TEST_F(BackupIntegrationTest, RestoreBackup_CreatesIndexWithCorrectMetadata) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + manager_->restoreBackupAsync(BACKUP_NAME, "restored_idx", USERNAME); + ASSERT_TRUE(waitForNoActiveBackup()); + + auto meta = manager_->getMetadata(USERNAME + std::string("/restored_idx")); + ASSERT_TRUE(meta.has_value()); + EXPECT_EQ(meta->name, "restored_idx"); + EXPECT_EQ(meta->dimension, DIM); + EXPECT_EQ(meta->M, 8u); +} + +TEST_F(BackupIntegrationTest, RestoreBackup_PreservesVectorCount) { + createTestIndex(); + insertVectors(N_VECTORS); + size_t original_count = manager_->getElementCount(INDEX_ID); + + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + manager_->restoreBackupAsync(BACKUP_NAME, "restored_idx", USERNAME); + ASSERT_TRUE(waitForNoActiveBackup()); + + std::string restored_id = USERNAME + std::string("/restored_idx"); + EXPECT_EQ(manager_->getElementCount(restored_id), original_count); +} + +TEST_F(BackupIntegrationTest, RestoreBackup_NonExistentBackup_ReturnsFalse) { + auto [ok, msg] = manager_->restoreBackupAsync("no_such_backup", "some_idx", USERNAME); + EXPECT_FALSE(ok); + EXPECT_NE(msg.find("not found"), std::string::npos); +} + +TEST_F(BackupIntegrationTest, RestoreBackup_TargetIndexAlreadyExists_ReturnsFalse) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + ASSERT_TRUE(waitForBackupInList(BACKUP_NAME)); + + // IDX_NAME index already exists + auto [ok, msg] = manager_->restoreBackupAsync(BACKUP_NAME, IDX_NAME, USERNAME); + EXPECT_FALSE(ok); + EXPECT_NE(msg.find("already exists"), std::string::npos); +} + +TEST_F(BackupIntegrationTest, RestoreBackup_WhileCreateInProgress_ReturnsFalse) { + createTestIndex(); + insertVectors(); + manager_->createBackupAsync(INDEX_ID, BACKUP_NAME); + + // setActiveBackup is synchronous — restore must be rejected immediately + auto [ok, msg] = manager_->restoreBackupAsync(BACKUP_NAME, "restored_idx", USERNAME); + EXPECT_FALSE(ok); + EXPECT_NE(msg.find("in progress"), std::string::npos); + waitForNoActiveBackup(); +} From d028cbcdb9a1ac776bc38d6288dcdeb8ba82e36c Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 17:42:04 +0530 Subject: [PATCH 28/29] update readme.md --- tests/README.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index a5a04a2c0a..a37db0d7a8 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,6 +1,6 @@ # Tests -Unit tests for Endee. Currently two test suites: filter and rebuild. +Unit tests for Endee. Currently three test suites: filter, rebuild, and backup. ## Build & Run All Tests @@ -76,6 +76,68 @@ End-to-end rebuild via IndexManager: - RebuildIntegrationTest/RebuildNonExistentIndex_Returns404Code - RebuildIntegrationTest/RebuildNoChange_Returns400Code +## ndd_backup_test + +Unit and integration tests for the backup subsystem (`BackupStore` + `IndexManager` backup methods). + +Build and run individually: + + cmake --build build --target ndd_backup_test + ./build/tests/ndd_backup_test + +Test cases: + +BackupStore state management (no IndexManager): +- BackupStoreStateTest/ValidateName_AlphanumericUnderscore_Passes +- BackupStoreStateTest/ValidateName_WithHyphen_Passes +- BackupStoreStateTest/ValidateName_Empty_Fails +- BackupStoreStateTest/ValidateName_TooLong_Fails +- BackupStoreStateTest/ValidateName_Slash_Fails +- BackupStoreStateTest/ValidateName_Space_Fails +- BackupStoreStateTest/ValidateName_Dot_Fails +- BackupStoreStateTest/NoActive_HasActiveIsFalse +- BackupStoreStateTest/SetActive_HasActiveIsTrue +- BackupStoreStateTest/SetActive_GetActiveReturnsNameAndOperation +- BackupStoreStateTest/SetActive_Restoration_OperationString +- BackupStoreStateTest/ClearActive_HasActiveIsFalse +- BackupStoreStateTest/ClearActive_GetActiveReturnsNullopt +- BackupStoreStateTest/ClearNonExistent_NoOp +- BackupStoreStateTest/TwoUsers_IndependentState +- BackupStoreStateTest/ReadBackupJson_MissingFile_ReturnsEmptyObject +- BackupStoreStateTest/WriteAndReadBackupJson_RoundTrip +- BackupStoreStateTest/ListBackups_EmptyWhenNoneExist +- BackupStoreStateTest/ListBackups_ReturnsAllWrittenEntries +- BackupStoreStateTest/GetBackupInfo_ExistingEntry +- BackupStoreStateTest/GetBackupInfo_NonExistent_ReturnsNull +- BackupStoreStateTest/DeleteBackup_NonExistent_ReturnsFalse +- BackupStoreStateTest/DeleteBackup_InvalidName_ReturnsFalse +- BackupStoreStateTest/DeleteBackup_RemovesTarAndJsonEntry + +Archive (tar) operations: +- BackupArchiveTest/CreateBackupTar_ProducesNonEmptyFile +- BackupArchiveTest/ExtractBackupTar_FilesRoundTrip +- BackupArchiveTest/ExtractBackupTar_ContentPreserved +- BackupArchiveTest/ExtractBackupTar_NonExistentArchive_Fails +- BackupArchiveTest/CreateBackupTar_PreCancelledStopToken_ReturnsFalse + +End-to-end backup and restore via IndexManager: +- BackupIntegrationTest/CreateBackupAsync_ReturnsTrueAndBackupName +- BackupIntegrationTest/CreateBackup_SetsActiveBackupDuringRun +- BackupIntegrationTest/CreateBackup_ProducesTarFile +- BackupIntegrationTest/CreateBackup_AppearsInListBackups +- BackupIntegrationTest/CreateBackup_MetadataHasExpectedFields +- BackupIntegrationTest/CreateBackup_WhileInProgress_ReturnsFalse +- BackupIntegrationTest/CreateBackup_DuplicateName_ReturnsFalse +- BackupIntegrationTest/CreateBackup_InvalidName_ReturnsFalse +- BackupIntegrationTest/DeleteBackup_RemovesTarAndJsonEntry +- BackupIntegrationTest/DeleteBackup_NonExistent_ReturnsFalse +- BackupIntegrationTest/RestoreBackupAsync_ReturnsTrueAndTargetName +- BackupIntegrationTest/RestoreBackup_CreatesIndexWithCorrectMetadata +- BackupIntegrationTest/RestoreBackup_PreservesVectorCount +- BackupIntegrationTest/RestoreBackup_NonExistentBackup_ReturnsFalse +- BackupIntegrationTest/RestoreBackup_TargetIndexAlreadyExists_ReturnsFalse +- BackupIntegrationTest/RestoreBackup_WhileCreateInProgress_ReturnsFalse + ## Notes - Tests use real file I/O and real MDBX databases — no mocking. From 5fb98f076729778b54aecc5d79ba8554f56f1296 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Tue, 28 Apr 2026 18:09:48 +0530 Subject: [PATCH 29/29] add archive_static --- tests/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 268f3eab1e..c34cf9a56a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -66,6 +66,7 @@ target_include_directories(ndd_rebuild_test PRIVATE target_link_libraries(ndd_rebuild_test PRIVATE ndd_core + archive_static GTest::gtest_main ) @@ -99,6 +100,7 @@ target_include_directories(ndd_backup_test PRIVATE target_link_libraries(ndd_backup_test PRIVATE ndd_core + archive_static GTest::gtest_main )