diff --git a/Cargo.lock b/Cargo.lock index ce040e1..23399f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -631,6 +631,16 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "mime" version = "0.3.16" @@ -1052,6 +1062,7 @@ dependencies = [ "getopts", "glob", "md-5", + "memmap", "prettytable-rs", "regex", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index 78bbfe5..f1f0ba7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,4 @@ sha-1 = "0.9.6" sha2 = "0.9.5" walkdir = "2" glob = "0.3.0" +memmap = "0.7.0" diff --git a/src/db/functions.rs b/src/db/functions.rs index 8a326b7..4b8fdd7 100644 --- a/src/db/functions.rs +++ b/src/db/functions.rs @@ -121,16 +121,23 @@ pub fn get_categories(c: &PgConnection) -> Vec { /// /// ctime is the most important parameter to detect if /// something has changed on the primary mirror. -pub fn get_directories(c: &PgConnection) -> Vec { +pub fn get_directories(c: &PgConnection, cat_id: i32) -> Vec { + use crate::db::schema::category_directory; use crate::db::schema::directory; - let query = directory::dsl::directory.select(( - directory::dsl::id, - directory::dsl::name, - directory::dsl::files, - directory::dsl::readable, - directory::dsl::ctime, - )); + let subselect = category_directory::dsl::category_directory + .select(category_directory::dsl::directory_id) + .filter(category_directory::dsl::category_id.eq(cat_id)); + + let query = directory::dsl::directory + .select(( + directory::dsl::id, + directory::dsl::name, + directory::dsl::files, + directory::dsl::readable, + directory::dsl::ctime, + )) + .filter(directory::dsl::id.eq_any(subselect)); let debug = diesel::debug_query::(&query); print_step(debug.to_string()); query @@ -139,7 +146,7 @@ pub fn get_directories(c: &PgConnection) -> Vec { } /// This retrieves the list of which directory belongs to given category. -pub fn get_category_directories(c: &PgConnection, cat_id: i32) -> Vec { +pub fn _get_category_directories(c: &PgConnection, cat_id: i32) -> Vec { use crate::db::schema::category_directory; let query = category_directory::dsl::category_directory diff --git a/src/db/schema.rs b/src/db/schema.rs index 956f2d2..6e2bfd2 100644 --- a/src/db/schema.rs +++ b/src/db/schema.rs @@ -71,6 +71,14 @@ table! { } } +table! { + host_category_dir (id) { + id -> Integer, + directory_id -> Integer, + } +} + joinable!(category -> directory (topdir_id)); allow_tables_to_appear_in_same_query!(category, directory); +allow_tables_to_appear_in_same_query!(category_directory, directory); diff --git a/src/main.rs b/src/main.rs index 30406bb..cf3c0d4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -382,37 +382,16 @@ fn check_for_repo(repos: &[db::models::Repository], prefix: String, arch_id: i32 /// Remove non-existing directories from the database /// /// If a directory has been deleted on the file system it will still exist in the database. This -/// function will get the list of directories from the table `category_directory` for the current -/// directory (as specified in `cat_id`) and compare it with the file system. Every directory that -/// does not exist on the file system will be removed from `category_directory`, `directory` and -/// `file_detail`. +/// function goes through the list of database directories and if it does not exist on the file +/// system it will be removed from `category_directory`, `host_category_dir`, `directory`, +/// `repository' and `file_detail`. fn cleanup_database( c: &PgConnection, cds: &HashMap, dirs: &[db::models::Directory], topdir: String, - cat_id: i32, ) -> Result { - debug::STEPS.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - let cgs = db::functions::get_category_directories(c, cat_id); - for d in dirs { - let mut wrong_category = true; - for cg in &cgs { - // If the directory is not part of the table category_directory - // we will not delete it from the database. Another possible - // to cleanup the database would be to compare by name. - // - // cgs only contains directories for the current category id - if cg.directory_id == d.id { - wrong_category = false; - } - } - - if wrong_category { - continue; - } - let mut dir_gone_from_fs = true; for k in cds.keys() { @@ -428,7 +407,7 @@ fn cleanup_database( } if dir_gone_from_fs { - debug::STEPS.fetch_add(3, std::sync::atomic::Ordering::SeqCst); + debug::STEPS.fetch_add(5, std::sync::atomic::Ordering::SeqCst); // Delete from CategoryDirectory (Is it possible to delete multiple entries at once???) // Something like 'DELETE FROM category_directory where directory_id = 10 or directory_id = 20'. let delete_cd = diesel::delete( @@ -439,14 +418,23 @@ fn cleanup_database( debug::print_step(debug_cd.to_string()); delete_cd.execute(c)?; - // Delete from Directory - let delete_dir = diesel::delete( - db::schema::directory::dsl::directory - .filter(db::schema::directory::dsl::id.eq(d.id)), + // Delete from HostCategoryDir + let delete_host_category_dir = diesel::delete( + db::schema::host_category_dir::dsl::host_category_dir + .filter(db::schema::host_category_dir::dsl::directory_id.eq(d.id)), ); - let debug_dir = diesel::debug_query::(&delete_dir); - debug::print_step(debug_dir.to_string()); - delete_dir.execute(c)?; + let debug_host_category_dir = diesel::debug_query::(&delete_host_category_dir); + debug::print_step(debug_host_category_dir.to_string()); + delete_host_category_dir.execute(c)?; + + // Delete from Repository + let delete_repository = diesel::delete( + db::schema::repository::dsl::repository + .filter(db::schema::repository::dsl::directory_id.eq(d.id)), + ); + let debug_repository = diesel::debug_query::(&delete_repository); + debug::print_step(debug_repository.to_string()); + delete_repository.execute(c)?; // And remove if from FileDetail let delete_fd = diesel::delete( @@ -456,6 +444,15 @@ fn cleanup_database( let debug_fd = diesel::debug_query::(&delete_fd); debug::print_step(debug_fd.to_string()); delete_fd.execute(c)?; + + // Delete from Directory + let delete_dir = diesel::delete( + db::schema::directory::dsl::directory + .filter(db::schema::directory::dsl::id.eq(d.id)), + ); + let debug_dir = diesel::debug_query::(&delete_dir); + debug::print_step(debug_dir.to_string()); + delete_dir.execute(c)?; } } Ok(0) @@ -1153,26 +1150,26 @@ fn scan_local_directory( }; if !fullfiletimelist.is_empty() && !skip_fftl { - use std::fs; - debug::print_step(format!( "Local directory ({}) scan using {}", url, fullfiletimelist )); - let data = fs::read_to_string(fullfiletimelist)?; - let pattern = Regex::new(r"([0-9]*)\t(.*)\t([0-9]*)\t(.*)")?; - data.lines() - .filter_map(|line| pattern.captures(line)) - .map(|info| FileInfo { - is_directory: info[2].starts_with('d'), - is_readable: !info[2].contains('-'), - size: info[3].parse().unwrap(), - timestamp: info[1].parse().unwrap(), - name: Some(info[4].to_string()), - }) - .for_each(|x| { - add_entry_to_category_directories(x, cds, excludes, topdir); - }); + let file = std::fs::File::open(fullfiletimelist)?; + let data = unsafe { memmap::MmapOptions::new().map(&file)? }; + for line in data.split(|elem| elem == &b'\n') { + let v: Vec<&str> = std::str::from_utf8(line)?.split('\t').collect(); + if v.len() < 4 { + continue; + } + let info = FileInfo { + is_directory: v[1].starts_with('d'), + is_readable: !v[1].contains('-'), + size: v[2].parse()?, + timestamp: v[0].parse()?, + name: Some(v[3].to_string()), + }; + add_entry_to_category_directories(info, cds, excludes, topdir); + } return Ok(()); } @@ -1481,7 +1478,7 @@ fn main() { handle_unreadable(&mut cds); - let mut d = db::functions::get_directories(&connection); + let mut d = db::functions::get_directories(&connection, category.id); if let Err(e) = sync_category_directories(&connection, topdir.clone(), category.id, &mut d, &mut cds) @@ -1573,7 +1570,7 @@ fn main() { } if params.delete_directories { - if let Err(e) = cleanup_database(&connection, &cds, &d, topdir, category.id) { + if let Err(e) = cleanup_database(&connection, &cds, &d, topdir) { println!("Database cleanup failed {}", e); process::exit(1); } diff --git a/src/scan_primary_mirror_test.rs b/src/scan_primary_mirror_test.rs index a52db5d..5e89ef9 100644 --- a/src/scan_primary_mirror_test.rs +++ b/src/scan_primary_mirror_test.rs @@ -317,7 +317,7 @@ fn sync_category_directories_test() { .is_err()); // this is empty - let mut dirs = db::functions::get_directories(&c); + let mut dirs = db::functions::get_directories(&c, 37); let mut cds: HashMap = HashMap::new(); let mut cd1 = CategoryDirectory { @@ -339,7 +339,7 @@ fn sync_category_directories_test() { assert_eq!(dirs[0].readable, true); assert_eq!(dirs[0].name, "topdir/directory1".to_string()); - dirs = db::functions::get_directories(&c); + dirs = db::functions::get_directories(&c, 37); // test after reading from database assert_eq!(dirs.len(), 1); assert_eq!(dirs[0].ctime, 1000); @@ -352,7 +352,7 @@ fn sync_category_directories_test() { assert!( !sync_category_directories(&c, "topdir/".to_string(), 37, &mut dirs, &mut cds).is_err() ); - dirs = db::functions::get_directories(&c); + dirs = db::functions::get_directories(&c, 37); assert_eq!(dirs.len(), 1); // this should have been updated assert_eq!(dirs[0].ctime, 2000);