Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deduplicate OnDisk Corpus #2827

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ uuid = { version = "1.10.0", features = ["serde", "v4"] }
which = "6.0.3"
windows = "0.59.0"
z3 = "0.12.1"

fs2 = "0.4.3"
tokatoka marked this conversation as resolved.
Show resolved Hide resolved

[workspace.lints.rust]
# Forbid
Expand Down
2 changes: 2 additions & 0 deletions libafl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ default = [
"regex",
"serdeany_autoreg",
"libafl_bolts/xxh3",
"fs2"
]
document-features = ["dep:document-features"]

Expand Down Expand Up @@ -296,6 +297,7 @@ clap = { workspace = true, optional = true }
num_enum = { workspace = true, optional = true }
libipt = { workspace = true, optional = true }
fastbloom = { workspace = true, optional = true }
fs2 = { workspace = true, optional = true }

[lints]
workspace = true
Expand Down
77 changes: 48 additions & 29 deletions libafl/src/corpus/inmemory_ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! For a lower memory footprint, consider using [`crate::corpus::CachedOnDiskCorpus`]
//! which only stores a certain number of [`Testcase`]s and removes additional ones in a FIFO manner.

use alloc::string::String;
use alloc::string::{String, ToString};
use core::cell::RefCell;
use std::{
fs,
Expand All @@ -14,6 +14,7 @@ use std::{
path::{Path, PathBuf},
};

use fs2::FileExt;
#[cfg(feature = "gzip")]
use libafl_bolts::compress::GzipCompressor;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -87,7 +88,7 @@ where
fn add(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add(testcase)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(id)
}
Expand All @@ -97,7 +98,7 @@ where
fn add_disabled(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add_disabled(testcase)?;
let testcase = &mut self.get_from_all(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(id)
}
Expand All @@ -108,7 +109,7 @@ where
let entry = self.inner.replace(id, testcase)?;
self.remove_testcase(&entry)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(entry)
}
Expand Down Expand Up @@ -375,42 +376,49 @@ impl<I> InMemoryOnDiskCorpus<I> {
}
}

fn save_testcase(&self, testcase: &mut Testcase<I>, id: CorpusId) -> Result<(), Error>
fn save_testcase(&self, testcase: &mut Testcase<I>) -> Result<(), Error>
where
I: Input,
{
let file_name_orig = testcase.filename_mut().take().unwrap_or_else(|| {
let file_name = testcase.filename_mut().take().unwrap_or_else(|| {
// TODO walk entry metadata to ask for pieces of filename (e.g. :havoc in AFL)
testcase.input().as_ref().unwrap().generate_name(Some(id))
testcase.input().as_ref().unwrap().generate_name()
});

// New testcase, we need to save it.
let mut file_name = file_name_orig.clone();
let mut ctr = String::new();
if self.locking {
let lockfile_name = format!(".{file_name}");
let lockfile_path = self.dir_path.join(lockfile_name);

let mut ctr = 2;
let file_name = if self.locking {
loop {
let lockfile_name = format!(".{file_name}.lafl_lock");
let lockfile_path = self.dir_path.join(lockfile_name);
let lockfile = try_create_new(&lockfile_path)?
.unwrap_or(OpenOptions::new().write(true).open(&lockfile_path)?);
lockfile.lock_exclusive()?;

if try_create_new(lockfile_path)?.is_some() {
break file_name;
}

file_name = format!("{file_name_orig}-{ctr}");
ctr += 1;
ctr = fs::read_to_string(&lockfile_path)?.trim().to_string();
if ctr.is_empty() {
ctr = String::from("1");
} else {
ctr = (ctr.parse::<u32>()? + 1).to_string();
}
} else {
file_name
};

fs::write(lockfile_path, &ctr)?;
}

if testcase.file_path().is_none() {
*testcase.file_path_mut() = Some(self.dir_path.join(&file_name));
}
*testcase.filename_mut() = Some(file_name);

if self.meta_format.is_some() {
let metafile_name = format!(".{}.metadata", testcase.filename().as_ref().unwrap());
let metafile_name = if self.locking {
format!(
".{}_{}.metadata",
testcase.filename().as_ref().unwrap(),
ctr
)
} else {
format!(".{}.metadata", testcase.filename().as_ref().unwrap())
};
let metafile_path = self.dir_path.join(&metafile_name);
let mut tmpfile_path = metafile_path.clone();
tmpfile_path.set_file_name(format!(".{metafile_name}.tmp",));
Expand Down Expand Up @@ -453,15 +461,26 @@ impl<I> InMemoryOnDiskCorpus<I> {

fn remove_testcase(&self, testcase: &Testcase<I>) -> Result<(), Error> {
if let Some(filename) = testcase.filename() {
if self.locking {
let lockfile_path = self.dir_path.join(format!(".{filename}"));
let lockfile = OpenOptions::new().write(true).open(&lockfile_path)?;

lockfile.lock_exclusive()?;
let ctr = fs::read_to_string(&lockfile_path)?;

if ctr == "1" {
lockfile.unlock()?;
drop(fs::remove_file(lockfile_path));
} else {
fs::write(lockfile_path, (ctr.parse::<u32>()? - 1).to_string())?;
return Ok(());
}
}

fs::remove_file(self.dir_path.join(filename))?;
if self.meta_format.is_some() {
fs::remove_file(self.dir_path.join(format!(".{filename}.metadata")))?;
}
// also try to remove the corresponding `.lafl_lock` file if it still exists
// (even though it shouldn't exist anymore, at this point in time)
drop(fs::remove_file(
self.dir_path.join(format!(".{filename}.lafl_lock")),
));
}
Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion libafl/src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ where
fn name_detailed(&self) -> Cow<'static, str> {
match self {
Event::NewTestcase { input, .. } => {
Cow::Owned(format!("Testcase {}", input.generate_name(None)))
Cow::Owned(format!("Testcase {}", input.generate_name()))
}
Event::UpdateExecStats { .. } => Cow::Borrowed("Client Heartbeat"),
Event::UpdateUserStats { .. } => Cow::Borrowed("UserStats"),
Expand Down
2 changes: 1 addition & 1 deletion libafl/src/executors/hooks/unix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ pub mod unix_signal_handler {
let mut bsod = Vec::new();
{
let mut writer = std::io::BufWriter::new(&mut bsod);
let _ = writeln!(writer, "input: {:?}", input.generate_name(None));
let _ = writeln!(writer, "input: {:?}", input.generate_name());
let bsod = libafl_bolts::minibsod::generate_minibsod(
&mut writer,
signal,
Expand Down
4 changes: 2 additions & 2 deletions libafl/src/inputs/encoded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use libafl_bolts::{Error, HasLen};
use regex::Regex;
use serde::{Deserialize, Serialize};

use crate::{corpus::CorpusId, inputs::Input};
use crate::inputs::Input;

/// Trait to encode bytes to an [`EncodedInput`] using the given [`Tokenizer`]
pub trait InputEncoder<T>
Expand Down Expand Up @@ -202,7 +202,7 @@ pub struct EncodedInput {
impl Input for EncodedInput {
/// Generate a name for this input
#[must_use]
fn generate_name(&self, _id: Option<CorpusId>) -> String {
fn generate_name(&self) -> String {
let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher();
for code in &self.codes {
hasher.write(&code.to_le_bytes());
Expand Down
4 changes: 2 additions & 2 deletions libafl/src/inputs/gramatron.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use ahash::RandomState;
use libafl_bolts::{Error, HasLen};
use serde::{Deserialize, Serialize};

use crate::{corpus::CorpusId, inputs::Input};
use crate::inputs::Input;

/// A terminal for gramatron grammar fuzzing
#[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -44,7 +44,7 @@ pub struct GramatronInput {
impl Input for GramatronInput {
/// Generate a name for this input
#[must_use]
fn generate_name(&self, _id: Option<CorpusId>) -> String {
fn generate_name(&self) -> String {
let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher();
for term in &self.terms {
hasher.write(term.symbol.as_bytes());
Expand Down
27 changes: 13 additions & 14 deletions libafl/src/inputs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,23 @@ pub mod nautilus;

use alloc::{
boxed::Box,
string::{String, ToString},
string::String,
vec::{Drain, Splice, Vec},
};
use core::{
clone::Clone,
fmt::Debug,
hash::Hash,
marker::PhantomData,
ops::{DerefMut, RangeBounds},
};
#[cfg(feature = "std")]
use std::{fs::File, hash::Hash, io::Read, path::Path};
use std::{fs::File, io::Read, path::Path};

#[cfg(feature = "std")]
use libafl_bolts::fs::write_file_atomic;
use libafl_bolts::{
generic_hash_std,
ownedref::{OwnedMutSlice, OwnedSlice},
subrange::{SubRangeMutSlice, SubRangeSlice},
Error, HasLen,
Expand All @@ -51,11 +53,9 @@ use libafl_bolts::{
pub use nautilus::*;
use serde::{Deserialize, Serialize};

use crate::corpus::CorpusId;

/// An input for the target
#[cfg(not(feature = "std"))]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
BAGUVIX456 marked this conversation as resolved.
Show resolved Hide resolved
/// Write this input to the file
fn to_file<P>(&self, _path: P) -> Result<(), Error> {
Err(Error::not_implemented("Not supported in no_std"))
Expand All @@ -67,12 +67,14 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}

/// Generate a name for this input
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self) -> String {
format!("{:016x}", generic_hash_std(self))
}
}

/// An input for the target
#[cfg(feature = "std")]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
/// Write this input to the file
fn to_file<P>(&self, path: P) -> Result<(), Error>
where
Expand All @@ -93,7 +95,9 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}

/// Generate a name for this input, the user is responsible for making each name of testcase unique.
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self) -> String {
format!("{:016x}", generic_hash_std(self))
}
}

/// Convert between two input types with a state
Expand Down Expand Up @@ -127,12 +131,7 @@ impl NopInput {
}
}

impl Input for NopInput {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
"nop-input".to_string()
}
}

impl Input for NopInput {}
impl HasTargetBytes for NopInput {
fn target_bytes(&self) -> OwnedSlice<u8> {
OwnedSlice::from(vec![0])
Expand Down
16 changes: 4 additions & 12 deletions libafl/src/inputs/value.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
//! Newtype pattern style wrapper for [`super::Input`]s

use alloc::{string::String, vec::Vec};
use alloc::vec::Vec;
use core::{fmt::Debug, hash::Hash};

use libafl_bolts::{generic_hash_std, rands::Rand};
use libafl_bolts::rands::Rand;
use serde::{Deserialize, Serialize};
#[cfg(feature = "std")]
use {
Expand All @@ -12,7 +12,7 @@ use {
};

use super::Input;
use crate::{corpus::CorpusId, mutators::numeric::Numeric};
use crate::mutators::numeric::Numeric;

/// Newtype pattern wrapper around an underlying structure to implement inputs
///
Expand Down Expand Up @@ -56,11 +56,7 @@ impl<I: Copy> Copy for ValueInput<I> {}
macro_rules! impl_input_for_value_input {
($($t:ty => $name:ident),+ $(,)?) => {
$(
impl Input for ValueInput<$t> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
impl Input for ValueInput<$t> {}

/// Input wrapping a <$t>
pub type $name = ValueInput<$t>;
Expand All @@ -86,10 +82,6 @@ impl_input_for_value_input!(

/// manually implemented because files can be written more efficiently
impl Input for ValueInput<Vec<u8>> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}

/// Write this input to the file
#[cfg(feature = "std")]
fn to_file<P>(&self, path: P) -> Result<(), Error>
Expand Down
5 changes: 1 addition & 4 deletions libafl/src/stages/dump.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,7 @@ where
[
Some(id.0.to_string()),
testcase.filename().clone(),
testcase
.input()
.as_ref()
.map(|t| t.generate_name(Some(*id))),
testcase.input().as_ref().map(|t| t.generate_name()),
]
.iter()
.flatten()
Expand Down
Loading