Deduplicate OnDisk Corpus (#2827)

* testcase name logic

* implement locking

* implement logic for removing testcase

* minor modifications

* minor modifications to remove_testcase()

* extract generate_name() from trait Input (broken)

* Revert "extract generate_name() from trait Input (broken)"

This reverts commit 9e217be2810a8c33970846cdd380f542975cc05b.

* fix ci errors

* remove CorpusId from generate_name() calls

* toml formatting

* write from file instead of fs

* fmt and clippy

* fix windows clippy

* handle renaming of testcase

* fix failing cmplog test

* overwrite lockfile on remove testcase

* format

* bring back corpus id in generate_name

* missed windows executors hook

* fix failing tests

* some more errors

---------

Co-authored-by: Dongjia "toka" Zhang <tokazerkje@outlook.com>
This commit is contained in:
Dhanvith Nayak 2025-02-04 12:11:00 +05:30 committed by GitHub
parent 5bd6a6f754
commit 72986fc129
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 111 additions and 127 deletions

View File

@ -124,7 +124,7 @@ uuid = { version = "1.10.0", features = ["serde", "v4"] }
which = "6.0.3"
windows = "0.59.0"
z3 = "0.12.1"
fs2 = "0.4.3" # Used by OnDisk Corpus for file locking
[workspace.lints.rust]
# Deny

View File

@ -2,10 +2,7 @@
use std::hash::Hash;
use lain::prelude::*;
use libafl::{
corpus::CorpusId,
inputs::{HasTargetBytes, Input},
};
use libafl::inputs::{HasTargetBytes, Input};
use libafl_bolts::{ownedref::OwnedSlice, HasLen};
use serde::{Deserialize, Serialize};
@ -48,15 +45,7 @@ pub enum PacketType {
Reset = 0x2,
}
impl Input for PacketData {
fn generate_name(&self, id: Option<CorpusId>) -> String {
if let Some(id) = id {
format!("id_{}", id.0)
} else {
"id_unknown".into()
}
}
}
impl Input for PacketData {}
impl HasTargetBytes for PacketData {
#[inline]

View File

@ -2,14 +2,13 @@ use core::num::NonZeroUsize;
use std::{borrow::Cow, hash::Hash};
use libafl::{
corpus::CorpusId,
generators::{Generator, RandBytesGenerator},
inputs::{BytesInput, HasTargetBytes, Input},
mutators::{MutationResult, Mutator},
state::HasRand,
Error, SerdeAny,
};
use libafl_bolts::{generic_hash_std, rands::Rand, Named};
use libafl_bolts::{rands::Rand, Named};
use serde::{Deserialize, Serialize};
/// The custom [`Input`] type used in this example, consisting of a byte array part, a byte array that is not always present, and a boolean
@ -28,11 +27,7 @@ pub struct CustomInput {
}
/// Hash-based implementation
impl Input for CustomInput {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
impl Input for CustomInput {}
impl CustomInput {
/// Returns a mutable reference to the byte array

View File

@ -60,6 +60,7 @@ std = [
"libafl_bolts/std",
"typed-builder",
"fastbloom",
"fs2",
]
## Tracks the Feedbacks and the Objectives that were interesting for a Testcase
@ -290,6 +291,8 @@ const_panic = { version = "0.2.9", default-features = false } # similarly, for f
pyo3 = { workspace = true, optional = true }
regex-syntax = { version = "0.8.4", optional = true } # For nautilus
fs2 = { workspace = true, optional = true } # used by OnDisk Corpus for file locking
# optional-dev deps (change when target.'cfg(accessible(::std))'.test-dependencies will be stable)
serial_test = { workspace = true, optional = true, default-features = false, features = [
"logging",

View File

@ -4,16 +4,17 @@
//! For a lower memory footprint, consider using [`crate::corpus::CachedOnDiskCorpus`]
//! which only stores a certain number of [`Testcase`]s and removes additional ones in a FIFO manner.
use alloc::string::String;
use alloc::string::{String, ToString};
use core::cell::{Ref, RefCell, RefMut};
use std::{
fs,
fs::{File, OpenOptions},
io,
io::Write,
io::{Read, Seek, SeekFrom, Write},
path::{Path, PathBuf},
};
use fs2::FileExt;
#[cfg(feature = "gzip")]
use libafl_bolts::compress::GzipCompressor;
use serde::{Deserialize, Serialize};
@ -33,7 +34,11 @@ use crate::{
/// If the create fails for _any_ reason, including, but not limited to, a preexisting existing file of that name,
/// it will instead return the respective [`io::Error`].
fn create_new<P: AsRef<Path>>(path: P) -> Result<File, io::Error> {
OpenOptions::new().write(true).create_new(true).open(path)
OpenOptions::new()
.write(true)
.read(true)
.create_new(true)
.open(path)
}
/// Tries to create the given `path` and returns `None` _only_ if the file already existed.
@ -85,7 +90,7 @@ where
fn add(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add(testcase)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase, Some(id))?;
*testcase.input_mut() = None;
Ok(id)
}
@ -95,7 +100,7 @@ where
fn add_disabled(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add_disabled(testcase)?;
let testcase = &mut self.get_from_all(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase, Some(id))?;
*testcase.input_mut() = None;
Ok(id)
}
@ -106,7 +111,7 @@ where
let entry = self.inner.replace(id, testcase)?;
self.remove_testcase(&entry)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase, Some(id))?;
*testcase.input_mut() = None;
Ok(entry)
}
@ -309,12 +314,19 @@ impl<I> InMemoryOnDiskCorpus<I> {
/// Sets the filename for a [`Testcase`].
/// If an error gets returned from the corpus (i.e., file exists), we'll have to retry with a different filename.
/// Renaming testcases will most likely cause duplicate testcases to not be handled correctly
/// if testcases with the same input are not given the same filename.
/// Only rename when you know what you are doing.
#[inline]
pub fn rename_testcase(
&self,
testcase: &mut Testcase<I>,
filename: String,
) -> Result<(), Error> {
id: Option<CorpusId>,
) -> Result<(), Error>
where
I: Input,
{
if testcase.filename().is_some() {
// We are renaming!
@ -327,36 +339,10 @@ impl<I> InMemoryOnDiskCorpus<I> {
return Ok(());
}
if self.locking {
let new_lock_filename = format!(".{new_filename}.lafl_lock");
// Try to create lock file for new testcases
if let Err(err) = create_new(self.dir_path.join(&new_lock_filename)) {
*testcase.filename_mut() = Some(old_filename);
return Err(Error::illegal_state(format!(
"Unable to create lock file {new_lock_filename} for new testcase: {err}"
)));
}
}
let new_file_path = self.dir_path.join(&new_filename);
fs::rename(testcase.file_path().as_ref().unwrap(), &new_file_path)?;
let new_metadata_path = {
if let Some(old_metadata_path) = testcase.metadata_path() {
// We have metadata. Let's rename it.
let new_metadata_path = self.dir_path.join(format!(".{new_filename}.metadata"));
fs::rename(old_metadata_path, &new_metadata_path)?;
Some(new_metadata_path)
} else {
None
}
};
*testcase.metadata_path_mut() = new_metadata_path;
self.remove_testcase(testcase)?;
*testcase.filename_mut() = Some(new_filename);
self.save_testcase(testcase, id)?;
*testcase.file_path_mut() = Some(new_file_path);
Ok(())
@ -367,34 +353,38 @@ impl<I> InMemoryOnDiskCorpus<I> {
}
}
fn save_testcase(&self, testcase: &mut Testcase<I>, id: CorpusId) -> Result<(), Error>
fn save_testcase(&self, testcase: &mut Testcase<I>, id: Option<CorpusId>) -> Result<(), Error>
where
I: Input,
{
let file_name_orig = testcase.filename_mut().take().unwrap_or_else(|| {
let file_name = testcase.filename_mut().take().unwrap_or_else(|| {
// TODO walk entry metadata to ask for pieces of filename (e.g. :havoc in AFL)
testcase.input().as_ref().unwrap().generate_name(Some(id))
testcase.input().as_ref().unwrap().generate_name(id)
});
// New testcase, we need to save it.
let mut file_name = file_name_orig.clone();
let mut ctr = String::new();
if self.locking {
let lockfile_name = format!(".{file_name}");
let lockfile_path = self.dir_path.join(lockfile_name);
let mut ctr = 2;
let file_name = if self.locking {
loop {
let lockfile_name = format!(".{file_name}.lafl_lock");
let lockfile_path = self.dir_path.join(lockfile_name);
let mut lockfile = try_create_new(&lockfile_path)?.unwrap_or(
OpenOptions::new()
.write(true)
.read(true)
.open(&lockfile_path)?,
);
lockfile.lock_exclusive()?;
if try_create_new(lockfile_path)?.is_some() {
break file_name;
}
lockfile.read_to_string(&mut ctr)?;
ctr = if ctr.is_empty() {
String::from("1")
} else {
(ctr.trim().parse::<u32>()? + 1).to_string()
};
file_name = format!("{file_name_orig}-{ctr}");
ctr += 1;
}
} else {
file_name
};
lockfile.seek(SeekFrom::Start(0))?;
lockfile.write_all(ctr.as_bytes())?;
}
if testcase.file_path().is_none() {
*testcase.file_path_mut() = Some(self.dir_path.join(&file_name));
@ -402,7 +392,15 @@ impl<I> InMemoryOnDiskCorpus<I> {
*testcase.filename_mut() = Some(file_name);
if self.meta_format.is_some() {
let metafile_name = format!(".{}.metadata", testcase.filename().as_ref().unwrap());
let metafile_name = if self.locking {
format!(
".{}_{}.metadata",
testcase.filename().as_ref().unwrap(),
ctr
)
} else {
format!(".{}.metadata", testcase.filename().as_ref().unwrap())
};
let metafile_path = self.dir_path.join(&metafile_name);
let mut tmpfile_path = metafile_path.clone();
tmpfile_path.set_file_name(format!(".{metafile_name}.tmp",));
@ -445,15 +443,36 @@ impl<I> InMemoryOnDiskCorpus<I> {
fn remove_testcase(&self, testcase: &Testcase<I>) -> Result<(), Error> {
if let Some(filename) = testcase.filename() {
let mut ctr = String::new();
if self.locking {
let lockfile_path = self.dir_path.join(format!(".{filename}"));
let mut lockfile = OpenOptions::new()
.write(true)
.read(true)
.open(&lockfile_path)?;
lockfile.lock_exclusive()?;
lockfile.read_to_string(&mut ctr)?;
ctr = ctr.trim().to_string();
if ctr == "1" {
FileExt::unlock(&lockfile)?;
drop(fs::remove_file(lockfile_path));
} else {
lockfile.seek(SeekFrom::Start(0))?;
lockfile.write_all(&(ctr.parse::<u32>()? - 1).to_le_bytes())?;
return Ok(());
}
}
fs::remove_file(self.dir_path.join(filename))?;
if self.meta_format.is_some() {
fs::remove_file(self.dir_path.join(format!(".{filename}.metadata")))?;
if self.locking {
fs::remove_file(self.dir_path.join(format!(".{filename}_{ctr}.metadata")))?;
} else {
fs::remove_file(self.dir_path.join(format!(".{filename}.metadata")))?;
}
}
// also try to remove the corresponding `.lafl_lock` file if it still exists
// (even though it shouldn't exist anymore, at this point in time)
drop(fs::remove_file(
self.dir_path.join(format!(".{filename}.lafl_lock")),
));
}
Ok(())
}

View File

@ -264,6 +264,8 @@ impl<I> Testcase<I> {
}
/// Create a new Testcase instance given an input and a `filename`
/// If locking is enabled, make sure that testcases with the same input have the same filename
/// to prevent ending up with duplicate testcases
#[inline]
pub fn with_filename(input: I, filename: String) -> Self {
Self {

View File

@ -28,21 +28,23 @@ pub mod nautilus;
use alloc::{
boxed::Box,
string::{String, ToString},
string::String,
vec::{Drain, Splice, Vec},
};
use core::{
clone::Clone,
fmt::Debug,
hash::Hash,
marker::PhantomData,
ops::{DerefMut, RangeBounds},
};
#[cfg(feature = "std")]
use std::{fs::File, hash::Hash, io::Read, path::Path};
use std::{fs::File, io::Read, path::Path};
#[cfg(feature = "std")]
use libafl_bolts::fs::write_file_atomic;
use libafl_bolts::{
generic_hash_std,
ownedref::{OwnedMutSlice, OwnedSlice},
subrange::{SubRangeMutSlice, SubRangeSlice},
Error, HasLen,
@ -55,7 +57,7 @@ use crate::corpus::CorpusId;
/// An input for the target
#[cfg(not(feature = "std"))]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
/// Write this input to the file
fn to_file<P>(&self, _path: P) -> Result<(), Error> {
Err(Error::not_implemented("Not supported in no_std"))
@ -67,12 +69,14 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}
/// Generate a name for this input
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
/// An input for the target
#[cfg(feature = "std")]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
/// Write this input to the file
fn to_file<P>(&self, path: P) -> Result<(), Error>
where
@ -93,7 +97,9 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}
/// Generate a name for this input, the user is responsible for making each name of testcase unique.
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
/// Convert between two input types with a state
@ -127,12 +133,7 @@ impl NopInput {
}
}
impl Input for NopInput {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
"nop-input".to_string()
}
}
impl Input for NopInput {}
impl HasTargetBytes for NopInput {
fn target_bytes(&self) -> OwnedSlice<u8> {
OwnedSlice::from(vec![0])

View File

@ -16,7 +16,7 @@ use crate::{corpus::CorpusId, inputs::Input};
/// An input composed of multiple parts. Use in situations where subcomponents are not necessarily
/// related, or represent distinct parts of the input.
#[derive(Clone, Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize, Hash)]
pub struct MultipartInput<I> {
parts: Vec<I>,
names: Vec<String>,

View File

@ -1,7 +1,7 @@
//! Input for the [`Nautilus`](https://github.com/RUB-SysSec/nautilus) grammar fuzzer methods
//!
//!
use alloc::{rc::Rc, string::String, vec::Vec};
use alloc::{rc::Rc, vec::Vec};
use core::cell::RefCell;
use std::hash::{Hash, Hasher};
@ -15,7 +15,6 @@ use crate::{
rule::RuleIdOrCustom,
tree::{Tree, TreeLike},
},
corpus::CorpusId,
generators::nautilus::NautilusContext,
inputs::{BytesInput, Input, InputConverter},
Error,
@ -28,23 +27,7 @@ pub struct NautilusInput {
pub tree: Tree,
}
impl Input for NautilusInput {
/// Generate a name for this input
#[must_use]
fn generate_name(&self, id: Option<CorpusId>) -> String {
/*let mut hasher = AHasher::new_with_keys(0, 0);
for term in &self.terms {
hasher.write(term.symbol.as_bytes());
}
format!("{:016x}", hasher.finish())*/
if let Some(id) = id {
format!("id_{}", id.0)
} else {
"id_unknown".into()
}
}
}
impl Input for NautilusInput {}
/// Rc Ref-cell from Input
impl From<NautilusInput> for Rc<RefCell<NautilusInput>> {

View File

@ -1,9 +1,9 @@
//! Newtype pattern style wrapper for [`super::Input`]s
use alloc::{string::String, vec::Vec};
use alloc::vec::Vec;
use core::{fmt::Debug, hash::Hash};
use libafl_bolts::{generic_hash_std, rands::Rand};
use libafl_bolts::rands::Rand;
use serde::{Deserialize, Serialize};
#[cfg(feature = "std")]
use {
@ -12,7 +12,7 @@ use {
};
use super::Input;
use crate::{corpus::CorpusId, mutators::numeric::Numeric};
use crate::mutators::numeric::Numeric;
/// Newtype pattern wrapper around an underlying structure to implement inputs
///
@ -56,11 +56,7 @@ impl<I: Copy> Copy for ValueInput<I> {}
macro_rules! impl_input_for_value_input {
($($t:ty => $name:ident),+ $(,)?) => {
$(
impl Input for ValueInput<$t> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
impl Input for ValueInput<$t> {}
/// Input wrapping a <$t>
pub type $name = ValueInput<$t>;
@ -86,10 +82,6 @@ impl_input_for_value_input!(
/// manually implemented because files can be written more efficiently
impl Input for ValueInput<Vec<u8>> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
/// Write this input to the file
#[cfg(feature = "std")]
fn to_file<P>(&self, path: P) -> Result<(), Error>

View File

@ -255,7 +255,7 @@ fn parse_path(s: &str) -> PathBuf {
let s = s.trim();
// If first and last character is a quote, let's remove them
let s = if s.starts_with('\"') && s.ends_with('\"'){
let s = if s.starts_with('\"') && s.ends_with('\"') {
&s[1..s.len() - 1]
} else {
s