From fbe8cce1b89cdd7b6330eed33de1ded3bb13a5dc Mon Sep 17 00:00:00 2001 From: Dominik Maier Date: Tue, 28 Feb 2023 15:41:05 +0100 Subject: [PATCH] Real OnDiskCorpus (#1096) * Real OnDiskCorpus * clippy * python * docs * clippy * docs * move to reuse cachedinmem corpus * fmt --- docs/src/message_passing/message_passing.md | 4 +- fuzzers/baby_fuzzer_minimizing/src/main.rs | 4 +- fuzzers/baby_fuzzer_nautilus/src/main.rs | 2 +- .../baby_fuzzer_swap_differential/src/main.rs | 4 +- fuzzers/fuzzbench_fork_qemu/src/fuzzer.rs | 4 +- fuzzers/fuzzbench_forkserver/src/main.rs | 4 +- fuzzers/fuzzbench_qemu/src/fuzzer.rs | 4 +- fuzzers/fuzzbench_text/src/lib.rs | 6 +- fuzzers/libfuzzer_libpng/src/lib.rs | 2 +- libafl/src/corpus/cached.rs | 20 +- libafl/src/corpus/inmemory_ondisk.rs | 315 ++++++++++++++++++ libafl/src/corpus/mod.rs | 19 ++ libafl/src/corpus/ondisk.rs | 157 ++------- 13 files changed, 390 insertions(+), 155 deletions(-) create mode 100644 libafl/src/corpus/inmemory_ondisk.rs diff --git a/docs/src/message_passing/message_passing.md b/docs/src/message_passing/message_passing.md index ddaba930fa..4e9daf6c3f 100644 --- a/docs/src/message_passing/message_passing.md +++ b/docs/src/message_passing/message_passing.md @@ -2,7 +2,7 @@ LibAFL offers a standard mechanism for message passing between processes and machines with a low overhead. We use message passing to inform the other connected clients/fuzzers/nodes about new testcases, metadata, and statistics about the current run. -Depending on individual needs, LibAFL can also write testcase contents to disk, while still using events to notify other fuzzers, using an `OnDiskCorpus`. +Depending on individual needs, LibAFL can also write testcase contents to disk, while still using events to notify other fuzzers, using the `CachedOnDiskCorpus` or similar. In our tests, message passing scales very well to share new testcases and metadata between multiple running fuzzer instances for multi-core fuzzing. Specifically, it scales _a lot_ better than using memory locks on a shared corpus, and _a lot_ better than sharing the testcases via the filesystem, as AFL traditionally does. @@ -12,7 +12,7 @@ The `EventManager` interface is used to send Events over the wire using `Low Lev ## Low Level Message Passing (LLMP) -LibAFL comes with a reasonably lock-free message passing mechanism that scales well across cores and, using its *broker2broker* mechanism, even to connected machines via TCP. +LibAFL comes with a reasonably lock-free message passing mechanism that scales well across cores and, using its _broker2broker_ mechanism, even to connected machines via TCP. Most example fuzzers use this mechanism, and it is the best `EventManager` if you want to fuzz on more than a single core. In the following, we will describe the inner workings of `LLMP`. diff --git a/fuzzers/baby_fuzzer_minimizing/src/main.rs b/fuzzers/baby_fuzzer_minimizing/src/main.rs index c3142ea9e6..a87010ab2d 100644 --- a/fuzzers/baby_fuzzer_minimizing/src/main.rs +++ b/fuzzers/baby_fuzzer_minimizing/src/main.rs @@ -56,7 +56,7 @@ pub fn main() -> Result<(), Error> { // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::new(&corpus_dir).unwrap(), + InMemoryOnDiskCorpus::new(&corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(&solution_dir).unwrap(), @@ -108,7 +108,7 @@ pub fn main() -> Result<(), Error> { let mut state = StdState::new( StdRand::with_seed(current_nanos()), - OnDiskCorpus::new(&minimized_dir).unwrap(), + InMemoryOnDiskCorpus::new(&minimized_dir).unwrap(), InMemoryCorpus::new(), &mut (), &mut (), diff --git a/fuzzers/baby_fuzzer_nautilus/src/main.rs b/fuzzers/baby_fuzzer_nautilus/src/main.rs index 54356830d0..62ab42581b 100644 --- a/fuzzers/baby_fuzzer_nautilus/src/main.rs +++ b/fuzzers/baby_fuzzer_nautilus/src/main.rs @@ -4,7 +4,7 @@ use std::ptr::write_volatile; use libafl::{ bolts::{current_nanos, rands::StdRand, tuples::tuple_list}, - corpus::{InMemoryCorpus, OnDiskCorpus}, + corpus::{InMemoryCorpus, InMemoryOnDiskCorpus, OnDiskCorpus}, events::SimpleEventManager, executors::{inprocess::InProcessExecutor, ExitKind}, feedback_or, diff --git a/fuzzers/baby_fuzzer_swap_differential/src/main.rs b/fuzzers/baby_fuzzer_swap_differential/src/main.rs index 07b4413ddb..739691fd46 100644 --- a/fuzzers/baby_fuzzer_swap_differential/src/main.rs +++ b/fuzzers/baby_fuzzer_swap_differential/src/main.rs @@ -11,7 +11,7 @@ use libafl::monitors::tui::TuiMonitor; use libafl::monitors::SimpleMonitor; use libafl::{ bolts::{current_nanos, rands::StdRand, tuples::tuple_list, AsSlice}, - corpus::{Corpus, InMemoryCorpus, OnDiskCorpus}, + corpus::{Corpus, InMemoryCorpus, InMemoryOnDiskCorpus}, events::SimpleEventManager, executors::{inprocess::InProcessExecutor, DiffExecutor, ExitKind}, feedbacks::{CrashFeedback, MaxMapFeedback}, @@ -180,7 +180,7 @@ pub fn main() { InMemoryCorpus::new(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer - OnDiskCorpus::new(PathBuf::from("./crashes")).unwrap(), + InMemoryOnDiskCorpus::new(PathBuf::from("./crashes")).unwrap(), // States of the feedbacks. // The feedbacks can report the data that should persist in the State. &mut feedback, diff --git a/fuzzers/fuzzbench_fork_qemu/src/fuzzer.rs b/fuzzers/fuzzbench_fork_qemu/src/fuzzer.rs index 616aae9b33..a6f10da254 100644 --- a/fuzzers/fuzzbench_fork_qemu/src/fuzzer.rs +++ b/fuzzers/fuzzbench_fork_qemu/src/fuzzer.rs @@ -21,7 +21,7 @@ use libafl::{ tuples::{tuple_list, Merge}, AsMutSlice, AsSlice, }, - corpus::{Corpus, OnDiskCorpus}, + corpus::{Corpus, InMemoryOnDiskCorpus, OnDiskCorpus}, events::SimpleRestartingEventManager, executors::{ExitKind, ShadowExecutor}, feedback_or, @@ -268,7 +268,7 @@ fn fuzz( // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::new(corpus_dir).unwrap(), + InMemoryOnDiskCorpus::new(corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), diff --git a/fuzzers/fuzzbench_forkserver/src/main.rs b/fuzzers/fuzzbench_forkserver/src/main.rs index fdaf05173b..4c79f836f2 100644 --- a/fuzzers/fuzzbench_forkserver/src/main.rs +++ b/fuzzers/fuzzbench_forkserver/src/main.rs @@ -16,7 +16,7 @@ use libafl::{ tuples::{tuple_list, Merge}, AsMutSlice, }, - corpus::{Corpus, OnDiskCorpus}, + corpus::{Corpus, InMemoryOnDiskCorpus, OnDiskCorpus}, events::SimpleEventManager, executors::forkserver::{ForkserverExecutor, TimeoutForkserverExecutor}, feedback_or, @@ -272,7 +272,7 @@ fn fuzz( // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::::new(corpus_dir).unwrap(), + InMemoryOnDiskCorpus::::new(corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), diff --git a/fuzzers/fuzzbench_qemu/src/fuzzer.rs b/fuzzers/fuzzbench_qemu/src/fuzzer.rs index 622bc4bf6c..71df1762f0 100644 --- a/fuzzers/fuzzbench_qemu/src/fuzzer.rs +++ b/fuzzers/fuzzbench_qemu/src/fuzzer.rs @@ -21,7 +21,7 @@ use libafl::{ tuples::{tuple_list, Merge}, AsSlice, }, - corpus::{Corpus, OnDiskCorpus}, + corpus::{Corpus, InMemoryOnDiskCorpus, OnDiskCorpus}, events::SimpleRestartingEventManager, executors::{ExitKind, ShadowExecutor, TimeoutExecutor}, feedback_or, @@ -280,7 +280,7 @@ fn fuzz( // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::new(corpus_dir).unwrap(), + InMemoryOnDiskCorpus::new(corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), diff --git a/fuzzers/fuzzbench_text/src/lib.rs b/fuzzers/fuzzbench_text/src/lib.rs index 0f785ffb79..5fa41f1546 100644 --- a/fuzzers/fuzzbench_text/src/lib.rs +++ b/fuzzers/fuzzbench_text/src/lib.rs @@ -25,7 +25,7 @@ use libafl::{ tuples::{tuple_list, Merge}, AsSlice, }, - corpus::{Corpus, OnDiskCorpus}, + corpus::{Corpus, InMemoryOnDiskCorpus, OnDiskCorpus}, events::SimpleRestartingEventManager, executors::{inprocess::InProcessExecutor, ExitKind, TimeoutExecutor}, feedback_or, @@ -332,7 +332,7 @@ fn fuzz_binary( // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::new(corpus_dir).unwrap(), + InMemoryOnDiskCorpus::new(corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), @@ -536,7 +536,7 @@ fn fuzz_text( // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - OnDiskCorpus::new(corpus_dir).unwrap(), + InMemoryOnDiskCorpus::new(corpus_dir).unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), diff --git a/fuzzers/libfuzzer_libpng/src/lib.rs b/fuzzers/libfuzzer_libpng/src/lib.rs index 7e2653c102..587694d833 100644 --- a/fuzzers/libfuzzer_libpng/src/lib.rs +++ b/fuzzers/libfuzzer_libpng/src/lib.rs @@ -112,7 +112,7 @@ fn fuzz(corpus_dirs: &[PathBuf], objective_dir: PathBuf, broker_port: u16) -> Re // RNG StdRand::with_seed(current_nanos()), // Corpus that will be evolved, we keep it in memory for performance - InMemoryCorpus::new(), + OnDiskCorpus::new("corpus_out").unwrap(), // Corpus in which we store solutions (crashes in this example), // on disk so the user can get them after stopping the fuzzer OnDiskCorpus::new(objective_dir).unwrap(), diff --git a/libafl/src/corpus/cached.rs b/libafl/src/corpus/cached.rs index 4f47603b93..4fb611706e 100644 --- a/libafl/src/corpus/cached.rs +++ b/libafl/src/corpus/cached.rs @@ -1,4 +1,4 @@ -//! The cached ondisk corpus stores testcases to disk keeping a part of them in memory. +//! The [`CachedOnDiskCorpus`] stores [`Testcase`]s to disk, keeping a subset of them in memory/cache, evicting in a FIFO manner. use alloc::collections::vec_deque::VecDeque; use core::cell::RefCell; @@ -8,14 +8,16 @@ use serde::{Deserialize, Serialize}; use crate::{ corpus::{ - ondisk::{OnDiskCorpus, OnDiskMetadataFormat}, - Corpus, CorpusId, Testcase, + inmemory_ondisk::InMemoryOnDiskCorpus, ondisk::OnDiskMetadataFormat, Corpus, CorpusId, + Testcase, }, inputs::{Input, UsesInput}, Error, }; -/// A corpus that keeps a maximum number of [`Testcase`]s in memory. The eviction policy is FIFO. +/// A corpus that keeps a maximum number of [`Testcase`]s in memory +/// and load them from disk, when they are being used +/// The eviction policy is FIFO. #[cfg(feature = "std")] #[derive(Default, Serialize, Deserialize, Clone, Debug)] #[serde(bound = "I: serde::de::DeserializeOwned")] @@ -23,7 +25,7 @@ pub struct CachedOnDiskCorpus where I: Input, { - inner: OnDiskCorpus, + inner: InMemoryOnDiskCorpus, cached_indexes: RefCell>, cache_max_len: usize, } @@ -148,7 +150,7 @@ where where P: AsRef, { - Self::_new(OnDiskCorpus::new(dir_path)?, cache_max_len) + Self::_new(InMemoryOnDiskCorpus::new(dir_path)?, cache_max_len) } /// Creates an [`CachedOnDiskCorpus`] that does not store [`Testcase`] metadata to disk. @@ -156,7 +158,7 @@ where where P: AsRef, { - Self::_new(OnDiskCorpus::no_meta(dir_path)?, cache_max_len) + Self::_new(InMemoryOnDiskCorpus::no_meta(dir_path)?, cache_max_len) } /// Creates the [`CachedOnDiskCorpus`] specifying the format in which `Metadata` will be saved to disk. @@ -171,13 +173,13 @@ where P: AsRef, { Self::_new( - OnDiskCorpus::with_meta_format(dir_path, meta_format)?, + InMemoryOnDiskCorpus::with_meta_format(dir_path, meta_format)?, cache_max_len, ) } /// Internal constructor `fn` - fn _new(on_disk_corpus: OnDiskCorpus, cache_max_len: usize) -> Result { + fn _new(on_disk_corpus: InMemoryOnDiskCorpus, cache_max_len: usize) -> Result { if cache_max_len == 0 { return Err(Error::illegal_argument( "The max cache len in CachedOnDiskCorpus cannot be 0", diff --git a/libafl/src/corpus/inmemory_ondisk.rs b/libafl/src/corpus/inmemory_ondisk.rs new file mode 100644 index 0000000000..af69bcaf3e --- /dev/null +++ b/libafl/src/corpus/inmemory_ondisk.rs @@ -0,0 +1,315 @@ +//! The [`InMemoryOnDiskCorpus`] stores [`Testcase`]s to disk. +//! Additionally, _all_ of them are kept in memory. +//! For a lower memory footprint, consider using [`crate::corpus::CachedOnDiskCorpus`] +//! which only stores a certain number of [`Testcase`]s and removes additional ones in a FIFO manner. + +use core::{cell::RefCell, time::Duration}; +#[cfg(feature = "std")] +use std::{fs, fs::File, io::Write}; +use std::{ + fs::OpenOptions, + path::{Path, PathBuf}, +}; + +use serde::{Deserialize, Serialize}; + +use super::ondisk::{OnDiskMetadata, OnDiskMetadataFormat}; +#[cfg(feature = "gzip")] +use crate::bolts::compress::GzipCompressor; +use crate::{ + bolts::serdeany::SerdeAnyMap, + corpus::{Corpus, CorpusId, InMemoryCorpus, Testcase}, + inputs::{Input, UsesInput}, + state::HasMetadata, + Error, +}; + +/// The [`Testcase`] metadata that'll be stored to disk +#[cfg(feature = "std")] +#[derive(Debug, Serialize)] +pub struct InMemoryOnDiskMetadata<'a> { + metadata: &'a SerdeAnyMap, + exec_time: &'a Option, + executions: &'a usize, +} + +/// A corpus able to store [`Testcase`]s to disk, while also keeping all of them in memory. +/// +/// Metadata is written to a `..metadata` file in the same folder by default. +#[cfg(feature = "std")] +#[derive(Default, Serialize, Deserialize, Clone, Debug)] +#[serde(bound = "I: serde::de::DeserializeOwned")] +pub struct InMemoryOnDiskCorpus +where + I: Input, +{ + inner: InMemoryCorpus, + dir_path: PathBuf, + meta_format: Option, +} + +impl UsesInput for InMemoryOnDiskCorpus +where + I: Input, +{ + type Input = I; +} + +impl Corpus for InMemoryOnDiskCorpus +where + I: Input, +{ + /// Returns the number of elements + #[inline] + fn count(&self) -> usize { + self.inner.count() + } + + /// Add an entry to the corpus and return its index + #[inline] + fn add(&mut self, testcase: Testcase) -> Result { + let idx = self.inner.add(testcase)?; + self.save_testcase(&mut self.get(idx).unwrap().borrow_mut(), idx)?; + Ok(idx) + } + + /// Replaces the testcase at the given idx + #[inline] + fn replace(&mut self, idx: CorpusId, testcase: Testcase) -> Result, Error> { + let entry = self.inner.replace(idx, testcase)?; + self.remove_testcase(&entry)?; + self.save_testcase(&mut self.get(idx).unwrap().borrow_mut(), idx)?; + Ok(entry) + } + + /// Removes an entry from the corpus, returning it if it was present. + #[inline] + fn remove(&mut self, idx: CorpusId) -> Result, Error> { + let entry = self.inner.remove(idx)?; + self.remove_testcase(&entry)?; + Ok(entry) + } + + /// Get by id + #[inline] + fn get(&self, idx: CorpusId) -> Result<&RefCell>, Error> { + self.inner.get(idx) + } + + /// Current testcase scheduled + #[inline] + fn current(&self) -> &Option { + self.inner.current() + } + + /// Current testcase scheduled (mutable) + #[inline] + fn current_mut(&mut self) -> &mut Option { + self.inner.current_mut() + } + + #[inline] + fn next(&self, idx: CorpusId) -> Option { + self.inner.next(idx) + } + + #[inline] + fn prev(&self, idx: CorpusId) -> Option { + self.inner.prev(idx) + } + + #[inline] + fn first(&self) -> Option { + self.inner.first() + } + + #[inline] + fn last(&self) -> Option { + self.inner.last() + } + + #[inline] + fn nth(&self, nth: usize) -> CorpusId { + self.inner.nth(nth) + } +} + +impl InMemoryOnDiskCorpus +where + I: Input, +{ + /// Creates an [`InMemoryOnDiskCorpus`]. + /// + /// This corpus stores all testcases to disk, and keeps all of them in memory, as well. + /// + /// By default, it stores metadata for each [`Testcase`] as prettified json. + /// Metadata will be written to a file named `..metadata` + /// The metadata may include objective reason, specific information for a fuzz job, and more. + /// + /// If you don't want metadata, use [`InMemoryOnDiskCorpus::no_meta`]. + /// To pick a different metadata format, use [`InMemoryOnDiskCorpus::with_meta_format`]. + /// + /// Will error, if [`std::fs::create_dir_all()`] failed for `dir_path`. + pub fn new

(dir_path: P) -> Result + where + P: AsRef, + { + Self::_new(dir_path.as_ref(), Some(OnDiskMetadataFormat::JsonPretty)) + } + + /// Creates the [`InMemoryOnDiskCorpus`] specifying the format in which `Metadata` will be saved to disk. + /// + /// Will error, if [`std::fs::create_dir_all()`] failed for `dir_path`. + pub fn with_meta_format

( + dir_path: P, + meta_format: OnDiskMetadataFormat, + ) -> Result + where + P: AsRef, + { + Self::_new(dir_path.as_ref(), Some(meta_format)) + } + + /// Creates an [`InMemoryOnDiskCorpus`] that will not store .metadata files + /// + /// Will error, if [`std::fs::create_dir_all()`] failed for `dir_path`. + pub fn no_meta

(dir_path: P) -> Result + where + P: AsRef, + { + Self::_new(dir_path.as_ref(), None) + } + + /// Private fn to crate a new corpus at the given (non-generic) path with the given optional `meta_format` + fn _new(dir_path: &Path, meta_format: Option) -> Result { + fs::create_dir_all(dir_path)?; + Ok(InMemoryOnDiskCorpus { + inner: InMemoryCorpus::new(), + dir_path: dir_path.into(), + meta_format, + }) + } + + fn save_testcase(&self, testcase: &mut Testcase, idx: CorpusId) -> Result<(), Error> { + if testcase.filename().is_none() { + // TODO walk entry metadata to ask for pieces of filename (e.g. :havoc in AFL) + let file_orig = testcase.input().as_ref().unwrap().generate_name(idx.0); + let mut file = file_orig.clone(); + + let mut ctr = 2; + let filename = loop { + let lockfile = format!(".{file}.lafl_lock"); + // try to create lockfile. + + if OpenOptions::new() + .write(true) + .create_new(true) + .open(self.dir_path.join(lockfile)) + .is_ok() + { + break self.dir_path.join(file); + } + + file = format!("{file_orig}-{ctr}"); + ctr += 1; + }; + + let filename_str = filename.to_str().expect("Invalid Path"); + testcase.set_filename(filename_str.into()); + }; + if self.meta_format.is_some() { + let mut filename = PathBuf::from(testcase.filename().as_ref().unwrap()); + filename.set_file_name(format!( + ".{}.metadata", + filename.file_name().unwrap().to_string_lossy() + )); + let mut tmpfile_name = PathBuf::from(&filename); + tmpfile_name.set_file_name(format!( + ".{}.tmp", + tmpfile_name.file_name().unwrap().to_string_lossy() + )); + + let ondisk_meta = OnDiskMetadata { + metadata: testcase.metadata(), + exec_time: testcase.exec_time(), + executions: testcase.executions(), + }; + + let mut tmpfile = File::create(&tmpfile_name)?; + + let serialized = match self.meta_format.as_ref().unwrap() { + OnDiskMetadataFormat::Postcard => postcard::to_allocvec(&ondisk_meta)?, + OnDiskMetadataFormat::Json => serde_json::to_vec(&ondisk_meta)?, + OnDiskMetadataFormat::JsonPretty => serde_json::to_vec_pretty(&ondisk_meta)?, + #[cfg(feature = "gzip")] + OnDiskMetadataFormat::JsonGzip => GzipCompressor::new(0) + .compress(&serde_json::to_vec_pretty(&ondisk_meta)?)? + .unwrap(), + }; + tmpfile.write_all(&serialized)?; + fs::rename(&tmpfile_name, &filename)?; + } + testcase + .store_input() + .expect("Could not save testcase to disk"); + Ok(()) + } + + fn remove_testcase(&self, testcase: &Testcase) -> Result<(), Error> { + if let Some(filename) = testcase.filename() { + fs::remove_file(filename)?; + } + if self.meta_format.is_some() { + let mut filename = PathBuf::from(testcase.filename().as_ref().unwrap()); + filename.set_file_name(format!( + ".{}.metadata", + filename.file_name().unwrap().to_string_lossy() + )); + fs::remove_file(filename)?; + } + Ok(()) + } +} + +#[cfg(feature = "python")] +/// `InMemoryOnDiskCorpus` Python bindings +pub mod pybind { + use alloc::string::String; + use std::path::PathBuf; + + use pyo3::prelude::*; + use serde::{Deserialize, Serialize}; + + use crate::{ + corpus::{pybind::PythonCorpus, InMemoryOnDiskCorpus}, + inputs::BytesInput, + }; + + #[pyclass(unsendable, name = "InMemoryOnDiskCorpus")] + #[allow(clippy::unsafe_derive_deserialize)] + #[derive(Serialize, Deserialize, Debug, Clone)] + /// Python class for InMemoryOnDiskCorpus + pub struct PythonInMemoryOnDiskCorpus { + /// Rust wrapped InMemoryOnDiskCorpus object + pub inner: InMemoryOnDiskCorpus, + } + + #[pymethods] + impl PythonInMemoryOnDiskCorpus { + #[new] + fn new(path: String) -> Self { + Self { + inner: InMemoryOnDiskCorpus::new(PathBuf::from(path)).unwrap(), + } + } + + fn as_corpus(slf: Py) -> PythonCorpus { + PythonCorpus::new_in_memory_on_disk(slf) + } + } + /// Register the classes to the python module + pub fn register(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + Ok(()) + } +} diff --git a/libafl/src/corpus/mod.rs b/libafl/src/corpus/mod.rs index 4116128e5e..d1ff5afe42 100644 --- a/libafl/src/corpus/mod.rs +++ b/libafl/src/corpus/mod.rs @@ -6,6 +6,11 @@ pub use testcase::{SchedulerTestcaseMetaData, Testcase}; pub mod inmemory; pub use inmemory::InMemoryCorpus; +#[cfg(feature = "std")] +pub mod inmemory_ondisk; +#[cfg(feature = "std")] +pub use inmemory_ondisk::InMemoryOnDiskCorpus; + #[cfg(feature = "std")] pub mod ondisk; #[cfg(feature = "std")] @@ -173,6 +178,7 @@ pub mod pybind { use crate::{ corpus::{ cached::pybind::PythonCachedOnDiskCorpus, inmemory::pybind::PythonInMemoryCorpus, + inmemory_ondisk::pybind::PythonInMemoryOnDiskCorpus, ondisk::pybind::PythonOnDiskCorpus, testcase::pybind::PythonTestcaseWrapper, Corpus, CorpusId, Testcase, }, @@ -185,6 +191,7 @@ pub mod pybind { InMemory(Py), CachedOnDisk(Py), OnDisk(Py), + InMemoryOnDisk(Py), } /// Corpus Trait binding @@ -204,6 +211,7 @@ pub mod pybind { PythonCorpusWrapper, { InMemory, + InMemoryOnDisk, CachedOnDisk, OnDisk } @@ -220,6 +228,7 @@ pub mod pybind { PythonCorpusWrapper, { InMemory, + InMemoryOnDisk, CachedOnDisk, OnDisk } @@ -253,6 +262,16 @@ pub mod pybind { } } + #[staticmethod] + #[must_use] + pub fn new_in_memory_on_disk( + py_in_memory_on_disk_corpus: Py, + ) -> Self { + Self { + wrapper: PythonCorpusWrapper::InMemoryOnDisk(py_in_memory_on_disk_corpus), + } + } + #[pyo3(name = "count")] fn pycount(&self) -> usize { self.count() diff --git a/libafl/src/corpus/ondisk.rs b/libafl/src/corpus/ondisk.rs index 113ef5e139..1364d08d9f 100644 --- a/libafl/src/corpus/ondisk.rs +++ b/libafl/src/corpus/ondisk.rs @@ -1,40 +1,35 @@ -//! The ondisk corpus stores [`Testcase`]s to disk. -//! Additionally, all of them are kept in memory. -//! For a lower memory footprint, consider using [`crate::corpus::CachedOnDiskCorpus`] -//! which only stores a certain number of testcases and removes additional ones in a FIFO manner. +//! The ondisk corpus stores all [`Testcase`]s to disk. +//! It never keeps any of them in memory. +//! This is a good solution for solutions that are never reused, and for very memory-constraint environments. +//! For any other occasions, consider using [`crate::corpus::CachedOnDiskCorpus`] +//! which stores a certain number of testcases in memory and removes additional ones in a FIFO manner. use core::{cell::RefCell, time::Duration}; -#[cfg(feature = "std")] -use std::{fs, fs::File, io::Write}; -use std::{ - fs::OpenOptions, - path::{Path, PathBuf}, -}; +use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -#[cfg(feature = "gzip")] -use crate::bolts::compress::GzipCompressor; +use super::CachedOnDiskCorpus; use crate::{ bolts::serdeany::SerdeAnyMap, - corpus::{Corpus, CorpusId, InMemoryCorpus, Testcase}, + corpus::{Corpus, CorpusId, Testcase}, inputs::{Input, UsesInput}, - state::HasMetadata, Error, }; /// Options for the the format of the on-disk metadata #[cfg(feature = "std")] -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Default, Debug, Clone, Serialize, Deserialize)] pub enum OnDiskMetadataFormat { /// A binary-encoded postcard Postcard, /// JSON Json, /// JSON formatted for readability + #[default] JsonPretty, - #[cfg(feature = "gzip")] /// The same as [`OnDiskMetadataFormat::JsonPretty`], but compressed + #[cfg(feature = "gzip")] JsonGzip, } @@ -42,9 +37,12 @@ pub enum OnDiskMetadataFormat { #[cfg(feature = "std")] #[derive(Debug, Serialize)] pub struct OnDiskMetadata<'a> { - metadata: &'a SerdeAnyMap, - exec_time: &'a Option, - executions: &'a usize, + /// The dynamic metadata [`SerdeAnyMap`] stored to disk + pub metadata: &'a SerdeAnyMap, + /// The exec time for this [`Testcase`] + pub exec_time: &'a Option, + /// The amount of executions for this [`Testcase`] + pub executions: &'a usize, } /// A corpus able to store [`Testcase`]s to disk, and load them from disk, when they are being used. @@ -57,9 +55,10 @@ pub struct OnDiskCorpus where I: Input, { - inner: InMemoryCorpus, + /// The root directory backing this corpus dir_path: PathBuf, - meta_format: Option, + /// We wrapp a cached corpus and set its size to 1. + inner: CachedOnDiskCorpus, } impl UsesInput for OnDiskCorpus @@ -82,26 +81,19 @@ where /// Add an entry to the corpus and return its index #[inline] fn add(&mut self, testcase: Testcase) -> Result { - let idx = self.inner.add(testcase)?; - self.save_testcase(&mut self.get(idx).unwrap().borrow_mut(), idx)?; - Ok(idx) + self.inner.add(testcase) } /// Replaces the testcase at the given idx #[inline] fn replace(&mut self, idx: CorpusId, testcase: Testcase) -> Result, Error> { - let entry = self.inner.replace(idx, testcase)?; - self.remove_testcase(&entry)?; - self.save_testcase(&mut self.get(idx).unwrap().borrow_mut(), idx)?; - Ok(entry) + self.inner.replace(idx, testcase) } /// Removes an entry from the corpus, returning it if it was present. #[inline] fn remove(&mut self, idx: CorpusId) -> Result, Error> { - let entry = self.inner.remove(idx)?; - self.remove_testcase(&entry)?; - Ok(entry) + self.inner.remove(idx) } /// Get by id @@ -154,13 +146,12 @@ where { /// Creates an [`OnDiskCorpus`]. /// - /// This corpus stores all testcases to disk, and keeps all of them in memory, as well. + /// This corpus stores all testcases to disk. /// /// By default, it stores metadata for each [`Testcase`] as prettified json. /// Metadata will be written to a file named `..metadata` /// The metadata may include objective reason, specific information for a fuzz job, and more. /// - /// If you don't want metadata, use [`OnDiskCorpus::no_meta`]. /// To pick a different metadata format, use [`OnDiskCorpus::with_meta_format`]. /// /// Will error, if [`std::fs::create_dir_all()`] failed for `dir_path`. @@ -168,7 +159,7 @@ where where P: AsRef, { - Self::_new(dir_path.as_ref(), Some(OnDiskMetadataFormat::JsonPretty)) + Self::_new(dir_path.as_ref(), OnDiskMetadataFormat::JsonPretty) } /// Creates the [`OnDiskCorpus`] specifying the format in which `Metadata` will be saved to disk. @@ -181,108 +172,16 @@ where where P: AsRef, { - Self::_new(dir_path.as_ref(), Some(meta_format)) - } - - /// Creates an [`OnDiskCorpus`] that will not store .metadata files - /// - /// Will error, if [`std::fs::create_dir_all()`] failed for `dir_path`. - pub fn no_meta

(dir_path: P) -> Result - where - P: AsRef, - { - Self::_new(dir_path.as_ref(), None) + Self::_new(dir_path.as_ref(), meta_format) } /// Private fn to crate a new corpus at the given (non-generic) path with the given optional `meta_format` - fn _new(dir_path: &Path, meta_format: Option) -> Result { - fs::create_dir_all(dir_path)?; + fn _new(dir_path: &Path, meta_format: OnDiskMetadataFormat) -> Result { Ok(OnDiskCorpus { - inner: InMemoryCorpus::new(), dir_path: dir_path.into(), - meta_format, + inner: CachedOnDiskCorpus::with_meta_format(dir_path, 1, meta_format)?, }) } - - fn save_testcase(&self, testcase: &mut Testcase, idx: CorpusId) -> Result<(), Error> { - if testcase.filename().is_none() { - // TODO walk entry metadata to ask for pieces of filename (e.g. :havoc in AFL) - let file_orig = testcase.input().as_ref().unwrap().generate_name(idx.0); - let mut file = file_orig.clone(); - - let mut ctr = 2; - let filename = loop { - let lockfile = format!(".{file}.lafl_lock"); - // try to create lockfile. - - if OpenOptions::new() - .write(true) - .create_new(true) - .open(self.dir_path.join(lockfile)) - .is_ok() - { - break self.dir_path.join(file); - } - - file = format!("{file_orig}-{ctr}"); - ctr += 1; - }; - - let filename_str = filename.to_str().expect("Invalid Path"); - testcase.set_filename(filename_str.into()); - }; - if self.meta_format.is_some() { - let mut filename = PathBuf::from(testcase.filename().as_ref().unwrap()); - filename.set_file_name(format!( - ".{}.metadata", - filename.file_name().unwrap().to_string_lossy() - )); - let mut tmpfile_name = PathBuf::from(&filename); - tmpfile_name.set_file_name(format!( - ".{}.tmp", - tmpfile_name.file_name().unwrap().to_string_lossy() - )); - - let ondisk_meta = OnDiskMetadata { - metadata: testcase.metadata(), - exec_time: testcase.exec_time(), - executions: testcase.executions(), - }; - - let mut tmpfile = File::create(&tmpfile_name)?; - - let serialized = match self.meta_format.as_ref().unwrap() { - OnDiskMetadataFormat::Postcard => postcard::to_allocvec(&ondisk_meta)?, - OnDiskMetadataFormat::Json => serde_json::to_vec(&ondisk_meta)?, - OnDiskMetadataFormat::JsonPretty => serde_json::to_vec_pretty(&ondisk_meta)?, - #[cfg(feature = "gzip")] - OnDiskMetadataFormat::JsonGzip => GzipCompressor::new(0) - .compress(&serde_json::to_vec_pretty(&ondisk_meta)?)? - .unwrap(), - }; - tmpfile.write_all(&serialized)?; - fs::rename(&tmpfile_name, &filename)?; - } - testcase - .store_input() - .expect("Could not save testcase to disk"); - Ok(()) - } - - fn remove_testcase(&self, testcase: &Testcase) -> Result<(), Error> { - if let Some(filename) = testcase.filename() { - fs::remove_file(filename)?; - } - if self.meta_format.is_some() { - let mut filename = PathBuf::from(testcase.filename().as_ref().unwrap()); - filename.set_file_name(format!( - ".{}.metadata", - filename.file_name().unwrap().to_string_lossy() - )); - fs::remove_file(filename)?; - } - Ok(()) - } } #[cfg(feature = "python")]