Fix OOM restarts with LlmpShouldSaveState (#1974)

* LlmpSaveState and OOM restarts

* clippy

* clippy

* rename
This commit is contained in:
Andrea Fioraldi 2024-03-30 17:05:03 +01:00 committed by GitHub
parent 69d7d7b1e3
commit 45d47214c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 83 additions and 18 deletions

View File

@ -9,10 +9,13 @@ static GLOBAL: MiMalloc = MiMalloc;
use core::time::Duration;
use std::{env, net::SocketAddr, path::PathBuf};
use clap::{self, Parser};
use clap::Parser;
use libafl::{
corpus::{Corpus, InMemoryOnDiskCorpus, OnDiskCorpus},
events::{launcher::Launcher, EventConfig, EventRestarter, LlmpRestartingEventManager},
events::{
launcher::Launcher, llmp::LlmpShouldSaveState, EventConfig, EventRestarter,
LlmpRestartingEventManager,
},
executors::{inprocess::InProcessExecutor, ExitKind},
feedback_or, feedback_or_fast,
feedbacks::{CrashFeedback, MaxMapFeedback, TimeFeedback, TimeoutFeedback},
@ -279,7 +282,11 @@ pub extern "C" fn libafl_main() {
.broker_port(broker_port)
.remote_broker_addr(opt.remote_broker_addr)
.stdout_file(Some("/dev/null"))
.serialize_state(!opt.reload_corpus)
.serialize_state(if opt.reload_corpus {
LlmpShouldSaveState::OOMSafeNever
} else {
LlmpShouldSaveState::OOMSafeOnRestart
})
.build()
.launch()
{

View File

@ -49,7 +49,7 @@ use crate::events::{CentralizedEventManager, CentralizedLlmpEventBroker};
#[cfg(feature = "std")]
use crate::{
events::{
llmp::{LlmpRestartingEventManager, ManagerKind, RestartingMgr},
llmp::{LlmpRestartingEventManager, LlmpShouldSaveState, ManagerKind, RestartingMgr},
EventConfig,
},
monitors::Monitor,
@ -126,8 +126,8 @@ where
#[builder(default = DEFAULT_CLIENT_TIMEOUT_SECS)]
client_timeout: Duration,
/// Tell the manager to serialize or not the state on restart
#[builder(default = true)]
serialize_state: bool,
#[builder(default = LlmpShouldSaveState::OnRestart)]
serialize_state: LlmpShouldSaveState,
#[builder(setter(skip), default = PhantomData)]
phantom_data: PhantomData<(&'a S, &'a SP, EMH)>,
}
@ -493,8 +493,8 @@ where
#[builder(default = true)]
spawn_broker: bool,
/// Tell the manager to serialize or not the state on restart
#[builder(default = true)]
serialize_state: bool,
#[builder(default = LlmpShouldSaveState::OnRestart)]
serialize_state: LlmpShouldSaveState,
/// The duration for the llmp client timeout
#[builder(default = DEFAULT_CLIENT_TIMEOUT_SECS)]
client_timeout: Duration,

View File

@ -892,6 +892,41 @@ where
}
}
/// Specify if the State must be persistent over restarts
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum LlmpShouldSaveState {
/// Always save and restore the state on restart (not OOM resistant)
OnRestart,
/// Never save the state (not OOM resistant)
Never,
/// Best-effort save and restore the state on restart (OOM safe)
/// This adds additional runtime costs when processing events
OOMSafeOnRestart,
/// Never save the state (OOM safe)
/// This adds additional runtime costs when processing events
OOMSafeNever,
}
impl LlmpShouldSaveState {
/// Check if the state must be saved `on_restart()`
#[must_use]
pub fn on_restart(&self) -> bool {
matches!(
self,
LlmpShouldSaveState::OnRestart | LlmpShouldSaveState::OOMSafeOnRestart
)
}
/// Check if the policy is OOM safe
#[must_use]
pub fn oom_safe(&self) -> bool {
matches!(
self,
LlmpShouldSaveState::OOMSafeOnRestart | LlmpShouldSaveState::OOMSafeNever
)
}
}
/// A manager that can restart on the fly, storing states in-between (in `on_restart`)
#[cfg(feature = "std")]
#[derive(Debug)]
@ -906,7 +941,7 @@ where
/// The staterestorer to serialize the state for the next runner
staterestorer: StateRestorer<SP>,
/// Decide if the state restorer must save the serialized state
save_state: bool,
save_state: LlmpShouldSaveState,
}
#[cfg(all(feature = "std", feature = "adaptive_serialization"))]
@ -980,7 +1015,9 @@ where
event: Event<<Self::State as UsesInput>::Input>,
) -> Result<(), Error> {
// Check if we are going to crash in the event, in which case we store our current state for the next runner
self.llmp_mgr.fire(state, event)
self.llmp_mgr.fire(state, event)?;
self.intermediate_save()?;
Ok(())
}
fn serialize_observers<OT>(&mut self, observers: &OT) -> Result<Option<Vec<u8>>, Error>
@ -1016,7 +1053,11 @@ where
// First, reset the page to 0 so the next iteration can read read from the beginning of this page
self.staterestorer.reset();
self.staterestorer.save(&(
if self.save_state { Some(state) } else { None },
if self.save_state.on_restart() {
Some(state)
} else {
None
},
&self.llmp_mgr.describe()?,
))?;
@ -1044,7 +1085,9 @@ where
Z: EvaluatorObservers<E::Observers, State = S> + ExecutionProcessor<E::Observers>, //CE: CustomEvent<I>,
{
fn process(&mut self, fuzzer: &mut Z, state: &mut S, executor: &mut E) -> Result<usize, Error> {
self.llmp_mgr.process(fuzzer, state, executor)
let res = self.llmp_mgr.process(fuzzer, state, executor)?;
self.intermediate_save()?;
Ok(res)
}
}
@ -1089,7 +1132,7 @@ where
Self {
llmp_mgr,
staterestorer,
save_state: true,
save_state: LlmpShouldSaveState::OnRestart,
}
}
@ -1097,7 +1140,7 @@ where
pub fn with_save_state(
llmp_mgr: LlmpEventManager<EMH, S, SP>,
staterestorer: StateRestorer<SP>,
save_state: bool,
save_state: LlmpShouldSaveState,
) -> Self {
Self {
llmp_mgr,
@ -1115,6 +1158,17 @@ where
pub fn staterestorer_mut(&mut self) -> &mut StateRestorer<SP> {
&mut self.staterestorer
}
/// Save LLMP state and empty state in staterestorer
pub fn intermediate_save(&mut self) -> Result<(), Error> {
// First, reset the page to 0 so the next iteration can read read from the beginning of this page
if self.save_state.oom_safe() {
self.staterestorer.reset();
self.staterestorer
.save(&(None::<S>, &self.llmp_mgr.describe()?))?;
}
Ok(())
}
}
/// The kind of manager we're creating right now
@ -1202,8 +1256,8 @@ where
#[builder(default = None)]
exit_cleanly_after: Option<NonZeroUsize>,
/// Tell the manager to serialize or not the state on restart
#[builder(default = true)]
serialize_state: bool,
#[builder(default = LlmpShouldSaveState::OnRestart)]
serialize_state: LlmpShouldSaveState,
/// The timeout duration used for llmp client timeout
#[builder(default = DEFAULT_CLIENT_TIMEOUT_SECS)]
client_timeout: Duration,
@ -1380,7 +1434,7 @@ where
compiler_fence(Ordering::SeqCst);
#[allow(clippy::manual_assert)]
if !staterestorer.has_content() && self.serialize_state {
if !staterestorer.has_content() && !self.serialize_state.oom_safe() {
#[cfg(unix)]
if child_status == 137 {
// Out of Memory, see https://tldp.org/LDP/abs/html/exitcodes.html
@ -1450,7 +1504,11 @@ where
)
};
// We reset the staterestorer, the next staterestorer and receiver (after crash) will reuse the page from the initial message.
mgr.staterestorer.reset();
if self.serialize_state.oom_safe() {
mgr.intermediate_save()?;
} else {
mgr.staterestorer.reset();
}
/* TODO: Not sure if this is needed
// We commit an empty NO_RESTART message to this buf, against infinite loops,