diff --git a/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs b/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs index 09d5357bfd..36cb52df62 100644 --- a/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs +++ b/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs @@ -62,6 +62,8 @@ pub extern "C" fn libafl_main() { #[cfg(not(test))] fn fuzz(corpus_dirs: &[PathBuf], objective_dir: PathBuf, broker_port: u16) -> Result<(), Error> { // 'While the stats are state, they are usually used in the broker - which is likely never restarted + + use libafl::feedbacks::simd::{SimdImplmentation, SimdMapFeedback}; let monitor = MultiMonitor::new(|s| println!("{s}")); // The restarting state will spawn the same process again as child, then restarted it each time it crashes. @@ -93,8 +95,7 @@ fn fuzz(corpus_dirs: &[PathBuf], objective_dir: PathBuf, broker_port: u16) -> Re // Create an observation channel to keep track of the execution time let time_observer = TimeObserver::new("time"); - let map_feedback = MaxMapFeedback::new(&edges_observer); - + let map_feedback = SimdMapFeedback::new(MaxMapFeedback::new(&edges_observer)); let calibration = CalibrationStage::new(&map_feedback); // Feedback to rate the interestingness of an input diff --git a/libafl/Cargo.toml b/libafl/Cargo.toml index afdef5a0d7..2b1f227d00 100644 --- a/libafl/Cargo.toml +++ b/libafl/Cargo.toml @@ -39,6 +39,7 @@ default = [ "regex", "serdeany_autoreg", "libafl_bolts/xxh3", + "stable_simd", ] document-features = ["dep:document-features"] @@ -195,6 +196,9 @@ nautilus = [ "regex", ] +## Use the best SIMD implementation by our [benchmark](https://github.com/wtdcode/libafl_simd_bench) +stable_simd = ["libafl_bolts/stable_simd"] + [[example]] name = "tui_mock" path = "./examples/tui_mock/main.rs" diff --git a/libafl/src/executors/sand.rs b/libafl/src/executors/sand.rs index ea4bf09979..48c24525e4 100644 --- a/libafl/src/executors/sand.rs +++ b/libafl/src/executors/sand.rs @@ -9,6 +9,7 @@ use core::marker::PhantomData; use libafl_bolts::{ AsIter, Error, Named, hash_std, + simd::std_simplify_map, tuples::{Handle, MatchName, MatchNameRef}, }; @@ -148,14 +149,10 @@ where let kind = self.executor.run_target(fuzzer, state, mgr, input)?; let ot = self.executor.observers(); let ob = ot.get(&self.ob_ref).unwrap().as_ref(); - let initial = ob.initial(); let mut covs = ob.to_vec(); match self.pattern { SANDExecutionPattern::SimplifiedTrace => { - // TODO: SIMD Optimizations - for it in &mut covs { - *it = if *it == initial { 0x1 } else { 0x80 }; - } + std_simplify_map(&mut covs); } SANDExecutionPattern::UniqueTrace => { classify_counts(covs.as_mut_slice()); diff --git a/libafl/src/feedbacks/map.rs b/libafl/src/feedbacks/map.rs index ae882134ef..dd3a157e39 100644 --- a/libafl/src/feedbacks/map.rs +++ b/libafl/src/feedbacks/map.rs @@ -1,8 +1,6 @@ //! Map feedback, maximizing or minimizing maps, for example the afl-style map observer. use alloc::{borrow::Cow, vec::Vec}; -#[rustversion::nightly] -use core::simd::prelude::SimdOrd; use core::{ fmt::Debug, marker::PhantomData, @@ -10,9 +8,9 @@ use core::{ }; #[rustversion::nightly] -use libafl_bolts::AsSlice; +use libafl_bolts::simd::std_covmap_is_interesting; use libafl_bolts::{ - AsIter, HasRefCnt, Named, + AsIter, AsSlice, HasRefCnt, Named, tuples::{Handle, Handled, MatchName, MatchNameRef}, }; use num_traits::PrimInt; @@ -548,7 +546,7 @@ where observers: &OT, _exit_kind: &ExitKind, ) -> Result { - Ok(self.is_interesting_u8_simd_optimized(state, observers)) + Ok(self.is_interesting_u8_simd_optimized(state, observers, std_covmap_is_interesting)) } } @@ -604,117 +602,6 @@ where } } -/// Specialize for the common coverage map size, maximization of u8s -#[rustversion::nightly] -impl MapFeedback -where - O: MapObserver + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>, - C: CanTrack + AsRef, -{ - fn is_interesting_u8_simd_optimized(&mut self, state: &mut S, observers: &OT) -> bool - where - S: HasNamedMetadata, - OT: MatchName, - { - // 128 bits vectors - type VectorType = core::simd::u8x16; - - let mut interesting = false; - // TODO Replace with match_name_type when stable - let observer = observers.get(&self.map_ref).expect("MapObserver not found. This is likely because you entered the crash handler with the wrong executor/observer").as_ref(); - - let map_state = state - .named_metadata_map_mut() - .get_mut::>(&self.name) - .unwrap(); - let size = observer.usable_count(); - let len = observer.len(); - if map_state.history_map.len() < len { - map_state.history_map.resize(len, u8::default()); - } - - let map = observer.as_slice(); - debug_assert!(map.len() >= size); - - let history_map = map_state.history_map.as_slice(); - - // Non vector implementation for reference - /*for (i, history) in history_map.iter_mut().enumerate() { - let item = map[i]; - let reduced = MaxReducer::reduce(*history, item); - if DifferentIsNovel::is_novel(*history, reduced) { - *history = reduced; - interesting = true; - if self.novelties.is_some() { - self.novelties.as_mut().unwrap().push(i); - } - } - }*/ - - let steps = size / VectorType::LEN; - let left = size % VectorType::LEN; - - if let Some(novelties) = self.novelties.as_mut() { - novelties.clear(); - for step in 0..steps { - let i = step * VectorType::LEN; - let history = VectorType::from_slice(&history_map[i..]); - let items = VectorType::from_slice(&map[i..]); - - if items.simd_max(history) != history { - interesting = true; - unsafe { - for j in i..(i + VectorType::LEN) { - let item = *map.get_unchecked(j); - if item > *history_map.get_unchecked(j) { - novelties.push(j); - } - } - } - } - } - - for j in (size - left)..size { - unsafe { - let item = *map.get_unchecked(j); - if item > *history_map.get_unchecked(j) { - interesting = true; - novelties.push(j); - } - } - } - } else { - for step in 0..steps { - let i = step * VectorType::LEN; - let history = VectorType::from_slice(&history_map[i..]); - let items = VectorType::from_slice(&map[i..]); - - if items.simd_max(history) != history { - interesting = true; - break; - } - } - - if !interesting { - for j in (size - left)..size { - unsafe { - let item = *map.get_unchecked(j); - if item > *history_map.get_unchecked(j) { - interesting = true; - break; - } - } - } - } - } - #[cfg(feature = "track_hit_feedbacks")] - { - self.last_result = Some(interesting); - } - interesting - } -} - impl HasObserverHandle for MapFeedback { type Observer = C; @@ -789,6 +676,67 @@ where } } +/// Specialize for the common coverage map size, maximization of u8s +impl MapFeedback +where + O: MapObserver + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>, + C: CanTrack + AsRef, +{ + #[allow(dead_code)] // this is true on stable wihout "stable_simd" + pub(crate) fn is_interesting_u8_simd_optimized( + &mut self, + state: &mut S, + observers: &OT, + simd: F, + ) -> bool + where + S: HasNamedMetadata, + OT: MatchName, + F: FnOnce(&[u8], &[u8], bool) -> (bool, Vec), + { + // TODO Replace with match_name_type when stable + let observer = observers.get(&self.map_ref).expect("MapObserver not found. This is likely because you entered the crash handler with the wrong executor/observer").as_ref(); + + let map_state = state + .named_metadata_map_mut() + .get_mut::>(&self.name) + .unwrap(); + let size = observer.usable_count(); + let len = observer.len(); + if map_state.history_map.len() < len { + map_state.history_map.resize(len, u8::default()); + } + + let map = observer.as_slice(); + debug_assert!(map.len() >= size); + + let history_map = map_state.history_map.as_slice(); + + // Non vector implementation for reference + /*for (i, history) in history_map.iter_mut().enumerate() { + let item = map[i]; + let reduced = MaxReducer::reduce(*history, item); + if DifferentIsNovel::is_novel(*history, reduced) { + *history = reduced; + interesting = true; + if self.novelties.is_some() { + self.novelties.as_mut().unwrap().push(i); + } + } + }*/ + + let (interesting, novelties) = simd(history_map, &map, self.novelties.is_some()); + if let Some(nov) = self.novelties.as_mut() { + *nov = novelties; + } + #[cfg(feature = "track_hit_feedbacks")] + { + self.last_result = Some(interesting); + } + interesting + } +} + #[cfg(test)] mod tests { use crate::feedbacks::{AllIsNovel, IsNovel, NextPow2IsNovel}; diff --git a/libafl/src/feedbacks/mod.rs b/libafl/src/feedbacks/mod.rs index 60a6217227..9460be35ee 100644 --- a/libafl/src/feedbacks/mod.rs +++ b/libafl/src/feedbacks/mod.rs @@ -46,6 +46,8 @@ pub mod map; pub mod nautilus; #[cfg(feature = "std")] pub mod new_hash_feedback; +#[cfg(feature = "stable_simd")] +pub mod simd; #[cfg(feature = "std")] pub mod stdio; pub mod transferred; diff --git a/libafl/src/feedbacks/simd.rs b/libafl/src/feedbacks/simd.rs new file mode 100644 index 0000000000..e64ffcc9bc --- /dev/null +++ b/libafl/src/feedbacks/simd.rs @@ -0,0 +1,195 @@ +//! SIMD accelerated map feedback with stable Rust. + +use alloc::{borrow::Cow, vec::Vec}; +use core::{ + fmt::Debug, + ops::{Deref, DerefMut}, +}; + +use libafl_bolts::{ + AsIter, AsSlice, Error, Named, + simd::{ + covmap_is_interesting_naive, covmap_is_interesting_u8x16, covmap_is_interesting_u8x32, + std_covmap_is_interesting, + }, + tuples::{Handle, MatchName}, +}; +use serde::{Serialize, de::DeserializeOwned}; + +use super::{ + DifferentIsNovel, Feedback, HasObserverHandle, MapFeedback, MaxReducer, StateInitializer, +}; +#[cfg(feature = "introspection")] +use crate::state::HasClientPerfMonitor; +use crate::{ + HasNamedMetadata, + corpus::Testcase, + events::EventFirer, + executors::ExitKind, + observers::{CanTrack, MapObserver}, + state::HasExecutions, +}; + +/// The coverage map SIMD acceleration to use. +/// Benchmark is available at +#[derive(Debug, Clone, Default, Copy)] +pub enum SimdImplmentation { + /// The u8x16 implementation from wide, usually the fastest + #[default] + WideU8x16, + /// The u8x32 implementation from wide, slightly slower than u8x16 (~1%) + WideU8x32, + /// Naive implementation, reference only + Naive, +} + +impl SimdImplmentation { + fn dispatch_simd(self) -> CoverageMapFunPtr { + match self { + SimdImplmentation::WideU8x16 => covmap_is_interesting_u8x16, + SimdImplmentation::WideU8x32 => covmap_is_interesting_u8x32, + SimdImplmentation::Naive => covmap_is_interesting_naive, + } + } +} + +type CoverageMapFunPtr = fn(&[u8], &[u8], bool) -> (bool, Vec); + +/// Stable Rust wrapper for SIMD accelerated map feedback. Unfortunately, we have to +/// keep this until specialization is stablized (not yet since 2016). +#[derive(Debug, Clone)] +pub struct SimdMapFeedback { + map: MapFeedback, + simd: CoverageMapFunPtr, +} + +impl SimdMapFeedback { + /// Wraps an existing map and enable SIMD acceleration. This will use standard SIMD + /// implementation, which might vary based on target architecture according to our + /// benchmark. + #[must_use] + pub fn new(map: MapFeedback) -> Self { + Self { + map, + simd: std_covmap_is_interesting, + } + } + + /// Wraps an existing map and enable SIMD acceleration according to arguments. + #[must_use] + pub fn with_simd( + map: MapFeedback, + simd: SimdImplmentation, + ) -> Self { + Self { + map, + simd: simd.dispatch_simd(), + } + } +} + +impl Deref for SimdMapFeedback { + type Target = MapFeedback; + fn deref(&self) -> &Self::Target { + &self.map + } +} + +impl DerefMut for SimdMapFeedback { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.map + } +} + +impl StateInitializer for SimdMapFeedback +where + O: MapObserver, + O::Entry: 'static + Default + Debug + DeserializeOwned + Serialize, + S: HasNamedMetadata, +{ + fn init_state(&mut self, state: &mut S) -> Result<(), Error> { + self.map.init_state(state) + } +} + +impl HasObserverHandle for SimdMapFeedback { + type Observer = C; + + #[inline] + fn observer_handle(&self) -> &Handle { + self.map.observer_handle() + } +} + +impl Named for SimdMapFeedback { + #[inline] + fn name(&self) -> &Cow<'static, str> { + self.map.name() + } +} + +// Delegate implementations to inner mapping except is_interesting +impl Feedback for SimdMapFeedback +where + C: CanTrack + AsRef, + EM: EventFirer, + O: MapObserver + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>, + OT: MatchName, + S: HasNamedMetadata + HasExecutions, +{ + fn is_interesting( + &mut self, + state: &mut S, + _manager: &mut EM, + _input: &I, + observers: &OT, + _exit_kind: &ExitKind, + ) -> Result { + let res = self + .map + .is_interesting_u8_simd_optimized(state, observers, self.simd); + Ok(res) + } + + #[cfg(feature = "introspection")] + fn is_interesting_introspection( + &mut self, + state: &mut S, + manager: &mut EM, + input: &I, + observers: &OT, + exit_kind: &ExitKind, + ) -> Result + where + S: HasClientPerfMonitor, + { + self.map + .is_interesting_introspection(state, manager, input, observers, exit_kind) + } + + #[cfg(feature = "track_hit_feedbacks")] + fn last_result(&self) -> Result { + // cargo +nightly doc asks so + as Feedback>::last_result( + &self.map, + ) + } + + #[cfg(feature = "track_hit_feedbacks")] + fn append_hit_feedbacks(&self, list: &mut Vec>) -> Result<(), Error> { + // cargo +nightly doc asks so + as Feedback>::append_hit_feedbacks(&self.map, list) + } + + #[inline] + fn append_metadata( + &mut self, + state: &mut S, + manager: &mut EM, + observers: &OT, + testcase: &mut Testcase, + ) -> Result<(), Error> { + self.map + .append_metadata(state, manager, observers, testcase) + } +} diff --git a/libafl_bolts/Cargo.toml b/libafl_bolts/Cargo.toml index a49af1173c..48fd55070f 100644 --- a/libafl_bolts/Cargo.toml +++ b/libafl_bolts/Cargo.toml @@ -52,6 +52,7 @@ std = [ "uds", "serial_test", "alloc", + "stable_simd", ] ## Enables all features that allocate in `no_std` @@ -117,9 +118,20 @@ llmp_debug = ["alloc", "std"] ## Reduces the initial map size for llmp llmp_small_maps = ["alloc"] +#! ### Stable SIMD features + +## Use the best SIMD implementation by our [benchmark](https://github.com/wtdcode/libafl_simd_bench) +stable_simd = ["alloc", "wide"] + [build-dependencies] rustversion = { workspace = true } +[dev-dependencies] +clap = { version = "4.5", features = ["derive", "env"] } +rand = "0.9.0" +chrono = "0.4.40" +itertools = "0.14.0" + [dependencies] libafl_derive = { workspace = true, default-features = true, optional = true } static_assertions = { workspace = true } @@ -165,6 +177,10 @@ serial_test = { workspace = true, optional = true, default-features = false, fea "logging", ] } +# optional stable simd, pin to a commit due to `u8x32` not released yet. Switch as long as next release is out! +wide = { git = "https://github.com/Lokathor/wide", rev = "71b5df0b2620da753836fafce5f99076181a49fe", optional = true } +rustversion = { workspace = true } + # Document all features of this crate (for `cargo doc`) document-features = { workspace = true, optional = true } @@ -212,3 +228,11 @@ mach = "0.3.2" name = "llmp_test" path = "./examples/llmp_test/main.rs" required-features = ["std"] + + +[[example]] +name = "simd" +path = "./examples/simd/simd.rs" +bench = true +harness = false +required-features = ["std", "stable_simd"] diff --git a/libafl_bolts/examples/simd/simd.rs b/libafl_bolts/examples/simd/simd.rs new file mode 100644 index 0000000000..7651cd041d --- /dev/null +++ b/libafl_bolts/examples/simd/simd.rs @@ -0,0 +1,200 @@ +use chrono::Utc; +use clap::Parser; +use itertools::Itertools; +use libafl_bolts::simd::{ + covmap_is_interesting_naive, covmap_is_interesting_u8x16, covmap_is_interesting_u8x32, + simplify_map_naive, simplify_map_u8x16, simplify_map_u8x32, +}; +use rand::{RngCore, rngs::ThreadRng}; + +#[derive(Parser)] +struct Cli { + #[arg(short, long, default_value_t = 2097152, env = "LIBAFL_BENCH_MAP_SIZE")] + pub map: usize, + #[arg(short, long, default_value_t = 32768, env = "LIBAFL_BENCH_ROUNDS")] + pub rounds: usize, + #[arg(short, long, env = "LIBAFL_BENCH_CORRECTNESS")] + pub validate: bool, + #[arg(short, long)] + pub bench: bool, // ?? Cargo sends this?? +} + +fn random_bits(map: &mut [u8], rng: &mut ThreadRng) { + // randomly set a bit since coverage map is usually sparse enough + let rng = rng.next_u64() as usize; + let bytes_idx = (rng / 8) % map.len(); + let bits_idx = rng % 8; + map[bytes_idx] |= 1 << bits_idx; +} + +fn clean_vectors(map: &mut [u8]) { + for it in map.iter_mut() { + *it = 0; + } +} + +struct SimplifyMapInput { + name: String, + func: fn(&mut [u8]), + map: Vec, + rounds: usize, + validate: bool, + rng: ThreadRng, +} + +impl SimplifyMapInput { + fn from_cli(name: &str, f: fn(&mut [u8]), cli: &Cli, rng: &ThreadRng) -> Self { + Self { + name: name.to_string(), + func: f, + map: vec![0; cli.map], + rng: rng.clone(), + rounds: cli.rounds, + validate: cli.validate, + } + } + fn measure_simplify_input(mut self) -> Vec { + println!("Running {}", &self.name); + let mut outs = vec![]; + println!("warm up..."); + for _ in 0..16 { + (self.func)(&mut self.map); + } + clean_vectors(&mut self.map); + for _ in 0..self.rounds { + random_bits(&mut self.map, &mut self.rng); + let before = Utc::now(); + + if self.validate { + let mut mp = self.map.clone(); + (self.func)(&mut self.map); + simplify_map_naive(&mut mp); + + assert!( + mp == self.map, + "Incorrect covmap impl. {:?} vs\n{:?}", + mp, + self.map + ); + } else { + (self.func)(&mut self.map); + } + let after = Utc::now(); + outs.push(after - before); + } + + outs + } +} + +type CovFuncPtr = fn(&[u8], &[u8], bool) -> (bool, Vec); + +struct CovInput { + name: String, + func: CovFuncPtr, + hist: Vec, + map: Vec, + rounds: usize, + validate: bool, + rng: ThreadRng, +} + +impl CovInput { + fn from_cli(name: &str, f: CovFuncPtr, cli: &Cli, rng: &ThreadRng) -> Self { + CovInput { + name: name.to_string(), + func: f, + hist: vec![0; cli.map], + map: vec![0; cli.map], + rng: rng.clone(), + rounds: cli.rounds, + validate: cli.validate, + } + } + fn measure_cov(mut self) -> Vec { + println!("Running {}", &self.name); + let mut outs = vec![]; + println!("warm up..."); + for _ in 0..16 { + (self.func)(&self.hist, &self.map, true); + } + clean_vectors(&mut self.hist); + clean_vectors(&mut self.map); + for _ in 0..self.rounds { + random_bits(&mut self.map, &mut self.rng); + let before = Utc::now(); + let (interesting, novelties) = (self.func)(&self.hist, &self.map, true); + if self.validate { + let (canonical_interesting, canonical_novelties) = + covmap_is_interesting_naive(&self.hist, &self.map, true); + + assert!( + canonical_interesting == interesting && novelties == canonical_novelties, + "Incorrect covmap impl. {canonical_interesting} vs {interesting}, {canonical_novelties:?} vs\n{novelties:?}" + ); + } + let after = Utc::now(); + outs.push(after - before); + } + + outs + } +} + +#[allow(clippy::cast_precision_loss)] +fn printout(ty: &str, tms: &[chrono::TimeDelta]) { + let tms = tms + .iter() + .map(|t| t.to_std().unwrap().as_secs_f64()) + .collect_vec(); + let mean = tms.iter().sum::() / tms.len() as f64; + let min = tms.iter().fold(0f64, |acc, x| acc.min(*x)); + let max = tms.iter().fold(0f64, |acc, x| acc.max(*x)); + let std = (tms + .iter() + .fold(0f64, |acc, x| acc + (*x - mean) * (*x - mean)) + / (tms.len() - 1) as f64) + .sqrt(); + let sum: f64 = tms.into_iter().sum(); + println!( + "{}: avg {:.03}, min {:.03}, max {:.03}, std {:.03}, sum {:.03}", + ty, + mean * 1000.0, + min * 1000.0, + max * 1000.0, + std * 1000.0, + sum * 1000.0 + ); +} + +fn main() { + // Bench with `taskset -c 3 cargo bench --example simd` + // Validate with `cargo bench --example simd -- --validate --rounds 8192` + let cli = Cli::parse(); + + let rng = rand::rng(); + + let simpls = [ + SimplifyMapInput::from_cli("naive simplify_map", simplify_map_naive, &cli, &rng), + SimplifyMapInput::from_cli("u8x16 simplify_map", simplify_map_u8x16, &cli, &rng), + SimplifyMapInput::from_cli("u8x32 simplify_map", simplify_map_u8x32, &cli, &rng), + ]; + + for bench in simpls { + let name = bench.name.clone(); + let outs = bench.measure_simplify_input(); + printout(&name, &outs); + } + + let benches = [ + CovInput::from_cli("naive cov", covmap_is_interesting_naive, &cli, &rng), + CovInput::from_cli("u8x16 cov", covmap_is_interesting_u8x16, &cli, &rng), + CovInput::from_cli("u8x32 cov", covmap_is_interesting_u8x32, &cli, &rng), + ]; + + for bench in benches { + let name = bench.name.clone(); + let outs = bench.measure_cov(); + printout(&name, &outs); + } +} diff --git a/libafl_bolts/src/lib.rs b/libafl_bolts/src/lib.rs index deb675f4d9..e41ad4715c 100644 --- a/libafl_bolts/src/lib.rs +++ b/libafl_bolts/src/lib.rs @@ -122,6 +122,8 @@ pub mod target_args; #[cfg(all(feature = "std", unix))] pub use target_args::*; +pub mod simd; + /// The purpose of this module is to alleviate imports of the bolts by adding a glob import. #[cfg(feature = "prelude")] pub mod bolts_prelude { diff --git a/libafl_bolts/src/simd.rs b/libafl_bolts/src/simd.rs new file mode 100644 index 0000000000..329baedb0d --- /dev/null +++ b/libafl_bolts/src/simd.rs @@ -0,0 +1,295 @@ +//! Module for SIMD assisted methods. + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +/// `simplify_map` naive implementaion. In most cases, this can be auto-vectorized. +pub fn simplify_map_naive(map: &mut [u8]) { + for it in map.iter_mut() { + *it = if *it == 0 { 0x1 } else { 0x80 }; + } +} + +/// `simplify_map` implementation by u8x16, worse performance compared to LLVM +/// auto-vectorization but faster if LLVM doesn't vectorize. +#[cfg(feature = "wide")] +pub fn simplify_map_u8x16(map: &mut [u8]) { + type VectorType = wide::u8x16; + const N: usize = VectorType::LANES as usize; + let size = map.len(); + let steps = size / N; + let left = size % N; + let lhs = VectorType::new([0x1; N]); + let rhs = VectorType::new([0x80; N]); + + for step in 0..steps { + let i = step * N; + let mp = VectorType::new(map[i..(i + N)].try_into().unwrap()); + + let mask = mp.cmp_eq(VectorType::ZERO); + let out = mask.blend(lhs, rhs); + map[i..i + N].copy_from_slice(out.as_array_ref()); + } + + #[allow(clippy::needless_range_loop)] + for j in (size - left)..size { + map[j] = if map[j] == 0 { 0x1 } else { 0x80 } + } +} + +/// `simplify_map` implementation by i8x32, achieving comparable performance with +/// LLVM auto-vectorization. +#[cfg(feature = "wide")] +pub fn simplify_map_u8x32(map: &mut [u8]) { + use wide::CmpEq; + + type VectorType = wide::u8x32; + const N: usize = VectorType::LANES as usize; + let size = map.len(); + let steps = size / N; + let left = size % N; + let lhs = VectorType::new([0x01; 32]); + let rhs = VectorType::new([0x80; 32]); + + for step in 0..steps { + let i = step * N; + let mp = VectorType::new(map[i..i + N].try_into().unwrap()); + + let mask = mp.cmp_eq(VectorType::ZERO); + let out = mask.blend(lhs, rhs); + unsafe { + out.as_array_ref() + .as_ptr() + .copy_to_nonoverlapping(map.as_mut_ptr().add(i), N); + } + } + + #[allow(clippy::needless_range_loop)] + for j in (size - left)..size { + map[j] = if map[j] == 0 { 0x1 } else { 0x80 } + } +} + +/// The std implementation of `simplify_map`. Use the fastest implementation by benchamrk by default. +pub fn std_simplify_map(map: &mut [u8]) { + #[cfg(not(feature = "wide"))] + simplify_map_naive(map); + + #[cfg(feature = "wide")] + simplify_map_u8x32(map); +} + +/// Coverage map insteresting implementation by u8x16. Slightly faster than nightly simd. +#[cfg(all(feature = "alloc", feature = "wide"))] +#[must_use] +pub fn covmap_is_interesting_u8x16( + hist: &[u8], + map: &[u8], + collect_novelties: bool, +) -> (bool, Vec) { + type VectorType = wide::u8x16; + let mut novelties = vec![]; + let mut interesting = false; + let size = map.len(); + let steps = size / VectorType::LANES as usize; + let left = size % VectorType::LANES as usize; + + if collect_novelties { + for step in 0..steps { + let i = step * VectorType::LANES as usize; + let history = + VectorType::new(hist[i..i + VectorType::LANES as usize].try_into().unwrap()); + let items = VectorType::new(map[i..i + VectorType::LANES as usize].try_into().unwrap()); + + if items.max(history) != history { + interesting = true; + unsafe { + for j in i..(i + VectorType::LANES as usize) { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + novelties.push(j); + } + } + } + } + } + + for j in (size - left)..size { + unsafe { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + interesting = true; + novelties.push(j); + } + } + } + } else { + for step in 0..steps { + let i = step * VectorType::LANES as usize; + let history = + VectorType::new(hist[i..i + VectorType::LANES as usize].try_into().unwrap()); + let items = VectorType::new(map[i..i + VectorType::LANES as usize].try_into().unwrap()); + + if items.max(history) != history { + interesting = true; + break; + } + } + + if !interesting { + for j in (size - left)..size { + unsafe { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + interesting = true; + break; + } + } + } + } + } + + (interesting, novelties) +} + +/// Coverage map insteresting implementation by u8x32. Slightly faster than nightly simd but slightly +/// slower than u8x16 version. +#[cfg(all(feature = "alloc", feature = "wide"))] +#[must_use] +pub fn covmap_is_interesting_u8x32( + hist: &[u8], + map: &[u8], + collect_novelties: bool, +) -> (bool, Vec) { + type VectorType = wide::u8x32; + const N: usize = VectorType::LANES as usize; + let mut novelties = vec![]; + let mut interesting = false; + let size = map.len(); + let steps = size / N; + let left = size % N; + + if collect_novelties { + for step in 0..steps { + let i = step * N; + let history = VectorType::new(hist[i..i + N].try_into().unwrap()); + let items = VectorType::new(map[i..i + N].try_into().unwrap()); + + if items.max(history) != history { + interesting = true; + unsafe { + // Break into two loops so that LLVM will vectorize both loops. + // Or LLVM won't vectorize them and is super slow. We need a few + // extra intrinsic to wide and safe_arch to vectorize this manually. + for j in i..(i + N / 2) { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + novelties.push(j); + } + } + + for j in (i + N / 2)..(i + N) { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + novelties.push(j); + } + } + } + } + } + + for j in (size - left)..size { + unsafe { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + interesting = true; + novelties.push(j); + } + } + } + } else { + for step in 0..steps { + let i = step * N; + let history = VectorType::new(hist[i..i + N].try_into().unwrap()); + let items = VectorType::new(map[i..i + N].try_into().unwrap()); + + if items.max(history) != history { + interesting = true; + break; + } + } + + if !interesting { + for j in (size - left)..size { + unsafe { + let item = *map.get_unchecked(j); + if item > *hist.get_unchecked(j) { + interesting = true; + break; + } + } + } + } + } + + (interesting, novelties) +} + +/// Coverage map insteresting naive implementation. Do not use it unless you have strong reasons to do. +#[cfg(feature = "alloc")] +#[must_use] +pub fn covmap_is_interesting_naive( + hist: &[u8], + map: &[u8], + collect_novelties: bool, +) -> (bool, Vec) { + let mut novelties = vec![]; + let mut interesting = false; + let initial = 0; + if collect_novelties { + for (i, item) in map.iter().enumerate().filter(|(_, item)| **item != initial) { + let existing = unsafe { *hist.get_unchecked(i) }; + let reduced = existing.max(*item); + if existing != reduced { + interesting = true; + novelties.push(i); + } + } + } else { + for (i, item) in map.iter().enumerate().filter(|(_, item)| **item != initial) { + let existing = unsafe { *hist.get_unchecked(i) }; + let reduced = existing.max(*item); + if existing != reduced { + interesting = true; + break; + } + } + } + + (interesting, novelties) +} + +/// Standard coverage map instereting implementation. Use the available fastest implementation by default. +#[cfg(feature = "alloc")] +#[allow(unused_variables)] // or we fail cargo doc +#[must_use] +pub fn std_covmap_is_interesting( + hist: &[u8], + map: &[u8], + collect_novelties: bool, +) -> (bool, Vec) { + #[cfg(not(feature = "wide"))] + return covmap_is_interesting_naive(hist, map, collect_novelties); + + #[cfg(feature = "wide")] + { + // Supported by benchmark: + // - on aarch64, u8x32 is 15% faster than u8x16 + // - on amd64, u8x16 is 10% faster compared to the u8x32 + #[cfg(target_arch = "aarch64")] + return covmap_is_interesting_u8x32(hist, map, collect_novelties); + + #[cfg(not(target_arch = "aarch64"))] + return covmap_is_interesting_u8x16(hist, map, collect_novelties); + } +}