diff --git a/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs b/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs
index 09d5357bfd..36cb52df62 100644
--- a/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs
+++ b/fuzzers/inprocess/libfuzzer_libpng/src/lib.rs
@@ -62,6 +62,8 @@ pub extern "C" fn libafl_main() {
 #[cfg(not(test))]
 fn fuzz(corpus_dirs: &[PathBuf], objective_dir: PathBuf, broker_port: u16) -> Result<(), Error> {
     // 'While the stats are state, they are usually used in the broker - which is likely never restarted
+
+    use libafl::feedbacks::simd::{SimdImplmentation, SimdMapFeedback};
     let monitor = MultiMonitor::new(|s| println!("{s}"));
 
     // The restarting state will spawn the same process again as child, then restarted it each time it crashes.
@@ -93,8 +95,7 @@ fn fuzz(corpus_dirs: &[PathBuf], objective_dir: PathBuf, broker_port: u16) -> Re
     // Create an observation channel to keep track of the execution time
     let time_observer = TimeObserver::new("time");
 
-    let map_feedback = MaxMapFeedback::new(&edges_observer);
-
+    let map_feedback = SimdMapFeedback::new(MaxMapFeedback::new(&edges_observer));
     let calibration = CalibrationStage::new(&map_feedback);
 
     // Feedback to rate the interestingness of an input
diff --git a/libafl/Cargo.toml b/libafl/Cargo.toml
index afdef5a0d7..2b1f227d00 100644
--- a/libafl/Cargo.toml
+++ b/libafl/Cargo.toml
@@ -39,6 +39,7 @@ default = [
   "regex",
   "serdeany_autoreg",
   "libafl_bolts/xxh3",
+  "stable_simd",
 ]
 document-features = ["dep:document-features"]
 
@@ -195,6 +196,9 @@ nautilus = [
   "regex",
 ]
 
+## Use the best SIMD implementation by our [benchmark](https://github.com/wtdcode/libafl_simd_bench)
+stable_simd = ["libafl_bolts/stable_simd"]
+
 [[example]]
 name = "tui_mock"
 path = "./examples/tui_mock/main.rs"
diff --git a/libafl/src/executors/sand.rs b/libafl/src/executors/sand.rs
index ea4bf09979..48c24525e4 100644
--- a/libafl/src/executors/sand.rs
+++ b/libafl/src/executors/sand.rs
@@ -9,6 +9,7 @@ use core::marker::PhantomData;
 
 use libafl_bolts::{
     AsIter, Error, Named, hash_std,
+    simd::std_simplify_map,
     tuples::{Handle, MatchName, MatchNameRef},
 };
 
@@ -148,14 +149,10 @@ where
         let kind = self.executor.run_target(fuzzer, state, mgr, input)?;
         let ot = self.executor.observers();
         let ob = ot.get(&self.ob_ref).unwrap().as_ref();
-        let initial = ob.initial();
         let mut covs = ob.to_vec();
         match self.pattern {
             SANDExecutionPattern::SimplifiedTrace => {
-                // TODO: SIMD Optimizations
-                for it in &mut covs {
-                    *it = if *it == initial { 0x1 } else { 0x80 };
-                }
+                std_simplify_map(&mut covs);
             }
             SANDExecutionPattern::UniqueTrace => {
                 classify_counts(covs.as_mut_slice());
diff --git a/libafl/src/feedbacks/map.rs b/libafl/src/feedbacks/map.rs
index ae882134ef..dd3a157e39 100644
--- a/libafl/src/feedbacks/map.rs
+++ b/libafl/src/feedbacks/map.rs
@@ -1,8 +1,6 @@
 //! Map feedback, maximizing or minimizing maps, for example the afl-style map observer.
 
 use alloc::{borrow::Cow, vec::Vec};
-#[rustversion::nightly]
-use core::simd::prelude::SimdOrd;
 use core::{
     fmt::Debug,
     marker::PhantomData,
@@ -10,9 +8,9 @@ use core::{
 };
 
 #[rustversion::nightly]
-use libafl_bolts::AsSlice;
+use libafl_bolts::simd::std_covmap_is_interesting;
 use libafl_bolts::{
-    AsIter, HasRefCnt, Named,
+    AsIter, AsSlice, HasRefCnt, Named,
     tuples::{Handle, Handled, MatchName, MatchNameRef},
 };
 use num_traits::PrimInt;
@@ -548,7 +546,7 @@ where
         observers: &OT,
         _exit_kind: &ExitKind,
     ) -> Result<bool, Error> {
-        Ok(self.is_interesting_u8_simd_optimized(state, observers))
+        Ok(self.is_interesting_u8_simd_optimized(state, observers, std_covmap_is_interesting))
     }
 }
 
@@ -604,117 +602,6 @@ where
     }
 }
 
-/// Specialize for the common coverage map size, maximization of u8s
-#[rustversion::nightly]
-impl<C, O> MapFeedback<C, DifferentIsNovel, O, MaxReducer>
-where
-    O: MapObserver<Entry = u8> + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>,
-    C: CanTrack + AsRef<O>,
-{
-    fn is_interesting_u8_simd_optimized<S, OT>(&mut self, state: &mut S, observers: &OT) -> bool
-    where
-        S: HasNamedMetadata,
-        OT: MatchName,
-    {
-        // 128 bits vectors
-        type VectorType = core::simd::u8x16;
-
-        let mut interesting = false;
-        // TODO Replace with match_name_type when stable
-        let observer = observers.get(&self.map_ref).expect("MapObserver not found. This is likely because you entered the crash handler with the wrong executor/observer").as_ref();
-
-        let map_state = state
-            .named_metadata_map_mut()
-            .get_mut::<MapFeedbackMetadata<u8>>(&self.name)
-            .unwrap();
-        let size = observer.usable_count();
-        let len = observer.len();
-        if map_state.history_map.len() < len {
-            map_state.history_map.resize(len, u8::default());
-        }
-
-        let map = observer.as_slice();
-        debug_assert!(map.len() >= size);
-
-        let history_map = map_state.history_map.as_slice();
-
-        // Non vector implementation for reference
-        /*for (i, history) in history_map.iter_mut().enumerate() {
-            let item = map[i];
-            let reduced = MaxReducer::reduce(*history, item);
-            if DifferentIsNovel::is_novel(*history, reduced) {
-                *history = reduced;
-                interesting = true;
-                if self.novelties.is_some() {
-                    self.novelties.as_mut().unwrap().push(i);
-                }
-            }
-        }*/
-
-        let steps = size / VectorType::LEN;
-        let left = size % VectorType::LEN;
-
-        if let Some(novelties) = self.novelties.as_mut() {
-            novelties.clear();
-            for step in 0..steps {
-                let i = step * VectorType::LEN;
-                let history = VectorType::from_slice(&history_map[i..]);
-                let items = VectorType::from_slice(&map[i..]);
-
-                if items.simd_max(history) != history {
-                    interesting = true;
-                    unsafe {
-                        for j in i..(i + VectorType::LEN) {
-                            let item = *map.get_unchecked(j);
-                            if item > *history_map.get_unchecked(j) {
-                                novelties.push(j);
-                            }
-                        }
-                    }
-                }
-            }
-
-            for j in (size - left)..size {
-                unsafe {
-                    let item = *map.get_unchecked(j);
-                    if item > *history_map.get_unchecked(j) {
-                        interesting = true;
-                        novelties.push(j);
-                    }
-                }
-            }
-        } else {
-            for step in 0..steps {
-                let i = step * VectorType::LEN;
-                let history = VectorType::from_slice(&history_map[i..]);
-                let items = VectorType::from_slice(&map[i..]);
-
-                if items.simd_max(history) != history {
-                    interesting = true;
-                    break;
-                }
-            }
-
-            if !interesting {
-                for j in (size - left)..size {
-                    unsafe {
-                        let item = *map.get_unchecked(j);
-                        if item > *history_map.get_unchecked(j) {
-                            interesting = true;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-        #[cfg(feature = "track_hit_feedbacks")]
-        {
-            self.last_result = Some(interesting);
-        }
-        interesting
-    }
-}
-
 impl<C, N, O, R> HasObserverHandle for MapFeedback<C, N, O, R> {
     type Observer = C;
 
@@ -789,6 +676,67 @@ where
     }
 }
 
+/// Specialize for the common coverage map size, maximization of u8s
+impl<C, O> MapFeedback<C, DifferentIsNovel, O, MaxReducer>
+where
+    O: MapObserver<Entry = u8> + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>,
+    C: CanTrack + AsRef<O>,
+{
+    #[allow(dead_code)] // this is true on stable wihout "stable_simd"
+    pub(crate) fn is_interesting_u8_simd_optimized<S, OT, F>(
+        &mut self,
+        state: &mut S,
+        observers: &OT,
+        simd: F,
+    ) -> bool
+    where
+        S: HasNamedMetadata,
+        OT: MatchName,
+        F: FnOnce(&[u8], &[u8], bool) -> (bool, Vec<usize>),
+    {
+        // TODO Replace with match_name_type when stable
+        let observer = observers.get(&self.map_ref).expect("MapObserver not found. This is likely because you entered the crash handler with the wrong executor/observer").as_ref();
+
+        let map_state = state
+            .named_metadata_map_mut()
+            .get_mut::<MapFeedbackMetadata<u8>>(&self.name)
+            .unwrap();
+        let size = observer.usable_count();
+        let len = observer.len();
+        if map_state.history_map.len() < len {
+            map_state.history_map.resize(len, u8::default());
+        }
+
+        let map = observer.as_slice();
+        debug_assert!(map.len() >= size);
+
+        let history_map = map_state.history_map.as_slice();
+
+        // Non vector implementation for reference
+        /*for (i, history) in history_map.iter_mut().enumerate() {
+            let item = map[i];
+            let reduced = MaxReducer::reduce(*history, item);
+            if DifferentIsNovel::is_novel(*history, reduced) {
+                *history = reduced;
+                interesting = true;
+                if self.novelties.is_some() {
+                    self.novelties.as_mut().unwrap().push(i);
+                }
+            }
+        }*/
+
+        let (interesting, novelties) = simd(history_map, &map, self.novelties.is_some());
+        if let Some(nov) = self.novelties.as_mut() {
+            *nov = novelties;
+        }
+        #[cfg(feature = "track_hit_feedbacks")]
+        {
+            self.last_result = Some(interesting);
+        }
+        interesting
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::feedbacks::{AllIsNovel, IsNovel, NextPow2IsNovel};
diff --git a/libafl/src/feedbacks/mod.rs b/libafl/src/feedbacks/mod.rs
index 60a6217227..9460be35ee 100644
--- a/libafl/src/feedbacks/mod.rs
+++ b/libafl/src/feedbacks/mod.rs
@@ -46,6 +46,8 @@ pub mod map;
 pub mod nautilus;
 #[cfg(feature = "std")]
 pub mod new_hash_feedback;
+#[cfg(feature = "stable_simd")]
+pub mod simd;
 #[cfg(feature = "std")]
 pub mod stdio;
 pub mod transferred;
diff --git a/libafl/src/feedbacks/simd.rs b/libafl/src/feedbacks/simd.rs
new file mode 100644
index 0000000000..e64ffcc9bc
--- /dev/null
+++ b/libafl/src/feedbacks/simd.rs
@@ -0,0 +1,195 @@
+//! SIMD accelerated map feedback with stable Rust.
+
+use alloc::{borrow::Cow, vec::Vec};
+use core::{
+    fmt::Debug,
+    ops::{Deref, DerefMut},
+};
+
+use libafl_bolts::{
+    AsIter, AsSlice, Error, Named,
+    simd::{
+        covmap_is_interesting_naive, covmap_is_interesting_u8x16, covmap_is_interesting_u8x32,
+        std_covmap_is_interesting,
+    },
+    tuples::{Handle, MatchName},
+};
+use serde::{Serialize, de::DeserializeOwned};
+
+use super::{
+    DifferentIsNovel, Feedback, HasObserverHandle, MapFeedback, MaxReducer, StateInitializer,
+};
+#[cfg(feature = "introspection")]
+use crate::state::HasClientPerfMonitor;
+use crate::{
+    HasNamedMetadata,
+    corpus::Testcase,
+    events::EventFirer,
+    executors::ExitKind,
+    observers::{CanTrack, MapObserver},
+    state::HasExecutions,
+};
+
+/// The coverage map SIMD acceleration to use.
+/// Benchmark is available at <https://github.com/wtdcode/libafl_simd_bench>
+#[derive(Debug, Clone, Default, Copy)]
+pub enum SimdImplmentation {
+    /// The u8x16 implementation from wide, usually the fastest
+    #[default]
+    WideU8x16,
+    /// The u8x32 implementation from wide, slightly slower than u8x16 (~1%)
+    WideU8x32,
+    /// Naive implementation, reference only
+    Naive,
+}
+
+impl SimdImplmentation {
+    fn dispatch_simd(self) -> CoverageMapFunPtr {
+        match self {
+            SimdImplmentation::WideU8x16 => covmap_is_interesting_u8x16,
+            SimdImplmentation::WideU8x32 => covmap_is_interesting_u8x32,
+            SimdImplmentation::Naive => covmap_is_interesting_naive,
+        }
+    }
+}
+
+type CoverageMapFunPtr = fn(&[u8], &[u8], bool) -> (bool, Vec<usize>);
+
+/// Stable Rust wrapper for SIMD accelerated map feedback. Unfortunately, we have to
+/// keep this until specialization is stablized (not yet since 2016).
+#[derive(Debug, Clone)]
+pub struct SimdMapFeedback<C, O> {
+    map: MapFeedback<C, DifferentIsNovel, O, MaxReducer>,
+    simd: CoverageMapFunPtr,
+}
+
+impl<C, O> SimdMapFeedback<C, O> {
+    /// Wraps an existing map and enable SIMD acceleration. This will use standard SIMD
+    /// implementation, which might vary based on target architecture according to our
+    /// benchmark.
+    #[must_use]
+    pub fn new(map: MapFeedback<C, DifferentIsNovel, O, MaxReducer>) -> Self {
+        Self {
+            map,
+            simd: std_covmap_is_interesting,
+        }
+    }
+
+    /// Wraps an existing map and enable SIMD acceleration according to arguments.
+    #[must_use]
+    pub fn with_simd(
+        map: MapFeedback<C, DifferentIsNovel, O, MaxReducer>,
+        simd: SimdImplmentation,
+    ) -> Self {
+        Self {
+            map,
+            simd: simd.dispatch_simd(),
+        }
+    }
+}
+
+impl<C, O> Deref for SimdMapFeedback<C, O> {
+    type Target = MapFeedback<C, DifferentIsNovel, O, MaxReducer>;
+    fn deref(&self) -> &Self::Target {
+        &self.map
+    }
+}
+
+impl<C, O> DerefMut for SimdMapFeedback<C, O> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.map
+    }
+}
+
+impl<C, O, S> StateInitializer<S> for SimdMapFeedback<C, O>
+where
+    O: MapObserver,
+    O::Entry: 'static + Default + Debug + DeserializeOwned + Serialize,
+    S: HasNamedMetadata,
+{
+    fn init_state(&mut self, state: &mut S) -> Result<(), Error> {
+        self.map.init_state(state)
+    }
+}
+
+impl<C, O> HasObserverHandle for SimdMapFeedback<C, O> {
+    type Observer = C;
+
+    #[inline]
+    fn observer_handle(&self) -> &Handle<C> {
+        self.map.observer_handle()
+    }
+}
+
+impl<C, O> Named for SimdMapFeedback<C, O> {
+    #[inline]
+    fn name(&self) -> &Cow<'static, str> {
+        self.map.name()
+    }
+}
+
+// Delegate implementations to inner mapping except is_interesting
+impl<C, O, EM, I, OT, S> Feedback<EM, I, OT, S> for SimdMapFeedback<C, O>
+where
+    C: CanTrack + AsRef<O>,
+    EM: EventFirer<I, S>,
+    O: MapObserver<Entry = u8> + for<'a> AsSlice<'a, Entry = u8> + for<'a> AsIter<'a, Item = u8>,
+    OT: MatchName,
+    S: HasNamedMetadata + HasExecutions,
+{
+    fn is_interesting(
+        &mut self,
+        state: &mut S,
+        _manager: &mut EM,
+        _input: &I,
+        observers: &OT,
+        _exit_kind: &ExitKind,
+    ) -> Result<bool, Error> {
+        let res = self
+            .map
+            .is_interesting_u8_simd_optimized(state, observers, self.simd);
+        Ok(res)
+    }
+
+    #[cfg(feature = "introspection")]
+    fn is_interesting_introspection(
+        &mut self,
+        state: &mut S,
+        manager: &mut EM,
+        input: &I,
+        observers: &OT,
+        exit_kind: &ExitKind,
+    ) -> Result<bool, Error>
+    where
+        S: HasClientPerfMonitor,
+    {
+        self.map
+            .is_interesting_introspection(state, manager, input, observers, exit_kind)
+    }
+
+    #[cfg(feature = "track_hit_feedbacks")]
+    fn last_result(&self) -> Result<bool, Error> {
+        // cargo +nightly doc asks so
+        <MapFeedback<C, DifferentIsNovel, O, MaxReducer> as Feedback<EM, I, OT, S>>::last_result(
+            &self.map,
+        )
+    }
+
+    #[cfg(feature = "track_hit_feedbacks")]
+    fn append_hit_feedbacks(&self, list: &mut Vec<Cow<'static, str>>) -> Result<(), Error> {
+        // cargo +nightly doc asks so
+        <MapFeedback<C, DifferentIsNovel, O, MaxReducer> as Feedback<EM, I, OT, S>>::append_hit_feedbacks(&self.map, list)
+    }
+
+    #[inline]
+    fn append_metadata(
+        &mut self,
+        state: &mut S,
+        manager: &mut EM,
+        observers: &OT,
+        testcase: &mut Testcase<I>,
+    ) -> Result<(), Error> {
+        self.map
+            .append_metadata(state, manager, observers, testcase)
+    }
+}
diff --git a/libafl_bolts/Cargo.toml b/libafl_bolts/Cargo.toml
index a49af1173c..48fd55070f 100644
--- a/libafl_bolts/Cargo.toml
+++ b/libafl_bolts/Cargo.toml
@@ -52,6 +52,7 @@ std = [
   "uds",
   "serial_test",
   "alloc",
+  "stable_simd",
 ]
 
 ## Enables all features that allocate in `no_std`
@@ -117,9 +118,20 @@ llmp_debug = ["alloc", "std"]
 ## Reduces the initial map size for llmp
 llmp_small_maps = ["alloc"]
 
+#! ### Stable SIMD features
+
+## Use the best SIMD implementation by our [benchmark](https://github.com/wtdcode/libafl_simd_bench)
+stable_simd = ["alloc", "wide"]
+
 [build-dependencies]
 rustversion = { workspace = true }
 
+[dev-dependencies]
+clap = { version = "4.5", features = ["derive", "env"] }
+rand = "0.9.0"
+chrono = "0.4.40"
+itertools = "0.14.0"
+
 [dependencies]
 libafl_derive = { workspace = true, default-features = true, optional = true }
 static_assertions = { workspace = true }
@@ -165,6 +177,10 @@ serial_test = { workspace = true, optional = true, default-features = false, fea
   "logging",
 ] }
 
+# optional stable simd, pin to a commit due to `u8x32` not released yet. Switch as long as next release is out!
+wide = { git = "https://github.com/Lokathor/wide", rev = "71b5df0b2620da753836fafce5f99076181a49fe", optional = true }
+rustversion = { workspace = true }
+
 # Document all features of this crate (for `cargo doc`)
 document-features = { workspace = true, optional = true }
 
@@ -212,3 +228,11 @@ mach = "0.3.2"
 name = "llmp_test"
 path = "./examples/llmp_test/main.rs"
 required-features = ["std"]
+
+
+[[example]]
+name = "simd"
+path = "./examples/simd/simd.rs"
+bench = true
+harness = false
+required-features = ["std", "stable_simd"]
diff --git a/libafl_bolts/examples/simd/simd.rs b/libafl_bolts/examples/simd/simd.rs
new file mode 100644
index 0000000000..7651cd041d
--- /dev/null
+++ b/libafl_bolts/examples/simd/simd.rs
@@ -0,0 +1,200 @@
+use chrono::Utc;
+use clap::Parser;
+use itertools::Itertools;
+use libafl_bolts::simd::{
+    covmap_is_interesting_naive, covmap_is_interesting_u8x16, covmap_is_interesting_u8x32,
+    simplify_map_naive, simplify_map_u8x16, simplify_map_u8x32,
+};
+use rand::{RngCore, rngs::ThreadRng};
+
+#[derive(Parser)]
+struct Cli {
+    #[arg(short, long, default_value_t = 2097152, env = "LIBAFL_BENCH_MAP_SIZE")]
+    pub map: usize,
+    #[arg(short, long, default_value_t = 32768, env = "LIBAFL_BENCH_ROUNDS")]
+    pub rounds: usize,
+    #[arg(short, long, env = "LIBAFL_BENCH_CORRECTNESS")]
+    pub validate: bool,
+    #[arg(short, long)]
+    pub bench: bool, // ?? Cargo sends this??
+}
+
+fn random_bits(map: &mut [u8], rng: &mut ThreadRng) {
+    // randomly set a bit since coverage map is usually sparse enough
+    let rng = rng.next_u64() as usize;
+    let bytes_idx = (rng / 8) % map.len();
+    let bits_idx = rng % 8;
+    map[bytes_idx] |= 1 << bits_idx;
+}
+
+fn clean_vectors(map: &mut [u8]) {
+    for it in map.iter_mut() {
+        *it = 0;
+    }
+}
+
+struct SimplifyMapInput {
+    name: String,
+    func: fn(&mut [u8]),
+    map: Vec<u8>,
+    rounds: usize,
+    validate: bool,
+    rng: ThreadRng,
+}
+
+impl SimplifyMapInput {
+    fn from_cli(name: &str, f: fn(&mut [u8]), cli: &Cli, rng: &ThreadRng) -> Self {
+        Self {
+            name: name.to_string(),
+            func: f,
+            map: vec![0; cli.map],
+            rng: rng.clone(),
+            rounds: cli.rounds,
+            validate: cli.validate,
+        }
+    }
+    fn measure_simplify_input(mut self) -> Vec<chrono::TimeDelta> {
+        println!("Running {}", &self.name);
+        let mut outs = vec![];
+        println!("warm up...");
+        for _ in 0..16 {
+            (self.func)(&mut self.map);
+        }
+        clean_vectors(&mut self.map);
+        for _ in 0..self.rounds {
+            random_bits(&mut self.map, &mut self.rng);
+            let before = Utc::now();
+
+            if self.validate {
+                let mut mp = self.map.clone();
+                (self.func)(&mut self.map);
+                simplify_map_naive(&mut mp);
+
+                assert!(
+                    mp == self.map,
+                    "Incorrect covmap impl. {:?} vs\n{:?}",
+                    mp,
+                    self.map
+                );
+            } else {
+                (self.func)(&mut self.map);
+            }
+            let after = Utc::now();
+            outs.push(after - before);
+        }
+
+        outs
+    }
+}
+
+type CovFuncPtr = fn(&[u8], &[u8], bool) -> (bool, Vec<usize>);
+
+struct CovInput {
+    name: String,
+    func: CovFuncPtr,
+    hist: Vec<u8>,
+    map: Vec<u8>,
+    rounds: usize,
+    validate: bool,
+    rng: ThreadRng,
+}
+
+impl CovInput {
+    fn from_cli(name: &str, f: CovFuncPtr, cli: &Cli, rng: &ThreadRng) -> Self {
+        CovInput {
+            name: name.to_string(),
+            func: f,
+            hist: vec![0; cli.map],
+            map: vec![0; cli.map],
+            rng: rng.clone(),
+            rounds: cli.rounds,
+            validate: cli.validate,
+        }
+    }
+    fn measure_cov(mut self) -> Vec<chrono::TimeDelta> {
+        println!("Running {}", &self.name);
+        let mut outs = vec![];
+        println!("warm up...");
+        for _ in 0..16 {
+            (self.func)(&self.hist, &self.map, true);
+        }
+        clean_vectors(&mut self.hist);
+        clean_vectors(&mut self.map);
+        for _ in 0..self.rounds {
+            random_bits(&mut self.map, &mut self.rng);
+            let before = Utc::now();
+            let (interesting, novelties) = (self.func)(&self.hist, &self.map, true);
+            if self.validate {
+                let (canonical_interesting, canonical_novelties) =
+                    covmap_is_interesting_naive(&self.hist, &self.map, true);
+
+                assert!(
+                    canonical_interesting == interesting && novelties == canonical_novelties,
+                    "Incorrect covmap impl. {canonical_interesting} vs {interesting}, {canonical_novelties:?} vs\n{novelties:?}"
+                );
+            }
+            let after = Utc::now();
+            outs.push(after - before);
+        }
+
+        outs
+    }
+}
+
+#[allow(clippy::cast_precision_loss)]
+fn printout(ty: &str, tms: &[chrono::TimeDelta]) {
+    let tms = tms
+        .iter()
+        .map(|t| t.to_std().unwrap().as_secs_f64())
+        .collect_vec();
+    let mean = tms.iter().sum::<f64>() / tms.len() as f64;
+    let min = tms.iter().fold(0f64, |acc, x| acc.min(*x));
+    let max = tms.iter().fold(0f64, |acc, x| acc.max(*x));
+    let std = (tms
+        .iter()
+        .fold(0f64, |acc, x| acc + (*x - mean) * (*x - mean))
+        / (tms.len() - 1) as f64)
+        .sqrt();
+    let sum: f64 = tms.into_iter().sum();
+    println!(
+        "{}: avg {:.03}, min {:.03}, max {:.03}, std {:.03}, sum {:.03}",
+        ty,
+        mean * 1000.0,
+        min * 1000.0,
+        max * 1000.0,
+        std * 1000.0,
+        sum * 1000.0
+    );
+}
+
+fn main() {
+    // Bench with `taskset -c 3 cargo bench --example simd`
+    // Validate with `cargo bench --example simd -- --validate --rounds 8192`
+    let cli = Cli::parse();
+
+    let rng = rand::rng();
+
+    let simpls = [
+        SimplifyMapInput::from_cli("naive simplify_map", simplify_map_naive, &cli, &rng),
+        SimplifyMapInput::from_cli("u8x16 simplify_map", simplify_map_u8x16, &cli, &rng),
+        SimplifyMapInput::from_cli("u8x32 simplify_map", simplify_map_u8x32, &cli, &rng),
+    ];
+
+    for bench in simpls {
+        let name = bench.name.clone();
+        let outs = bench.measure_simplify_input();
+        printout(&name, &outs);
+    }
+
+    let benches = [
+        CovInput::from_cli("naive cov", covmap_is_interesting_naive, &cli, &rng),
+        CovInput::from_cli("u8x16 cov", covmap_is_interesting_u8x16, &cli, &rng),
+        CovInput::from_cli("u8x32 cov", covmap_is_interesting_u8x32, &cli, &rng),
+    ];
+
+    for bench in benches {
+        let name = bench.name.clone();
+        let outs = bench.measure_cov();
+        printout(&name, &outs);
+    }
+}
diff --git a/libafl_bolts/src/lib.rs b/libafl_bolts/src/lib.rs
index deb675f4d9..e41ad4715c 100644
--- a/libafl_bolts/src/lib.rs
+++ b/libafl_bolts/src/lib.rs
@@ -122,6 +122,8 @@ pub mod target_args;
 #[cfg(all(feature = "std", unix))]
 pub use target_args::*;
 
+pub mod simd;
+
 /// The purpose of this module is to alleviate imports of the bolts by adding a glob import.
 #[cfg(feature = "prelude")]
 pub mod bolts_prelude {
diff --git a/libafl_bolts/src/simd.rs b/libafl_bolts/src/simd.rs
new file mode 100644
index 0000000000..329baedb0d
--- /dev/null
+++ b/libafl_bolts/src/simd.rs
@@ -0,0 +1,295 @@
+//! Module for SIMD assisted methods.
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+/// `simplify_map` naive implementaion. In most cases, this can be auto-vectorized.
+pub fn simplify_map_naive(map: &mut [u8]) {
+    for it in map.iter_mut() {
+        *it = if *it == 0 { 0x1 } else { 0x80 };
+    }
+}
+
+/// `simplify_map` implementation by u8x16, worse performance compared to LLVM
+/// auto-vectorization but faster if LLVM doesn't vectorize.
+#[cfg(feature = "wide")]
+pub fn simplify_map_u8x16(map: &mut [u8]) {
+    type VectorType = wide::u8x16;
+    const N: usize = VectorType::LANES as usize;
+    let size = map.len();
+    let steps = size / N;
+    let left = size % N;
+    let lhs = VectorType::new([0x1; N]);
+    let rhs = VectorType::new([0x80; N]);
+
+    for step in 0..steps {
+        let i = step * N;
+        let mp = VectorType::new(map[i..(i + N)].try_into().unwrap());
+
+        let mask = mp.cmp_eq(VectorType::ZERO);
+        let out = mask.blend(lhs, rhs);
+        map[i..i + N].copy_from_slice(out.as_array_ref());
+    }
+
+    #[allow(clippy::needless_range_loop)]
+    for j in (size - left)..size {
+        map[j] = if map[j] == 0 { 0x1 } else { 0x80 }
+    }
+}
+
+/// `simplify_map` implementation by i8x32, achieving comparable performance with
+/// LLVM auto-vectorization.
+#[cfg(feature = "wide")]
+pub fn simplify_map_u8x32(map: &mut [u8]) {
+    use wide::CmpEq;
+
+    type VectorType = wide::u8x32;
+    const N: usize = VectorType::LANES as usize;
+    let size = map.len();
+    let steps = size / N;
+    let left = size % N;
+    let lhs = VectorType::new([0x01; 32]);
+    let rhs = VectorType::new([0x80; 32]);
+
+    for step in 0..steps {
+        let i = step * N;
+        let mp = VectorType::new(map[i..i + N].try_into().unwrap());
+
+        let mask = mp.cmp_eq(VectorType::ZERO);
+        let out = mask.blend(lhs, rhs);
+        unsafe {
+            out.as_array_ref()
+                .as_ptr()
+                .copy_to_nonoverlapping(map.as_mut_ptr().add(i), N);
+        }
+    }
+
+    #[allow(clippy::needless_range_loop)]
+    for j in (size - left)..size {
+        map[j] = if map[j] == 0 { 0x1 } else { 0x80 }
+    }
+}
+
+/// The std implementation of `simplify_map`. Use the fastest implementation by benchamrk by default.
+pub fn std_simplify_map(map: &mut [u8]) {
+    #[cfg(not(feature = "wide"))]
+    simplify_map_naive(map);
+
+    #[cfg(feature = "wide")]
+    simplify_map_u8x32(map);
+}
+
+/// Coverage map insteresting implementation by u8x16. Slightly faster than nightly simd.
+#[cfg(all(feature = "alloc", feature = "wide"))]
+#[must_use]
+pub fn covmap_is_interesting_u8x16(
+    hist: &[u8],
+    map: &[u8],
+    collect_novelties: bool,
+) -> (bool, Vec<usize>) {
+    type VectorType = wide::u8x16;
+    let mut novelties = vec![];
+    let mut interesting = false;
+    let size = map.len();
+    let steps = size / VectorType::LANES as usize;
+    let left = size % VectorType::LANES as usize;
+
+    if collect_novelties {
+        for step in 0..steps {
+            let i = step * VectorType::LANES as usize;
+            let history =
+                VectorType::new(hist[i..i + VectorType::LANES as usize].try_into().unwrap());
+            let items = VectorType::new(map[i..i + VectorType::LANES as usize].try_into().unwrap());
+
+            if items.max(history) != history {
+                interesting = true;
+                unsafe {
+                    for j in i..(i + VectorType::LANES as usize) {
+                        let item = *map.get_unchecked(j);
+                        if item > *hist.get_unchecked(j) {
+                            novelties.push(j);
+                        }
+                    }
+                }
+            }
+        }
+
+        for j in (size - left)..size {
+            unsafe {
+                let item = *map.get_unchecked(j);
+                if item > *hist.get_unchecked(j) {
+                    interesting = true;
+                    novelties.push(j);
+                }
+            }
+        }
+    } else {
+        for step in 0..steps {
+            let i = step * VectorType::LANES as usize;
+            let history =
+                VectorType::new(hist[i..i + VectorType::LANES as usize].try_into().unwrap());
+            let items = VectorType::new(map[i..i + VectorType::LANES as usize].try_into().unwrap());
+
+            if items.max(history) != history {
+                interesting = true;
+                break;
+            }
+        }
+
+        if !interesting {
+            for j in (size - left)..size {
+                unsafe {
+                    let item = *map.get_unchecked(j);
+                    if item > *hist.get_unchecked(j) {
+                        interesting = true;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    (interesting, novelties)
+}
+
+/// Coverage map insteresting implementation by u8x32. Slightly faster than nightly simd but slightly
+/// slower than u8x16 version.
+#[cfg(all(feature = "alloc", feature = "wide"))]
+#[must_use]
+pub fn covmap_is_interesting_u8x32(
+    hist: &[u8],
+    map: &[u8],
+    collect_novelties: bool,
+) -> (bool, Vec<usize>) {
+    type VectorType = wide::u8x32;
+    const N: usize = VectorType::LANES as usize;
+    let mut novelties = vec![];
+    let mut interesting = false;
+    let size = map.len();
+    let steps = size / N;
+    let left = size % N;
+
+    if collect_novelties {
+        for step in 0..steps {
+            let i = step * N;
+            let history = VectorType::new(hist[i..i + N].try_into().unwrap());
+            let items = VectorType::new(map[i..i + N].try_into().unwrap());
+
+            if items.max(history) != history {
+                interesting = true;
+                unsafe {
+                    // Break into two loops so that LLVM will vectorize both loops.
+                    // Or LLVM won't vectorize them and is super slow. We need a few
+                    // extra intrinsic to wide and safe_arch to vectorize this manually.
+                    for j in i..(i + N / 2) {
+                        let item = *map.get_unchecked(j);
+                        if item > *hist.get_unchecked(j) {
+                            novelties.push(j);
+                        }
+                    }
+
+                    for j in (i + N / 2)..(i + N) {
+                        let item = *map.get_unchecked(j);
+                        if item > *hist.get_unchecked(j) {
+                            novelties.push(j);
+                        }
+                    }
+                }
+            }
+        }
+
+        for j in (size - left)..size {
+            unsafe {
+                let item = *map.get_unchecked(j);
+                if item > *hist.get_unchecked(j) {
+                    interesting = true;
+                    novelties.push(j);
+                }
+            }
+        }
+    } else {
+        for step in 0..steps {
+            let i = step * N;
+            let history = VectorType::new(hist[i..i + N].try_into().unwrap());
+            let items = VectorType::new(map[i..i + N].try_into().unwrap());
+
+            if items.max(history) != history {
+                interesting = true;
+                break;
+            }
+        }
+
+        if !interesting {
+            for j in (size - left)..size {
+                unsafe {
+                    let item = *map.get_unchecked(j);
+                    if item > *hist.get_unchecked(j) {
+                        interesting = true;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    (interesting, novelties)
+}
+
+/// Coverage map insteresting naive implementation. Do not use it unless you have strong reasons to do.
+#[cfg(feature = "alloc")]
+#[must_use]
+pub fn covmap_is_interesting_naive(
+    hist: &[u8],
+    map: &[u8],
+    collect_novelties: bool,
+) -> (bool, Vec<usize>) {
+    let mut novelties = vec![];
+    let mut interesting = false;
+    let initial = 0;
+    if collect_novelties {
+        for (i, item) in map.iter().enumerate().filter(|(_, item)| **item != initial) {
+            let existing = unsafe { *hist.get_unchecked(i) };
+            let reduced = existing.max(*item);
+            if existing != reduced {
+                interesting = true;
+                novelties.push(i);
+            }
+        }
+    } else {
+        for (i, item) in map.iter().enumerate().filter(|(_, item)| **item != initial) {
+            let existing = unsafe { *hist.get_unchecked(i) };
+            let reduced = existing.max(*item);
+            if existing != reduced {
+                interesting = true;
+                break;
+            }
+        }
+    }
+
+    (interesting, novelties)
+}
+
+/// Standard coverage map instereting implementation. Use the available fastest implementation by default.
+#[cfg(feature = "alloc")]
+#[allow(unused_variables)] // or we fail cargo doc
+#[must_use]
+pub fn std_covmap_is_interesting(
+    hist: &[u8],
+    map: &[u8],
+    collect_novelties: bool,
+) -> (bool, Vec<usize>) {
+    #[cfg(not(feature = "wide"))]
+    return covmap_is_interesting_naive(hist, map, collect_novelties);
+
+    #[cfg(feature = "wide")]
+    {
+        // Supported by benchmark:
+        // - on aarch64, u8x32 is 15% faster than u8x16
+        // - on amd64, u8x16 is 10% faster compared to the u8x32
+        #[cfg(target_arch = "aarch64")]
+        return covmap_is_interesting_u8x32(hist, map, collect_novelties);
+
+        #[cfg(not(target_arch = "aarch64"))]
+        return covmap_is_interesting_u8x16(hist, map, collect_novelties);
+    }
+}