Unicode-preserving mutators (#1542)

* create the string classification stage * modify API to pre-group * preserving mutator * more meaningful test * subproperty mutators + some fixes * document, finalise, integrate with libafl_libfuzzer * add example, fix for weird range select * fix for introspection * fix fuzzer build * speed optimisation: allow, but do not require, stacking * property => category * token replacement * fixup: rare case where rust does not agree on valid character * fix CI again * again again * take two: dynamic unicode discovery * oops * fix: last byte is never selected * opt: bias to smaller unicode categories * fix test * opt: precompute regions and fix tests * cache and allow stacking * document and update libafl_libfuzzer * oops, use reverse * fix bolts clippy error * fixup part 2 * clippy * part 2 * clippy warning allow * clippy complaint * use alloc not std --------- Co-authored-by: toka <tokazerkje@outlook.com>
2023-11-21 00:41:16 +01:00 · 2023-11-21 00:41:16 +01:00 · 281524dbf9
commit 281524dbf9
parent 1e96652ed2
16 changed files with 1037 additions and 14 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@ -73,6 +73,8 @@ jobs:
      run: command -v llvm-config-15 && clang-15 -v
    - name: Add nightly rustfmt and clippy
      run: rustup toolchain install nightly --component rustfmt --component clippy --component miri --allow-downgrade
    - name: Install ucd-generate
      run: cargo install -f ucd-generate
    - uses: actions/checkout@v3
    - uses: Swatinem/rust-cache@v2
@ -135,6 +137,8 @@ jobs:
      run: command -v llvm-config-15 && clang-15 -v
    - name: Install cargo-hack
      run: curl -LsSf https://github.com/taiki-e/cargo-hack/releases/latest/download/cargo-hack-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
    - name: Install ucd-generate
      run: cargo install -f ucd-generate
    - name: Add nightly
      run: rustup toolchain install nightly --allow-downgrade
    - uses: actions/checkout@v3
@ -222,6 +226,8 @@ jobs:
    - name: Install cxxbridge
      if: runner.os == 'macOS'
      run: cargo install cxxbridge-cmd
    - name: Install ucd-generate
      run: cargo install -f ucd-generate
    - name: Install python (macOS)
      # Removing macOS things already installed in CI against failed linking
      if: runner.os == 'macOS'
@ -384,6 +390,8 @@ jobs:
        toolchain: stable
    - name: Add nightly rustfmt and clippy
      run: rustup toolchain install nightly --component rustfmt --component clippy --allow-downgrade
    - name: Install ucd-generate
      run: cargo install -f ucd-generate
    - name: Install deps
      run: brew install z3 gtk+3
    - name: Install cxxbridge
@ -453,6 +461,7 @@ jobs:
          freebsd-version
          . "$HOME/.cargo/env"
          rustup toolchain install nightly
          cargo install -f ucd-generate
          export LLVM_CONFIG=/usr/local/bin/llvm-config16
          pwd
          ls -lah
--- a/fuzzers/baby_fuzzer_unicode/.gitignore
+++ b/fuzzers/baby_fuzzer_unicode/.gitignore
@ -0,0 +1 @@
 libpng-*
--- a/fuzzers/baby_fuzzer_unicode/Cargo.toml
+++ b/fuzzers/baby_fuzzer_unicode/Cargo.toml
@ -0,0 +1,24 @@
 [package]
 name = "baby_fuzzer_unicode"
 version = "0.10.0"
 authors = ["Andrea Fioraldi <andreafioraldi@gmail.com>", "Dominik Maier <domenukk@gmail.com>"]
 edition = "2021"
 [features]
 default = ["std"]
 tui = []
 std = []
 [profile.dev]
 panic = "abort"
 [profile.release]
 panic = "abort"
 lto = true
 codegen-units = 1
 opt-level = 3
 debug = true
 [dependencies]
 libafl = { path = "../../libafl/", features = ["unicode"] }
 libafl_bolts = { path = "../../libafl_bolts/" }
--- a/fuzzers/baby_fuzzer_unicode/README.md
+++ b/fuzzers/baby_fuzzer_unicode/README.md
@ -0,0 +1,15 @@
 # Baby fuzzer: unicode
 This is a minimalistic example about how to create a libafl based fuzzer.
 It runs on a single core until a crash occurs and then exits.
 The tested program is a simple Rust function without any instrumentation.
 For real fuzzing, you will want to add some sort to add coverage or other feedback.
 You can run this example using `cargo run`, and you can enable the TUI feature by running `cargo run --features tui`.
 ## Unicode
 This fuzzer uses mutators which preserve unicode properties. For programs which have string-heavy inputs, you may
 consider using the same strategy.
--- a/fuzzers/baby_fuzzer_unicode/src/main.rs
+++ b/fuzzers/baby_fuzzer_unicode/src/main.rs
@ -0,0 +1,138 @@
 #[cfg(windows)]
 use std::ptr::write_volatile;
 use std::{path::PathBuf, ptr::write};
 #[cfg(feature = "tui")]
 use libafl::monitors::tui::{ui::TuiUI, TuiMonitor};
 #[cfg(not(feature = "tui"))]
 use libafl::monitors::SimpleMonitor;
 use libafl::{
    corpus::{InMemoryCorpus, OnDiskCorpus},
    events::SimpleEventManager,
    executors::{inprocess::InProcessExecutor, ExitKind},
    feedbacks::{CrashFeedback, MaxMapFeedback},
    fuzzer::{Fuzzer, StdFuzzer},
    inputs::{BytesInput, HasTargetBytes},
    mutators::{StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator},
    observers::StdMapObserver,
    schedulers::QueueScheduler,
    stages::{mutational::StdMutationalStage, StringIdentificationStage},
    state::StdState,
    Evaluator,
 };
 use libafl_bolts::{current_nanos, rands::StdRand, tuples::tuple_list, AsSlice};
 /// Coverage map with explicit assignments due to the lack of instrumentation
 static mut SIGNALS: [u8; 64] = [0; 64];
 static mut SIGNALS_PTR: *mut u8 = unsafe { SIGNALS.as_mut_ptr() };
 /// Assign a signal to the signals map
 fn signals_set(idx: usize) {
    unsafe { write(SIGNALS_PTR.add(idx), 1) };
 }
 #[allow(clippy::similar_names, clippy::manual_assert)]
 pub fn main() {
    // The closure that we want to fuzz
    let mut harness = |input: &BytesInput| {
        let target = input.target_bytes();
        let buf = target.as_slice();
        let goal = b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
        let mut i = 0;
        for _ in buf.iter().zip(goal).take_while(|(b, c)| b == c) {
            signals_set(i);
            i += 1;
        }
        if i == goal.len() {
            #[cfg(unix)]
            panic!("Artificial bug triggered =)");
            #[cfg(windows)]
            unsafe {
                write_volatile(0 as *mut u32, 0);
            }
        }
        ExitKind::Ok
    };
    // Create an observation channel using the signals map
    let observer = unsafe { StdMapObserver::from_mut_ptr("signals", SIGNALS_PTR, SIGNALS.len()) };
    // Feedback to rate the interestingness of an input
    let mut feedback = MaxMapFeedback::new(&observer);
    // A feedback to choose if an input is a solution or not
    let mut objective = CrashFeedback::new();
    // create a State from scratch
    let mut state = StdState::new(
        // RNG
        StdRand::with_seed(current_nanos()),
        // Corpus that will be evolved, we keep it in memory for performance
        InMemoryCorpus::new(),
        // Corpus in which we store solutions (crashes in this example),
        // on disk so the user can get them after stopping the fuzzer
        OnDiskCorpus::new(PathBuf::from("./crashes")).unwrap(),
        // States of the feedbacks.
        // The feedbacks can report the data that should persist in the State.
        &mut feedback,
        // Same for objective feedbacks
        &mut objective,
    )
    .unwrap();
    // The Monitor trait define how the fuzzer stats are displayed to the user
    #[cfg(not(feature = "tui"))]
    let mon = SimpleMonitor::new(|s| println!("{s}"));
    #[cfg(feature = "tui")]
    let ui = TuiUI::with_version(String::from("Baby Fuzzer"), String::from("0.0.1"), false);
    #[cfg(feature = "tui")]
    let mon = TuiMonitor::new(ui);
    // The event manager handle the various events generated during the fuzzing loop
    // such as the notification of the addition of a new item to the corpus
    let mut mgr = SimpleEventManager::new(mon);
    // A queue policy to get testcasess from the corpus
    let scheduler = QueueScheduler::new();
    // A fuzzer with feedbacks and a corpus scheduler
    let mut fuzzer = StdFuzzer::new(scheduler, feedback, objective);
    // Create the executor for an in-process function with just one observer
    let mut executor = InProcessExecutor::new(
        &mut harness,
        tuple_list!(observer),
        &mut fuzzer,
        &mut state,
        &mut mgr,
    )
    .expect("Failed to create the Executor");
    // Generate 8 initial inputs
    fuzzer
        .evaluate_input(
            &mut state,
            &mut executor,
            &mut mgr,
            BytesInput::new(vec![b'a']),
        )
        .unwrap();
    // Setup a mutational stage with a basic bytes mutator
    let mutator = StdScheduledMutator::new(tuple_list!(
        StringCategoryRandMutator,
        StringSubcategoryRandMutator,
        StringSubcategoryRandMutator,
        StringSubcategoryRandMutator,
        StringSubcategoryRandMutator
    ));
    let mut stages = tuple_list!(
        StringIdentificationStage::new(),
        StdMutationalStage::transforming(mutator)
    );
    fuzzer
        .fuzz_loop(&mut stages, &mut executor, &mut state, &mut mgr)
        .expect("Error in the fuzzing loop");
 }
--- a/libafl/Cargo.toml
+++ b/libafl/Cargo.toml
@ -77,6 +77,9 @@ concolic_mutation = ["z3"]
 ## Enable the fancy TuiMonitor for a termanal UI using crossterm
 tui_monitor = ["ratatui", "crossterm"]
 ## Enables `StringClassificationStage` and associated mutators, which allow for mutations which preserve the Unicode property data
 unicode = ["libafl_bolts/alloc", "ahash/std", "serde/rc", "bitvec"]
 #! ## LibAFL-Bolts Features
@ -126,7 +129,9 @@ agpl = ["nautilus"]
 nautilus = ["grammartec", "std", "serde_json/std"]
 [build-dependencies]
 reqwest = { version = "0.11", features = ["blocking"] }
 rustversion = "1.0"
 zip = "0.6"
 [dev-dependencies]
 serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
@ -172,7 +177,9 @@ z3 = { version = "0.12.0", features = ["static-link-z3"], optional = true } # fo
 pyo3 = { version = "0.18", optional = true, features = ["serde", "macros"] }
 concat-idents = { version = "1.1.3", optional = true }
-libcasr = { version = "2.7", optional = true}
+libcasr = { version = "2.7", optional = true }
 bitvec = { version = "1.0", optional = true, features = ["serde"] } # used for string range storage
 # optional-dev deps (change when target.'cfg(accessible(::std))'.test-dependencies will be stable)
 serial_test = { version = "2", optional = true, default-features = false, features = ["logging"] }
--- a/libafl/build.rs
+++ b/libafl/build.rs
@ -1,14 +1,69 @@
 use std::error::Error;
 #[rustversion::nightly]
-fn main() {
+fn main() -> Result<(), Box<dyn Error>> {
    println!("cargo:rerun-if-changed=build.rs");
    println!("cargo:rustc-cfg=nightly");
    #[cfg(feature = "unicode")]
    {
        build_unicode_property_map()?;
    }
    Ok(())
 }
 #[rustversion::not(nightly)]
-fn main() {
+fn main() -> Result<(), Box<dyn Error>> {
    println!("cargo:rerun-if-changed=build.rs");
    assert!(
        cfg!(all(not(docrs), not(feature = "nautilus"))),
        "The 'nautilus' feature of libafl requires a nightly compiler"
    );
    #[cfg(feature = "unicode")]
    {
        build_unicode_property_map()?;
    }
    Ok(())
 }
 #[cfg(feature = "unicode")]
 fn build_unicode_property_map() -> Result<(), Box<dyn Error>> {
    use std::{
        env,
        fs::File,
        io::{BufWriter, Write},
        path::PathBuf,
        process::{Command, Stdio},
    };
    let out_dir = PathBuf::from(env::var_os("OUT_DIR").unwrap());
    let ucd_dir = out_dir.join("ucd-dir");
    let generated_file = out_dir.join("unicode_categories.rs");
    std::fs::create_dir_all(&ucd_dir)?;
    let zip_path = ucd_dir.join("ucd.zip");
    let mut ucd_file = BufWriter::new(File::create(&zip_path)?);
    for chunk in reqwest::blocking::get("https://www.unicode.org/Public/zipped/latest/UCD.zip")?
        .bytes()?
        .chunks(1 << 12)
    {
        ucd_file.write_all(chunk)?;
    }
    ucd_file.flush()?;
    drop(ucd_file);
    let mut zip_file = zip::ZipArchive::new(File::open(&zip_path)?)?;
    zip_file.extract(&ucd_dir)?;
    drop(zip_file);
    std::fs::remove_file(zip_path)?;
    let status = Command::new("ucd-generate")
        .arg("general-category")
        .arg(ucd_dir.as_os_str())
        .stdout(Stdio::from(File::create(generated_file)?))
        .status()?;
    assert!(status.success());
    Ok(())
 }
--- a/libafl/src/mutators/mod.rs
+++ b/libafl/src/mutators/mod.rs
@ -20,6 +20,11 @@ pub use grimoire::*;
 pub mod tuneable;
 pub use tuneable::*;
 #[cfg(feature = "unicode")]
 pub mod string;
 #[cfg(feature = "unicode")]
 pub use string::*;
 #[cfg(feature = "nautilus")]
 pub mod nautilus;
 use alloc::vec::Vec;
--- a/libafl/src/mutators/string.rs
+++ b/libafl/src/mutators/string.rs
@ -0,0 +1,595 @@
 //! Mutators for preserving string categories, which may be useful for certain targets which are primarily string-oriented.
 use alloc::vec::Vec;
 use core::{
    cmp::{Ordering, Reverse},
    ops::Range,
 };
 use libafl_bolts::{rands::Rand, Error, HasLen, Named};
 use crate::{
    corpus::{CorpusId, HasTestcase, Testcase},
    inputs::{BytesInput, HasBytesVec},
    mutators::{rand_range, MutationResult, Mutator, Tokens},
    stages::{
        extract_metadata,
        mutational::{MutatedTransform, MutatedTransformPost},
        StringIdentificationMetadata,
    },
    state::{HasCorpus, HasMaxSize, HasMetadata, HasRand},
 };
 /// Input which contains the context necessary to perform unicode mutations
 pub type UnicodeInput = (BytesInput, StringIdentificationMetadata);
 impl<S> MutatedTransform<BytesInput, S> for UnicodeInput
 where
    S: HasCorpus<Input = BytesInput> + HasTestcase,
 {
    type Post = StringIdentificationMetadata;
    fn try_transform_from(
        base: &mut Testcase<BytesInput>,
        state: &S,
        _corpus_idx: CorpusId,
    ) -> Result<Self, Error> {
        let input = base.load_input(state.corpus())?.clone();
        let metadata = base.metadata::<StringIdentificationMetadata>().cloned()?;
        Ok((input, metadata))
    }
    fn try_transform_into(self, _state: &S) -> Result<(BytesInput, Self::Post), Error> {
        Ok(self)
    }
 }
 impl<S> MutatedTransformPost<S> for StringIdentificationMetadata
 where
    S: HasTestcase,
 {
    fn post_exec(
        self,
        state: &mut S,
        _stage_idx: i32,
        corpus_idx: Option<CorpusId>,
    ) -> Result<(), Error> {
        if let Some(corpus_idx) = corpus_idx {
            let mut tc = state.testcase_mut(corpus_idx)?;
            tc.add_metadata(self);
        }
        Ok(())
    }
 }
 const MAX_CHARS: usize = 16;
 fn choose_start<R: Rand>(
    rand: &mut R,
    bytes: &[u8],
    meta: &StringIdentificationMetadata,
 ) -> Option<(usize, usize)> {
    let idx = rand.below(bytes.len() as u64) as usize;
    let mut options = Vec::new();
    for (start, range) in meta.ranges() {
        if idx
            .checked_sub(*start) // idx adjusted to start
            .and_then(|idx| (idx < range.len()).then(|| range[idx])) // idx in range
            .map_or(false, |r| r)
        {
            options.push((*start, range));
        }
    }
    match options.len() {
        0 => None,
        1 => Some((options[0].0, options[0].1.len())),
        _ => {
            // bias towards longer strings
            options.sort_by_cached_key(|(_, entries)| entries.count_ones());
            let selected = libafl_bolts::math::integer_sqrt(
                rand.below((options.len() * options.len()) as u64),
            ) as usize;
            Some((options[selected].0, options[selected].1.len()))
        }
    }
 }
 fn get_subcategory<T: Ord + Copy>(needle: T, haystack: &[(T, T)]) -> Option<(T, T)> {
    haystack
        .binary_search_by(|&(min, max)| match min.cmp(&needle) {
            Ordering::Less | Ordering::Equal => match needle.cmp(&max) {
                Ordering::Less | Ordering::Equal => Ordering::Equal,
                Ordering::Greater => Ordering::Less,
            },
            Ordering::Greater => Ordering::Greater,
        })
        .ok()
        .map(|idx| haystack[idx])
 }
 fn find_range<F: Fn(char) -> bool>(
    chars: &[(usize, char)],
    idx: usize,
    predicate: F,
 ) -> Range<usize> {
    // walk backwards and discover
    let start = chars[..idx]
        .iter()
        .rev()
        .take_while(|&&(_, c)| predicate(c))
        .last()
        .map_or(chars[idx].0, |&(i, _)| i);
    // walk forwards
    let end = chars[(idx + 1)..]
        .iter()
        .take_while(|&&(_, c)| predicate(c))
        .last()
        .map_or(chars[idx].0 + chars[idx].1.len_utf8(), |&(i, c)| {
            i + c.len_utf8()
        });
    start..end
 }
 fn choose_category_range<R: Rand>(
    rand: &mut R,
    string: &str,
 ) -> (Range<usize>, &'static [(u32, u32)]) {
    let chars = string.char_indices().collect::<Vec<_>>();
    let idx = rand.below(chars.len() as u64) as usize;
    let c = chars[idx].1;
    // figure out the categories for this char
    let expanded = c as u32;
    #[cfg(test)]
    let mut names = Vec::new();
    let mut categories = Vec::new();
    for (_name, category) in unicode_categories::BY_NAME {
        if get_subcategory(expanded, category).is_some() {
            #[cfg(test)]
            names.push(_name);
            categories.push(category);
        }
    }
    // ok -- we want to bias towards smaller regions to keep the mutations "tight" to original
    // we sort the options by descending length, then pick isqrt of below(n^2)
    categories.sort_by_cached_key(|cat| {
        Reverse(
            cat.iter()
                .map(|&(min, max)| (max - min + 1) as usize)
                .sum::<usize>(),
        )
    });
    let options = categories.len() * categories.len();
    let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize;
    let selected = categories[selected_idx];
    #[cfg(test)]
    println!("category for `{c}' ({}): {}", c as u32, names[selected_idx]);
    (
        find_range(&chars, idx, |c| {
            get_subcategory(c as u32, selected).is_some()
        }),
        selected,
    )
 }
 fn choose_subcategory_range<R: Rand>(rand: &mut R, string: &str) -> (Range<usize>, (u32, u32)) {
    let chars = string.char_indices().collect::<Vec<_>>();
    let idx = rand.below(chars.len() as u64) as usize;
    let c = chars[idx].1;
    // figure out the categories for this char
    let expanded = c as u32;
    #[cfg(test)]
    let mut names = Vec::new();
    let mut subcategories = Vec::new();
    for (_name, category) in unicode_categories::BY_NAME {
        if let Some(subcategory) = get_subcategory(expanded, category) {
            #[cfg(test)]
            names.push(_name);
            subcategories.push(subcategory);
        }
    }
    // see reasoning for selection pattern in choose_category_range
    subcategories.sort_by_key(|&(min, max)| Reverse(max - min + 1));
    let options = subcategories.len() * subcategories.len();
    let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize;
    let selected = subcategories[selected_idx];
    #[cfg(test)]
    println!(
        "subcategory for `{c}' ({}): {} ({:?})",
        c as u32, names[selected_idx], selected
    );
    (
        find_range(&chars, idx, |c| {
            let expanded = c as u32;
            selected.0 <= expanded && expanded <= selected.1
        }),
        selected,
    )
 }
 fn rand_replace_range<S: HasRand + HasMaxSize, F: Fn(&mut S) -> char>(
    state: &mut S,
    input: &mut UnicodeInput,
    range: Range<usize>,
    char_gen: F,
 ) -> MutationResult {
    let temp_range = rand_range(state, range.end - range.start, MAX_CHARS);
    let range = (range.start + temp_range.start)..(range.start + temp_range.end);
    let range = match core::str::from_utf8(&input.0.bytes()[range.clone()]) {
        Ok(_) => range,
        Err(e) => range.start..(range.start + e.valid_up_to()),
    };
    #[cfg(test)]
    println!(
        "mutating range: {:?} ({:?})",
        range,
        core::str::from_utf8(&input.0.bytes()[range.clone()])
    );
    if range.start == range.end {
        return MutationResult::Skipped;
    }
    let replace_len = state.rand_mut().below(MAX_CHARS as u64) as usize;
    let orig_len = range.end - range.start;
    if input.0.len() - orig_len + replace_len > state.max_size() {
        return MutationResult::Skipped;
    }
    let mut replacement = Vec::with_capacity(replace_len);
    let mut dest = [0u8; 4];
    loop {
        let new_c = char_gen(state);
        if replacement.len() + new_c.len_utf8() > replace_len {
            break;
        }
        new_c.encode_utf8(&mut dest);
        replacement.extend_from_slice(&dest[..new_c.len_utf8()]);
        if replacement.len() + new_c.len_utf8() == replace_len {
            break; // nailed it
        }
    }
    input.0.bytes_mut().splice(range, replacement);
    input.1 = extract_metadata(input.0.bytes());
    MutationResult::Mutated
 }
 /// Unicode category data, as used by string analysis and mutators.
 pub mod unicode_categories {
    #![allow(unused)]
    #![allow(missing_docs)]
    #![allow(clippy::redundant_static_lifetimes)]
    include!(concat!(env!("OUT_DIR"), "/unicode_categories.rs"));
 }
 /// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the
 /// range's category
 #[derive(Debug, Default)]
 pub struct StringCategoryRandMutator;
 impl Named for StringCategoryRandMutator {
    fn name(&self) -> &str {
        "string-category-rand"
    }
 }
 impl<S> Mutator<UnicodeInput, S> for StringCategoryRandMutator
 where
    S: HasRand + HasMaxSize,
 {
    fn mutate(
        &mut self,
        state: &mut S,
        input: &mut UnicodeInput,
        _stage_idx: i32,
    ) -> Result<MutationResult, Error> {
        if input.0.bytes().is_empty() {
            return Ok(MutationResult::Skipped);
        }
        let bytes = input.0.bytes();
        let meta = &input.1;
        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
            let substring = core::str::from_utf8(&bytes[base..][..len])?;
            let (range, category) = choose_category_range(state.rand_mut(), substring);
            #[cfg(test)]
            println!(
                "{:?} => {:?}",
                range,
                core::str::from_utf8(&bytes[range.clone()])
            );
            let options: u64 = category
                .iter()
                .map(|&(start, end)| u64::from(end) - u64::from(start) + 1)
                .sum();
            let char_gen = |state: &mut S| loop {
                let mut selected = state.rand_mut().below(options);
                for &(min, max) in category {
                    if let Some(next_selected) =
                        selected.checked_sub(u64::from(max) - u64::from(min) + 1)
                    {
                        selected = next_selected;
                    } else if let Some(new_c) = char::from_u32(selected as u32 + min) {
                        return new_c;
                    } else {
                        break;
                    }
                }
            };
            return Ok(rand_replace_range(state, input, range, char_gen));
        }
        Ok(MutationResult::Skipped)
    }
 }
 /// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the
 /// range's subcategory
 #[derive(Debug, Default)]
 pub struct StringSubcategoryRandMutator;
 impl Named for StringSubcategoryRandMutator {
    fn name(&self) -> &str {
        "string-subcategory-rand"
    }
 }
 impl<S> Mutator<UnicodeInput, S> for StringSubcategoryRandMutator
 where
    S: HasRand + HasMaxSize,
 {
    fn mutate(
        &mut self,
        state: &mut S,
        input: &mut UnicodeInput,
        _stage_idx: i32,
    ) -> Result<MutationResult, Error> {
        if input.0.bytes().is_empty() {
            return Ok(MutationResult::Skipped);
        }
        let bytes = input.0.bytes();
        let meta = &input.1;
        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
            let substring = core::str::from_utf8(&bytes[base..][..len])?;
            let (range, subcategory) = choose_subcategory_range(state.rand_mut(), substring);
            #[cfg(test)]
            println!(
                "{:?} => {:?}",
                range,
                core::str::from_utf8(&bytes[range.clone()])
            );
            let options: u64 = u64::from(subcategory.1) - u64::from(subcategory.0) + 1;
            let char_gen = |state: &mut S| loop {
                let selected = state.rand_mut().below(options);
                if let Some(new_c) = char::from_u32(selected as u32 + subcategory.0) {
                    return new_c;
                }
            };
            return Ok(rand_replace_range(state, input, range, char_gen));
        }
        Ok(MutationResult::Skipped)
    }
 }
 /// Mutator which randomly replaces a full category-contiguous region of chars with a random token
 #[derive(Debug, Default)]
 pub struct StringCategoryTokenReplaceMutator;
 impl Named for StringCategoryTokenReplaceMutator {
    fn name(&self) -> &str {
        "string-category-token-replace"
    }
 }
 impl<S> Mutator<UnicodeInput, S> for StringCategoryTokenReplaceMutator
 where
    S: HasRand + HasMaxSize + HasMetadata,
 {
    fn mutate(
        &mut self,
        state: &mut S,
        input: &mut UnicodeInput,
        _stage_idx: i32,
    ) -> Result<MutationResult, Error> {
        if input.0.bytes().is_empty() {
            return Ok(MutationResult::Skipped);
        }
        let tokens_len = {
            let meta = state.metadata_map().get::<Tokens>();
            if meta.is_none() {
                return Ok(MutationResult::Skipped);
            }
            if meta.unwrap().tokens().is_empty() {
                return Ok(MutationResult::Skipped);
            }
            meta.unwrap().tokens().len()
        };
        let token_idx = state.rand_mut().below(tokens_len as u64) as usize;
        let bytes = input.0.bytes();
        let meta = &input.1;
        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
            let substring = core::str::from_utf8(&bytes[base..][..len])?;
            let (range, _) = choose_category_range(state.rand_mut(), substring);
            #[cfg(test)]
            println!(
                "{:?} => {:?}",
                range,
                core::str::from_utf8(&bytes[range.clone()])
            );
            let meta = state.metadata_map().get::<Tokens>().unwrap();
            let token = &meta.tokens()[token_idx];
            if input.0.len() - (range.end - range.start) + token.len() > state.max_size() {
                return Ok(MutationResult::Skipped);
            }
            input.0.bytes_mut().splice(range, token.iter().copied());
            input.1 = extract_metadata(input.0.bytes());
            return Ok(MutationResult::Mutated);
        }
        Ok(MutationResult::Skipped)
    }
 }
 /// Mutator which randomly replaces a full subcategory-contiguous region of chars with a random token
 #[derive(Debug, Default)]
 pub struct StringSubcategoryTokenReplaceMutator;
 impl Named for StringSubcategoryTokenReplaceMutator {
    fn name(&self) -> &str {
        "string-subcategory-replace"
    }
 }
 impl<S> Mutator<UnicodeInput, S> for StringSubcategoryTokenReplaceMutator
 where
    S: HasRand + HasMaxSize + HasMetadata,
 {
    fn mutate(
        &mut self,
        state: &mut S,
        input: &mut UnicodeInput,
        _stage_idx: i32,
    ) -> Result<MutationResult, Error> {
        if input.0.bytes().is_empty() {
            return Ok(MutationResult::Skipped);
        }
        let tokens_len = {
            let meta = state.metadata_map().get::<Tokens>();
            if meta.is_none() {
                return Ok(MutationResult::Skipped);
            }
            if meta.unwrap().tokens().is_empty() {
                return Ok(MutationResult::Skipped);
            }
            meta.unwrap().tokens().len()
        };
        let token_idx = state.rand_mut().below(tokens_len as u64) as usize;
        let bytes = input.0.bytes();
        let meta = &input.1;
        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
            let substring = core::str::from_utf8(&bytes[base..][..len])?;
            let (range, _) = choose_subcategory_range(state.rand_mut(), substring);
            #[cfg(test)]
            println!(
                "{:?} => {:?}",
                range,
                core::str::from_utf8(&bytes[range.clone()])
            );
            let meta = state.metadata_map().get::<Tokens>().unwrap();
            let token = &meta.tokens()[token_idx];
            if input.0.len() - (range.end - range.start) + token.len() > state.max_size() {
                return Ok(MutationResult::Skipped);
            }
            input.0.bytes_mut().splice(range, token.iter().copied());
            input.1 = extract_metadata(input.0.bytes());
            return Ok(MutationResult::Mutated);
        }
        Ok(MutationResult::Skipped)
    }
 }
 #[cfg(test)]
 mod test {
    use libafl_bolts::rands::StdRand;
    use super::*;
    use crate::{corpus::NopCorpus, stages::extract_metadata, state::StdState};
    // a not-so-useful test for this
    #[test]
    fn mutate_hex() {
        let result: Result<(), Error> = (|| {
            let hex = "0123456789abcdef0123456789abcdef";
            let mut bytes = BytesInput::from(hex.as_bytes());
            let mut mutator = StringCategoryRandMutator;
            let mut state = StdState::new(
                StdRand::with_seed(0),
                NopCorpus::<BytesInput>::new(),
                NopCorpus::new(),
                &mut (),
                &mut (),
            )?;
            for _ in 0..(1 << 12) {
                let metadata = extract_metadata(bytes.bytes());
                let mut input = (bytes, metadata);
                let _ = mutator.mutate(&mut state, &mut input, 0);
                println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap());
                bytes = input.0;
            }
            Ok(())
        })();
        if let Err(e) = result {
            panic!("failed with error: {e}");
        }
    }
    #[test]
    fn mutate_hex_subcat() {
        let result: Result<(), Error> = (|| {
            let hex = "0123456789abcdef0123456789abcdef";
            let mut bytes = BytesInput::from(hex.as_bytes());
            let mut mutator = StringSubcategoryRandMutator;
            let mut state = StdState::new(
                StdRand::with_seed(0),
                NopCorpus::<BytesInput>::new(),
                NopCorpus::new(),
                &mut (),
                &mut (),
            )?;
            for _ in 0..(1 << 12) {
                let metadata = extract_metadata(bytes.bytes());
                let mut input = (bytes, metadata);
                let _ = mutator.mutate(&mut state, &mut input, 0);
                println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap());
                bytes = input.0;
            }
            Ok(())
        })();
        if let Err(e) = result {
            panic!("failed with error: {e}");
        }
    }
 }
--- a/libafl/src/stages/mod.rs
+++ b/libafl/src/stages/mod.rs
@ -49,6 +49,11 @@ pub use concolic::ConcolicTracingStage;
 #[cfg(feature = "std")]
 pub use concolic::SimpleConcolicMutationalStage;
 #[cfg(feature = "unicode")]
 pub mod string;
 #[cfg(feature = "unicode")]
 pub use string::*;
 #[cfg(feature = "std")]
 pub mod sync;
 #[cfg(feature = "std")]
@ -56,6 +61,7 @@ pub use sync::*;
 #[cfg(feature = "std")]
 pub mod dump;
 use core::{convert::From, marker::PhantomData};
 #[cfg(feature = "std")]
--- a/libafl/src/stages/string.rs
+++ b/libafl/src/stages/string.rs
@ -0,0 +1,128 @@
 //! Stages which analysis common to Unicode-style mutations
 use alloc::{collections::VecDeque, rc::Rc, vec::Vec};
 use core::marker::PhantomData;
 use bitvec::{bitvec, vec::BitVec};
 use libafl_bolts::{impl_serdeany, Error};
 use serde::{Deserialize, Serialize};
 use crate::{
    corpus::{CorpusId, HasTestcase},
    inputs::{BytesInput, HasBytesVec, UsesInput},
    stages::Stage,
    state::{HasCorpus, HasMetadata, UsesState},
 };
 /// Metadata which stores the list of pre-computed string-like ranges in the input
 #[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct StringIdentificationMetadata {
    ranges: Rc<Vec<(usize, BitVec)>>,
 }
 impl_serdeany!(StringIdentificationMetadata);
 impl StringIdentificationMetadata {
    /// The list of pre-computed string-like ranges in the input
    #[must_use]
    pub fn ranges(&self) -> &Vec<(usize, BitVec)> {
        self.ranges.as_ref()
    }
 }
 pub(crate) fn extract_metadata(bytes: &[u8]) -> StringIdentificationMetadata {
    let mut ranges = Vec::new();
    if !bytes.is_empty() {
        let mut queue = VecDeque::new();
        let mut visited = bitvec![0; bytes.len()];
        queue.push_back(0);
        while let Some(i) = queue.pop_front() {
            if i >= bytes.len() || visited[i] {
                // if we've already visited a particular entry, then we already know its range(s)
                continue;
            }
            visited.set(i, true); // we always visit the current entry
            let s = core::str::from_utf8(&bytes[i..]).unwrap_or_else(|e| {
                queue.push_back(i + e.valid_up_to() + 1); // push to the next region
                core::str::from_utf8(&bytes[i..][..e.valid_up_to()]).unwrap()
            });
            if !s.is_empty() {
                let mut entries = bitvec![0; s.bytes().len()];
                for (c_idx, _) in s.char_indices() {
                    entries.set(c_idx, true);
                    visited.set(i + c_idx, true);
                }
                for unset in entries.iter_zeros() {
                    // each unset index potentially represents a new UTF-8 start point
                    queue.push_back(unset);
                }
                ranges.push((i, entries));
            }
        }
    }
    StringIdentificationMetadata {
        ranges: Rc::new(ranges),
    }
 }
 /// Stage which identifies potential strings in the provided input
 #[derive(Debug)]
 pub struct StringIdentificationStage<S> {
    phantom: PhantomData<S>,
 }
 impl<S> Default for StringIdentificationStage<S> {
    fn default() -> Self {
        Self::new()
    }
 }
 impl<S> StringIdentificationStage<S> {
    /// Create a new instance of the string identification stage
    #[must_use]
    pub fn new() -> Self {
        Self {
            phantom: PhantomData,
        }
    }
 }
 impl<S> UsesState for StringIdentificationStage<S>
 where
    S: UsesInput,
 {
    type State = S;
 }
 impl<S, E, EM, Z> Stage<E, EM, Z> for StringIdentificationStage<S>
 where
    S: HasTestcase<Input = BytesInput> + HasCorpus,
    E: UsesState<State = S>,
    EM: UsesState<State = S>,
    Z: UsesState<State = S>,
 {
    fn perform(
        &mut self,
        _fuzzer: &mut Z,
        _executor: &mut E,
        state: &mut Self::State,
        _manager: &mut EM,
        corpus_idx: CorpusId,
    ) -> Result<(), Error> {
        let mut tc = state.testcase_mut(corpus_idx)?;
        if tc.has_metadata::<StringIdentificationMetadata>() {
            return Ok(()); // skip recompute
        }
        let input = tc.load_input(state.corpus())?;
        let bytes = input.bytes();
        let metadata = extract_metadata(bytes);
        tc.add_metadata(metadata);
        Ok(())
    }
 }
--- a/libafl_bolts/src/lib.rs
+++ b/libafl_bolts/src/lib.rs
@ -169,11 +169,6 @@ use log::{Metadata, Record};
 /// out of `libafl_bolts` into `libafl::events::launcher`.
 pub mod launcher {}
 // Re-export derive(SerdeAny)
 #[cfg(feature = "libafl_derive")]
 #[allow(unused_imports)]
 #[macro_use]
 extern crate libafl_derive;
 use core::{
    array::TryFromSliceError,
    fmt::{self, Display},
@ -190,6 +185,7 @@ pub use libafl_derive::SerdeAny;
 use {
    alloc::string::{FromUtf8Error, String},
    core::cell::{BorrowError, BorrowMutError},
    core::str::Utf8Error,
 };
 /// We need fixed names for many parts of this lib.
@ -505,6 +501,14 @@ impl From<FromUtf8Error> for Error {
    }
 }
 #[cfg(feature = "alloc")]
 impl From<Utf8Error> for Error {
    #[allow(unused_variables)]
    fn from(err: Utf8Error) -> Self {
        Self::unknown(format!("Could not convert byte / utf-8: {err:?}"))
    }
 }
 #[cfg(feature = "std")]
 impl From<VarError> for Error {
    #[allow(unused_variables)]
--- a/libafl_bolts/src/serdeany.rs
+++ b/libafl_bolts/src/serdeany.rs
@ -85,7 +85,7 @@ macro_rules! create_serde_registry_for_trait {
                Error,
            };
-            /// Visitor object used internally for the [`SerdeAny`] registry.
+            /// Visitor object used internally for the [`crate::serdeany::SerdeAny`] registry.
            #[derive(Debug)]
            pub struct BoxDynVisitor {}
            #[allow(unused_qualifications)]
@ -319,7 +319,7 @@ macro_rules! create_serde_registry_for_trait {
                }
            }
-            /// A serializable [`HashMap`] wrapper for [`SerdeAny`] types, addressable by name.
+            /// A serializable [`HashMap`] wrapper for [`crate::serdeany::SerdeAny`] types, addressable by name.
            #[allow(clippy::unsafe_derive_deserialize)]
            #[allow(unused_qualifications)]
            #[derive(Debug, Serialize, Deserialize)]
--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml
@ -30,7 +30,7 @@ path = "src/lib.rs"
 crate-type = ["staticlib", "rlib"]
 [dependencies]
-libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor"] }
+libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor", "unicode"] }
 libafl_bolts = { path = "../../libafl_bolts", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "serdeany_autoreg", "errors_backtrace"] }
 libafl_targets = { path = "../../libafl_targets", features = ["sancov_8bit", "sancov_cmplog", "libfuzzer", "libfuzzer_oom", "libfuzzer_define_run_driver", "libfuzzer_interceptors", "sanitizers_flags", "whole_archive"] }
--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs
@ -166,7 +166,8 @@ macro_rules! fuzz_with {
            mutators::{
                GrimoireExtensionMutator, GrimoireRecursiveReplacementMutator, GrimoireRandomDeleteMutator,
                GrimoireStringReplacementMutator, havoc_crossover, havoc_mutations, havoc_mutations_no_crossover,
-                I2SRandReplace, StdScheduledMutator, Tokens, tokens_mutations
+                I2SRandReplace, StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator,
                StringCategoryTokenReplaceMutator, StringSubcategoryTokenReplaceMutator, Tokens, tokens_mutations
            },
            observers::{stacktrace::BacktraceObserver, TimeObserver},
            schedulers::{
@ -174,7 +175,7 @@ macro_rules! fuzz_with {
            },
            stages::{
                CalibrationStage, GeneralizationStage, IfStage, StdMutationalStage,
-                StdPowerMutationalStage, TracingStage,
+                StdPowerMutationalStage, StringIdentificationStage, TracingStage,
            },
            state::{HasCorpus, StdState},
            StdFuzzer,
@ -224,7 +225,7 @@ macro_rules! fuzz_with {
            // Set up a generalization stage for grimoire
            let generalization = GeneralizationStage::new(&edges_observer);
-            let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), (generalization, ()));
+            let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), tuple_list!(generalization));
            let calibration = CalibrationStage::new(&map_feedback);
@ -296,6 +297,32 @@ macro_rules! fuzz_with {
            });
            state.metadata_map_mut().insert_boxed(grimoire_metadata);
            // Set up a string category analysis stage for unicode mutations
            let unicode_used = $options.unicode();
            let string_mutator = StdScheduledMutator::new(
                tuple_list!(
                    StringCategoryRandMutator,
                    StringSubcategoryRandMutator,
                    StringSubcategoryRandMutator,
                    StringSubcategoryRandMutator,
                    StringSubcategoryRandMutator,
                )
            );
            let string_replace_mutator = StdScheduledMutator::new(
                tuple_list!(
                    StringCategoryTokenReplaceMutator,
                    StringSubcategoryTokenReplaceMutator,
                    StringSubcategoryTokenReplaceMutator,
                    StringSubcategoryTokenReplaceMutator,
                    StringSubcategoryTokenReplaceMutator,
                )
            );
            let string_power = StdMutationalStage::transforming(string_mutator);
            let string_replace_power = StdMutationalStage::transforming(string_replace_mutator);
            let string_analysis = StringIdentificationStage::new();
            let string_analysis = IfStage::new(|_, _, _, _, _| Ok((unicode_used && mutator_status.std_mutational).into()), tuple_list!(string_analysis, string_power, string_replace_power));
            // Attempt to use tokens from libfuzzer dicts
            if !state.has_metadata::<Tokens>() {
                let mut toks = if let Some(tokens) = $options.dict() {
@ -466,6 +493,7 @@ macro_rules! fuzz_with {
                calibration,
                generalization,
                tracing,
                string_analysis,
                i2s,
                cm_i2s,
                std_power,
--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs
@ -107,6 +107,7 @@ pub struct LibfuzzerOptions {
    artifact_prefix: ArtifactPrefix,
    timeout: Duration,
    grimoire: Option<bool>,
    unicode: bool,
    forks: Option<usize>,
    dict: Option<Tokens>,
    dirs: Vec<PathBuf>,
@ -162,6 +163,10 @@ impl LibfuzzerOptions {
        self.grimoire
    }
    pub fn unicode(&self) -> bool {
        self.unicode
    }
    pub fn forks(&self) -> Option<usize> {
        self.forks
    }
@ -230,6 +235,7 @@ struct LibfuzzerOptionsBuilder<'a> {
    artifact_prefix: Option<&'a str>,
    timeout: Option<Duration>,
    grimoire: Option<bool>,
    unicode: Option<bool>,
    forks: Option<usize>,
    dict: Option<&'a str>,
    dirs: Vec<&'a str>,
@ -292,6 +298,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> {
                            }
                        }
                        "grimoire" => self.grimoire = Some(parse_or_bail!(name, value, u64) > 0),
                        "unicode" => self.unicode = Some(parse_or_bail!(name, value, u64) > 0),
                        "artifact_prefix" => {
                            self.artifact_prefix = Some(value);
                        }
@ -349,6 +356,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> {
                .unwrap_or_default(),
            timeout: self.timeout.unwrap_or(Duration::from_secs(1200)),
            grimoire: self.grimoire,
            unicode: self.unicode.unwrap_or(true),
            forks: self.forks,
            dict: self.dict.map(|path| {
                Tokens::from_file(path).expect("Couldn't load tokens from specified dictionary")