Unicode-preserving mutators (#1542)

* create the string classification stage * modify API to pre-group * preserving mutator * more meaningful test * subproperty mutators + some fixes * document, finalise, integrate with libafl_libfuzzer * add example, fix for weird range select * fix for introspection * fix fuzzer build * speed optimisation: allow, but do not require, stacking * property => category * token replacement * fixup: rare case where rust does not agree on valid character * fix CI again * again again * take two: dynamic unicode discovery * oops * fix: last byte is never selected * opt: bias to smaller unicode categories * fix test * opt: precompute regions and fix tests * cache and allow stacking * document and update libafl_libfuzzer * oops, use reverse * fix bolts clippy error * fixup part 2 * clippy * part 2 * clippy warning allow * clippy complaint * use alloc not std --------- Co-authored-by: toka <tokazerkje@outlook.com>
2023-11-21 00:41:16 +01:00 · 2023-11-21 00:41:16 +01:00 · 281524dbf9
commit 281524dbf9
parent 1e96652ed2
16 changed files with 1037 additions and 14 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@ -73,6 +73,8 @@ jobs:
      run: command -v llvm-config-15 && clang-15 -v
    - name: Add nightly rustfmt and clippy
      run: rustup toolchain install nightly --component rustfmt --component clippy --component miri --allow-downgrade
+    - name: Install ucd-generate
+      run: cargo install -f ucd-generate
    - uses: actions/checkout@v3
    - uses: Swatinem/rust-cache@v2

@ -135,6 +137,8 @@ jobs:
      run: command -v llvm-config-15 && clang-15 -v
    - name: Install cargo-hack
      run: curl -LsSf https://github.com/taiki-e/cargo-hack/releases/latest/download/cargo-hack-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
+    - name: Install ucd-generate
+      run: cargo install -f ucd-generate
    - name: Add nightly
      run: rustup toolchain install nightly --allow-downgrade
    - uses: actions/checkout@v3
@ -222,6 +226,8 @@ jobs:
    - name: Install cxxbridge
      if: runner.os == 'macOS'
      run: cargo install cxxbridge-cmd
+    - name: Install ucd-generate
+      run: cargo install -f ucd-generate
    - name: Install python (macOS)
      # Removing macOS things already installed in CI against failed linking
      if: runner.os == 'macOS'
@ -384,6 +390,8 @@ jobs:
        toolchain: stable
    - name: Add nightly rustfmt and clippy
      run: rustup toolchain install nightly --component rustfmt --component clippy --allow-downgrade
+    - name: Install ucd-generate
+      run: cargo install -f ucd-generate
    - name: Install deps
      run: brew install z3 gtk+3
    - name: Install cxxbridge
@ -453,6 +461,7 @@ jobs:
          freebsd-version
          . "$HOME/.cargo/env"
          rustup toolchain install nightly
+          cargo install -f ucd-generate
          export LLVM_CONFIG=/usr/local/bin/llvm-config16
          pwd
          ls -lah
--- a/fuzzers/baby_fuzzer_unicode/.gitignore
+++ b/fuzzers/baby_fuzzer_unicode/.gitignore
@ -0,0 +1 @@
+libpng-*
--- a/fuzzers/baby_fuzzer_unicode/Cargo.toml
+++ b/fuzzers/baby_fuzzer_unicode/Cargo.toml
@ -0,0 +1,24 @@
+[package]
+name = "baby_fuzzer_unicode"
+version = "0.10.0"
+authors = ["Andrea Fioraldi <andreafioraldi@gmail.com>", "Dominik Maier <domenukk@gmail.com>"]
+edition = "2021"
+
+[features]
+default = ["std"]
+tui = []
+std = []
+
+[profile.dev]
+panic = "abort"
+
+[profile.release]
+panic = "abort"
+lto = true
+codegen-units = 1
+opt-level = 3
+debug = true
+
+[dependencies]
+libafl = { path = "../../libafl/", features = ["unicode"] }
+libafl_bolts = { path = "../../libafl_bolts/" }
--- a/fuzzers/baby_fuzzer_unicode/README.md
+++ b/fuzzers/baby_fuzzer_unicode/README.md
@ -0,0 +1,15 @@
+# Baby fuzzer: unicode
+
+This is a minimalistic example about how to create a libafl based fuzzer.
+
+It runs on a single core until a crash occurs and then exits.
+
+The tested program is a simple Rust function without any instrumentation.
+For real fuzzing, you will want to add some sort to add coverage or other feedback.
+
+You can run this example using `cargo run`, and you can enable the TUI feature by running `cargo run --features tui`.
+
+## Unicode
+
+This fuzzer uses mutators which preserve unicode properties. For programs which have string-heavy inputs, you may
+consider using the same strategy.
--- a/fuzzers/baby_fuzzer_unicode/src/main.rs
+++ b/fuzzers/baby_fuzzer_unicode/src/main.rs
@ -0,0 +1,138 @@
+#[cfg(windows)]
+use std::ptr::write_volatile;
+use std::{path::PathBuf, ptr::write};
+
+#[cfg(feature = "tui")]
+use libafl::monitors::tui::{ui::TuiUI, TuiMonitor};
+#[cfg(not(feature = "tui"))]
+use libafl::monitors::SimpleMonitor;
+use libafl::{
+    corpus::{InMemoryCorpus, OnDiskCorpus},
+    events::SimpleEventManager,
+    executors::{inprocess::InProcessExecutor, ExitKind},
+    feedbacks::{CrashFeedback, MaxMapFeedback},
+    fuzzer::{Fuzzer, StdFuzzer},
+    inputs::{BytesInput, HasTargetBytes},
+    mutators::{StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator},
+    observers::StdMapObserver,
+    schedulers::QueueScheduler,
+    stages::{mutational::StdMutationalStage, StringIdentificationStage},
+    state::StdState,
+    Evaluator,
+};
+use libafl_bolts::{current_nanos, rands::StdRand, tuples::tuple_list, AsSlice};
+
+/// Coverage map with explicit assignments due to the lack of instrumentation
+static mut SIGNALS: [u8; 64] = [0; 64];
+static mut SIGNALS_PTR: *mut u8 = unsafe { SIGNALS.as_mut_ptr() };
+
+/// Assign a signal to the signals map
+fn signals_set(idx: usize) {
+    unsafe { write(SIGNALS_PTR.add(idx), 1) };
+}
+
+#[allow(clippy::similar_names, clippy::manual_assert)]
+pub fn main() {
+    // The closure that we want to fuzz
+    let mut harness = |input: &BytesInput| {
+        let target = input.target_bytes();
+        let buf = target.as_slice();
+        let goal = b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
+        let mut i = 0;
+        for _ in buf.iter().zip(goal).take_while(|(b, c)| b == c) {
+            signals_set(i);
+            i += 1;
+        }
+        if i == goal.len() {
+            #[cfg(unix)]
+            panic!("Artificial bug triggered =)");
+
+            #[cfg(windows)]
+            unsafe {
+                write_volatile(0 as *mut u32, 0);
+            }
+        }
+        ExitKind::Ok
+    };
+
+    // Create an observation channel using the signals map
+    let observer = unsafe { StdMapObserver::from_mut_ptr("signals", SIGNALS_PTR, SIGNALS.len()) };
+
+    // Feedback to rate the interestingness of an input
+    let mut feedback = MaxMapFeedback::new(&observer);
+
+    // A feedback to choose if an input is a solution or not
+    let mut objective = CrashFeedback::new();
+
+    // create a State from scratch
+    let mut state = StdState::new(
+        // RNG
+        StdRand::with_seed(current_nanos()),
+        // Corpus that will be evolved, we keep it in memory for performance
+        InMemoryCorpus::new(),
+        // Corpus in which we store solutions (crashes in this example),
+        // on disk so the user can get them after stopping the fuzzer
+        OnDiskCorpus::new(PathBuf::from("./crashes")).unwrap(),
+        // States of the feedbacks.
+        // The feedbacks can report the data that should persist in the State.
+        &mut feedback,
+        // Same for objective feedbacks
+        &mut objective,
+    )
+    .unwrap();
+
+    // The Monitor trait define how the fuzzer stats are displayed to the user
+    #[cfg(not(feature = "tui"))]
+    let mon = SimpleMonitor::new(|s| println!("{s}"));
+    #[cfg(feature = "tui")]
+    let ui = TuiUI::with_version(String::from("Baby Fuzzer"), String::from("0.0.1"), false);
+    #[cfg(feature = "tui")]
+    let mon = TuiMonitor::new(ui);
+
+    // The event manager handle the various events generated during the fuzzing loop
+    // such as the notification of the addition of a new item to the corpus
+    let mut mgr = SimpleEventManager::new(mon);
+
+    // A queue policy to get testcasess from the corpus
+    let scheduler = QueueScheduler::new();
+
+    // A fuzzer with feedbacks and a corpus scheduler
+    let mut fuzzer = StdFuzzer::new(scheduler, feedback, objective);
+
+    // Create the executor for an in-process function with just one observer
+    let mut executor = InProcessExecutor::new(
+        &mut harness,
+        tuple_list!(observer),
+        &mut fuzzer,
+        &mut state,
+        &mut mgr,
+    )
+    .expect("Failed to create the Executor");
+
+    // Generate 8 initial inputs
+    fuzzer
+        .evaluate_input(
+            &mut state,
+            &mut executor,
+            &mut mgr,
+            BytesInput::new(vec![b'a']),
+        )
+        .unwrap();
+
+    // Setup a mutational stage with a basic bytes mutator
+    let mutator = StdScheduledMutator::new(tuple_list!(
+        StringCategoryRandMutator,
+        StringSubcategoryRandMutator,
+        StringSubcategoryRandMutator,
+        StringSubcategoryRandMutator,
+        StringSubcategoryRandMutator
+    ));
+    let mut stages = tuple_list!(
+        StringIdentificationStage::new(),
+        StdMutationalStage::transforming(mutator)
+    );
+
+    fuzzer
+        .fuzz_loop(&mut stages, &mut executor, &mut state, &mut mgr)
+        .expect("Error in the fuzzing loop");
+}
--- a/libafl/Cargo.toml
+++ b/libafl/Cargo.toml
@ -77,6 +77,9 @@ concolic_mutation = ["z3"]
 ## Enable the fancy TuiMonitor for a termanal UI using crossterm
 tui_monitor = ["ratatui", "crossterm"]

+## Enables `StringClassificationStage` and associated mutators, which allow for mutations which preserve the Unicode property data
+unicode = ["libafl_bolts/alloc", "ahash/std", "serde/rc", "bitvec"]
+

 #! ## LibAFL-Bolts Features

@ -126,7 +129,9 @@ agpl = ["nautilus"]
 nautilus = ["grammartec", "std", "serde_json/std"]

 [build-dependencies]
+reqwest = { version = "0.11", features = ["blocking"] }
 rustversion = "1.0"
+zip = "0.6"

 [dev-dependencies]
 serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
@ -172,7 +177,9 @@ z3 = { version = "0.12.0", features = ["static-link-z3"], optional = true } # fo
 pyo3 = { version = "0.18", optional = true, features = ["serde", "macros"] }
 concat-idents = { version = "1.1.3", optional = true }

-libcasr = { version = "2.7", optional = true}
+libcasr = { version = "2.7", optional = true }
+
+bitvec = { version = "1.0", optional = true, features = ["serde"] } # used for string range storage

 # optional-dev deps (change when target.'cfg(accessible(::std))'.test-dependencies will be stable)
 serial_test = { version = "2", optional = true, default-features = false, features = ["logging"] }
--- a/libafl/build.rs
+++ b/libafl/build.rs
@ -1,14 +1,69 @@
+use std::error::Error;
+
 #[rustversion::nightly]
-fn main() {
+fn main() -> Result<(), Box<dyn Error>> {
    println!("cargo:rerun-if-changed=build.rs");
    println!("cargo:rustc-cfg=nightly");
+    #[cfg(feature = "unicode")]
+    {
+        build_unicode_property_map()?;
+    }
+    Ok(())
 }

 #[rustversion::not(nightly)]
-fn main() {
+fn main() -> Result<(), Box<dyn Error>> {
    println!("cargo:rerun-if-changed=build.rs");
    assert!(
        cfg!(all(not(docrs), not(feature = "nautilus"))),
        "The 'nautilus' feature of libafl requires a nightly compiler"
    );
+    #[cfg(feature = "unicode")]
+    {
+        build_unicode_property_map()?;
+    }
+    Ok(())
+}
+
+#[cfg(feature = "unicode")]
+fn build_unicode_property_map() -> Result<(), Box<dyn Error>> {
+    use std::{
+        env,
+        fs::File,
+        io::{BufWriter, Write},
+        path::PathBuf,
+        process::{Command, Stdio},
+    };
+
+    let out_dir = PathBuf::from(env::var_os("OUT_DIR").unwrap());
+    let ucd_dir = out_dir.join("ucd-dir");
+    let generated_file = out_dir.join("unicode_categories.rs");
+
+    std::fs::create_dir_all(&ucd_dir)?;
+
+    let zip_path = ucd_dir.join("ucd.zip");
+    let mut ucd_file = BufWriter::new(File::create(&zip_path)?);
+    for chunk in reqwest::blocking::get("https://www.unicode.org/Public/zipped/latest/UCD.zip")?
+        .bytes()?
+        .chunks(1 << 12)
+    {
+        ucd_file.write_all(chunk)?;
+    }
+    ucd_file.flush()?;
+    drop(ucd_file);
+
+    let mut zip_file = zip::ZipArchive::new(File::open(&zip_path)?)?;
+    zip_file.extract(&ucd_dir)?;
+    drop(zip_file);
+
+    std::fs::remove_file(zip_path)?;
+
+    let status = Command::new("ucd-generate")
+        .arg("general-category")
+        .arg(ucd_dir.as_os_str())
+        .stdout(Stdio::from(File::create(generated_file)?))
+        .status()?;
+    assert!(status.success());
+
+    Ok(())
 }
--- a/libafl/src/mutators/mod.rs
+++ b/libafl/src/mutators/mod.rs
@ -20,6 +20,11 @@ pub use grimoire::*;
 pub mod tuneable;
 pub use tuneable::*;

+#[cfg(feature = "unicode")]
+pub mod string;
+#[cfg(feature = "unicode")]
+pub use string::*;
+
 #[cfg(feature = "nautilus")]
 pub mod nautilus;
 use alloc::vec::Vec;
--- a/libafl/src/mutators/string.rs
+++ b/libafl/src/mutators/string.rs
@ -0,0 +1,595 @@
+//! Mutators for preserving string categories, which may be useful for certain targets which are primarily string-oriented.
+use alloc::vec::Vec;
+use core::{
+    cmp::{Ordering, Reverse},
+    ops::Range,
+};
+
+use libafl_bolts::{rands::Rand, Error, HasLen, Named};
+
+use crate::{
+    corpus::{CorpusId, HasTestcase, Testcase},
+    inputs::{BytesInput, HasBytesVec},
+    mutators::{rand_range, MutationResult, Mutator, Tokens},
+    stages::{
+        extract_metadata,
+        mutational::{MutatedTransform, MutatedTransformPost},
+        StringIdentificationMetadata,
+    },
+    state::{HasCorpus, HasMaxSize, HasMetadata, HasRand},
+};
+
+/// Input which contains the context necessary to perform unicode mutations
+pub type UnicodeInput = (BytesInput, StringIdentificationMetadata);
+
+impl<S> MutatedTransform<BytesInput, S> for UnicodeInput
+where
+    S: HasCorpus<Input = BytesInput> + HasTestcase,
+{
+    type Post = StringIdentificationMetadata;
+
+    fn try_transform_from(
+        base: &mut Testcase<BytesInput>,
+        state: &S,
+        _corpus_idx: CorpusId,
+    ) -> Result<Self, Error> {
+        let input = base.load_input(state.corpus())?.clone();
+        let metadata = base.metadata::<StringIdentificationMetadata>().cloned()?;
+        Ok((input, metadata))
+    }
+
+    fn try_transform_into(self, _state: &S) -> Result<(BytesInput, Self::Post), Error> {
+        Ok(self)
+    }
+}
+
+impl<S> MutatedTransformPost<S> for StringIdentificationMetadata
+where
+    S: HasTestcase,
+{
+    fn post_exec(
+        self,
+        state: &mut S,
+        _stage_idx: i32,
+        corpus_idx: Option<CorpusId>,
+    ) -> Result<(), Error> {
+        if let Some(corpus_idx) = corpus_idx {
+            let mut tc = state.testcase_mut(corpus_idx)?;
+            tc.add_metadata(self);
+        }
+        Ok(())
+    }
+}
+
+const MAX_CHARS: usize = 16;
+
+fn choose_start<R: Rand>(
+    rand: &mut R,
+    bytes: &[u8],
+    meta: &StringIdentificationMetadata,
+) -> Option<(usize, usize)> {
+    let idx = rand.below(bytes.len() as u64) as usize;
+    let mut options = Vec::new();
+    for (start, range) in meta.ranges() {
+        if idx
+            .checked_sub(*start) // idx adjusted to start
+            .and_then(|idx| (idx < range.len()).then(|| range[idx])) // idx in range
+            .map_or(false, |r| r)
+        {
+            options.push((*start, range));
+        }
+    }
+    match options.len() {
+        0 => None,
+        1 => Some((options[0].0, options[0].1.len())),
+        _ => {
+            // bias towards longer strings
+            options.sort_by_cached_key(|(_, entries)| entries.count_ones());
+            let selected = libafl_bolts::math::integer_sqrt(
+                rand.below((options.len() * options.len()) as u64),
+            ) as usize;
+            Some((options[selected].0, options[selected].1.len()))
+        }
+    }
+}
+
+fn get_subcategory<T: Ord + Copy>(needle: T, haystack: &[(T, T)]) -> Option<(T, T)> {
+    haystack
+        .binary_search_by(|&(min, max)| match min.cmp(&needle) {
+            Ordering::Less | Ordering::Equal => match needle.cmp(&max) {
+                Ordering::Less | Ordering::Equal => Ordering::Equal,
+                Ordering::Greater => Ordering::Less,
+            },
+            Ordering::Greater => Ordering::Greater,
+        })
+        .ok()
+        .map(|idx| haystack[idx])
+}
+
+fn find_range<F: Fn(char) -> bool>(
+    chars: &[(usize, char)],
+    idx: usize,
+    predicate: F,
+) -> Range<usize> {
+    // walk backwards and discover
+    let start = chars[..idx]
+        .iter()
+        .rev()
+        .take_while(|&&(_, c)| predicate(c))
+        .last()
+        .map_or(chars[idx].0, |&(i, _)| i);
+    // walk forwards
+    let end = chars[(idx + 1)..]
+        .iter()
+        .take_while(|&&(_, c)| predicate(c))
+        .last()
+        .map_or(chars[idx].0 + chars[idx].1.len_utf8(), |&(i, c)| {
+            i + c.len_utf8()
+        });
+
+    start..end
+}
+
+fn choose_category_range<R: Rand>(
+    rand: &mut R,
+    string: &str,
+) -> (Range<usize>, &'static [(u32, u32)]) {
+    let chars = string.char_indices().collect::<Vec<_>>();
+    let idx = rand.below(chars.len() as u64) as usize;
+    let c = chars[idx].1;
+
+    // figure out the categories for this char
+    let expanded = c as u32;
+    #[cfg(test)]
+    let mut names = Vec::new();
+    let mut categories = Vec::new();
+    for (_name, category) in unicode_categories::BY_NAME {
+        if get_subcategory(expanded, category).is_some() {
+            #[cfg(test)]
+            names.push(_name);
+            categories.push(category);
+        }
+    }
+
+    // ok -- we want to bias towards smaller regions to keep the mutations "tight" to original
+    // we sort the options by descending length, then pick isqrt of below(n^2)
+
+    categories.sort_by_cached_key(|cat| {
+        Reverse(
+            cat.iter()
+                .map(|&(min, max)| (max - min + 1) as usize)
+                .sum::<usize>(),
+        )
+    });
+    let options = categories.len() * categories.len();
+    let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize;
+
+    let selected = categories[selected_idx];
+
+    #[cfg(test)]
+    println!("category for `{c}' ({}): {}", c as u32, names[selected_idx]);
+
+    (
+        find_range(&chars, idx, |c| {
+            get_subcategory(c as u32, selected).is_some()
+        }),
+        selected,
+    )
+}
+
+fn choose_subcategory_range<R: Rand>(rand: &mut R, string: &str) -> (Range<usize>, (u32, u32)) {
+    let chars = string.char_indices().collect::<Vec<_>>();
+    let idx = rand.below(chars.len() as u64) as usize;
+    let c = chars[idx].1;
+
+    // figure out the categories for this char
+    let expanded = c as u32;
+    #[cfg(test)]
+    let mut names = Vec::new();
+    let mut subcategories = Vec::new();
+    for (_name, category) in unicode_categories::BY_NAME {
+        if let Some(subcategory) = get_subcategory(expanded, category) {
+            #[cfg(test)]
+            names.push(_name);
+            subcategories.push(subcategory);
+        }
+    }
+
+    // see reasoning for selection pattern in choose_category_range
+
+    subcategories.sort_by_key(|&(min, max)| Reverse(max - min + 1));
+    let options = subcategories.len() * subcategories.len();
+    let selected_idx = libafl_bolts::math::integer_sqrt(rand.below(options as u64)) as usize;
+    let selected = subcategories[selected_idx];
+
+    #[cfg(test)]
+    println!(
+        "subcategory for `{c}' ({}): {} ({:?})",
+        c as u32, names[selected_idx], selected
+    );
+
+    (
+        find_range(&chars, idx, |c| {
+            let expanded = c as u32;
+            selected.0 <= expanded && expanded <= selected.1
+        }),
+        selected,
+    )
+}
+
+fn rand_replace_range<S: HasRand + HasMaxSize, F: Fn(&mut S) -> char>(
+    state: &mut S,
+    input: &mut UnicodeInput,
+    range: Range<usize>,
+    char_gen: F,
+) -> MutationResult {
+    let temp_range = rand_range(state, range.end - range.start, MAX_CHARS);
+    let range = (range.start + temp_range.start)..(range.start + temp_range.end);
+    let range = match core::str::from_utf8(&input.0.bytes()[range.clone()]) {
+        Ok(_) => range,
+        Err(e) => range.start..(range.start + e.valid_up_to()),
+    };
+
+    #[cfg(test)]
+    println!(
+        "mutating range: {:?} ({:?})",
+        range,
+        core::str::from_utf8(&input.0.bytes()[range.clone()])
+    );
+    if range.start == range.end {
+        return MutationResult::Skipped;
+    }
+
+    let replace_len = state.rand_mut().below(MAX_CHARS as u64) as usize;
+    let orig_len = range.end - range.start;
+    if input.0.len() - orig_len + replace_len > state.max_size() {
+        return MutationResult::Skipped;
+    }
+
+    let mut replacement = Vec::with_capacity(replace_len);
+    let mut dest = [0u8; 4];
+
+    loop {
+        let new_c = char_gen(state);
+        if replacement.len() + new_c.len_utf8() > replace_len {
+            break;
+        }
+        new_c.encode_utf8(&mut dest);
+        replacement.extend_from_slice(&dest[..new_c.len_utf8()]);
+        if replacement.len() + new_c.len_utf8() == replace_len {
+            break; // nailed it
+        }
+    }
+
+    input.0.bytes_mut().splice(range, replacement);
+    input.1 = extract_metadata(input.0.bytes());
+
+    MutationResult::Mutated
+}
+
+/// Unicode category data, as used by string analysis and mutators.
+pub mod unicode_categories {
+    #![allow(unused)]
+    #![allow(missing_docs)]
+    #![allow(clippy::redundant_static_lifetimes)]
+
+    include!(concat!(env!("OUT_DIR"), "/unicode_categories.rs"));
+}
+
+/// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the
+/// range's category
+#[derive(Debug, Default)]
+pub struct StringCategoryRandMutator;
+
+impl Named for StringCategoryRandMutator {
+    fn name(&self) -> &str {
+        "string-category-rand"
+    }
+}
+
+impl<S> Mutator<UnicodeInput, S> for StringCategoryRandMutator
+where
+    S: HasRand + HasMaxSize,
+{
+    fn mutate(
+        &mut self,
+        state: &mut S,
+        input: &mut UnicodeInput,
+        _stage_idx: i32,
+    ) -> Result<MutationResult, Error> {
+        if input.0.bytes().is_empty() {
+            return Ok(MutationResult::Skipped);
+        }
+
+        let bytes = input.0.bytes();
+        let meta = &input.1;
+        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
+            let substring = core::str::from_utf8(&bytes[base..][..len])?;
+            let (range, category) = choose_category_range(state.rand_mut(), substring);
+            #[cfg(test)]
+            println!(
+                "{:?} => {:?}",
+                range,
+                core::str::from_utf8(&bytes[range.clone()])
+            );
+
+            let options: u64 = category
+                .iter()
+                .map(|&(start, end)| u64::from(end) - u64::from(start) + 1)
+                .sum();
+            let char_gen = |state: &mut S| loop {
+                let mut selected = state.rand_mut().below(options);
+                for &(min, max) in category {
+                    if let Some(next_selected) =
+                        selected.checked_sub(u64::from(max) - u64::from(min) + 1)
+                    {
+                        selected = next_selected;
+                    } else if let Some(new_c) = char::from_u32(selected as u32 + min) {
+                        return new_c;
+                    } else {
+                        break;
+                    }
+                }
+            };
+
+            return Ok(rand_replace_range(state, input, range, char_gen));
+        }
+
+        Ok(MutationResult::Skipped)
+    }
+}
+
+/// Mutator which randomly replaces a randomly selected range of bytes with bytes that preserve the
+/// range's subcategory
+#[derive(Debug, Default)]
+pub struct StringSubcategoryRandMutator;
+
+impl Named for StringSubcategoryRandMutator {
+    fn name(&self) -> &str {
+        "string-subcategory-rand"
+    }
+}
+
+impl<S> Mutator<UnicodeInput, S> for StringSubcategoryRandMutator
+where
+    S: HasRand + HasMaxSize,
+{
+    fn mutate(
+        &mut self,
+        state: &mut S,
+        input: &mut UnicodeInput,
+        _stage_idx: i32,
+    ) -> Result<MutationResult, Error> {
+        if input.0.bytes().is_empty() {
+            return Ok(MutationResult::Skipped);
+        }
+
+        let bytes = input.0.bytes();
+        let meta = &input.1;
+        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
+            let substring = core::str::from_utf8(&bytes[base..][..len])?;
+            let (range, subcategory) = choose_subcategory_range(state.rand_mut(), substring);
+            #[cfg(test)]
+            println!(
+                "{:?} => {:?}",
+                range,
+                core::str::from_utf8(&bytes[range.clone()])
+            );
+
+            let options: u64 = u64::from(subcategory.1) - u64::from(subcategory.0) + 1;
+            let char_gen = |state: &mut S| loop {
+                let selected = state.rand_mut().below(options);
+                if let Some(new_c) = char::from_u32(selected as u32 + subcategory.0) {
+                    return new_c;
+                }
+            };
+
+            return Ok(rand_replace_range(state, input, range, char_gen));
+        }
+
+        Ok(MutationResult::Skipped)
+    }
+}
+
+/// Mutator which randomly replaces a full category-contiguous region of chars with a random token
+#[derive(Debug, Default)]
+pub struct StringCategoryTokenReplaceMutator;
+
+impl Named for StringCategoryTokenReplaceMutator {
+    fn name(&self) -> &str {
+        "string-category-token-replace"
+    }
+}
+
+impl<S> Mutator<UnicodeInput, S> for StringCategoryTokenReplaceMutator
+where
+    S: HasRand + HasMaxSize + HasMetadata,
+{
+    fn mutate(
+        &mut self,
+        state: &mut S,
+        input: &mut UnicodeInput,
+        _stage_idx: i32,
+    ) -> Result<MutationResult, Error> {
+        if input.0.bytes().is_empty() {
+            return Ok(MutationResult::Skipped);
+        }
+
+        let tokens_len = {
+            let meta = state.metadata_map().get::<Tokens>();
+            if meta.is_none() {
+                return Ok(MutationResult::Skipped);
+            }
+            if meta.unwrap().tokens().is_empty() {
+                return Ok(MutationResult::Skipped);
+            }
+            meta.unwrap().tokens().len()
+        };
+        let token_idx = state.rand_mut().below(tokens_len as u64) as usize;
+
+        let bytes = input.0.bytes();
+        let meta = &input.1;
+        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
+            let substring = core::str::from_utf8(&bytes[base..][..len])?;
+            let (range, _) = choose_category_range(state.rand_mut(), substring);
+
+            #[cfg(test)]
+            println!(
+                "{:?} => {:?}",
+                range,
+                core::str::from_utf8(&bytes[range.clone()])
+            );
+
+            let meta = state.metadata_map().get::<Tokens>().unwrap();
+            let token = &meta.tokens()[token_idx];
+
+            if input.0.len() - (range.end - range.start) + token.len() > state.max_size() {
+                return Ok(MutationResult::Skipped);
+            }
+
+            input.0.bytes_mut().splice(range, token.iter().copied());
+            input.1 = extract_metadata(input.0.bytes());
+            return Ok(MutationResult::Mutated);
+        }
+
+        Ok(MutationResult::Skipped)
+    }
+}
+
+/// Mutator which randomly replaces a full subcategory-contiguous region of chars with a random token
+#[derive(Debug, Default)]
+pub struct StringSubcategoryTokenReplaceMutator;
+
+impl Named for StringSubcategoryTokenReplaceMutator {
+    fn name(&self) -> &str {
+        "string-subcategory-replace"
+    }
+}
+
+impl<S> Mutator<UnicodeInput, S> for StringSubcategoryTokenReplaceMutator
+where
+    S: HasRand + HasMaxSize + HasMetadata,
+{
+    fn mutate(
+        &mut self,
+        state: &mut S,
+        input: &mut UnicodeInput,
+        _stage_idx: i32,
+    ) -> Result<MutationResult, Error> {
+        if input.0.bytes().is_empty() {
+            return Ok(MutationResult::Skipped);
+        }
+
+        let tokens_len = {
+            let meta = state.metadata_map().get::<Tokens>();
+            if meta.is_none() {
+                return Ok(MutationResult::Skipped);
+            }
+            if meta.unwrap().tokens().is_empty() {
+                return Ok(MutationResult::Skipped);
+            }
+            meta.unwrap().tokens().len()
+        };
+        let token_idx = state.rand_mut().below(tokens_len as u64) as usize;
+
+        let bytes = input.0.bytes();
+        let meta = &input.1;
+        if let Some((base, len)) = choose_start(state.rand_mut(), bytes, meta) {
+            let substring = core::str::from_utf8(&bytes[base..][..len])?;
+            let (range, _) = choose_subcategory_range(state.rand_mut(), substring);
+
+            #[cfg(test)]
+            println!(
+                "{:?} => {:?}",
+                range,
+                core::str::from_utf8(&bytes[range.clone()])
+            );
+
+            let meta = state.metadata_map().get::<Tokens>().unwrap();
+            let token = &meta.tokens()[token_idx];
+
+            if input.0.len() - (range.end - range.start) + token.len() > state.max_size() {
+                return Ok(MutationResult::Skipped);
+            }
+
+            input.0.bytes_mut().splice(range, token.iter().copied());
+            input.1 = extract_metadata(input.0.bytes());
+            return Ok(MutationResult::Mutated);
+        }
+
+        Ok(MutationResult::Skipped)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use libafl_bolts::rands::StdRand;
+
+    use super::*;
+    use crate::{corpus::NopCorpus, stages::extract_metadata, state::StdState};
+
+    // a not-so-useful test for this
+    #[test]
+    fn mutate_hex() {
+        let result: Result<(), Error> = (|| {
+            let hex = "0123456789abcdef0123456789abcdef";
+            let mut bytes = BytesInput::from(hex.as_bytes());
+
+            let mut mutator = StringCategoryRandMutator;
+
+            let mut state = StdState::new(
+                StdRand::with_seed(0),
+                NopCorpus::<BytesInput>::new(),
+                NopCorpus::new(),
+                &mut (),
+                &mut (),
+            )?;
+
+            for _ in 0..(1 << 12) {
+                let metadata = extract_metadata(bytes.bytes());
+                let mut input = (bytes, metadata);
+                let _ = mutator.mutate(&mut state, &mut input, 0);
+                println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap());
+                bytes = input.0;
+            }
+
+            Ok(())
+        })();
+
+        if let Err(e) = result {
+            panic!("failed with error: {e}");
+        }
+    }
+
+    #[test]
+    fn mutate_hex_subcat() {
+        let result: Result<(), Error> = (|| {
+            let hex = "0123456789abcdef0123456789abcdef";
+            let mut bytes = BytesInput::from(hex.as_bytes());
+
+            let mut mutator = StringSubcategoryRandMutator;
+
+            let mut state = StdState::new(
+                StdRand::with_seed(0),
+                NopCorpus::<BytesInput>::new(),
+                NopCorpus::new(),
+                &mut (),
+                &mut (),
+            )?;
+
+            for _ in 0..(1 << 12) {
+                let metadata = extract_metadata(bytes.bytes());
+                let mut input = (bytes, metadata);
+                let _ = mutator.mutate(&mut state, &mut input, 0);
+                println!("{:?}", core::str::from_utf8(input.0.bytes()).unwrap());
+                bytes = input.0;
+            }
+
+            Ok(())
+        })();
+
+        if let Err(e) = result {
+            panic!("failed with error: {e}");
+        }
+    }
+}
--- a/libafl/src/stages/mod.rs
+++ b/libafl/src/stages/mod.rs
@ -49,6 +49,11 @@ pub use concolic::ConcolicTracingStage;
 #[cfg(feature = "std")]
 pub use concolic::SimpleConcolicMutationalStage;

+#[cfg(feature = "unicode")]
+pub mod string;
+#[cfg(feature = "unicode")]
+pub use string::*;
+
 #[cfg(feature = "std")]
 pub mod sync;
 #[cfg(feature = "std")]
@ -56,6 +61,7 @@ pub use sync::*;

 #[cfg(feature = "std")]
 pub mod dump;
+
 use core::{convert::From, marker::PhantomData};

 #[cfg(feature = "std")]
--- a/libafl/src/stages/string.rs
+++ b/libafl/src/stages/string.rs
@ -0,0 +1,128 @@
+//! Stages which analysis common to Unicode-style mutations
+
+use alloc::{collections::VecDeque, rc::Rc, vec::Vec};
+use core::marker::PhantomData;
+
+use bitvec::{bitvec, vec::BitVec};
+use libafl_bolts::{impl_serdeany, Error};
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    corpus::{CorpusId, HasTestcase},
+    inputs::{BytesInput, HasBytesVec, UsesInput},
+    stages::Stage,
+    state::{HasCorpus, HasMetadata, UsesState},
+};
+
+/// Metadata which stores the list of pre-computed string-like ranges in the input
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+pub struct StringIdentificationMetadata {
+    ranges: Rc<Vec<(usize, BitVec)>>,
+}
+
+impl_serdeany!(StringIdentificationMetadata);
+
+impl StringIdentificationMetadata {
+    /// The list of pre-computed string-like ranges in the input
+    #[must_use]
+    pub fn ranges(&self) -> &Vec<(usize, BitVec)> {
+        self.ranges.as_ref()
+    }
+}
+
+pub(crate) fn extract_metadata(bytes: &[u8]) -> StringIdentificationMetadata {
+    let mut ranges = Vec::new();
+
+    if !bytes.is_empty() {
+        let mut queue = VecDeque::new();
+        let mut visited = bitvec![0; bytes.len()];
+        queue.push_back(0);
+
+        while let Some(i) = queue.pop_front() {
+            if i >= bytes.len() || visited[i] {
+                // if we've already visited a particular entry, then we already know its range(s)
+                continue;
+            }
+            visited.set(i, true); // we always visit the current entry
+            let s = core::str::from_utf8(&bytes[i..]).unwrap_or_else(|e| {
+                queue.push_back(i + e.valid_up_to() + 1); // push to the next region
+                core::str::from_utf8(&bytes[i..][..e.valid_up_to()]).unwrap()
+            });
+            if !s.is_empty() {
+                let mut entries = bitvec![0; s.bytes().len()];
+                for (c_idx, _) in s.char_indices() {
+                    entries.set(c_idx, true);
+                    visited.set(i + c_idx, true);
+                }
+                for unset in entries.iter_zeros() {
+                    // each unset index potentially represents a new UTF-8 start point
+                    queue.push_back(unset);
+                }
+                ranges.push((i, entries));
+            }
+        }
+    }
+
+    StringIdentificationMetadata {
+        ranges: Rc::new(ranges),
+    }
+}
+
+/// Stage which identifies potential strings in the provided input
+#[derive(Debug)]
+pub struct StringIdentificationStage<S> {
+    phantom: PhantomData<S>,
+}
+
+impl<S> Default for StringIdentificationStage<S> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<S> StringIdentificationStage<S> {
+    /// Create a new instance of the string identification stage
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            phantom: PhantomData,
+        }
+    }
+}
+
+impl<S> UsesState for StringIdentificationStage<S>
+where
+    S: UsesInput,
+{
+    type State = S;
+}
+
+impl<S, E, EM, Z> Stage<E, EM, Z> for StringIdentificationStage<S>
+where
+    S: HasTestcase<Input = BytesInput> + HasCorpus,
+    E: UsesState<State = S>,
+    EM: UsesState<State = S>,
+    Z: UsesState<State = S>,
+{
+    fn perform(
+        &mut self,
+        _fuzzer: &mut Z,
+        _executor: &mut E,
+        state: &mut Self::State,
+        _manager: &mut EM,
+        corpus_idx: CorpusId,
+    ) -> Result<(), Error> {
+        let mut tc = state.testcase_mut(corpus_idx)?;
+        if tc.has_metadata::<StringIdentificationMetadata>() {
+            return Ok(()); // skip recompute
+        }
+
+        let input = tc.load_input(state.corpus())?;
+
+        let bytes = input.bytes();
+        let metadata = extract_metadata(bytes);
+        tc.add_metadata(metadata);
+
+        Ok(())
+    }
+}
--- a/libafl_bolts/src/lib.rs
+++ b/libafl_bolts/src/lib.rs
@ -169,11 +169,6 @@ use log::{Metadata, Record};
 /// out of `libafl_bolts` into `libafl::events::launcher`.
 pub mod launcher {}

-// Re-export derive(SerdeAny)
-#[cfg(feature = "libafl_derive")]
-#[allow(unused_imports)]
-#[macro_use]
-extern crate libafl_derive;
 use core::{
    array::TryFromSliceError,
    fmt::{self, Display},
@ -190,6 +185,7 @@ pub use libafl_derive::SerdeAny;
 use {
    alloc::string::{FromUtf8Error, String},
    core::cell::{BorrowError, BorrowMutError},
+    core::str::Utf8Error,
 };

 /// We need fixed names for many parts of this lib.
@ -505,6 +501,14 @@ impl From<FromUtf8Error> for Error {
    }
 }

+#[cfg(feature = "alloc")]
+impl From<Utf8Error> for Error {
+    #[allow(unused_variables)]
+    fn from(err: Utf8Error) -> Self {
+        Self::unknown(format!("Could not convert byte / utf-8: {err:?}"))
+    }
+}
+
 #[cfg(feature = "std")]
 impl From<VarError> for Error {
    #[allow(unused_variables)]
--- a/libafl_bolts/src/serdeany.rs
+++ b/libafl_bolts/src/serdeany.rs
@ -85,7 +85,7 @@ macro_rules! create_serde_registry_for_trait {
                Error,
            };

-            /// Visitor object used internally for the [`SerdeAny`] registry.
+            /// Visitor object used internally for the [`crate::serdeany::SerdeAny`] registry.
            #[derive(Debug)]
            pub struct BoxDynVisitor {}
            #[allow(unused_qualifications)]
@ -319,7 +319,7 @@ macro_rules! create_serde_registry_for_trait {
                }
            }

-            /// A serializable [`HashMap`] wrapper for [`SerdeAny`] types, addressable by name.
+            /// A serializable [`HashMap`] wrapper for [`crate::serdeany::SerdeAny`] types, addressable by name.
            #[allow(clippy::unsafe_derive_deserialize)]
            #[allow(unused_qualifications)]
            #[derive(Debug, Serialize, Deserialize)]
--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/Cargo.toml
@ -30,7 +30,7 @@ path = "src/lib.rs"
 crate-type = ["staticlib", "rlib"]

 [dependencies]
-libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor"] }
+libafl = { path = "../../libafl", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "regex", "errors_backtrace", "serdeany_autoreg", "tui_monitor", "unicode"] }
 libafl_bolts = { path = "../../libafl_bolts", default-features = false, features = ["std", "derive", "llmp_compression", "rand_trait", "serdeany_autoreg", "errors_backtrace"] }
 libafl_targets = { path = "../../libafl_targets", features = ["sancov_8bit", "sancov_cmplog", "libfuzzer", "libfuzzer_oom", "libfuzzer_define_run_driver", "libfuzzer_interceptors", "sanitizers_flags", "whole_archive"] }

--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/lib.rs
@ -166,7 +166,8 @@ macro_rules! fuzz_with {
            mutators::{
                GrimoireExtensionMutator, GrimoireRecursiveReplacementMutator, GrimoireRandomDeleteMutator,
                GrimoireStringReplacementMutator, havoc_crossover, havoc_mutations, havoc_mutations_no_crossover,
-                I2SRandReplace, StdScheduledMutator, Tokens, tokens_mutations
+                I2SRandReplace, StdScheduledMutator, StringCategoryRandMutator, StringSubcategoryRandMutator,
+                StringCategoryTokenReplaceMutator, StringSubcategoryTokenReplaceMutator, Tokens, tokens_mutations
            },
            observers::{stacktrace::BacktraceObserver, TimeObserver},
            schedulers::{
@ -174,7 +175,7 @@ macro_rules! fuzz_with {
            },
            stages::{
                CalibrationStage, GeneralizationStage, IfStage, StdMutationalStage,
-                StdPowerMutationalStage, TracingStage,
+                StdPowerMutationalStage, StringIdentificationStage, TracingStage,
            },
            state::{HasCorpus, StdState},
            StdFuzzer,
@ -224,7 +225,7 @@ macro_rules! fuzz_with {

            // Set up a generalization stage for grimoire
            let generalization = GeneralizationStage::new(&edges_observer);
-            let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), (generalization, ()));
+            let generalization = IfStage::new(|_, _, _, _, _| Ok(grimoire.into()), tuple_list!(generalization));

            let calibration = CalibrationStage::new(&map_feedback);

@ -296,6 +297,32 @@ macro_rules! fuzz_with {
            });
            state.metadata_map_mut().insert_boxed(grimoire_metadata);

+            // Set up a string category analysis stage for unicode mutations
+            let unicode_used = $options.unicode();
+            let string_mutator = StdScheduledMutator::new(
+                tuple_list!(
+                    StringCategoryRandMutator,
+                    StringSubcategoryRandMutator,
+                    StringSubcategoryRandMutator,
+                    StringSubcategoryRandMutator,
+                    StringSubcategoryRandMutator,
+                )
+            );
+            let string_replace_mutator = StdScheduledMutator::new(
+                tuple_list!(
+                    StringCategoryTokenReplaceMutator,
+                    StringSubcategoryTokenReplaceMutator,
+                    StringSubcategoryTokenReplaceMutator,
+                    StringSubcategoryTokenReplaceMutator,
+                    StringSubcategoryTokenReplaceMutator,
+                )
+            );
+            let string_power = StdMutationalStage::transforming(string_mutator);
+            let string_replace_power = StdMutationalStage::transforming(string_replace_mutator);
+
+            let string_analysis = StringIdentificationStage::new();
+            let string_analysis = IfStage::new(|_, _, _, _, _| Ok((unicode_used && mutator_status.std_mutational).into()), tuple_list!(string_analysis, string_power, string_replace_power));
+
            // Attempt to use tokens from libfuzzer dicts
            if !state.has_metadata::<Tokens>() {
                let mut toks = if let Some(tokens) = $options.dict() {
@ -466,6 +493,7 @@ macro_rules! fuzz_with {
                calibration,
                generalization,
                tracing,
+                string_analysis,
                i2s,
                cm_i2s,
                std_power,
--- a/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs
+++ b/libafl_libfuzzer/libafl_libfuzzer_runtime/src/options.rs
@ -107,6 +107,7 @@ pub struct LibfuzzerOptions {
    artifact_prefix: ArtifactPrefix,
    timeout: Duration,
    grimoire: Option<bool>,
+    unicode: bool,
    forks: Option<usize>,
    dict: Option<Tokens>,
    dirs: Vec<PathBuf>,
@ -162,6 +163,10 @@ impl LibfuzzerOptions {
        self.grimoire
    }

+    pub fn unicode(&self) -> bool {
+        self.unicode
+    }
+
    pub fn forks(&self) -> Option<usize> {
        self.forks
    }
@ -230,6 +235,7 @@ struct LibfuzzerOptionsBuilder<'a> {
    artifact_prefix: Option<&'a str>,
    timeout: Option<Duration>,
    grimoire: Option<bool>,
+    unicode: Option<bool>,
    forks: Option<usize>,
    dict: Option<&'a str>,
    dirs: Vec<&'a str>,
@ -292,6 +298,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> {
                            }
                        }
                        "grimoire" => self.grimoire = Some(parse_or_bail!(name, value, u64) > 0),
+                        "unicode" => self.unicode = Some(parse_or_bail!(name, value, u64) > 0),
                        "artifact_prefix" => {
                            self.artifact_prefix = Some(value);
                        }
@ -349,6 +356,7 @@ impl<'a> LibfuzzerOptionsBuilder<'a> {
                .unwrap_or_default(),
            timeout: self.timeout.unwrap_or(Duration::from_secs(1200)),
            grimoire: self.grimoire,
+            unicode: self.unicode.unwrap_or(true),
            forks: self.forks,
            dict: self.dict.map(|path| {
                Tokens::from_file(path).expect("Couldn't load tokens from specified dictionary")