Port gramatron preprocessing to Rust (#341)

* grammatron random mut * import String from alloc * gramatron * grammar preprocess scripts * clippy * fix construct_automata.py * splice mutator * fix * clippy * recursion mutator * recursion mut in example * clippy * fix * clippy * grammars * fix gramatron * fmt
2021-10-28 10:37:31 +02:00 · 2021-10-28 10:37:31 +02:00 · 2055eabede
commit 2055eabede
parent 7eb293e087
19 changed files with 413 additions and 23 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -27,4 +27,5 @@ default-members = [
 exclude = [
    "fuzzers",
    "bindings",
+    "scripts",
 ]
--- a/fuzzers/baby_fuzzer_gramatron/Cargo.toml
+++ b/fuzzers/baby_fuzzer_gramatron/Cargo.toml
@ -20,4 +20,4 @@ debug = true

 [dependencies]
 libafl = { path = "../../libafl/" }
-serde_json = "1.0.68"
+postcard = "0.7"
--- a/fuzzers/baby_fuzzer_gramatron/auto.postcard
+++ b/fuzzers/baby_fuzzer_gramatron/auto.postcard
--- a/fuzzers/baby_fuzzer_gramatron/src/main.rs
+++ b/fuzzers/baby_fuzzer_gramatron/src/main.rs
@ -14,7 +14,7 @@ use libafl::{
    events::SimpleEventManager,
    executors::{inprocess::InProcessExecutor, ExitKind},
    feedbacks::{CrashFeedback, MapFeedbackState, MaxMapFeedback},
-    fuzzer::{Evaluator, Fuzzer, StdFuzzer},
+    fuzzer::{Fuzzer, StdFuzzer},
    generators::{Automaton, GramatronGenerator},
    inputs::GramatronInput,
    mutators::{
@ -38,8 +38,10 @@ fn signals_set(idx: usize) {

 fn read_automaton_from_file<P: AsRef<Path>>(path: P) -> Automaton {
    let file = fs::File::open(path).unwrap();
-    let reader = BufReader::new(file);
-    serde_json::from_reader(reader).unwrap()
+    let mut reader = BufReader::new(file);
+    let mut buffer = Vec::new();
+    reader.read_to_end(&mut buffer).unwrap();
+    postcard::from_bytes(&buffer).unwrap()
 }

 #[allow(clippy::similar_names)]
@ -104,9 +106,28 @@ pub fn main() {
    )
    .expect("Failed to create the Executor");

-    let mut generator =
-        GramatronGenerator::new(read_automaton_from_file(PathBuf::from("auto.json")));
+    let automaton = read_automaton_from_file(PathBuf::from("auto.postcard"));
+    let mut generator = GramatronGenerator::new(&automaton);

+    /// Use this code to profile the generator performance
+    /*
+    use libafl::generators::Generator;
+    use std::collections::HashSet;
+    let mut set = HashSet::new();
+    let st = libafl::bolts::current_milliseconds();
+    let mut b = vec![];
+    let mut c = 0;
+    for _ in 0..100000000 {
+        let i = generator.generate(&mut state).unwrap();
+        i.unparse(&mut b);
+        set.insert(b.clone());
+        c += b.len();
+    }
+    println!("{} / {}", c, libafl::bolts::current_milliseconds() - st);
+    println!("{} / 100000000", set.len());
+
+    return;
+    */
    // Generate 8 initial inputs
    state
        .generate_initial_inputs_forced(&mut fuzzer, &mut executor, &mut generator, &mut mgr, 8)
--- a/libafl/src/bolts/launcher.rs
+++ b/libafl/src/bolts/launcher.rs
@ -134,7 +134,7 @@ where
                            .build()
                            .launch()?;

-                        (self.run_client)(state, mgr, bind_to.id)?;
+                        (self.run_client)(state, mgr, bind_to.id).expect("Client closure failed");
                        break;
                    }
                };
--- a/libafl/src/fuzzer/mod.rs
+++ b/libafl/src/fuzzer/mod.rs
@ -23,8 +23,8 @@ use alloc::boxed::Box;
 use alloc::string::ToString;
 use core::{marker::PhantomData, time::Duration};

-/// Send a stats update all 3 (or more) seconds
-const STATS_TIMEOUT_DEFAULT: Duration = Duration::from_millis(3 * 1000);
+/// Send a stats update all 15 (or more) seconds
+const STATS_TIMEOUT_DEFAULT: Duration = Duration::from_secs(15);

 /// Holds a scheduler
 pub trait HasCorpusScheduler<CS, I, S>
--- a/libafl/src/generators/gramatron.rs
+++ b/libafl/src/generators/gramatron.rs
@ -10,14 +10,13 @@ use crate::{
    Error,
 };

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
 pub struct Trigger {
-    pub id: String,
    pub dest: usize,
    pub term: String,
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
 pub struct Automaton {
    pub final_state: usize,
    pub init_state: usize,
@ -26,16 +25,16 @@ pub struct Automaton {

 #[derive(Clone, Debug)]
 /// Generates random inputs from a grammar automatron
-pub struct GramatronGenerator<R, S>
+pub struct GramatronGenerator<'a, R, S>
 where
    R: Rand,
    S: HasRand<R>,
 {
-    automaton: Automaton,
+    automaton: &'a Automaton,
    phantom: PhantomData<(R, S)>,
 }

-impl<R, S> Generator<GramatronInput, S> for GramatronGenerator<R, S>
+impl<'a, R, S> Generator<GramatronInput, S> for GramatronGenerator<'a, R, S>
 where
    R: Rand,
    S: HasRand<R>,
@ -51,14 +50,14 @@ where
    }
 }

-impl<R, S> GramatronGenerator<R, S>
+impl<'a, R, S> GramatronGenerator<'a, R, S>
 where
    R: Rand,
    S: HasRand<R>,
 {
    /// Returns a new [`GramatronGenerator`]
    #[must_use]
-    pub fn new(automaton: Automaton) -> Self {
+    pub fn new(automaton: &'a Automaton) -> Self {
        Self {
            automaton,
            phantom: PhantomData,
--- a/libafl/src/mutators/gramatron.rs
+++ b/libafl/src/mutators/gramatron.rs
@ -18,7 +18,7 @@ where
    S: HasRand<R> + HasMetadata,
    R: Rand,
 {
-    generator: &'a GramatronGenerator<R, S>,
+    generator: &'a GramatronGenerator<'a, R, S>,
 }

 impl<'a, R, S> Mutator<GramatronInput, S> for GramatronRandomMutator<'a, R, S>
@ -61,7 +61,7 @@ where
 {
    /// Creates a new [`GramatronRandomMutator`].
    #[must_use]
-    pub fn new(generator: &'a GramatronGenerator<R, S>) -> Self {
+    pub fn new(generator: &'a GramatronGenerator<'a, R, S>) -> Self {
        Self { generator }
    }
 }
--- a/scripts/gramatron/README.md
+++ b/scripts/gramatron/README.md
@ -0,0 +1,17 @@
+# Gramatron preprocessing scripts
+
+In this folder live the scripts to convert a grammar (some examples in the `grammars/` subfolder) into a serialized Automaton.
+
+You need as first to convert the grammar to the GNF form using the `gnf_converter.py` Python script.
+
+Then use the output as input of the `construct_automata` crate.
+
+Here an example using the Ruby grammar:
+
+```
+./gnf_converter.py --gf grammars/ruby_grammar.json --out ruby_gnf.json --start PROGRAM
+cd construct_automata
+RUSTFLAGS="-C target-cpu=native" cargo run --release -- --gf ../ruby_gnf.json --out ../ruby_automaton.postcard
+```
+
+You can add the `--limit` flag to limit the stack size, as described in the Gramatron paper.
--- a/scripts/gramatron/construct_automata/Cargo.toml
+++ b/scripts/gramatron/construct_automata/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "construct_automata"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde_json = "1.0"
+regex = "1"
+postcard = "0.7"
+lazy_static = "1.4.0"
+libafl = { path = "../../../libafl" }
+clap = { version = "3.0.0-beta.2", features = ["yaml"] }
--- a/scripts/gramatron/construct_automata/src/clap-config.yaml
+++ b/scripts/gramatron/construct_automata/src/clap-config.yaml
@ -0,0 +1,22 @@
+name: construct_automata
+version: "0.1.0"
+author: "Andrea Fioraldi <andreafioraldi@gmail.com>"
+about: Generate a serialized Automaton using a json GNF grammar
+args:
+    - grammar:
+        short: g
+        long: grammar-file
+        value_name: GRAMMAR
+        required: true
+        takes_value: true
+    - output:
+        short: o
+        long: output
+        value_name: OUTPUT
+        required: true
+        takes_value: true
+    - limit:
+        short: l
+        long: limit
+        value_name: LIMIT
+        takes_value: true
--- a/scripts/gramatron/construct_automata/src/main.rs
+++ b/scripts/gramatron/construct_automata/src/main.rs
@ -0,0 +1,314 @@
+use clap::{load_yaml, App};
+use lazy_static::lazy_static;
+use regex::Regex;
+use serde_json::Value;
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::{
+    fs,
+    io::{BufReader, Write},
+    path::Path,
+    rc::Rc,
+};
+
+use libafl::generators::gramatron::{Automaton, Trigger};
+
+fn read_grammar_from_file<P: AsRef<Path>>(path: P) -> Value {
+    let file = fs::File::open(path).unwrap();
+    let reader = BufReader::new(file);
+    serde_json::from_reader(reader).unwrap()
+}
+
+#[derive(Debug)]
+struct Element {
+    pub state: usize,
+    pub items: Rc<VecDeque<String>>,
+}
+
+#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
+struct Transition {
+    pub source: usize,
+    pub dest: usize,
+    pub ss: Vec<String>,
+    pub terminal: String,
+    pub is_regex: bool,
+    pub stack: Rc<VecDeque<String>>,
+}
+
+#[derive(Default)]
+struct Stacks {
+    pub q: HashMap<usize, VecDeque<String>>,
+    pub s: HashMap<usize, Vec<String>>,
+}
+
+fn tokenize(rule: &str) -> (String, Vec<String>, bool) {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"([r])*'([\s\S]+)'([\s\S]*)").unwrap();
+    }
+    let cap = RE.captures(rule).unwrap();
+    let is_regex = cap.get(1).is_some();
+    let terminal = cap.get(2).unwrap().as_str().to_owned();
+    let ss = cap.get(3).map_or(vec![], |m| {
+        m.as_str()
+            .split_whitespace()
+            .map(|x| x.to_owned())
+            .collect()
+    });
+    if terminal == "\\n" {
+        ("\n".into(), ss, is_regex)
+    } else {
+        (terminal, ss, is_regex)
+    }
+}
+
+fn prepare_transitions(
+    grammar: &Value,
+    pda: &mut Vec<Transition>,
+    state_stacks: &mut Stacks,
+    state_count: &mut usize,
+    worklist: &mut VecDeque<Element>,
+    element: Element,
+    stack_limit: usize,
+) {
+    if element.items.is_empty() {
+        return; // Final state was encountered, pop from worklist without doing anything
+    }
+
+    let state = element.state;
+    let nonterminal = &element.items[0];
+    let rules = grammar[nonterminal].as_array().unwrap();
+    // let mut i = 0;
+    'rules_loop: for rule in rules {
+        let rule = rule.as_str().unwrap();
+        let (terminal, ss, is_regex) = tokenize(rule);
+        let dest = *state_count;
+
+        // println!("Rule \"{}\", {} over {}", &rule, i, rules.len());
+
+        // Creating a state stack for the new state
+        let mut state_stack = state_stacks
+            .q
+            .get(&state)
+            .map_or(VecDeque::new(), |x| x.clone());
+        if !state_stack.is_empty() {
+            state_stack.pop_front();
+        }
+        for symbol in ss.iter().rev() {
+            state_stack.push_front(symbol.clone());
+        }
+        let mut state_stack_sorted: Vec<_> = state_stack.iter().cloned().collect();
+        state_stack_sorted.sort();
+
+        let mut transition = Transition {
+            source: state,
+            dest,
+            ss,
+            terminal,
+            is_regex,
+            stack: Rc::new(state_stack.clone()),
+        };
+
+        // Check if a recursive transition state being created, if so make a backward
+        // edge and don't add anything to the worklist
+        for (key, val) in state_stacks.s.iter() {
+            if state_stack_sorted == *val {
+                transition.dest = *key;
+                // i += 1;
+                pda.push(transition.clone());
+
+                // If a recursive transition exercised don't add the same transition as a new
+                // edge, continue onto the next transitions
+                continue 'rules_loop;
+            }
+        }
+
+        // If the generated state has a stack size > stack_limit then that state is abandoned
+        // and not added to the FSA or the worklist for further expansion
+        if stack_limit > 0 {
+            if transition.stack.len() > stack_limit {
+                // TODO add to unexpanded_rules
+                continue;
+            }
+        }
+
+        // Create transitions for the non-recursive relations and add to the worklist
+        worklist.push_back(Element {
+            state: dest,
+            items: transition.stack.clone(),
+        });
+        state_stacks.q.insert(dest, state_stack);
+        state_stacks.s.insert(dest, state_stack_sorted);
+        pda.push(transition);
+
+        println!("worklist size: {}", worklist.len());
+
+        *state_count += 1;
+        // i += 1;
+    }
+}
+
+fn get_states(pda: &[Transition]) -> (HashSet<usize>, HashSet<usize>, HashSet<usize>) {
+    let mut source = HashSet::new();
+    let mut dest = HashSet::new();
+    for transition in pda {
+        source.insert(transition.source);
+        dest.insert(transition.dest);
+    }
+    let all = source.union(&dest).map(|x| *x).collect();
+    (
+        all,
+        dest.difference(&source).map(|x| *x).collect(),
+        source.difference(&dest).map(|x| *x).collect(),
+    )
+}
+
+fn postprocess(pda: &[Transition], stack_limit: usize) -> Automaton {
+    let mut num_transition = 0;
+    let (states, finals, initial) = get_states(pda);
+
+    assert!(initial.len() == 1);
+
+    println!("# transitions: {}", pda.len());
+    println!("# states: {}", states.len());
+    println!("initial state: {:?}", &initial);
+    println!("final states: {:?}", &finals);
+
+    let mut memoized = Vec::with_capacity(states.len());
+    //let mut memoized_unique = Vec::with_capacity(states.len());
+
+    // if stack_limit ...
+    if stack_limit > 0 {
+        let mut culled_pda = Vec::with_capacity(pda.len());
+        let mut blocklist = HashSet::new();
+        //let mut culled_pda_unique = HashSet::new();
+
+        for final_state in &finals {
+            pda.iter().for_each(|transition| {
+                if transition.dest == *final_state && transition.stack.len() > 0 {
+                    blocklist.insert(transition.dest);
+                } else {
+                    culled_pda.push(transition);
+                    //culled_pda_unique.insert(transition);
+                }
+            });
+        }
+
+        // println!("culled_pda size: {} pda size: {}", culled_pda.len(), pda.len());
+
+        let culled_finals: HashSet<usize> = finals.difference(&blocklist).map(|x| *x).collect();
+        assert!(culled_finals.len() == 1);
+
+        culled_pda.iter().for_each(|transition| {
+            if blocklist.contains(&transition.dest) {
+                return;
+            }
+            num_transition += 1;
+            let state = transition.source;
+            if state >= memoized.len() {
+                memoized.resize(state + 1, vec![]);
+            }
+            memoized[state].push(Trigger {
+                dest: transition.dest,
+                term: transition.terminal.clone(),
+            });
+
+            if num_transition % 4096 == 0 {
+                println!(
+                    "processed {} transitions over {}",
+                    num_transition,
+                    culled_pda.len()
+                );
+            }
+        });
+
+        /*
+        culled_pda_unique.iter().for_each(|transition| {
+            if blocklist.contains(&transition.dest) {
+                return;
+            }
+            num_transition += 1;
+            let state = transition.source;
+            if state >= memoized_unique.len() {
+                memoized_unique.resize(state +1, vec![]);
+            }
+            memoized_unique[state].push(Trigger {dest: transition.dest, term: transition.terminal.clone()});
+        });
+        */
+
+        Automaton {
+            init_state: initial.iter().next().cloned().unwrap(),
+            final_state: culled_finals.iter().next().cloned().unwrap(),
+            pda: memoized,
+        }
+    } else {
+        // Running FSA construction in exact approximation mode and postprocessing it like so
+        pda.iter().for_each(|transition| {
+            num_transition += 1;
+            let state = transition.source;
+            if state >= memoized.len() {
+                memoized.resize(state + 1, vec![]);
+            }
+            memoized[state].push(Trigger {
+                dest: transition.dest,
+                term: transition.terminal.clone(),
+            });
+
+            if num_transition % 4096 == 0 {
+                println!(
+                    "processed {} transitions over {}",
+                    num_transition,
+                    pda.len()
+                );
+            }
+        });
+
+        Automaton {
+            init_state: initial.iter().next().cloned().unwrap(),
+            final_state: finals.iter().next().cloned().unwrap(),
+            pda: memoized,
+        }
+    }
+}
+
+fn main() {
+    let yaml = load_yaml!("clap-config.yaml");
+    let matches = App::from(yaml).get_matches();
+
+    let grammar_file = matches.value_of("grammar").unwrap();
+    let output_file = matches.value_of("output").unwrap();
+    let stack_limit = matches.value_of_t::<usize>("limit").unwrap_or(0);
+
+    let mut worklist = VecDeque::new();
+    let mut state_count = 1;
+    let mut state_stacks = Stacks::default();
+    let mut pda = vec![];
+
+    let grammar = read_grammar_from_file(grammar_file);
+    let start_symbol = grammar["Start"][0].as_str().unwrap().to_owned();
+    let mut start_vec = VecDeque::new();
+    start_vec.push_back(start_symbol);
+    worklist.push_back(Element {
+        state: 0,
+        items: Rc::new(start_vec),
+    });
+
+    while let Some(element) = worklist.pop_front() {
+        prepare_transitions(
+            &grammar,
+            &mut pda,
+            &mut state_stacks,
+            &mut state_count,
+            &mut worklist,
+            element,
+            stack_limit,
+        );
+    }
+
+    state_stacks.q.clear();
+    state_stacks.s.clear();
+
+    let transformed = postprocess(&pda, stack_limit);
+    let serialized = postcard::to_allocvec(&transformed).unwrap();
+
+    let mut file = fs::File::create(output_file).unwrap();
+    file.write_all(&serialized).unwrap();
+}
--- a/scripts/gramatron/gnf_converter.py
+++ b/scripts/gramatron/gnf_converter.py
@ -186,15 +186,20 @@ def remove_mixed(grammar):
        for rhs in rules:
            # tokens = rhs.split(' ')
            regen_rule = []
+            # print('---------------------')
+            # print(rhs)
            tokens = gettokens(rhs)
            if len(gettokens(rhs)) == 1:
                new_grammar[lhs].append(rhs)
                continue
            for token in tokens:
+                # print(token, isTerminal(token), regen_rule)
                # Identify if there is a terminal in the RHS
                if isTerminal(token):
                    # Check if a corresponding nonterminal already exists
-                    nonterminal = terminal_exist(token, new_grammar)
+                    # nonterminal = terminal_exist(token, new_grammar)
+                    nonterminal = None
+                    # TODO(andrea) disabled ATM, further investigation using the Ruby grammar needed
                    if nonterminal:
                        regen_rule.append(nonterminal)
                    else:
--- a/scripts/gramatron/grammars/js/source_automata.json
+++ b/scripts/gramatron/grammars/js/source_automata.json
--- a/scripts/gramatron/grammars/js_grammar.json
+++ b/scripts/gramatron/grammars/js_grammar.json
--- a/scripts/gramatron/grammars/php/source_automata.json
+++ b/scripts/gramatron/grammars/php/source_automata.json
--- a/scripts/gramatron/grammars/php_grammar.json
+++ b/scripts/gramatron/grammars/php_grammar.json
--- a/scripts/gramatron/grammars/ruby/source_automata.json
+++ b/scripts/gramatron/grammars/ruby/source_automata.json
--- a/scripts/gramatron/grammars/ruby_grammar.json
+++ b/scripts/gramatron/grammars/ruby_grammar.json