Port gramatron preprocessing to Rust (#341)

* grammatron random mut

* import String from alloc

* gramatron

* grammar preprocess scripts

* clippy

* fix construct_automata.py

* splice mutator

* fix

* clippy

* recursion mutator

* recursion mut in example

* clippy

* fix

* clippy

* grammars

* fix gramatron

* fmt
This commit is contained in:
Andrea Fioraldi 2021-10-28 10:37:31 +02:00 committed by GitHub
parent 7eb293e087
commit 2055eabede
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 413 additions and 23 deletions

View File

@ -27,4 +27,5 @@ default-members = [
exclude = [
"fuzzers",
"bindings",
"scripts",
]

View File

@ -20,4 +20,4 @@ debug = true
[dependencies]
libafl = { path = "../../libafl/" }
serde_json = "1.0.68"
postcard = "0.7"

Binary file not shown.

View File

@ -14,7 +14,7 @@ use libafl::{
events::SimpleEventManager,
executors::{inprocess::InProcessExecutor, ExitKind},
feedbacks::{CrashFeedback, MapFeedbackState, MaxMapFeedback},
fuzzer::{Evaluator, Fuzzer, StdFuzzer},
fuzzer::{Fuzzer, StdFuzzer},
generators::{Automaton, GramatronGenerator},
inputs::GramatronInput,
mutators::{
@ -38,8 +38,10 @@ fn signals_set(idx: usize) {
fn read_automaton_from_file<P: AsRef<Path>>(path: P) -> Automaton {
let file = fs::File::open(path).unwrap();
let reader = BufReader::new(file);
serde_json::from_reader(reader).unwrap()
let mut reader = BufReader::new(file);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer).unwrap();
postcard::from_bytes(&buffer).unwrap()
}
#[allow(clippy::similar_names)]
@ -104,9 +106,28 @@ pub fn main() {
)
.expect("Failed to create the Executor");
let mut generator =
GramatronGenerator::new(read_automaton_from_file(PathBuf::from("auto.json")));
let automaton = read_automaton_from_file(PathBuf::from("auto.postcard"));
let mut generator = GramatronGenerator::new(&automaton);
/// Use this code to profile the generator performance
/*
use libafl::generators::Generator;
use std::collections::HashSet;
let mut set = HashSet::new();
let st = libafl::bolts::current_milliseconds();
let mut b = vec![];
let mut c = 0;
for _ in 0..100000000 {
let i = generator.generate(&mut state).unwrap();
i.unparse(&mut b);
set.insert(b.clone());
c += b.len();
}
println!("{} / {}", c, libafl::bolts::current_milliseconds() - st);
println!("{} / 100000000", set.len());
return;
*/
// Generate 8 initial inputs
state
.generate_initial_inputs_forced(&mut fuzzer, &mut executor, &mut generator, &mut mgr, 8)

View File

@ -134,7 +134,7 @@ where
.build()
.launch()?;
(self.run_client)(state, mgr, bind_to.id)?;
(self.run_client)(state, mgr, bind_to.id).expect("Client closure failed");
break;
}
};

View File

@ -23,8 +23,8 @@ use alloc::boxed::Box;
use alloc::string::ToString;
use core::{marker::PhantomData, time::Duration};
/// Send a stats update all 3 (or more) seconds
const STATS_TIMEOUT_DEFAULT: Duration = Duration::from_millis(3 * 1000);
/// Send a stats update all 15 (or more) seconds
const STATS_TIMEOUT_DEFAULT: Duration = Duration::from_secs(15);
/// Holds a scheduler
pub trait HasCorpusScheduler<CS, I, S>

View File

@ -10,14 +10,13 @@ use crate::{
Error,
};
#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct Trigger {
pub id: String,
pub dest: usize,
pub term: String,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct Automaton {
pub final_state: usize,
pub init_state: usize,
@ -26,16 +25,16 @@ pub struct Automaton {
#[derive(Clone, Debug)]
/// Generates random inputs from a grammar automatron
pub struct GramatronGenerator<R, S>
pub struct GramatronGenerator<'a, R, S>
where
R: Rand,
S: HasRand<R>,
{
automaton: Automaton,
automaton: &'a Automaton,
phantom: PhantomData<(R, S)>,
}
impl<R, S> Generator<GramatronInput, S> for GramatronGenerator<R, S>
impl<'a, R, S> Generator<GramatronInput, S> for GramatronGenerator<'a, R, S>
where
R: Rand,
S: HasRand<R>,
@ -51,14 +50,14 @@ where
}
}
impl<R, S> GramatronGenerator<R, S>
impl<'a, R, S> GramatronGenerator<'a, R, S>
where
R: Rand,
S: HasRand<R>,
{
/// Returns a new [`GramatronGenerator`]
#[must_use]
pub fn new(automaton: Automaton) -> Self {
pub fn new(automaton: &'a Automaton) -> Self {
Self {
automaton,
phantom: PhantomData,

View File

@ -18,7 +18,7 @@ where
S: HasRand<R> + HasMetadata,
R: Rand,
{
generator: &'a GramatronGenerator<R, S>,
generator: &'a GramatronGenerator<'a, R, S>,
}
impl<'a, R, S> Mutator<GramatronInput, S> for GramatronRandomMutator<'a, R, S>
@ -61,7 +61,7 @@ where
{
/// Creates a new [`GramatronRandomMutator`].
#[must_use]
pub fn new(generator: &'a GramatronGenerator<R, S>) -> Self {
pub fn new(generator: &'a GramatronGenerator<'a, R, S>) -> Self {
Self { generator }
}
}

View File

@ -0,0 +1,17 @@
# Gramatron preprocessing scripts
In this folder live the scripts to convert a grammar (some examples in the `grammars/` subfolder) into a serialized Automaton.
You need as first to convert the grammar to the GNF form using the `gnf_converter.py` Python script.
Then use the output as input of the `construct_automata` crate.
Here an example using the Ruby grammar:
```
./gnf_converter.py --gf grammars/ruby_grammar.json --out ruby_gnf.json --start PROGRAM
cd construct_automata
RUSTFLAGS="-C target-cpu=native" cargo run --release -- --gf ../ruby_gnf.json --out ../ruby_automaton.postcard
```
You can add the `--limit` flag to limit the stack size, as described in the Gramatron paper.

View File

@ -0,0 +1,14 @@
[package]
name = "construct_automata"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde_json = "1.0"
regex = "1"
postcard = "0.7"
lazy_static = "1.4.0"
libafl = { path = "../../../libafl" }
clap = { version = "3.0.0-beta.2", features = ["yaml"] }

View File

@ -0,0 +1,22 @@
name: construct_automata
version: "0.1.0"
author: "Andrea Fioraldi <andreafioraldi@gmail.com>"
about: Generate a serialized Automaton using a json GNF grammar
args:
- grammar:
short: g
long: grammar-file
value_name: GRAMMAR
required: true
takes_value: true
- output:
short: o
long: output
value_name: OUTPUT
required: true
takes_value: true
- limit:
short: l
long: limit
value_name: LIMIT
takes_value: true

View File

@ -0,0 +1,314 @@
use clap::{load_yaml, App};
use lazy_static::lazy_static;
use regex::Regex;
use serde_json::Value;
use std::collections::{HashMap, HashSet, VecDeque};
use std::{
fs,
io::{BufReader, Write},
path::Path,
rc::Rc,
};
use libafl::generators::gramatron::{Automaton, Trigger};
fn read_grammar_from_file<P: AsRef<Path>>(path: P) -> Value {
let file = fs::File::open(path).unwrap();
let reader = BufReader::new(file);
serde_json::from_reader(reader).unwrap()
}
#[derive(Debug)]
struct Element {
pub state: usize,
pub items: Rc<VecDeque<String>>,
}
#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
struct Transition {
pub source: usize,
pub dest: usize,
pub ss: Vec<String>,
pub terminal: String,
pub is_regex: bool,
pub stack: Rc<VecDeque<String>>,
}
#[derive(Default)]
struct Stacks {
pub q: HashMap<usize, VecDeque<String>>,
pub s: HashMap<usize, Vec<String>>,
}
fn tokenize(rule: &str) -> (String, Vec<String>, bool) {
lazy_static! {
static ref RE: Regex = Regex::new(r"([r])*'([\s\S]+)'([\s\S]*)").unwrap();
}
let cap = RE.captures(rule).unwrap();
let is_regex = cap.get(1).is_some();
let terminal = cap.get(2).unwrap().as_str().to_owned();
let ss = cap.get(3).map_or(vec![], |m| {
m.as_str()
.split_whitespace()
.map(|x| x.to_owned())
.collect()
});
if terminal == "\\n" {
("\n".into(), ss, is_regex)
} else {
(terminal, ss, is_regex)
}
}
fn prepare_transitions(
grammar: &Value,
pda: &mut Vec<Transition>,
state_stacks: &mut Stacks,
state_count: &mut usize,
worklist: &mut VecDeque<Element>,
element: Element,
stack_limit: usize,
) {
if element.items.is_empty() {
return; // Final state was encountered, pop from worklist without doing anything
}
let state = element.state;
let nonterminal = &element.items[0];
let rules = grammar[nonterminal].as_array().unwrap();
// let mut i = 0;
'rules_loop: for rule in rules {
let rule = rule.as_str().unwrap();
let (terminal, ss, is_regex) = tokenize(rule);
let dest = *state_count;
// println!("Rule \"{}\", {} over {}", &rule, i, rules.len());
// Creating a state stack for the new state
let mut state_stack = state_stacks
.q
.get(&state)
.map_or(VecDeque::new(), |x| x.clone());
if !state_stack.is_empty() {
state_stack.pop_front();
}
for symbol in ss.iter().rev() {
state_stack.push_front(symbol.clone());
}
let mut state_stack_sorted: Vec<_> = state_stack.iter().cloned().collect();
state_stack_sorted.sort();
let mut transition = Transition {
source: state,
dest,
ss,
terminal,
is_regex,
stack: Rc::new(state_stack.clone()),
};
// Check if a recursive transition state being created, if so make a backward
// edge and don't add anything to the worklist
for (key, val) in state_stacks.s.iter() {
if state_stack_sorted == *val {
transition.dest = *key;
// i += 1;
pda.push(transition.clone());
// If a recursive transition exercised don't add the same transition as a new
// edge, continue onto the next transitions
continue 'rules_loop;
}
}
// If the generated state has a stack size > stack_limit then that state is abandoned
// and not added to the FSA or the worklist for further expansion
if stack_limit > 0 {
if transition.stack.len() > stack_limit {
// TODO add to unexpanded_rules
continue;
}
}
// Create transitions for the non-recursive relations and add to the worklist
worklist.push_back(Element {
state: dest,
items: transition.stack.clone(),
});
state_stacks.q.insert(dest, state_stack);
state_stacks.s.insert(dest, state_stack_sorted);
pda.push(transition);
println!("worklist size: {}", worklist.len());
*state_count += 1;
// i += 1;
}
}
fn get_states(pda: &[Transition]) -> (HashSet<usize>, HashSet<usize>, HashSet<usize>) {
let mut source = HashSet::new();
let mut dest = HashSet::new();
for transition in pda {
source.insert(transition.source);
dest.insert(transition.dest);
}
let all = source.union(&dest).map(|x| *x).collect();
(
all,
dest.difference(&source).map(|x| *x).collect(),
source.difference(&dest).map(|x| *x).collect(),
)
}
fn postprocess(pda: &[Transition], stack_limit: usize) -> Automaton {
let mut num_transition = 0;
let (states, finals, initial) = get_states(pda);
assert!(initial.len() == 1);
println!("# transitions: {}", pda.len());
println!("# states: {}", states.len());
println!("initial state: {:?}", &initial);
println!("final states: {:?}", &finals);
let mut memoized = Vec::with_capacity(states.len());
//let mut memoized_unique = Vec::with_capacity(states.len());
// if stack_limit ...
if stack_limit > 0 {
let mut culled_pda = Vec::with_capacity(pda.len());
let mut blocklist = HashSet::new();
//let mut culled_pda_unique = HashSet::new();
for final_state in &finals {
pda.iter().for_each(|transition| {
if transition.dest == *final_state && transition.stack.len() > 0 {
blocklist.insert(transition.dest);
} else {
culled_pda.push(transition);
//culled_pda_unique.insert(transition);
}
});
}
// println!("culled_pda size: {} pda size: {}", culled_pda.len(), pda.len());
let culled_finals: HashSet<usize> = finals.difference(&blocklist).map(|x| *x).collect();
assert!(culled_finals.len() == 1);
culled_pda.iter().for_each(|transition| {
if blocklist.contains(&transition.dest) {
return;
}
num_transition += 1;
let state = transition.source;
if state >= memoized.len() {
memoized.resize(state + 1, vec![]);
}
memoized[state].push(Trigger {
dest: transition.dest,
term: transition.terminal.clone(),
});
if num_transition % 4096 == 0 {
println!(
"processed {} transitions over {}",
num_transition,
culled_pda.len()
);
}
});
/*
culled_pda_unique.iter().for_each(|transition| {
if blocklist.contains(&transition.dest) {
return;
}
num_transition += 1;
let state = transition.source;
if state >= memoized_unique.len() {
memoized_unique.resize(state +1, vec![]);
}
memoized_unique[state].push(Trigger {dest: transition.dest, term: transition.terminal.clone()});
});
*/
Automaton {
init_state: initial.iter().next().cloned().unwrap(),
final_state: culled_finals.iter().next().cloned().unwrap(),
pda: memoized,
}
} else {
// Running FSA construction in exact approximation mode and postprocessing it like so
pda.iter().for_each(|transition| {
num_transition += 1;
let state = transition.source;
if state >= memoized.len() {
memoized.resize(state + 1, vec![]);
}
memoized[state].push(Trigger {
dest: transition.dest,
term: transition.terminal.clone(),
});
if num_transition % 4096 == 0 {
println!(
"processed {} transitions over {}",
num_transition,
pda.len()
);
}
});
Automaton {
init_state: initial.iter().next().cloned().unwrap(),
final_state: finals.iter().next().cloned().unwrap(),
pda: memoized,
}
}
}
fn main() {
let yaml = load_yaml!("clap-config.yaml");
let matches = App::from(yaml).get_matches();
let grammar_file = matches.value_of("grammar").unwrap();
let output_file = matches.value_of("output").unwrap();
let stack_limit = matches.value_of_t::<usize>("limit").unwrap_or(0);
let mut worklist = VecDeque::new();
let mut state_count = 1;
let mut state_stacks = Stacks::default();
let mut pda = vec![];
let grammar = read_grammar_from_file(grammar_file);
let start_symbol = grammar["Start"][0].as_str().unwrap().to_owned();
let mut start_vec = VecDeque::new();
start_vec.push_back(start_symbol);
worklist.push_back(Element {
state: 0,
items: Rc::new(start_vec),
});
while let Some(element) = worklist.pop_front() {
prepare_transitions(
&grammar,
&mut pda,
&mut state_stacks,
&mut state_count,
&mut worklist,
element,
stack_limit,
);
}
state_stacks.q.clear();
state_stacks.s.clear();
let transformed = postprocess(&pda, stack_limit);
let serialized = postcard::to_allocvec(&transformed).unwrap();
let mut file = fs::File::create(output_file).unwrap();
file.write_all(&serialized).unwrap();
}

7
scripts/gramatron/gnf_converter.py Normal file → Executable file
View File

@ -186,15 +186,20 @@ def remove_mixed(grammar):
for rhs in rules:
# tokens = rhs.split(' ')
regen_rule = []
# print('---------------------')
# print(rhs)
tokens = gettokens(rhs)
if len(gettokens(rhs)) == 1:
new_grammar[lhs].append(rhs)
continue
for token in tokens:
# print(token, isTerminal(token), regen_rule)
# Identify if there is a terminal in the RHS
if isTerminal(token):
# Check if a corresponding nonterminal already exists
nonterminal = terminal_exist(token, new_grammar)
# nonterminal = terminal_exist(token, new_grammar)
nonterminal = None
# TODO(andrea) disabled ATM, further investigation using the Ruby grammar needed
if nonterminal:
regen_rule.append(nonterminal)
else:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long