From e32b3eae9398a381a9449475adbba23213e5034b Mon Sep 17 00:00:00 2001 From: Valentin Huber Date: Sat, 9 Nov 2024 19:13:51 +0100 Subject: [PATCH] Introducing Launcher::overcommit, improving CI formatting (#2670) * introducing Launcher::overcommit * removing unnecessary cfg restrictions and clippy allows * improving warning for wrong clang-format version * installing black in the format CI * Enforcing python formatting in CI * extending formatting using black on all python files * printing diff on black failure * preferring python's black over system black * moving to LLVM 19 for formatting --- .github/workflows/build_and_test.yml | 2 + bindings/pylibafl/test.py | 6 +- fuzzers/binary_only/python_qemu/fuzzer.py | 18 +- .../dynamic_analysis/concatenator.py | 20 +- libafl/src/events/launcher.rs | 179 ++++++------- scripts/fmt_all.sh | 21 +- utils/cfg_builder/build.py | 13 +- utils/gramatron/construct_automata.py | 248 ++++++++++-------- utils/gramatron/gnf_converter.py | 99 +++---- utils/libafl_fmt/Cargo.toml | 1 + utils/libafl_fmt/src/main.rs | 37 ++- 11 files changed, 345 insertions(+), 299 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 27433d1f36..7bd69e987a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -198,6 +198,8 @@ jobs: run: rustup component add --toolchain nightly-x86_64-unknown-linux-gnu rustfmt - uses: Swatinem/rust-cache@v2 with: { shared-key: "ubuntu" } + - name: Installing black + run: python3 -m pip install black - name: Format Check run: ./scripts/fmt_all.sh check diff --git a/bindings/pylibafl/test.py b/bindings/pylibafl/test.py index 41d90c9e3e..1ad3ef8c1e 100644 --- a/bindings/pylibafl/test.py +++ b/bindings/pylibafl/test.py @@ -3,5 +3,7 @@ import ctypes import platform print("Starting to fuzz from python!") -fuzzer = sugar.InMemoryBytesCoverageSugar(input_dirs=["./in"], output_dir="out", broker_port=1337, cores=[0,1]) -fuzzer.run(lambda b: print("foo")) \ No newline at end of file +fuzzer = sugar.InMemoryBytesCoverageSugar( + input_dirs=["./in"], output_dir="out", broker_port=1337, cores=[0, 1] +) +fuzzer.run(lambda b: print("foo")) diff --git a/fuzzers/binary_only/python_qemu/fuzzer.py b/fuzzers/binary_only/python_qemu/fuzzer.py index 71fc023580..295159cad5 100644 --- a/fuzzers/binary_only/python_qemu/fuzzer.py +++ b/fuzzers/binary_only/python_qemu/fuzzer.py @@ -4,31 +4,32 @@ from pylibafl import sugar, qemu import lief MAX_SIZE = 0x100 -BINARY_PATH = './a.out' +BINARY_PATH = "./a.out" -emu = qemu.Qemu(['qemu-x86_64', BINARY_PATH], []) +emu = qemu.Qemu(["qemu-x86_64", BINARY_PATH], []) elf = lief.parse(BINARY_PATH) test_one_input = elf.get_function_address("LLVMFuzzerTestOneInput") if elf.is_pie: test_one_input += emu.load_addr() -print('LLVMFuzzerTestOneInput @ 0x%x' % test_one_input) +print("LLVMFuzzerTestOneInput @ 0x%x" % test_one_input) emu.set_breakpoint(test_one_input) emu.run() sp = emu.read_reg(qemu.regs.Rsp) -print('SP = 0x%x' % sp) +print("SP = 0x%x" % sp) -retaddr = int.from_bytes(emu.read_mem(sp, 8), 'little') -print('RET = 0x%x' % retaddr) +retaddr = int.from_bytes(emu.read_mem(sp, 8), "little") +print("RET = 0x%x" % retaddr) inp = emu.map_private(0, MAX_SIZE, qemu.mmap.ReadWrite) -assert(inp > 0) +assert inp > 0 emu.remove_breakpoint(test_one_input) emu.set_breakpoint(retaddr) + def harness(b): if len(b) > MAX_SIZE: b = b[:MAX_SIZE] @@ -39,5 +40,6 @@ def harness(b): emu.write_reg(qemu.regs.Rip, test_one_input) emu.run() -fuzz = sugar.QemuBytesCoverageSugar(['./in'], './out', 3456, [0,1,2,3]) + +fuzz = sugar.QemuBytesCoverageSugar(["./in"], "./out", 3456, [0, 1, 2, 3]) fuzz.run(emu, harness) diff --git a/fuzzers/inprocess/dynamic_analysis/concatenator.py b/fuzzers/inprocess/dynamic_analysis/concatenator.py index 72f09f56c6..f24d06ad1e 100755 --- a/fuzzers/inprocess/dynamic_analysis/concatenator.py +++ b/fuzzers/inprocess/dynamic_analysis/concatenator.py @@ -4,33 +4,35 @@ import os import json import sys + def concatenate_json_files(input_dir): json_files = [] for root, dirs, files in os.walk(input_dir): for file in files: - if file.endswith('.json'): + if file.endswith(".json"): json_files.append(os.path.join(root, file)) - + data = dict() for json_file in json_files: - with open(json_file, 'r') as file: + with open(json_file, "r") as file: if os.stat(json_file).st_size == 0: # skip empty file else json.load() fails continue json_data = json.load(file) print(type(json_data), file) data = data | json_data - - output_file = os.path.join(os.getcwd(), 'concatenated.json') - with open(output_file, 'w') as file: + + output_file = os.path.join(os.getcwd(), "concatenated.json") + with open(output_file, "w") as file: json.dump([data], file) - + print(f"JSON files concatenated successfully! Output file: {output_file}") -if __name__ == '__main__': + +if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python script.py ") sys.exit(1) - + input_directory = sys.argv[1] concatenate_json_files(input_directory) diff --git a/libafl/src/events/launcher.rs b/libafl/src/events/launcher.rs index fcd75cafd2..9dfaced771 100644 --- a/libafl/src/events/launcher.rs +++ b/libafl/src/events/launcher.rs @@ -108,24 +108,27 @@ pub struct Launcher<'a, CF, MT, SP> { broker_port: u16, /// The list of cores to run on cores: &'a Cores, + /// The number of clients to spawn on each core + #[builder(default = 1)] + overcommit: usize, /// A file name to write all client output to - #[cfg(all(unix, feature = "std"))] + #[cfg(unix)] #[builder(default = None)] stdout_file: Option<&'a str>, /// The time in milliseconds to delay between child launches #[builder(default = 10)] launch_delay: u64, /// The actual, opened, `stdout_file` - so that we keep it open until the end - #[cfg(all(unix, feature = "std", feature = "fork"))] + #[cfg(all(unix, feature = "fork"))] #[builder(setter(skip), default = None)] opened_stdout_file: Option, /// A file name to write all client stderr output to. If not specified, output is sent to /// `stdout_file`. - #[cfg(all(unix, feature = "std"))] + #[cfg(unix)] #[builder(default = None)] stderr_file: Option<&'a str>, /// The actual, opened, `stdout_file` - so that we keep it open until the end - #[cfg(all(unix, feature = "std", feature = "fork"))] + #[cfg(all(unix, feature = "fork"))] #[builder(setter(skip), default = None)] opened_stderr_file: Option, /// The `ip:port` address of another broker to connect our new broker to for multi-machine @@ -172,17 +175,10 @@ where SP: ShMemProvider, { /// Launch the broker and the clients and fuzz - #[cfg(all(unix, feature = "std", feature = "fork"))] - pub fn launch(&mut self) -> Result<(), Error> - where - S: State + HasExecutions, - CF: FnOnce(Option, LlmpRestartingEventManager<(), S, SP>, CoreId) -> Result<(), Error>, - { - Self::launch_with_hooks(self, tuple_list!()) - } - - /// Launch the broker and the clients and fuzz - #[cfg(all(feature = "std", any(windows, not(feature = "fork"))))] + #[cfg(all( + feature = "std", + any(windows, not(feature = "fork"), all(unix, feature = "fork")) + ))] #[allow(unused_mut, clippy::match_wild_err_arm)] pub fn launch(&mut self) -> Result<(), Error> where @@ -200,9 +196,8 @@ where SP: ShMemProvider, { /// Launch the broker and the clients and fuzz with a user-supplied hook - #[cfg(all(unix, feature = "std", feature = "fork"))] - #[allow(clippy::similar_names)] - #[allow(clippy::too_many_lines)] + #[cfg(all(unix, feature = "fork"))] + #[allow(clippy::similar_names, clippy::too_many_lines)] pub fn launch_with_hooks(&mut self, hooks: EMH) -> Result<(), Error> where S: State + HasExecutions, @@ -221,8 +216,7 @@ where )); } - let core_ids = get_core_ids().unwrap(); - let num_cores = core_ids.len(); + let core_ids = get_core_ids()?; let mut handles = vec![]; log::info!("spawning on cores: {:?}", self.cores); @@ -234,66 +228,63 @@ where .stderr_file .map(|filename| File::create(filename).unwrap()); - #[cfg(feature = "std")] let debug_output = std::env::var(LIBAFL_DEBUG_OUTPUT).is_ok(); // Spawn clients let mut index = 0_u64; - for (id, bind_to) in core_ids.iter().enumerate().take(num_cores) { + for (id, bind_to) in core_ids.iter().enumerate() { if self.cores.ids.iter().any(|&x| x == id.into()) { - index += 1; - self.shmem_provider.pre_fork()?; - // # Safety - // Fork is safe in general, apart from potential side effects to the OS and other threads - match unsafe { fork() }? { - ForkResult::Parent(child) => { - self.shmem_provider.post_fork(false)?; - handles.push(child.pid); - #[cfg(feature = "std")] - log::info!("child spawned and bound to core {id}"); - } - ForkResult::Child => { - // # Safety - // A call to `getpid` is safe. - log::info!("{:?} PostFork", unsafe { libc::getpid() }); - self.shmem_provider.post_fork(true)?; + for _ in 0..self.overcommit { + index += 1; + self.shmem_provider.pre_fork()?; + // # Safety + // Fork is safe in general, apart from potential side effects to the OS and other threads + match unsafe { fork() }? { + ForkResult::Parent(child) => { + self.shmem_provider.post_fork(false)?; + handles.push(child.pid); + log::info!("child spawned and bound to core {id}"); + } + ForkResult::Child => { + // # Safety + // A call to `getpid` is safe. + log::info!("{:?} PostFork", unsafe { libc::getpid() }); + self.shmem_provider.post_fork(true)?; - #[cfg(feature = "std")] - std::thread::sleep(Duration::from_millis(index * self.launch_delay)); + std::thread::sleep(Duration::from_millis(index * self.launch_delay)); - #[cfg(feature = "std")] - if !debug_output { - if let Some(file) = &self.opened_stdout_file { - dup2(file.as_raw_fd(), libc::STDOUT_FILENO)?; - if let Some(stderr) = &self.opened_stderr_file { - dup2(stderr.as_raw_fd(), libc::STDERR_FILENO)?; - } else { - dup2(file.as_raw_fd(), libc::STDERR_FILENO)?; + if !debug_output { + if let Some(file) = &self.opened_stdout_file { + dup2(file.as_raw_fd(), libc::STDOUT_FILENO)?; + if let Some(stderr) = &self.opened_stderr_file { + dup2(stderr.as_raw_fd(), libc::STDERR_FILENO)?; + } else { + dup2(file.as_raw_fd(), libc::STDERR_FILENO)?; + } } } + + // Fuzzer client. keeps retrying the connection to broker till the broker starts + let builder = RestartingMgr::::builder() + .shmem_provider(self.shmem_provider.clone()) + .broker_port(self.broker_port) + .kind(ManagerKind::Client { + cpu_core: Some(*bind_to), + }) + .configuration(self.configuration) + .serialize_state(self.serialize_state) + .hooks(hooks); + let builder = builder.time_ref(self.time_ref.clone()); + let (state, mgr) = builder.build().launch()?; + + return (self.run_client.take().unwrap())(state, mgr, *bind_to); } - - // Fuzzer client. keeps retrying the connection to broker till the broker starts - let builder = RestartingMgr::::builder() - .shmem_provider(self.shmem_provider.clone()) - .broker_port(self.broker_port) - .kind(ManagerKind::Client { - cpu_core: Some(*bind_to), - }) - .configuration(self.configuration) - .serialize_state(self.serialize_state) - .hooks(hooks); - let builder = builder.time_ref(self.time_ref.clone()); - let (state, mgr) = builder.build().launch()?; - - return (self.run_client.take().unwrap())(state, mgr, *bind_to); - } - }; + }; + } } } if self.spawn_broker { - #[cfg(feature = "std")] log::info!("I am broker!!."); // TODO we don't want always a broker here, think about using different laucher process to spawn different configurations @@ -337,7 +328,7 @@ where } /// Launch the broker and the clients and fuzz - #[cfg(all(feature = "std", any(windows, not(feature = "fork"))))] + #[cfg(any(windows, not(feature = "fork")))] #[allow(unused_mut, clippy::match_wild_err_arm, clippy::too_many_lines)] pub fn launch_with_hooks(&mut self, hooks: EMH) -> Result<(), Error> where @@ -381,7 +372,7 @@ where log::info!("spawning on cores: {:?}", self.cores); let debug_output = std::env::var("LIBAFL_DEBUG_OUTPUT").is_ok(); - #[cfg(all(feature = "std", unix))] + #[cfg(unix)] { // Set own stdout and stderr as set by the user if !debug_output { @@ -404,32 +395,34 @@ where //spawn clients for (id, _) in core_ids.iter().enumerate().take(num_cores) { if self.cores.ids.iter().any(|&x| x == id.into()) { - // Forward own stdio to child processes, if requested by user - let (mut stdout, mut stderr) = (Stdio::null(), Stdio::null()); - #[cfg(all(feature = "std", unix))] - { - if self.stdout_file.is_some() || self.stderr_file.is_some() { - stdout = Stdio::inherit(); - stderr = Stdio::inherit(); - }; + for _ in 0..self.overcommit { + // Forward own stdio to child processes, if requested by user + let (mut stdout, mut stderr) = (Stdio::null(), Stdio::null()); + #[cfg(unix)] + { + if self.stdout_file.is_some() || self.stderr_file.is_some() { + stdout = Stdio::inherit(); + stderr = Stdio::inherit(); + }; + } + + std::thread::sleep(Duration::from_millis( + id as u64 * self.launch_delay, + )); + + std::env::set_var(_AFL_LAUNCHER_CLIENT, id.to_string()); + let mut child = startable_self()?; + let child = (if debug_output { + &mut child + } else { + child.stdout(stdout); + child.stderr(stderr) + }) + .spawn()?; + handles.push(child); } - - #[cfg(feature = "std")] - std::thread::sleep(Duration::from_millis(id as u64 * self.launch_delay)); - - std::env::set_var(_AFL_LAUNCHER_CLIENT, id.to_string()); - let mut child = startable_self()?; - let child = (if debug_output { - &mut child - } else { - child.stdout(stdout); - child.stderr(stderr) - }) - .spawn()?; - handles.push(child); } } - handles } Err(_) => panic!("Env variables are broken, received non-unicode!"), @@ -444,7 +437,6 @@ where } if self.spawn_broker { - #[cfg(feature = "std")] log::info!("I am broker!!."); let builder = RestartingMgr::::builder() @@ -620,8 +612,7 @@ where /// Launch a Centralized-based fuzzer. /// - `main_inner_mgr_builder` will be called to build the inner manager of the main node. /// - `secondary_inner_mgr_builder` will be called to build the inner manager of the secondary nodes. - #[allow(clippy::similar_names)] - #[allow(clippy::too_many_lines)] + #[allow(clippy::similar_names, clippy::too_many_lines)] pub fn launch_generic( &mut self, main_inner_mgr_builder: EMB, diff --git a/scripts/fmt_all.sh b/scripts/fmt_all.sh index e3a9a56b4b..963f15c4ce 100755 --- a/scripts/fmt_all.sh +++ b/scripts/fmt_all.sh @@ -11,16 +11,21 @@ else cargo run --manifest-path "$LIBAFL_DIR/utils/libafl_fmt/Cargo.toml" --release -- --verbose || exit 1 fi -if command -v black > /dev/null; then - echo "[*] Formatting python files" - if ! black "$SCRIPT_DIR" - then - echo "Python format failed." - exit 1 - fi +if python3 -m black --version > /dev/null; then + BLACK_COMMAND="python3 -m black" +elif command -v black > /dev/null; then + BLACK_COMMAND="black" +fi +if [ -n "$BLACK_COMMAND" ]; then + echo "[*] Formatting python files" + if [ "$1" = "check" ]; then + $BLACK_COMMAND --check --diff "$LIBAFL_DIR" || exit 1 + else + $BLACK_COMMAND "$LIBAFL_DIR" || exit 1 + fi else - echo "Warning: python black not found. Formatting skipped for python." + echo -e "\n\033[1;33mWarning\033[0m: python black not found. Formatting skipped for python.\n" fi if [ "$1" != "check" ]; then diff --git a/utils/cfg_builder/build.py b/utils/cfg_builder/build.py index 070f15e30b..0d60fb6a55 100644 --- a/utils/cfg_builder/build.py +++ b/utils/cfg_builder/build.py @@ -7,7 +7,7 @@ import sys cfg = dict() -if 'CFG_OUTPUT_PATH' not in os.environ: +if "CFG_OUTPUT_PATH" not in os.environ: sys.exit("CFG_OUTPUT_PATH not set") input_path = os.environ["CFG_OUTPUT_PATH"] @@ -31,7 +31,7 @@ for mname, module in cfg.items(): fnname2SG = dict() # First, add all the intra-procedural edges - for (fname, v) in module['edges'].items(): + for fname, v in module["edges"].items(): if fname not in fname2id: GG.add_node(f_ids, label=fname) @@ -41,8 +41,7 @@ for mname, module in cfg.items(): sz = len(v) for idx in range(node_ids, node_ids + sz): G.add_node(idx) - G.nodes[idx]['label'] = mname + ' ' + \ - fname + ' ' + str(idx - node_ids) + G.nodes[idx]["label"] = mname + " " + fname + " " + str(idx - node_ids) node_id_list = list(range(node_ids, node_ids + sz)) node_ids += sz SG = G.subgraph(node_id_list) @@ -52,14 +51,14 @@ for mname, module in cfg.items(): G.add_edge(node_id_list[src], node_id_list[item]) # Next, build inter-procedural edges - for (fname, calls) in module['calls'].items(): - for (idx, target_fns) in calls.items(): + for fname, calls in module["calls"].items(): + for idx, target_fns in calls.items(): # G.nodes isn't sorted src = sorted(fnname2SG[fname].nodes())[0] + int(idx) for target_fn in target_fns: if target_fn in fnname2SG: - offset = module['entries'][target_fn] + offset = module["entries"][target_fn] dst = sorted(fnname2SG[target_fn].nodes)[0] + offset diff --git a/utils/gramatron/construct_automata.py b/utils/gramatron/construct_automata.py index 0cb87c3c75..126b3f84e8 100644 --- a/utils/gramatron/construct_automata.py +++ b/utils/gramatron/construct_automata.py @@ -8,36 +8,37 @@ import sys import json import re from collections import defaultdict + # import pygraphviz as pgv gram_data = None state_count = 1 pda = [] worklist = [] -state_stacks = {} +state_stacks = {} # === If user provides upper bound on the stack size during FSA creation === # Specifies the upper bound to which the stack is allowed to grow # If for any generated state, the stack size is >= stack_limit then this # state is not expanded further. -stack_limit = None +stack_limit = None # Holds the set of unexpanded rules owing to the user-passed stack constraint limit unexpanded_rules = set() + def main(grammar, limit): global worklist, gram_data, stack_limit - current = '0' + current = "0" stack_limit = limit if stack_limit: - print ('[X] Operating in bounded stack mode') + print("[X] Operating in bounded stack mode") - with open(grammar, 'r') as fd: + with open(grammar, "r") as fd: gram_data = json.load(fd) start_symbol = gram_data["Start"][0] worklist.append([current, [start_symbol]]) # print (grammar) - filename = (grammar.split('/')[-1]).split('.')[0] - + filename = (grammar.split("/")[-1]).split(".")[0] while worklist: # Take an element from the worklist @@ -45,69 +46,78 @@ def main(grammar, limit): # print ('Worklist:', worklist) element = worklist.pop(0) prep_transitions(element) - - pda_file = filename + '_transition.json' - graph_file = filename + '.png' + + pda_file = filename + "_transition.json" + graph_file = filename + ".png" # print ('XXXXXXXXXXXXXXXX') # print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file)) # XXX Commented out because visualization of current version of PHP causes segfault # Create the graph and dump the transitions to a file # create_graph(filename) transformed = postprocess() - with open(filename + '_automata.json', 'w+') as fd: + with open(filename + "_automata.json", "w+") as fd: json.dump(transformed, fd) - with open(filename + '_transition.json', 'w+') as fd: + with open(filename + "_transition.json", "w+") as fd: json.dump(pda, fd) if not unexpanded_rules: - print ('[X] No unexpanded rules, absolute FSA formed') + print("[X] No unexpanded rules, absolute FSA formed") exit(0) else: - print ('[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json'.format(filename)) - print ('[X] Number of unexpanded rules:', len(unexpanded_rules)) - with open(filename + '_disallowed.json', 'w+') as fd: + print( + "[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json".format( + filename + ) + ) + print("[X] Number of unexpanded rules:", len(unexpanded_rules)) + with open(filename + "_disallowed.json", "w+") as fd: json.dump(list(unexpanded_rules), fd) + def create_graph(filename): - ''' + """ Creates a DOT representation of the PDA - ''' + """ global pda - G = pgv.AGraph(strict = False, directed = True) + G = pgv.AGraph(strict=False, directed=True) for transition in pda: - print ('Transition:', transition) - G.add_edge(transition['source'], transition['dest'], - label = 'Term:{}'.format(transition['terminal'])) - G.layout(prog = 'dot') - print ('Do it up 2') - G.draw(filename + '.png') + print("Transition:", transition) + G.add_edge( + transition["source"], + transition["dest"], + label="Term:{}".format(transition["terminal"]), + ) + G.layout(prog="dot") + print("Do it up 2") + G.draw(filename + ".png") + def prep_transitions(element): - ''' + """ Generates transitions - ''' + """ global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules state = element[0] try: - nonterminal = element[1][0] + nonterminal = element[1][0] except IndexError: # Final state was encountered, pop from worklist without doing anything return rules = gram_data[nonterminal] count = 1 for rule in rules: - isRecursive = False + isRecursive = False # print ('Current state:', state) terminal, ss, termIsRegex = tokenize(rule) transition = get_template() - transition['trigger'] = '_'.join([state, str(count)]) - transition['source'] = state - transition['dest'] = str(state_count) - transition['ss'] = ss - transition['terminal'] = terminal - transition['rule'] = "{} -> {}".format(nonterminal, rule ) + transition["trigger"] = "_".join([state, str(count)]) + transition["source"] = state + transition["dest"] = str(state_count) + transition["ss"] = ss + transition["terminal"] = terminal + transition["rule"] = "{} -> {}".format(nonterminal, rule) if termIsRegex: - transition['termIsRegex'] = True - + transition["termIsRegex"] = True + # Creating a state stack for the new state try: state_stack = state_stacks[state][:] @@ -118,7 +128,7 @@ def prep_transitions(element): if ss: for symbol in ss[::-1]: state_stack.insert(0, symbol) - transition['stack'] = state_stack + transition["stack"] = state_stack # Check if a recursive transition state being created, if so make a backward # edge and don't add anything to the worklist @@ -128,38 +138,39 @@ def prep_transitions(element): # print ('Stack:', sorted(stack)) # print ('State stack:', sorted(state_stack)) if sorted(stack) == sorted(state_stack): - transition['dest'] = state_element + transition["dest"] = state_element # print ('Recursive:', transition) pda.append(transition) count += 1 isRecursive = True - break + break # If a recursive transition exercised don't add the same transition as a new # edge, continue onto the next transitions if isRecursive: continue - + # If the generated state has a stack size > stack_limit then that state is abandoned # and not added to the FSA or the worklist for further expansion if stack_limit: - if (len(transition['stack']) > stack_limit): - unexpanded_rules.add(transition['rule']) + if len(transition["stack"]) > stack_limit: + unexpanded_rules.add(transition["rule"]) continue # Create transitions for the non-recursive relations and add to the worklist # print ('Normal:', transition) # print ('State2:', state) pda.append(transition) - worklist.append([transition['dest'], transition['stack']]) - state_stacks[transition['dest']] = state_stack + worklist.append([transition["dest"], transition["stack"]]) + state_stacks[transition["dest"]] = state_stack state_count += 1 count += 1 + def tokenize(rule): - ''' + """ Gets the terminal and the corresponding stack symbols from a rule in GNF form - ''' - pattern = re.compile("([r])*\'([\s\S]+)\'([\s\S]*)") + """ + pattern = re.compile("([r])*'([\s\S]+)'([\s\S]*)") terminal = None ss = None termIsRegex = False @@ -176,34 +187,35 @@ def tokenize(rule): return terminal, ss, termIsRegex + def get_template(): transition_template = { - 'trigger':None, - 'source': None, - 'dest': None, - 'termIsRegex': False, - 'terminal' : None, - 'stack': [] - } + "trigger": None, + "source": None, + "dest": None, + "termIsRegex": False, + "terminal": None, + "stack": [], + } return transition_template + def postprocess1(): - ''' + """ Creates a representation to be passed on to the C-module - ''' + """ global pda final_struct = {} # Supporting data structures for if stack limit is imposed culled_pda = [] culled_final = [] - num_transitions = 0 # Keep track of number of transitions - + num_transitions = 0 # Keep track of number of transitions states, final, initial = _get_states() memoized = [[]] * len(states) - print (initial) - assert len(initial) == 1, 'More than one init state found' + print(initial) + assert len(initial) == 1, "More than one init state found" # Cull transitions to states which were not expanded owing to the stack limit if stack_limit: @@ -211,63 +223,67 @@ def postprocess1(): blocklist = [] for final_state in final: for transition in pda: - if (transition["dest"] == final_state) and (len(transition["stack"]) > 0): + if (transition["dest"] == final_state) and ( + len(transition["stack"]) > 0 + ): blocklist.append(transition["dest"]) continue else: culled_pda.append(transition) - + culled_final = [state for state in final if state not in blocklist] - assert len(culled_final) == 1, 'More than one final state found' + assert len(culled_final) == 1, "More than one final state found" for transition in culled_pda: state = transition["source"] if transition["dest"] in blocklist: - continue + continue num_transitions += 1 - memoized[int(state)].append((transition["trigger"], - int(transition["dest"]), transition["terminal"])) + memoized[int(state)].append( + (transition["trigger"], int(transition["dest"]), transition["terminal"]) + ) final_struct["init_state"] = int(initial) final_struct["final_state"] = int(culled_final[0]) # The reason we do this is because when states are culled, the indexing is # still relative to the actual number of states hence we keep numstates recorded # as the original number of states - print ('[X] Actual Number of states:', len(memoized)) - print ('[X] Number of transitions:', num_transitions) - print ('[X] Original Number of states:', len(states)) + print("[X] Actual Number of states:", len(memoized)) + print("[X] Number of transitions:", num_transitions) + print("[X] Original Number of states:", len(states)) final_struct["pda"] = memoized return final_struct - + # Running FSA construction in exact approximation mode and postprocessing it like so for transition in pda: - state = transition["source"] - memoized[int(state)].append((transition["trigger"], - int(transition["dest"]), transition["terminal"])) + state = transition["source"] + memoized[int(state)].append( + (transition["trigger"], int(transition["dest"]), transition["terminal"]) + ) final_struct["init_state"] = int(initial) final_struct["final_state"] = int(final[0]) - print ('[X] Actual Number of states:', len(memoized)) + print("[X] Actual Number of states:", len(memoized)) final_struct["pda"] = memoized return final_struct + def postprocess(): - ''' + """ Creates a representation to be passed on to the C-module - ''' + """ global pda final_struct = {} memoized = defaultdict(list) # Supporting data structures for if stack limit is imposed culled_pda = [] culled_final = [] - num_transitions = 0 # Keep track of number of transitions - + num_transitions = 0 # Keep track of number of transitions states, final, initial = _get_states() - print (initial) - assert len(initial) == 1, 'More than one init state found' + print(initial) + assert len(initial) == 1, "More than one init state found" # Cull transitions to states which were not expanded owing to the stack limit if stack_limit: @@ -275,49 +291,51 @@ def postprocess(): blocklist = [] for final_state in final: for transition in pda: - if (transition["dest"] == final_state) and (len(transition["stack"]) > 0): + if (transition["dest"] == final_state) and ( + len(transition["stack"]) > 0 + ): blocklist.append(transition["dest"]) continue else: culled_pda.append(transition) - + culled_final = [state for state in final if state not in blocklist] - assert len(culled_final) == 1, 'More than one final state found' + assert len(culled_final) == 1, "More than one final state found" for transition in culled_pda: state = transition["source"] if transition["dest"] in blocklist: - continue + continue num_transitions += 1 - memoized[int(state)].append([transition["trigger"], int(transition["dest"]), - transition["terminal"]]) - - - + memoized[int(state)].append( + [transition["trigger"], int(transition["dest"]), transition["terminal"]] + ) + final_struct["init_state"] = int(initial) final_struct["final_state"] = int(culled_final[0]) # The reason we do this is because when states are culled, the indexing is # still relative to the actual number of states hence we keep numstates recorded # as the original number of states - print ('[X] Actual Number of states:', len(memoized.keys())) - print ('[X] Number of transitions:', num_transitions) - print ('[X] Original Number of states:', len(states)) - #final_struct["numstates"] = len(states) - memoized_list = [[]]*len(states) + print("[X] Actual Number of states:", len(memoized.keys())) + print("[X] Number of transitions:", num_transitions) + print("[X] Original Number of states:", len(states)) + # final_struct["numstates"] = len(states) + memoized_list = [[]] * len(states) else: # Running FSA construction in exact approximation mode and postprocessing it like so for transition in pda: - state = transition["source"] - memoized[int(state)].append([transition["trigger"], int(transition["dest"]), - transition["terminal"]]) + state = transition["source"] + memoized[int(state)].append( + [transition["trigger"], int(transition["dest"]), transition["terminal"]] + ) final_struct["init_state"] = int(initial) final_struct["final_state"] = int(final[0]) - print ('[X] Actual Number of states:', len(memoized.keys())) - #final_struct["numstates"] = len(memoized.keys()) - memoized_list = [[]]*len(memoized.keys()) - + print("[X] Actual Number of states:", len(memoized.keys())) + # final_struct["numstates"] = len(memoized.keys()) + memoized_list = [[]] * len(memoized.keys()) + for k in memoized.keys(): memoized_list[k] = memoized[k] final_struct["pda"] = memoized_list @@ -333,19 +351,23 @@ def _get_states(): dest.add(transition["dest"]) source_copy = source.copy() source_copy.update(dest) - return list(source_copy), list(dest.difference(source)), str(''.join(list(source.difference(dest)))) + return ( + list(source_copy), + list(dest.difference(source)), + str("".join(list(source.difference(dest)))), + ) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description = 'Script to convert GNF grammar to PDA') + + parser = argparse.ArgumentParser(description="Script to convert GNF grammar to PDA") + parser.add_argument("--gf", type=str, help="Location of GNF grammar") parser.add_argument( - '--gf', - type = str, - help = 'Location of GNF grammar') - parser.add_argument( - '--limit', - type = int, - default = None, - help = 'Specify the upper bound for the stack size') + "--limit", + type=int, + default=None, + help="Specify the upper bound for the stack size", + ) args = parser.parse_args() main(args.gf, args.limit) diff --git a/utils/gramatron/gnf_converter.py b/utils/gramatron/gnf_converter.py index 0bc70d1555..37434a7197 100755 --- a/utils/gramatron/gnf_converter.py +++ b/utils/gramatron/gnf_converter.py @@ -16,17 +16,18 @@ DEBUG = False NONTERMINALSET = [] COUNT = 1 + def convert_to_gnf(grammar, start): if DEBUG: - with open('debug_preprocess.json', 'w+') as fd: + with open("debug_preprocess.json", "w+") as fd: json.dump(grammar, fd) - grammar = remove_unit(grammar) # eliminates unit productions + grammar = remove_unit(grammar) # eliminates unit productions if DEBUG: - with open('debug_unit.json', 'w+') as fd: + with open("debug_unit.json", "w+") as fd: json.dump(grammar, fd) - grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals + grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals if DEBUG: - with open('debug_mixed.json', 'w+') as fd: + with open("debug_mixed.json", "w+") as fd: json.dump(grammar, fd) grammar = gnf(grammar) @@ -35,12 +36,13 @@ def convert_to_gnf(grammar, start): # with open('debug_gnf_reachable.json', 'w+') as fd: # json.dump(reachable_grammar, fd) if DEBUG: - with open('debug_gnf.json', 'w+') as fd: + with open("debug_gnf.json", "w+") as fd: json.dump(grammar, fd) grammar["Start"] = [start] return grammar + def remove_left_recursion(grammar): # Remove the left recursion in the grammar rules. # This algorithm is adopted from @@ -69,10 +71,10 @@ def remove_left_recursion(grammar): r.append(new_rule) left_recursion = [r[1:] + [new_rule] for r in left_recursion] left_recursion.append(["' '"]) - new_grammar[lhs] = [' '.join(rule) for rule in others] - new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion] + new_grammar[lhs] = [" ".join(rule) for rule in others] + new_grammar[new_rule] = [" ".join(rule) for rule in left_recursion] else: - new_grammar[lhs] = [' '.join(rule) for rule in others] + new_grammar[lhs] = [" ".join(rule) for rule in others] no_left_recursion = True for lhs, rules in old_grammar.items(): for rule in rules: @@ -88,10 +90,11 @@ def remove_left_recursion(grammar): new_grammar = defaultdict(list) return new_grammar + def get_reachable(grammar, start): - ''' + """ Returns a grammar without dead rules - ''' + """ reachable_nt = set() worklist = list() processed = set() @@ -113,9 +116,10 @@ def get_reachable(grammar, start): def gettokens(rule): - pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'") + pattern = re.compile("([^\s\"']+)|\"([^\"]*)\"|'([^']*)'") return [matched.group(0) for matched in pattern.finditer(rule)] + def gnf(grammar): old_grammar = copy.deepcopy(grammar) new_grammar = defaultdict(list) @@ -129,7 +133,7 @@ def gnf(grammar): new_grammar[lhs].append(rule) continue startoken = tokens[0] - assert(startoken != lhs) + assert startoken != lhs endrule = tokens[1:] if not isTerminal(startoken): newrules = [] @@ -139,7 +143,7 @@ def gnf(grammar): temprule.insert(0, extension) newrules.append(temprule) for newnew in newrules: - new_grammar[lhs].append(' '.join(newnew)) + new_grammar[lhs].append(" ".join(newnew)) else: new_grammar[lhs].append(rule) isgnf = True @@ -163,7 +167,7 @@ def process_antlr4_grammar(data): productions = [] production = [] for line in data: - if line != '\n': + if line != "\n": production.append(line) else: productions.append(production) @@ -172,16 +176,17 @@ def process_antlr4_grammar(data): for production in productions: rules = [] init = production[0] - nonterminal = init.split(':')[0] - rules.append(strip_chars(init.split(':')[1]).strip('| ')) + nonterminal = init.split(":")[0] + rules.append(strip_chars(init.split(":")[1]).strip("| ")) for production_rule in production[1:]: - rules.append(strip_chars(production_rule.split('|')[0])) + rules.append(strip_chars(production_rule.split("|")[0])) final_rule_set[nonterminal] = rules # for line in data: # if line != '\n': # production.append(line) return final_rule_set + def remove_unit(grammar): nounitproductions = False old_grammar = copy.deepcopy(grammar) @@ -213,19 +218,21 @@ def remove_unit(grammar): new_grammar = defaultdict(list) return new_grammar + def isTerminal(rule): # pattern = re.compile("([r]*\'[\s\S]+\')") - pattern = re.compile("\'(.*?)\'") + pattern = re.compile("'(.*?)'") match = pattern.match(rule) if match: return True else: return False + def remove_mixed(grammar): - ''' + """ Remove rules where there are terminals mixed in with non-terminals - ''' + """ new_grammar = defaultdict(list) for lhs, rules in grammar.items(): for rhs in rules: @@ -248,17 +255,20 @@ def remove_mixed(grammar): regen_rule.append(new_nonterm) else: regen_rule.append(token) - new_grammar[lhs].append(' '.join(regen_rule)) + new_grammar[lhs].append(" ".join(regen_rule)) return new_grammar + def strip_chars(rule): - return rule.strip('\n\t ') + return rule.strip("\n\t ") + def get_nonterminal(): global COUNT COUNT += 1 return f"GeneratedTermVar{COUNT}" + def terminal_exist(token, grammar): for nonterminal, rules in grammar.items(): if token in rules and len(token) == 1: @@ -269,42 +279,37 @@ def terminal_exist(token, grammar): def main(grammar_file, out, start): grammar = None # If grammar file is a preprocessed NT file, then skip preprocessing - if '.json' in grammar_file: - with open(grammar_file, 'r') as fd: + if ".json" in grammar_file: + with open(grammar_file, "r") as fd: grammar = json.load(fd) - elif '.g4' in grammar_file: - with open(grammar_file, 'r') as fd: + elif ".g4" in grammar_file: + with open(grammar_file, "r") as fd: data = fd.readlines() grammar = process_antlr4_grammar(data) else: - raise('Unknwown file format passed. Accepts (.g4/.json)') + raise ("Unknwown file format passed. Accepts (.g4/.json)") grammar = convert_to_gnf(grammar, start) - with open(out, 'w+') as fd: + with open(out, "w+") as fd: json.dump(grammar, fd) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form') + + parser = argparse.ArgumentParser( + description="Script to convert grammar to GNF form" + ) parser.add_argument( - '--gf', - type = str, - required = True, - help = 'Location of grammar file') + "--gf", type=str, required=True, help="Location of grammar file" + ) parser.add_argument( - '--out', - type = str, - required = True, - help = 'Location of output file') + "--out", type=str, required=True, help="Location of output file" + ) + parser.add_argument("--start", type=str, required=True, help="Start token") parser.add_argument( - '--start', - type = str, - required = True, - help = 'Start token') - parser.add_argument( - '--debug', - action='store_true', - help = 'Write intermediate states to debug files') + "--debug", action="store_true", help="Write intermediate states to debug files" + ) args = parser.parse_args() DEBUG = args.debug diff --git a/utils/libafl_fmt/Cargo.toml b/utils/libafl_fmt/Cargo.toml index 3b774ca07d..9f6ead251e 100644 --- a/utils/libafl_fmt/Cargo.toml +++ b/utils/libafl_fmt/Cargo.toml @@ -20,3 +20,4 @@ tokio = { version = "1.38", features = [ clap = { version = "4.5", features = ["derive"] } exitcode = "1.1" which = "6.0" +colored = "2.1.0" diff --git a/utils/libafl_fmt/src/main.rs b/utils/libafl_fmt/src/main.rs index 1b2913f06b..8617fc9708 100644 --- a/utils/libafl_fmt/src/main.rs +++ b/utils/libafl_fmt/src/main.rs @@ -78,12 +78,13 @@ use std::{ }; use clap::Parser; +use colored::Colorize; use regex::RegexSet; use tokio::{process::Command, task::JoinSet}; use walkdir::{DirEntry, WalkDir}; use which::which; -const REF_LLVM_VERSION: u32 = 18; +const REF_LLVM_VERSION: u32 = 19; fn is_workspace_toml(path: &Path) -> bool { for line in read_to_string(path).unwrap().lines() { @@ -249,20 +250,29 @@ async fn main() -> io::Result<()> { tokio_joinset.spawn(run_cargo_fmt(project, cli.check, cli.verbose)); } - let ref_clang_format = format!("clang-format-{REF_LLVM_VERSION}"); + let reference_clang_format = format!("clang-format-{REF_LLVM_VERSION}"); + let unspecified_clang_format = "clang-format"; + + let (clang, warning) = if which(&reference_clang_format).is_ok() { + (Some(reference_clang_format.as_str()), None) + } else if which(unspecified_clang_format).is_ok() { + let version = Command::new(unspecified_clang_format) + .arg("--version") + .output() + .await? + .stdout; - let (clang, warning) = if which(ref_clang_format.clone()).is_ok() { - // can't use 18 for ci. - (Some(ref_clang_format), None) - } else if which("clang-format").is_ok() { ( - Some("clang-format".to_string()), - Some("using clang-format, could provide a different result from clang-format-17"), + Some(unspecified_clang_format), + Some(format!( + "using {}, could provide a different result from clang-format-17", + from_utf8(&version).unwrap().replace('\n', "") + )), ) } else { ( None, - Some("clang-format not found. Skipping C formatting..."), + Some("clang-format not found. Skipping C formatting...".to_string()), ) }; // println!("Using {:#?} to format...", clang); @@ -277,7 +287,12 @@ async fn main() -> io::Result<()> { .collect(); for c_file in c_files_to_fmt { - tokio_joinset.spawn(run_clang_fmt(c_file, clang.clone(), cli.check, cli.verbose)); + tokio_joinset.spawn(run_clang_fmt( + c_file, + clang.to_string(), + cli.check, + cli.verbose, + )); } } @@ -292,7 +307,7 @@ async fn main() -> io::Result<()> { } if let Some(warning) = warning { - println!("Warning: {warning}"); + println!("\n{}: {}\n", "Warning".yellow().bold(), warning); } if cli.check {