Introducing Launcher::overcommit, improving CI formatting (#2670)
* introducing Launcher::overcommit * removing unnecessary cfg restrictions and clippy allows * improving warning for wrong clang-format version * installing black in the format CI * Enforcing python formatting in CI * extending formatting using black on all python files * printing diff on black failure * preferring python's black over system black * moving to LLVM 19 for formatting
This commit is contained in:
parent
8617fa6603
commit
e32b3eae93
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@ -198,6 +198,8 @@ jobs:
|
||||
run: rustup component add --toolchain nightly-x86_64-unknown-linux-gnu rustfmt
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with: { shared-key: "ubuntu" }
|
||||
- name: Installing black
|
||||
run: python3 -m pip install black
|
||||
- name: Format Check
|
||||
run: ./scripts/fmt_all.sh check
|
||||
|
||||
|
@ -3,5 +3,7 @@ import ctypes
|
||||
import platform
|
||||
|
||||
print("Starting to fuzz from python!")
|
||||
fuzzer = sugar.InMemoryBytesCoverageSugar(input_dirs=["./in"], output_dir="out", broker_port=1337, cores=[0,1])
|
||||
fuzzer = sugar.InMemoryBytesCoverageSugar(
|
||||
input_dirs=["./in"], output_dir="out", broker_port=1337, cores=[0, 1]
|
||||
)
|
||||
fuzzer.run(lambda b: print("foo"))
|
@ -4,31 +4,32 @@ from pylibafl import sugar, qemu
|
||||
import lief
|
||||
|
||||
MAX_SIZE = 0x100
|
||||
BINARY_PATH = './a.out'
|
||||
BINARY_PATH = "./a.out"
|
||||
|
||||
emu = qemu.Qemu(['qemu-x86_64', BINARY_PATH], [])
|
||||
emu = qemu.Qemu(["qemu-x86_64", BINARY_PATH], [])
|
||||
|
||||
elf = lief.parse(BINARY_PATH)
|
||||
test_one_input = elf.get_function_address("LLVMFuzzerTestOneInput")
|
||||
if elf.is_pie:
|
||||
test_one_input += emu.load_addr()
|
||||
print('LLVMFuzzerTestOneInput @ 0x%x' % test_one_input)
|
||||
print("LLVMFuzzerTestOneInput @ 0x%x" % test_one_input)
|
||||
|
||||
emu.set_breakpoint(test_one_input)
|
||||
emu.run()
|
||||
|
||||
sp = emu.read_reg(qemu.regs.Rsp)
|
||||
print('SP = 0x%x' % sp)
|
||||
print("SP = 0x%x" % sp)
|
||||
|
||||
retaddr = int.from_bytes(emu.read_mem(sp, 8), 'little')
|
||||
print('RET = 0x%x' % retaddr)
|
||||
retaddr = int.from_bytes(emu.read_mem(sp, 8), "little")
|
||||
print("RET = 0x%x" % retaddr)
|
||||
|
||||
inp = emu.map_private(0, MAX_SIZE, qemu.mmap.ReadWrite)
|
||||
assert(inp > 0)
|
||||
assert inp > 0
|
||||
|
||||
emu.remove_breakpoint(test_one_input)
|
||||
emu.set_breakpoint(retaddr)
|
||||
|
||||
|
||||
def harness(b):
|
||||
if len(b) > MAX_SIZE:
|
||||
b = b[:MAX_SIZE]
|
||||
@ -39,5 +40,6 @@ def harness(b):
|
||||
emu.write_reg(qemu.regs.Rip, test_one_input)
|
||||
emu.run()
|
||||
|
||||
fuzz = sugar.QemuBytesCoverageSugar(['./in'], './out', 3456, [0,1,2,3])
|
||||
|
||||
fuzz = sugar.QemuBytesCoverageSugar(["./in"], "./out", 3456, [0, 1, 2, 3])
|
||||
fuzz.run(emu, harness)
|
||||
|
@ -4,16 +4,17 @@ import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def concatenate_json_files(input_dir):
|
||||
json_files = []
|
||||
for root, dirs, files in os.walk(input_dir):
|
||||
for file in files:
|
||||
if file.endswith('.json'):
|
||||
if file.endswith(".json"):
|
||||
json_files.append(os.path.join(root, file))
|
||||
|
||||
data = dict()
|
||||
for json_file in json_files:
|
||||
with open(json_file, 'r') as file:
|
||||
with open(json_file, "r") as file:
|
||||
if os.stat(json_file).st_size == 0:
|
||||
# skip empty file else json.load() fails
|
||||
continue
|
||||
@ -21,13 +22,14 @@ def concatenate_json_files(input_dir):
|
||||
print(type(json_data), file)
|
||||
data = data | json_data
|
||||
|
||||
output_file = os.path.join(os.getcwd(), 'concatenated.json')
|
||||
with open(output_file, 'w') as file:
|
||||
output_file = os.path.join(os.getcwd(), "concatenated.json")
|
||||
with open(output_file, "w") as file:
|
||||
json.dump([data], file)
|
||||
|
||||
print(f"JSON files concatenated successfully! Output file: {output_file}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python script.py <directory_path>")
|
||||
sys.exit(1)
|
||||
|
@ -108,24 +108,27 @@ pub struct Launcher<'a, CF, MT, SP> {
|
||||
broker_port: u16,
|
||||
/// The list of cores to run on
|
||||
cores: &'a Cores,
|
||||
/// The number of clients to spawn on each core
|
||||
#[builder(default = 1)]
|
||||
overcommit: usize,
|
||||
/// A file name to write all client output to
|
||||
#[cfg(all(unix, feature = "std"))]
|
||||
#[cfg(unix)]
|
||||
#[builder(default = None)]
|
||||
stdout_file: Option<&'a str>,
|
||||
/// The time in milliseconds to delay between child launches
|
||||
#[builder(default = 10)]
|
||||
launch_delay: u64,
|
||||
/// The actual, opened, `stdout_file` - so that we keep it open until the end
|
||||
#[cfg(all(unix, feature = "std", feature = "fork"))]
|
||||
#[cfg(all(unix, feature = "fork"))]
|
||||
#[builder(setter(skip), default = None)]
|
||||
opened_stdout_file: Option<File>,
|
||||
/// A file name to write all client stderr output to. If not specified, output is sent to
|
||||
/// `stdout_file`.
|
||||
#[cfg(all(unix, feature = "std"))]
|
||||
#[cfg(unix)]
|
||||
#[builder(default = None)]
|
||||
stderr_file: Option<&'a str>,
|
||||
/// The actual, opened, `stdout_file` - so that we keep it open until the end
|
||||
#[cfg(all(unix, feature = "std", feature = "fork"))]
|
||||
#[cfg(all(unix, feature = "fork"))]
|
||||
#[builder(setter(skip), default = None)]
|
||||
opened_stderr_file: Option<File>,
|
||||
/// The `ip:port` address of another broker to connect our new broker to for multi-machine
|
||||
@ -172,17 +175,10 @@ where
|
||||
SP: ShMemProvider,
|
||||
{
|
||||
/// Launch the broker and the clients and fuzz
|
||||
#[cfg(all(unix, feature = "std", feature = "fork"))]
|
||||
pub fn launch<S>(&mut self) -> Result<(), Error>
|
||||
where
|
||||
S: State + HasExecutions,
|
||||
CF: FnOnce(Option<S>, LlmpRestartingEventManager<(), S, SP>, CoreId) -> Result<(), Error>,
|
||||
{
|
||||
Self::launch_with_hooks(self, tuple_list!())
|
||||
}
|
||||
|
||||
/// Launch the broker and the clients and fuzz
|
||||
#[cfg(all(feature = "std", any(windows, not(feature = "fork"))))]
|
||||
#[cfg(all(
|
||||
feature = "std",
|
||||
any(windows, not(feature = "fork"), all(unix, feature = "fork"))
|
||||
))]
|
||||
#[allow(unused_mut, clippy::match_wild_err_arm)]
|
||||
pub fn launch<S>(&mut self) -> Result<(), Error>
|
||||
where
|
||||
@ -200,9 +196,8 @@ where
|
||||
SP: ShMemProvider,
|
||||
{
|
||||
/// Launch the broker and the clients and fuzz with a user-supplied hook
|
||||
#[cfg(all(unix, feature = "std", feature = "fork"))]
|
||||
#[allow(clippy::similar_names)]
|
||||
#[allow(clippy::too_many_lines)]
|
||||
#[cfg(all(unix, feature = "fork"))]
|
||||
#[allow(clippy::similar_names, clippy::too_many_lines)]
|
||||
pub fn launch_with_hooks<EMH, S>(&mut self, hooks: EMH) -> Result<(), Error>
|
||||
where
|
||||
S: State + HasExecutions,
|
||||
@ -221,8 +216,7 @@ where
|
||||
));
|
||||
}
|
||||
|
||||
let core_ids = get_core_ids().unwrap();
|
||||
let num_cores = core_ids.len();
|
||||
let core_ids = get_core_ids()?;
|
||||
let mut handles = vec![];
|
||||
|
||||
log::info!("spawning on cores: {:?}", self.cores);
|
||||
@ -234,13 +228,13 @@ where
|
||||
.stderr_file
|
||||
.map(|filename| File::create(filename).unwrap());
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
let debug_output = std::env::var(LIBAFL_DEBUG_OUTPUT).is_ok();
|
||||
|
||||
// Spawn clients
|
||||
let mut index = 0_u64;
|
||||
for (id, bind_to) in core_ids.iter().enumerate().take(num_cores) {
|
||||
for (id, bind_to) in core_ids.iter().enumerate() {
|
||||
if self.cores.ids.iter().any(|&x| x == id.into()) {
|
||||
for _ in 0..self.overcommit {
|
||||
index += 1;
|
||||
self.shmem_provider.pre_fork()?;
|
||||
// # Safety
|
||||
@ -249,7 +243,6 @@ where
|
||||
ForkResult::Parent(child) => {
|
||||
self.shmem_provider.post_fork(false)?;
|
||||
handles.push(child.pid);
|
||||
#[cfg(feature = "std")]
|
||||
log::info!("child spawned and bound to core {id}");
|
||||
}
|
||||
ForkResult::Child => {
|
||||
@ -258,10 +251,8 @@ where
|
||||
log::info!("{:?} PostFork", unsafe { libc::getpid() });
|
||||
self.shmem_provider.post_fork(true)?;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
std::thread::sleep(Duration::from_millis(index * self.launch_delay));
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
if !debug_output {
|
||||
if let Some(file) = &self.opened_stdout_file {
|
||||
dup2(file.as_raw_fd(), libc::STDOUT_FILENO)?;
|
||||
@ -291,9 +282,9 @@ where
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.spawn_broker {
|
||||
#[cfg(feature = "std")]
|
||||
log::info!("I am broker!!.");
|
||||
|
||||
// TODO we don't want always a broker here, think about using different laucher process to spawn different configurations
|
||||
@ -337,7 +328,7 @@ where
|
||||
}
|
||||
|
||||
/// Launch the broker and the clients and fuzz
|
||||
#[cfg(all(feature = "std", any(windows, not(feature = "fork"))))]
|
||||
#[cfg(any(windows, not(feature = "fork")))]
|
||||
#[allow(unused_mut, clippy::match_wild_err_arm, clippy::too_many_lines)]
|
||||
pub fn launch_with_hooks<EMH, S>(&mut self, hooks: EMH) -> Result<(), Error>
|
||||
where
|
||||
@ -381,7 +372,7 @@ where
|
||||
log::info!("spawning on cores: {:?}", self.cores);
|
||||
|
||||
let debug_output = std::env::var("LIBAFL_DEBUG_OUTPUT").is_ok();
|
||||
#[cfg(all(feature = "std", unix))]
|
||||
#[cfg(unix)]
|
||||
{
|
||||
// Set own stdout and stderr as set by the user
|
||||
if !debug_output {
|
||||
@ -404,9 +395,10 @@ where
|
||||
//spawn clients
|
||||
for (id, _) in core_ids.iter().enumerate().take(num_cores) {
|
||||
if self.cores.ids.iter().any(|&x| x == id.into()) {
|
||||
for _ in 0..self.overcommit {
|
||||
// Forward own stdio to child processes, if requested by user
|
||||
let (mut stdout, mut stderr) = (Stdio::null(), Stdio::null());
|
||||
#[cfg(all(feature = "std", unix))]
|
||||
#[cfg(unix)]
|
||||
{
|
||||
if self.stdout_file.is_some() || self.stderr_file.is_some() {
|
||||
stdout = Stdio::inherit();
|
||||
@ -414,8 +406,9 @@ where
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
std::thread::sleep(Duration::from_millis(id as u64 * self.launch_delay));
|
||||
std::thread::sleep(Duration::from_millis(
|
||||
id as u64 * self.launch_delay,
|
||||
));
|
||||
|
||||
std::env::set_var(_AFL_LAUNCHER_CLIENT, id.to_string());
|
||||
let mut child = startable_self()?;
|
||||
@ -429,7 +422,7 @@ where
|
||||
handles.push(child);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
handles
|
||||
}
|
||||
Err(_) => panic!("Env variables are broken, received non-unicode!"),
|
||||
@ -444,7 +437,6 @@ where
|
||||
}
|
||||
|
||||
if self.spawn_broker {
|
||||
#[cfg(feature = "std")]
|
||||
log::info!("I am broker!!.");
|
||||
|
||||
let builder = RestartingMgr::<EMH, MT, S, SP>::builder()
|
||||
@ -620,8 +612,7 @@ where
|
||||
/// Launch a Centralized-based fuzzer.
|
||||
/// - `main_inner_mgr_builder` will be called to build the inner manager of the main node.
|
||||
/// - `secondary_inner_mgr_builder` will be called to build the inner manager of the secondary nodes.
|
||||
#[allow(clippy::similar_names)]
|
||||
#[allow(clippy::too_many_lines)]
|
||||
#[allow(clippy::similar_names, clippy::too_many_lines)]
|
||||
pub fn launch_generic<EM, EMB, S>(
|
||||
&mut self,
|
||||
main_inner_mgr_builder: EMB,
|
||||
|
@ -11,16 +11,21 @@ else
|
||||
cargo run --manifest-path "$LIBAFL_DIR/utils/libafl_fmt/Cargo.toml" --release -- --verbose || exit 1
|
||||
fi
|
||||
|
||||
if command -v black > /dev/null; then
|
||||
echo "[*] Formatting python files"
|
||||
if ! black "$SCRIPT_DIR"
|
||||
then
|
||||
echo "Python format failed."
|
||||
exit 1
|
||||
if python3 -m black --version > /dev/null; then
|
||||
BLACK_COMMAND="python3 -m black"
|
||||
elif command -v black > /dev/null; then
|
||||
BLACK_COMMAND="black"
|
||||
fi
|
||||
|
||||
if [ -n "$BLACK_COMMAND" ]; then
|
||||
echo "[*] Formatting python files"
|
||||
if [ "$1" = "check" ]; then
|
||||
$BLACK_COMMAND --check --diff "$LIBAFL_DIR" || exit 1
|
||||
else
|
||||
echo "Warning: python black not found. Formatting skipped for python."
|
||||
$BLACK_COMMAND "$LIBAFL_DIR" || exit 1
|
||||
fi
|
||||
else
|
||||
echo -e "\n\033[1;33mWarning\033[0m: python black not found. Formatting skipped for python.\n"
|
||||
fi
|
||||
|
||||
if [ "$1" != "check" ]; then
|
||||
|
@ -7,7 +7,7 @@ import sys
|
||||
|
||||
cfg = dict()
|
||||
|
||||
if 'CFG_OUTPUT_PATH' not in os.environ:
|
||||
if "CFG_OUTPUT_PATH" not in os.environ:
|
||||
sys.exit("CFG_OUTPUT_PATH not set")
|
||||
|
||||
input_path = os.environ["CFG_OUTPUT_PATH"]
|
||||
@ -31,7 +31,7 @@ for mname, module in cfg.items():
|
||||
fnname2SG = dict()
|
||||
# First, add all the intra-procedural edges
|
||||
|
||||
for (fname, v) in module['edges'].items():
|
||||
for fname, v in module["edges"].items():
|
||||
|
||||
if fname not in fname2id:
|
||||
GG.add_node(f_ids, label=fname)
|
||||
@ -41,8 +41,7 @@ for mname, module in cfg.items():
|
||||
sz = len(v)
|
||||
for idx in range(node_ids, node_ids + sz):
|
||||
G.add_node(idx)
|
||||
G.nodes[idx]['label'] = mname + ' ' + \
|
||||
fname + ' ' + str(idx - node_ids)
|
||||
G.nodes[idx]["label"] = mname + " " + fname + " " + str(idx - node_ids)
|
||||
node_id_list = list(range(node_ids, node_ids + sz))
|
||||
node_ids += sz
|
||||
SG = G.subgraph(node_id_list)
|
||||
@ -52,14 +51,14 @@ for mname, module in cfg.items():
|
||||
G.add_edge(node_id_list[src], node_id_list[item])
|
||||
|
||||
# Next, build inter-procedural edges
|
||||
for (fname, calls) in module['calls'].items():
|
||||
for (idx, target_fns) in calls.items():
|
||||
for fname, calls in module["calls"].items():
|
||||
for idx, target_fns in calls.items():
|
||||
# G.nodes isn't sorted
|
||||
|
||||
src = sorted(fnname2SG[fname].nodes())[0] + int(idx)
|
||||
for target_fn in target_fns:
|
||||
if target_fn in fnname2SG:
|
||||
offset = module['entries'][target_fn]
|
||||
offset = module["entries"][target_fn]
|
||||
|
||||
dst = sorted(fnname2SG[target_fn].nodes)[0] + offset
|
||||
|
||||
|
@ -8,6 +8,7 @@ import sys
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# import pygraphviz as pgv
|
||||
|
||||
gram_data = None
|
||||
@ -24,20 +25,20 @@ stack_limit = None
|
||||
# Holds the set of unexpanded rules owing to the user-passed stack constraint limit
|
||||
unexpanded_rules = set()
|
||||
|
||||
|
||||
def main(grammar, limit):
|
||||
global worklist, gram_data, stack_limit
|
||||
current = '0'
|
||||
current = "0"
|
||||
stack_limit = limit
|
||||
if stack_limit:
|
||||
print ('[X] Operating in bounded stack mode')
|
||||
print("[X] Operating in bounded stack mode")
|
||||
|
||||
with open(grammar, 'r') as fd:
|
||||
with open(grammar, "r") as fd:
|
||||
gram_data = json.load(fd)
|
||||
start_symbol = gram_data["Start"][0]
|
||||
worklist.append([current, [start_symbol]])
|
||||
# print (grammar)
|
||||
filename = (grammar.split('/')[-1]).split('.')[0]
|
||||
|
||||
filename = (grammar.split("/")[-1]).split(".")[0]
|
||||
|
||||
while worklist:
|
||||
# Take an element from the worklist
|
||||
@ -46,45 +47,54 @@ def main(grammar, limit):
|
||||
element = worklist.pop(0)
|
||||
prep_transitions(element)
|
||||
|
||||
pda_file = filename + '_transition.json'
|
||||
graph_file = filename + '.png'
|
||||
pda_file = filename + "_transition.json"
|
||||
graph_file = filename + ".png"
|
||||
# print ('XXXXXXXXXXXXXXXX')
|
||||
# print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file))
|
||||
# XXX Commented out because visualization of current version of PHP causes segfault
|
||||
# Create the graph and dump the transitions to a file
|
||||
# create_graph(filename)
|
||||
transformed = postprocess()
|
||||
with open(filename + '_automata.json', 'w+') as fd:
|
||||
with open(filename + "_automata.json", "w+") as fd:
|
||||
json.dump(transformed, fd)
|
||||
with open(filename + '_transition.json', 'w+') as fd:
|
||||
with open(filename + "_transition.json", "w+") as fd:
|
||||
json.dump(pda, fd)
|
||||
if not unexpanded_rules:
|
||||
print ('[X] No unexpanded rules, absolute FSA formed')
|
||||
print("[X] No unexpanded rules, absolute FSA formed")
|
||||
exit(0)
|
||||
else:
|
||||
print ('[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json'.format(filename))
|
||||
print ('[X] Number of unexpanded rules:', len(unexpanded_rules))
|
||||
with open(filename + '_disallowed.json', 'w+') as fd:
|
||||
print(
|
||||
"[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json".format(
|
||||
filename
|
||||
)
|
||||
)
|
||||
print("[X] Number of unexpanded rules:", len(unexpanded_rules))
|
||||
with open(filename + "_disallowed.json", "w+") as fd:
|
||||
json.dump(list(unexpanded_rules), fd)
|
||||
|
||||
|
||||
def create_graph(filename):
|
||||
'''
|
||||
"""
|
||||
Creates a DOT representation of the PDA
|
||||
'''
|
||||
"""
|
||||
global pda
|
||||
G = pgv.AGraph(strict=False, directed=True)
|
||||
for transition in pda:
|
||||
print ('Transition:', transition)
|
||||
G.add_edge(transition['source'], transition['dest'],
|
||||
label = 'Term:{}'.format(transition['terminal']))
|
||||
G.layout(prog = 'dot')
|
||||
print ('Do it up 2')
|
||||
G.draw(filename + '.png')
|
||||
print("Transition:", transition)
|
||||
G.add_edge(
|
||||
transition["source"],
|
||||
transition["dest"],
|
||||
label="Term:{}".format(transition["terminal"]),
|
||||
)
|
||||
G.layout(prog="dot")
|
||||
print("Do it up 2")
|
||||
G.draw(filename + ".png")
|
||||
|
||||
|
||||
def prep_transitions(element):
|
||||
'''
|
||||
"""
|
||||
Generates transitions
|
||||
'''
|
||||
"""
|
||||
global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules
|
||||
state = element[0]
|
||||
try:
|
||||
@ -99,14 +109,14 @@ def prep_transitions(element):
|
||||
# print ('Current state:', state)
|
||||
terminal, ss, termIsRegex = tokenize(rule)
|
||||
transition = get_template()
|
||||
transition['trigger'] = '_'.join([state, str(count)])
|
||||
transition['source'] = state
|
||||
transition['dest'] = str(state_count)
|
||||
transition['ss'] = ss
|
||||
transition['terminal'] = terminal
|
||||
transition['rule'] = "{} -> {}".format(nonterminal, rule )
|
||||
transition["trigger"] = "_".join([state, str(count)])
|
||||
transition["source"] = state
|
||||
transition["dest"] = str(state_count)
|
||||
transition["ss"] = ss
|
||||
transition["terminal"] = terminal
|
||||
transition["rule"] = "{} -> {}".format(nonterminal, rule)
|
||||
if termIsRegex:
|
||||
transition['termIsRegex'] = True
|
||||
transition["termIsRegex"] = True
|
||||
|
||||
# Creating a state stack for the new state
|
||||
try:
|
||||
@ -118,7 +128,7 @@ def prep_transitions(element):
|
||||
if ss:
|
||||
for symbol in ss[::-1]:
|
||||
state_stack.insert(0, symbol)
|
||||
transition['stack'] = state_stack
|
||||
transition["stack"] = state_stack
|
||||
|
||||
# Check if a recursive transition state being created, if so make a backward
|
||||
# edge and don't add anything to the worklist
|
||||
@ -128,7 +138,7 @@ def prep_transitions(element):
|
||||
# print ('Stack:', sorted(stack))
|
||||
# print ('State stack:', sorted(state_stack))
|
||||
if sorted(stack) == sorted(state_stack):
|
||||
transition['dest'] = state_element
|
||||
transition["dest"] = state_element
|
||||
# print ('Recursive:', transition)
|
||||
pda.append(transition)
|
||||
count += 1
|
||||
@ -142,24 +152,25 @@ def prep_transitions(element):
|
||||
# If the generated state has a stack size > stack_limit then that state is abandoned
|
||||
# and not added to the FSA or the worklist for further expansion
|
||||
if stack_limit:
|
||||
if (len(transition['stack']) > stack_limit):
|
||||
unexpanded_rules.add(transition['rule'])
|
||||
if len(transition["stack"]) > stack_limit:
|
||||
unexpanded_rules.add(transition["rule"])
|
||||
continue
|
||||
|
||||
# Create transitions for the non-recursive relations and add to the worklist
|
||||
# print ('Normal:', transition)
|
||||
# print ('State2:', state)
|
||||
pda.append(transition)
|
||||
worklist.append([transition['dest'], transition['stack']])
|
||||
state_stacks[transition['dest']] = state_stack
|
||||
worklist.append([transition["dest"], transition["stack"]])
|
||||
state_stacks[transition["dest"]] = state_stack
|
||||
state_count += 1
|
||||
count += 1
|
||||
|
||||
|
||||
def tokenize(rule):
|
||||
'''
|
||||
"""
|
||||
Gets the terminal and the corresponding stack symbols from a rule in GNF form
|
||||
'''
|
||||
pattern = re.compile("([r])*\'([\s\S]+)\'([\s\S]*)")
|
||||
"""
|
||||
pattern = re.compile("([r])*'([\s\S]+)'([\s\S]*)")
|
||||
terminal = None
|
||||
ss = None
|
||||
termIsRegex = False
|
||||
@ -176,21 +187,23 @@ def tokenize(rule):
|
||||
|
||||
return terminal, ss, termIsRegex
|
||||
|
||||
|
||||
def get_template():
|
||||
transition_template = {
|
||||
'trigger':None,
|
||||
'source': None,
|
||||
'dest': None,
|
||||
'termIsRegex': False,
|
||||
'terminal' : None,
|
||||
'stack': []
|
||||
"trigger": None,
|
||||
"source": None,
|
||||
"dest": None,
|
||||
"termIsRegex": False,
|
||||
"terminal": None,
|
||||
"stack": [],
|
||||
}
|
||||
return transition_template
|
||||
|
||||
|
||||
def postprocess1():
|
||||
'''
|
||||
"""
|
||||
Creates a representation to be passed on to the C-module
|
||||
'''
|
||||
"""
|
||||
global pda
|
||||
final_struct = {}
|
||||
# Supporting data structures for if stack limit is imposed
|
||||
@ -198,12 +211,11 @@ def postprocess1():
|
||||
culled_final = []
|
||||
num_transitions = 0 # Keep track of number of transitions
|
||||
|
||||
|
||||
states, final, initial = _get_states()
|
||||
memoized = [[]] * len(states)
|
||||
|
||||
print(initial)
|
||||
assert len(initial) == 1, 'More than one init state found'
|
||||
assert len(initial) == 1, "More than one init state found"
|
||||
|
||||
# Cull transitions to states which were not expanded owing to the stack limit
|
||||
if stack_limit:
|
||||
@ -211,7 +223,9 @@ def postprocess1():
|
||||
blocklist = []
|
||||
for final_state in final:
|
||||
for transition in pda:
|
||||
if (transition["dest"] == final_state) and (len(transition["stack"]) > 0):
|
||||
if (transition["dest"] == final_state) and (
|
||||
len(transition["stack"]) > 0
|
||||
):
|
||||
blocklist.append(transition["dest"])
|
||||
continue
|
||||
else:
|
||||
@ -219,42 +233,45 @@ def postprocess1():
|
||||
|
||||
culled_final = [state for state in final if state not in blocklist]
|
||||
|
||||
assert len(culled_final) == 1, 'More than one final state found'
|
||||
assert len(culled_final) == 1, "More than one final state found"
|
||||
|
||||
for transition in culled_pda:
|
||||
state = transition["source"]
|
||||
if transition["dest"] in blocklist:
|
||||
continue
|
||||
num_transitions += 1
|
||||
memoized[int(state)].append((transition["trigger"],
|
||||
int(transition["dest"]), transition["terminal"]))
|
||||
memoized[int(state)].append(
|
||||
(transition["trigger"], int(transition["dest"]), transition["terminal"])
|
||||
)
|
||||
final_struct["init_state"] = int(initial)
|
||||
final_struct["final_state"] = int(culled_final[0])
|
||||
# The reason we do this is because when states are culled, the indexing is
|
||||
# still relative to the actual number of states hence we keep numstates recorded
|
||||
# as the original number of states
|
||||
print ('[X] Actual Number of states:', len(memoized))
|
||||
print ('[X] Number of transitions:', num_transitions)
|
||||
print ('[X] Original Number of states:', len(states))
|
||||
print("[X] Actual Number of states:", len(memoized))
|
||||
print("[X] Number of transitions:", num_transitions)
|
||||
print("[X] Original Number of states:", len(states))
|
||||
final_struct["pda"] = memoized
|
||||
return final_struct
|
||||
|
||||
# Running FSA construction in exact approximation mode and postprocessing it like so
|
||||
for transition in pda:
|
||||
state = transition["source"]
|
||||
memoized[int(state)].append((transition["trigger"],
|
||||
int(transition["dest"]), transition["terminal"]))
|
||||
memoized[int(state)].append(
|
||||
(transition["trigger"], int(transition["dest"]), transition["terminal"])
|
||||
)
|
||||
|
||||
final_struct["init_state"] = int(initial)
|
||||
final_struct["final_state"] = int(final[0])
|
||||
print ('[X] Actual Number of states:', len(memoized))
|
||||
print("[X] Actual Number of states:", len(memoized))
|
||||
final_struct["pda"] = memoized
|
||||
return final_struct
|
||||
|
||||
|
||||
def postprocess():
|
||||
'''
|
||||
"""
|
||||
Creates a representation to be passed on to the C-module
|
||||
'''
|
||||
"""
|
||||
global pda
|
||||
final_struct = {}
|
||||
memoized = defaultdict(list)
|
||||
@ -263,11 +280,10 @@ def postprocess():
|
||||
culled_final = []
|
||||
num_transitions = 0 # Keep track of number of transitions
|
||||
|
||||
|
||||
states, final, initial = _get_states()
|
||||
|
||||
print(initial)
|
||||
assert len(initial) == 1, 'More than one init state found'
|
||||
assert len(initial) == 1, "More than one init state found"
|
||||
|
||||
# Cull transitions to states which were not expanded owing to the stack limit
|
||||
if stack_limit:
|
||||
@ -275,7 +291,9 @@ def postprocess():
|
||||
blocklist = []
|
||||
for final_state in final:
|
||||
for transition in pda:
|
||||
if (transition["dest"] == final_state) and (len(transition["stack"]) > 0):
|
||||
if (transition["dest"] == final_state) and (
|
||||
len(transition["stack"]) > 0
|
||||
):
|
||||
blocklist.append(transition["dest"])
|
||||
continue
|
||||
else:
|
||||
@ -283,38 +301,38 @@ def postprocess():
|
||||
|
||||
culled_final = [state for state in final if state not in blocklist]
|
||||
|
||||
assert len(culled_final) == 1, 'More than one final state found'
|
||||
assert len(culled_final) == 1, "More than one final state found"
|
||||
|
||||
for transition in culled_pda:
|
||||
state = transition["source"]
|
||||
if transition["dest"] in blocklist:
|
||||
continue
|
||||
num_transitions += 1
|
||||
memoized[int(state)].append([transition["trigger"], int(transition["dest"]),
|
||||
transition["terminal"]])
|
||||
|
||||
|
||||
memoized[int(state)].append(
|
||||
[transition["trigger"], int(transition["dest"]), transition["terminal"]]
|
||||
)
|
||||
|
||||
final_struct["init_state"] = int(initial)
|
||||
final_struct["final_state"] = int(culled_final[0])
|
||||
# The reason we do this is because when states are culled, the indexing is
|
||||
# still relative to the actual number of states hence we keep numstates recorded
|
||||
# as the original number of states
|
||||
print ('[X] Actual Number of states:', len(memoized.keys()))
|
||||
print ('[X] Number of transitions:', num_transitions)
|
||||
print ('[X] Original Number of states:', len(states))
|
||||
print("[X] Actual Number of states:", len(memoized.keys()))
|
||||
print("[X] Number of transitions:", num_transitions)
|
||||
print("[X] Original Number of states:", len(states))
|
||||
# final_struct["numstates"] = len(states)
|
||||
memoized_list = [[]] * len(states)
|
||||
else:
|
||||
# Running FSA construction in exact approximation mode and postprocessing it like so
|
||||
for transition in pda:
|
||||
state = transition["source"]
|
||||
memoized[int(state)].append([transition["trigger"], int(transition["dest"]),
|
||||
transition["terminal"]])
|
||||
memoized[int(state)].append(
|
||||
[transition["trigger"], int(transition["dest"]), transition["terminal"]]
|
||||
)
|
||||
|
||||
final_struct["init_state"] = int(initial)
|
||||
final_struct["final_state"] = int(final[0])
|
||||
print ('[X] Actual Number of states:', len(memoized.keys()))
|
||||
print("[X] Actual Number of states:", len(memoized.keys()))
|
||||
# final_struct["numstates"] = len(memoized.keys())
|
||||
memoized_list = [[]] * len(memoized.keys())
|
||||
|
||||
@ -333,19 +351,23 @@ def _get_states():
|
||||
dest.add(transition["dest"])
|
||||
source_copy = source.copy()
|
||||
source_copy.update(dest)
|
||||
return list(source_copy), list(dest.difference(source)), str(''.join(list(source.difference(dest))))
|
||||
return (
|
||||
list(source_copy),
|
||||
list(dest.difference(source)),
|
||||
str("".join(list(source.difference(dest)))),
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description = 'Script to convert GNF grammar to PDA')
|
||||
|
||||
parser = argparse.ArgumentParser(description="Script to convert GNF grammar to PDA")
|
||||
parser.add_argument("--gf", type=str, help="Location of GNF grammar")
|
||||
parser.add_argument(
|
||||
'--gf',
|
||||
type = str,
|
||||
help = 'Location of GNF grammar')
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help = 'Specify the upper bound for the stack size')
|
||||
help="Specify the upper bound for the stack size",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args.gf, args.limit)
|
||||
|
@ -16,17 +16,18 @@ DEBUG = False
|
||||
NONTERMINALSET = []
|
||||
COUNT = 1
|
||||
|
||||
|
||||
def convert_to_gnf(grammar, start):
|
||||
if DEBUG:
|
||||
with open('debug_preprocess.json', 'w+') as fd:
|
||||
with open("debug_preprocess.json", "w+") as fd:
|
||||
json.dump(grammar, fd)
|
||||
grammar = remove_unit(grammar) # eliminates unit productions
|
||||
if DEBUG:
|
||||
with open('debug_unit.json', 'w+') as fd:
|
||||
with open("debug_unit.json", "w+") as fd:
|
||||
json.dump(grammar, fd)
|
||||
grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
|
||||
if DEBUG:
|
||||
with open('debug_mixed.json', 'w+') as fd:
|
||||
with open("debug_mixed.json", "w+") as fd:
|
||||
json.dump(grammar, fd)
|
||||
grammar = gnf(grammar)
|
||||
|
||||
@ -35,12 +36,13 @@ def convert_to_gnf(grammar, start):
|
||||
# with open('debug_gnf_reachable.json', 'w+') as fd:
|
||||
# json.dump(reachable_grammar, fd)
|
||||
if DEBUG:
|
||||
with open('debug_gnf.json', 'w+') as fd:
|
||||
with open("debug_gnf.json", "w+") as fd:
|
||||
json.dump(grammar, fd)
|
||||
|
||||
grammar["Start"] = [start]
|
||||
return grammar
|
||||
|
||||
|
||||
def remove_left_recursion(grammar):
|
||||
# Remove the left recursion in the grammar rules.
|
||||
# This algorithm is adopted from
|
||||
@ -69,10 +71,10 @@ def remove_left_recursion(grammar):
|
||||
r.append(new_rule)
|
||||
left_recursion = [r[1:] + [new_rule] for r in left_recursion]
|
||||
left_recursion.append(["' '"])
|
||||
new_grammar[lhs] = [' '.join(rule) for rule in others]
|
||||
new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
|
||||
new_grammar[lhs] = [" ".join(rule) for rule in others]
|
||||
new_grammar[new_rule] = [" ".join(rule) for rule in left_recursion]
|
||||
else:
|
||||
new_grammar[lhs] = [' '.join(rule) for rule in others]
|
||||
new_grammar[lhs] = [" ".join(rule) for rule in others]
|
||||
no_left_recursion = True
|
||||
for lhs, rules in old_grammar.items():
|
||||
for rule in rules:
|
||||
@ -88,10 +90,11 @@ def remove_left_recursion(grammar):
|
||||
new_grammar = defaultdict(list)
|
||||
return new_grammar
|
||||
|
||||
|
||||
def get_reachable(grammar, start):
|
||||
'''
|
||||
"""
|
||||
Returns a grammar without dead rules
|
||||
'''
|
||||
"""
|
||||
reachable_nt = set()
|
||||
worklist = list()
|
||||
processed = set()
|
||||
@ -113,9 +116,10 @@ def get_reachable(grammar, start):
|
||||
|
||||
|
||||
def gettokens(rule):
|
||||
pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'")
|
||||
pattern = re.compile("([^\s\"']+)|\"([^\"]*)\"|'([^']*)'")
|
||||
return [matched.group(0) for matched in pattern.finditer(rule)]
|
||||
|
||||
|
||||
def gnf(grammar):
|
||||
old_grammar = copy.deepcopy(grammar)
|
||||
new_grammar = defaultdict(list)
|
||||
@ -129,7 +133,7 @@ def gnf(grammar):
|
||||
new_grammar[lhs].append(rule)
|
||||
continue
|
||||
startoken = tokens[0]
|
||||
assert(startoken != lhs)
|
||||
assert startoken != lhs
|
||||
endrule = tokens[1:]
|
||||
if not isTerminal(startoken):
|
||||
newrules = []
|
||||
@ -139,7 +143,7 @@ def gnf(grammar):
|
||||
temprule.insert(0, extension)
|
||||
newrules.append(temprule)
|
||||
for newnew in newrules:
|
||||
new_grammar[lhs].append(' '.join(newnew))
|
||||
new_grammar[lhs].append(" ".join(newnew))
|
||||
else:
|
||||
new_grammar[lhs].append(rule)
|
||||
isgnf = True
|
||||
@ -163,7 +167,7 @@ def process_antlr4_grammar(data):
|
||||
productions = []
|
||||
production = []
|
||||
for line in data:
|
||||
if line != '\n':
|
||||
if line != "\n":
|
||||
production.append(line)
|
||||
else:
|
||||
productions.append(production)
|
||||
@ -172,16 +176,17 @@ def process_antlr4_grammar(data):
|
||||
for production in productions:
|
||||
rules = []
|
||||
init = production[0]
|
||||
nonterminal = init.split(':')[0]
|
||||
rules.append(strip_chars(init.split(':')[1]).strip('| '))
|
||||
nonterminal = init.split(":")[0]
|
||||
rules.append(strip_chars(init.split(":")[1]).strip("| "))
|
||||
for production_rule in production[1:]:
|
||||
rules.append(strip_chars(production_rule.split('|')[0]))
|
||||
rules.append(strip_chars(production_rule.split("|")[0]))
|
||||
final_rule_set[nonterminal] = rules
|
||||
# for line in data:
|
||||
# if line != '\n':
|
||||
# production.append(line)
|
||||
return final_rule_set
|
||||
|
||||
|
||||
def remove_unit(grammar):
|
||||
nounitproductions = False
|
||||
old_grammar = copy.deepcopy(grammar)
|
||||
@ -213,19 +218,21 @@ def remove_unit(grammar):
|
||||
new_grammar = defaultdict(list)
|
||||
return new_grammar
|
||||
|
||||
|
||||
def isTerminal(rule):
|
||||
# pattern = re.compile("([r]*\'[\s\S]+\')")
|
||||
pattern = re.compile("\'(.*?)\'")
|
||||
pattern = re.compile("'(.*?)'")
|
||||
match = pattern.match(rule)
|
||||
if match:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def remove_mixed(grammar):
|
||||
'''
|
||||
"""
|
||||
Remove rules where there are terminals mixed in with non-terminals
|
||||
'''
|
||||
"""
|
||||
new_grammar = defaultdict(list)
|
||||
for lhs, rules in grammar.items():
|
||||
for rhs in rules:
|
||||
@ -248,17 +255,20 @@ def remove_mixed(grammar):
|
||||
regen_rule.append(new_nonterm)
|
||||
else:
|
||||
regen_rule.append(token)
|
||||
new_grammar[lhs].append(' '.join(regen_rule))
|
||||
new_grammar[lhs].append(" ".join(regen_rule))
|
||||
return new_grammar
|
||||
|
||||
|
||||
def strip_chars(rule):
|
||||
return rule.strip('\n\t ')
|
||||
return rule.strip("\n\t ")
|
||||
|
||||
|
||||
def get_nonterminal():
|
||||
global COUNT
|
||||
COUNT += 1
|
||||
return f"GeneratedTermVar{COUNT}"
|
||||
|
||||
|
||||
def terminal_exist(token, grammar):
|
||||
for nonterminal, rules in grammar.items():
|
||||
if token in rules and len(token) == 1:
|
||||
@ -269,42 +279,37 @@ def terminal_exist(token, grammar):
|
||||
def main(grammar_file, out, start):
|
||||
grammar = None
|
||||
# If grammar file is a preprocessed NT file, then skip preprocessing
|
||||
if '.json' in grammar_file:
|
||||
with open(grammar_file, 'r') as fd:
|
||||
if ".json" in grammar_file:
|
||||
with open(grammar_file, "r") as fd:
|
||||
grammar = json.load(fd)
|
||||
elif '.g4' in grammar_file:
|
||||
with open(grammar_file, 'r') as fd:
|
||||
elif ".g4" in grammar_file:
|
||||
with open(grammar_file, "r") as fd:
|
||||
data = fd.readlines()
|
||||
grammar = process_antlr4_grammar(data)
|
||||
else:
|
||||
raise('Unknwown file format passed. Accepts (.g4/.json)')
|
||||
raise ("Unknwown file format passed. Accepts (.g4/.json)")
|
||||
|
||||
grammar = convert_to_gnf(grammar, start)
|
||||
with open(out, 'w+') as fd:
|
||||
with open(out, "w+") as fd:
|
||||
json.dump(grammar, fd)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Script to convert grammar to GNF form"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gf',
|
||||
type = str,
|
||||
required = True,
|
||||
help = 'Location of grammar file')
|
||||
"--gf", type=str, required=True, help="Location of grammar file"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--out',
|
||||
type = str,
|
||||
required = True,
|
||||
help = 'Location of output file')
|
||||
"--out", type=str, required=True, help="Location of output file"
|
||||
)
|
||||
parser.add_argument("--start", type=str, required=True, help="Start token")
|
||||
parser.add_argument(
|
||||
'--start',
|
||||
type = str,
|
||||
required = True,
|
||||
help = 'Start token')
|
||||
parser.add_argument(
|
||||
'--debug',
|
||||
action='store_true',
|
||||
help = 'Write intermediate states to debug files')
|
||||
"--debug", action="store_true", help="Write intermediate states to debug files"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
DEBUG = args.debug
|
||||
|
||||
|
@ -20,3 +20,4 @@ tokio = { version = "1.38", features = [
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
exitcode = "1.1"
|
||||
which = "6.0"
|
||||
colored = "2.1.0"
|
||||
|
@ -78,12 +78,13 @@ use std::{
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use colored::Colorize;
|
||||
use regex::RegexSet;
|
||||
use tokio::{process::Command, task::JoinSet};
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
use which::which;
|
||||
|
||||
const REF_LLVM_VERSION: u32 = 18;
|
||||
const REF_LLVM_VERSION: u32 = 19;
|
||||
|
||||
fn is_workspace_toml(path: &Path) -> bool {
|
||||
for line in read_to_string(path).unwrap().lines() {
|
||||
@ -249,20 +250,29 @@ async fn main() -> io::Result<()> {
|
||||
tokio_joinset.spawn(run_cargo_fmt(project, cli.check, cli.verbose));
|
||||
}
|
||||
|
||||
let ref_clang_format = format!("clang-format-{REF_LLVM_VERSION}");
|
||||
let reference_clang_format = format!("clang-format-{REF_LLVM_VERSION}");
|
||||
let unspecified_clang_format = "clang-format";
|
||||
|
||||
let (clang, warning) = if which(&reference_clang_format).is_ok() {
|
||||
(Some(reference_clang_format.as_str()), None)
|
||||
} else if which(unspecified_clang_format).is_ok() {
|
||||
let version = Command::new(unspecified_clang_format)
|
||||
.arg("--version")
|
||||
.output()
|
||||
.await?
|
||||
.stdout;
|
||||
|
||||
let (clang, warning) = if which(ref_clang_format.clone()).is_ok() {
|
||||
// can't use 18 for ci.
|
||||
(Some(ref_clang_format), None)
|
||||
} else if which("clang-format").is_ok() {
|
||||
(
|
||||
Some("clang-format".to_string()),
|
||||
Some("using clang-format, could provide a different result from clang-format-17"),
|
||||
Some(unspecified_clang_format),
|
||||
Some(format!(
|
||||
"using {}, could provide a different result from clang-format-17",
|
||||
from_utf8(&version).unwrap().replace('\n', "")
|
||||
)),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
None,
|
||||
Some("clang-format not found. Skipping C formatting..."),
|
||||
Some("clang-format not found. Skipping C formatting...".to_string()),
|
||||
)
|
||||
};
|
||||
// println!("Using {:#?} to format...", clang);
|
||||
@ -277,7 +287,12 @@ async fn main() -> io::Result<()> {
|
||||
.collect();
|
||||
|
||||
for c_file in c_files_to_fmt {
|
||||
tokio_joinset.spawn(run_clang_fmt(c_file, clang.clone(), cli.check, cli.verbose));
|
||||
tokio_joinset.spawn(run_clang_fmt(
|
||||
c_file,
|
||||
clang.to_string(),
|
||||
cli.check,
|
||||
cli.verbose,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@ -292,7 +307,7 @@ async fn main() -> io::Result<()> {
|
||||
}
|
||||
|
||||
if let Some(warning) = warning {
|
||||
println!("Warning: {warning}");
|
||||
println!("\n{}: {}\n", "Warning".yellow().bold(), warning);
|
||||
}
|
||||
|
||||
if cli.check {
|
||||
|
Loading…
x
Reference in New Issue
Block a user