Fix gnf_converter.py script (#616)

This commit is contained in:
Ao Li 2022-05-10 10:48:48 -07:00 committed by GitHub
parent a02b90be44
commit 9e382c4177
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 66 additions and 65 deletions

View File

@ -28,10 +28,6 @@ def convert_to_gnf(grammar, start):
if DEBUG:
with open('debug_mixed.json', 'w+') as fd:
json.dump(grammar, fd)
grammar = break_rules(grammar) # eliminate rules with more than two non-terminals
if DEBUG:
with open('debug_break.json', 'w+') as fd:
json.dump(grammar, fd)
grammar = gnf(grammar)
# Dump GNF form of the grammar with only reachable rules
@ -45,6 +41,53 @@ def convert_to_gnf(grammar, start):
grammar["Start"] = [start]
return grammar
def remove_left_recursion(grammar):
# Remove the left recursion in the grammar rules.
# This algorithm is adopted from
# https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
# Note that the current implementation does not
# guarantee completeness and will not remove recursions
# similar to { "A": ["BC"], "B": ["AD"] }.
# Therefore, we need to call this function each time
# the rule is updated.
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
no_left_recursion = False
while not no_left_recursion:
for lhs, rules in old_grammar.items():
left_recursion = []
others = []
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
left_recursion.append(tokens)
else:
others.append(tokens)
if left_recursion:
new_rule = get_nonterminal()
for r in others:
r.append(new_rule)
left_recursion = [r[1:] + [new_rule] for r in left_recursion]
left_recursion.append(["' '"])
new_grammar[lhs] = [' '.join(rule) for rule in others]
new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
else:
new_grammar[lhs] = [' '.join(rule) for rule in others]
no_left_recursion = True
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
left_recursion = False
break
else:
continue
break
if not no_left_recursion:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def get_reachable(grammar, start):
'''
Returns a grammar without dead rules
@ -78,6 +121,7 @@ def gnf(grammar):
new_grammar = defaultdict(list)
isgnf = False
while not isgnf:
old_grammar = remove_left_recursion(old_grammar)
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
@ -85,6 +129,7 @@ def gnf(grammar):
new_grammar[lhs].append(rule)
continue
startoken = tokens[0]
assert(startoken != lhs)
endrule = tokens[1:]
if not isTerminal(startoken):
newrules = []
@ -184,22 +229,17 @@ def remove_mixed(grammar):
new_grammar = defaultdict(list)
for lhs, rules in grammar.items():
for rhs in rules:
# tokens = rhs.split(' ')
regen_rule = []
# print('---------------------')
# print(rhs)
tokens = gettokens(rhs)
if len(gettokens(rhs)) == 1:
if len(tokens) == 1:
new_grammar[lhs].append(rhs)
continue
for token in tokens:
regen_rule = [tokens[0]]
for token in tokens[1:]:
# print(token, isTerminal(token), regen_rule)
# Identify if there is a terminal in the RHS
if isTerminal(token):
# Check if a corresponding nonterminal already exists
# nonterminal = terminal_exist(token, new_grammar)
nonterminal = None
# TODO(andrea) disabled ATM, further investigation using the Ruby grammar needed
nonterminal = terminal_exist(token, new_grammar)
if nonterminal:
regen_rule.append(nonterminal)
else:
@ -211,60 +251,17 @@ def remove_mixed(grammar):
new_grammar[lhs].append(' '.join(regen_rule))
return new_grammar
def break_rules(grammar):
new_grammar = defaultdict(list)
old_grammar = copy.deepcopy(grammar)
nomulti = False
while not nomulti:
for lhs, rules in old_grammar.items():
for rhs in rules:
tokens = gettokens(rhs)
if len(tokens) > 2 and (not isTerminal(rhs)):
split = tokens[:-1]
nonterminal = terminal_exist(' '.join(split), new_grammar)
if nonterminal:
newrule = ' '.join([nonterminal, tokens[-1]])
new_grammar[lhs].append(newrule)
else:
nonterminal = get_nonterminal()
new_grammar[nonterminal].append(' '.join(split))
newrule = ' '.join([nonterminal, tokens[-1]])
new_grammar[lhs].append(newrule)
else:
new_grammar[lhs].append(rhs)
nomulti = True
for lhs, rules in new_grammar.items():
for rhs in rules:
# tokens = rhs.split(' ')
tokens = gettokens(rhs)
if len(tokens) > 2 and (not isTerminal(rhs)):
nomulti = False
break
if not nomulti:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def strip_chars(rule):
return rule.strip('\n\t ')
def get_nonterminal():
global NONTERMINALSET
if NONTERMINALSET:
return NONTERMINALSET.pop(0)
else:
_repopulate()
return NONTERMINALSET.pop(0)
def _repopulate():
global COUNT
global NONTERMINALSET
NONTERMINALSET = [''.join(x) for x in list(combinations(ascii_uppercase, COUNT))]
COUNT += 1
return f"GeneratedTermVar{COUNT}"
def terminal_exist(token, grammar):
for nonterminal, rules in grammar.items():
if token in rules:
if token in rules and len(token) == 1:
return nonterminal
return None

View File

@ -0,0 +1,4 @@
{
"B": ["'a'", "'b'"],
"A": ["A 'a'", "'a'"]
}