Fix gnf_converter.py script (#616)

2022-05-10 10:48:48 -07:00 · 2022-05-10 10:48:48 -07:00 · 9e382c4177
commit 9e382c4177
parent a02b90be44
2 changed files with 66 additions and 65 deletions
--- a/utils/gramatron/gnf_converter.py
+++ b/utils/gramatron/gnf_converter.py
@ -28,10 +28,6 @@ def convert_to_gnf(grammar, start):
    if DEBUG:
        with open('debug_mixed.json', 'w+') as fd:
            json.dump(grammar, fd)
    grammar = break_rules(grammar) # eliminate rules with more than two non-terminals
    if DEBUG:
        with open('debug_break.json', 'w+') as fd:
            json.dump(grammar, fd)
    grammar = gnf(grammar)
    # Dump GNF form of the grammar with only reachable rules
@ -45,6 +41,53 @@ def convert_to_gnf(grammar, start):
    grammar["Start"] = [start]
    return grammar
 def remove_left_recursion(grammar):
    # Remove the left recursion in the grammar rules.
    # This algorithm is adopted from
    # https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
    # Note that the current implementation does not
    # guarantee completeness and will not remove recursions
    # similar to { "A": ["BC"], "B": ["AD"] }.
    # Therefore, we need to call this function each time
    # the rule is updated.
    old_grammar = copy.deepcopy(grammar)
    new_grammar = defaultdict(list)
    no_left_recursion = False
    while not no_left_recursion:
        for lhs, rules in old_grammar.items():
            left_recursion = []
            others = []
            for rule in rules:
                tokens = gettokens(rule)
                if tokens[0] == lhs:
                    left_recursion.append(tokens)
                else:
                    others.append(tokens)
            if left_recursion:
                new_rule = get_nonterminal()
                for r in others:
                    r.append(new_rule)
                left_recursion = [r[1:] + [new_rule] for r in left_recursion]
                left_recursion.append(["' '"])
                new_grammar[lhs] = [' '.join(rule) for rule in others]
                new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
            else:
                new_grammar[lhs] = [' '.join(rule) for rule in others]
        no_left_recursion = True
        for lhs, rules in old_grammar.items():
            for rule in rules:
                tokens = gettokens(rule)
                if tokens[0] == lhs:
                    left_recursion = False
                    break
            else:
                continue
            break
        if not no_left_recursion:
            old_grammar = copy.deepcopy(new_grammar)
            new_grammar = defaultdict(list)
    return new_grammar
 def get_reachable(grammar, start):
    '''
    Returns a grammar without dead rules
@ -78,6 +121,7 @@ def gnf(grammar):
    new_grammar = defaultdict(list)
    isgnf = False
    while not isgnf:
        old_grammar = remove_left_recursion(old_grammar)
        for lhs, rules in old_grammar.items():
            for rule in rules:
                tokens = gettokens(rule)
@ -85,6 +129,7 @@ def gnf(grammar):
                    new_grammar[lhs].append(rule)
                    continue
                startoken = tokens[0]
                assert(startoken != lhs)
                endrule = tokens[1:]
                if not isTerminal(startoken):
                    newrules = []
@ -184,22 +229,17 @@ def remove_mixed(grammar):
    new_grammar = defaultdict(list)
    for lhs, rules in grammar.items():
        for rhs in rules:
            # tokens = rhs.split(' ')
            regen_rule = []
            # print('---------------------')
            # print(rhs)
            tokens = gettokens(rhs)
-            if len(gettokens(rhs)) == 1:
+            if len(tokens) == 1:
                new_grammar[lhs].append(rhs)
                continue
-            for token in tokens:
+            regen_rule = [tokens[0]]
            for token in tokens[1:]:
                # print(token, isTerminal(token), regen_rule)
                # Identify if there is a terminal in the RHS
                if isTerminal(token):
                    # Check if a corresponding nonterminal already exists
-                    # nonterminal = terminal_exist(token, new_grammar)
+                    nonterminal = terminal_exist(token, new_grammar)
                    nonterminal = None
                    # TODO(andrea) disabled ATM, further investigation using the Ruby grammar needed
                    if nonterminal:
                        regen_rule.append(nonterminal)
                    else:
@ -211,60 +251,17 @@ def remove_mixed(grammar):
            new_grammar[lhs].append(' '.join(regen_rule))
    return new_grammar
 def break_rules(grammar):
    new_grammar = defaultdict(list)
    old_grammar = copy.deepcopy(grammar)
    nomulti = False
    while not nomulti:
        for lhs, rules in old_grammar.items():
            for rhs in rules:
                tokens = gettokens(rhs)
                if len(tokens) > 2 and (not isTerminal(rhs)):
                    split = tokens[:-1] 
                    nonterminal = terminal_exist(' '.join(split), new_grammar)
                    if nonterminal:
                        newrule = ' '.join([nonterminal, tokens[-1]])
                        new_grammar[lhs].append(newrule)
                    else:
                        nonterminal = get_nonterminal()
                        new_grammar[nonterminal].append(' '.join(split))
                        newrule = ' '.join([nonterminal, tokens[-1]])
                        new_grammar[lhs].append(newrule)
                else:
                    new_grammar[lhs].append(rhs)
        nomulti = True
        for lhs, rules in new_grammar.items():
            for rhs in rules:
                # tokens = rhs.split(' ')
                tokens = gettokens(rhs)
                if len(tokens) > 2 and (not isTerminal(rhs)):
                    nomulti = False
                    break
        if not nomulti:
            old_grammar = copy.deepcopy(new_grammar)
            new_grammar = defaultdict(list)
    return new_grammar
 def strip_chars(rule):
    return rule.strip('\n\t ')
 def get_nonterminal():
    global NONTERMINALSET
    if NONTERMINALSET:
        return NONTERMINALSET.pop(0)
    else:
        _repopulate()
        return NONTERMINALSET.pop(0)
 def _repopulate():
    global COUNT
    global NONTERMINALSET
    NONTERMINALSET = [''.join(x) for x in list(combinations(ascii_uppercase, COUNT))]
    COUNT += 1
    return f"GeneratedTermVar{COUNT}"
 def terminal_exist(token, grammar):
    for nonterminal, rules in grammar.items():
-        if token in rules:
+        if token in rules and len(token) == 1:
            return nonterminal
    return None
--- a/utils/gramatron/grammars/test1.json
+++ b/utils/gramatron/grammars/test1.json
@ -0,0 +1,4 @@
 {
 	"B": ["'a'", "'b'"],
 	"A": ["A 'a'", "'a'"]
 }