397 lines
13 KiB
C++
397 lines
13 KiB
C++
//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This class can produce a generic deterministic finite state automaton (DFA),
|
|
// given a set of possible states and transitions.
|
|
//
|
|
// The input transitions can be nondeterministic - this class will produce the
|
|
// deterministic equivalent state machine.
|
|
//
|
|
// The generated code can run the DFA and produce an accepted / not accepted
|
|
// state and also produce, given a sequence of transitions that results in an
|
|
// accepted state, the sequence of intermediate states. This is useful if the
|
|
// initial automaton was nondeterministic - it allows mapping back from the DFA
|
|
// to the NFA.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#define DEBUG_TYPE "dfa-emitter"
|
|
|
|
#include "DFAEmitter.h"
|
|
#include "CodeGenTarget.h"
|
|
#include "SequenceToOffsetTable.h"
|
|
#include "TableGenBackends.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/ADT/UniqueVector.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/TableGen/Record.h"
|
|
#include "llvm/TableGen/TableGenBackend.h"
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// DfaEmitter implementation. This is independent of the GenAutomaton backend.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
|
|
Actions.insert(A);
|
|
NfaStates.insert(From);
|
|
NfaStates.insert(To);
|
|
NfaTransitions[{From, A}].push_back(To);
|
|
++NumNfaTransitions;
|
|
}
|
|
|
|
void DfaEmitter::visitDfaState(const DfaState &DS) {
|
|
// For every possible action...
|
|
auto FromId = DfaStates.idFor(DS);
|
|
for (action_type A : Actions) {
|
|
DfaState NewStates;
|
|
DfaTransitionInfo TI;
|
|
// For every represented state, word pair in the original NFA...
|
|
for (state_type FromState : DS) {
|
|
// If this action is possible from this state add the transitioned-to
|
|
// states to NewStates.
|
|
auto I = NfaTransitions.find({FromState, A});
|
|
if (I == NfaTransitions.end())
|
|
continue;
|
|
for (state_type &ToState : I->second) {
|
|
NewStates.push_back(ToState);
|
|
TI.emplace_back(FromState, ToState);
|
|
}
|
|
}
|
|
if (NewStates.empty())
|
|
continue;
|
|
// Sort and unique.
|
|
sort(NewStates);
|
|
NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
|
|
NewStates.end());
|
|
sort(TI);
|
|
TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
|
|
unsigned ToId = DfaStates.insert(NewStates);
|
|
DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
|
|
}
|
|
}
|
|
|
|
void DfaEmitter::constructDfa() {
|
|
DfaState Initial(1, /*NFA initial state=*/0);
|
|
DfaStates.insert(Initial);
|
|
|
|
// Note that UniqueVector starts indices at 1, not zero.
|
|
unsigned DfaStateId = 1;
|
|
while (DfaStateId <= DfaStates.size()) {
|
|
DfaState S = DfaStates[DfaStateId];
|
|
visitDfaState(S);
|
|
DfaStateId++;
|
|
}
|
|
}
|
|
|
|
void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
|
|
constructDfa();
|
|
|
|
OS << "// Input NFA has " << NfaStates.size() << " states with "
|
|
<< NumNfaTransitions << " transitions.\n";
|
|
OS << "// Generated DFA has " << DfaStates.size() << " states with "
|
|
<< DfaTransitions.size() << " transitions.\n\n";
|
|
|
|
// Implementation note: We don't bake a simple std::pair<> here as it requires
|
|
// significantly more effort to parse. A simple test with a large array of
|
|
// struct-pairs (N=100000) took clang-10 6s to parse. The same array of
|
|
// std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
|
|
// define the pair type.
|
|
//
|
|
// FIXME: It may make sense to emit these as ULEB sequences instead of
|
|
// pairs of uint64_t.
|
|
OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
|
|
OS << "// transition implies a set of NFA transitions. These are referred\n";
|
|
OS << "// to by index in " << Name << "Transitions[].\n";
|
|
|
|
SequenceToOffsetTable<DfaTransitionInfo> Table;
|
|
std::map<DfaTransitionInfo, unsigned> EmittedIndices;
|
|
for (auto &T : DfaTransitions)
|
|
Table.add(T.second.second);
|
|
Table.layout();
|
|
OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
|
|
<< "TransitionInfo = {{\n";
|
|
Table.emit(
|
|
OS,
|
|
[](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
|
|
OS << "{" << P.first << ", " << P.second << "}";
|
|
},
|
|
"{0ULL, 0ULL}");
|
|
|
|
OS << "}};\n\n";
|
|
|
|
OS << "// A transition in the generated " << Name << " DFA.\n";
|
|
OS << "struct " << Name << "Transition {\n";
|
|
OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n";
|
|
OS << " ";
|
|
printActionType(OS);
|
|
OS << " Action; // The input symbol that causes this transition.\n";
|
|
OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n";
|
|
OS << " unsigned InfoIdx; // Start index into " << Name
|
|
<< "TransitionInfo.\n";
|
|
OS << "};\n\n";
|
|
|
|
OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
|
|
OS << "// The initial state is 1, not zero.\n";
|
|
OS << "const std::array<" << Name << "Transition, "
|
|
<< DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
|
|
for (auto &KV : DfaTransitions) {
|
|
dfa_state_type From = KV.first.first;
|
|
dfa_state_type To = KV.second.first;
|
|
action_type A = KV.first.second;
|
|
unsigned InfoIdx = Table.get(KV.second.second);
|
|
OS << " {" << From << ", ";
|
|
printActionValue(A, OS);
|
|
OS << ", " << To << ", " << InfoIdx << "},\n";
|
|
}
|
|
OS << "\n}};\n\n";
|
|
}
|
|
|
|
void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
|
|
|
|
void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// AutomatonEmitter implementation
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
namespace {
|
|
// FIXME: This entire discriminated union could be removed with c++17:
|
|
// using Action = std::variant<Record *, unsigned, std::string>;
|
|
struct Action {
|
|
Record *R = nullptr;
|
|
unsigned I = 0;
|
|
std::string S;
|
|
|
|
Action() = default;
|
|
Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
|
|
|
|
void print(raw_ostream &OS) const {
|
|
if (R)
|
|
OS << R->getName();
|
|
else if (!S.empty())
|
|
OS << '"' << S << '"';
|
|
else
|
|
OS << I;
|
|
}
|
|
bool operator<(const Action &Other) const {
|
|
return std::make_tuple(R, I, S) <
|
|
std::make_tuple(Other.R, Other.I, Other.S);
|
|
}
|
|
};
|
|
|
|
using ActionTuple = std::vector<Action>;
|
|
class Automaton;
|
|
|
|
class Transition {
|
|
uint64_t NewState;
|
|
// The tuple of actions that causes this transition.
|
|
ActionTuple Actions;
|
|
// The types of the actions; this is the same across all transitions.
|
|
SmallVector<std::string, 4> Types;
|
|
|
|
public:
|
|
Transition(Record *R, Automaton *Parent);
|
|
const ActionTuple &getActions() { return Actions; }
|
|
SmallVector<std::string, 4> getTypes() { return Types; }
|
|
|
|
bool canTransitionFrom(uint64_t State);
|
|
uint64_t transitionFrom(uint64_t State);
|
|
};
|
|
|
|
class Automaton {
|
|
RecordKeeper &Records;
|
|
Record *R;
|
|
std::vector<Transition> Transitions;
|
|
/// All possible action tuples, uniqued.
|
|
UniqueVector<ActionTuple> Actions;
|
|
/// The fields within each Transition object to find the action symbols.
|
|
std::vector<StringRef> ActionSymbolFields;
|
|
|
|
public:
|
|
Automaton(RecordKeeper &Records, Record *R);
|
|
void emit(raw_ostream &OS);
|
|
|
|
ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
|
|
/// If the type of action A has been overridden (there exists a field
|
|
/// "TypeOf_A") return that, otherwise return the empty string.
|
|
StringRef getActionSymbolType(StringRef A);
|
|
};
|
|
|
|
class AutomatonEmitter {
|
|
RecordKeeper &Records;
|
|
|
|
public:
|
|
AutomatonEmitter(RecordKeeper &R) : Records(R) {}
|
|
void run(raw_ostream &OS);
|
|
};
|
|
|
|
/// A DfaEmitter implementation that can print our variant action type.
|
|
class CustomDfaEmitter : public DfaEmitter {
|
|
const UniqueVector<ActionTuple> &Actions;
|
|
std::string TypeName;
|
|
|
|
public:
|
|
CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
|
|
: Actions(Actions), TypeName(TypeName) {}
|
|
|
|
void printActionType(raw_ostream &OS) override;
|
|
void printActionValue(action_type A, raw_ostream &OS) override;
|
|
};
|
|
} // namespace
|
|
|
|
void AutomatonEmitter::run(raw_ostream &OS) {
|
|
for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
|
|
Automaton A(Records, R);
|
|
OS << "#ifdef GET_" << R->getName() << "_DECL\n";
|
|
A.emit(OS);
|
|
OS << "#endif // GET_" << R->getName() << "_DECL\n";
|
|
}
|
|
}
|
|
|
|
Automaton::Automaton(RecordKeeper &Records, Record *R)
|
|
: Records(Records), R(R) {
|
|
LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
|
|
ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
|
|
}
|
|
|
|
void Automaton::emit(raw_ostream &OS) {
|
|
StringRef TransitionClass = R->getValueAsString("TransitionClass");
|
|
for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
|
|
assert(T->isSubClassOf("Transition"));
|
|
Transitions.emplace_back(T, this);
|
|
Actions.insert(Transitions.back().getActions());
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size()
|
|
<< "\n");
|
|
LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size()
|
|
<< " potential transitions.\n");
|
|
|
|
StringRef Name = R->getName();
|
|
|
|
CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
|
|
// Starting from the initial state, build up a list of possible states and
|
|
// transitions.
|
|
std::deque<uint64_t> Worklist(1, 0);
|
|
std::set<uint64_t> SeenStates;
|
|
unsigned NumTransitions = 0;
|
|
SeenStates.insert(Worklist.front());
|
|
while (!Worklist.empty()) {
|
|
uint64_t State = Worklist.front();
|
|
Worklist.pop_front();
|
|
for (Transition &T : Transitions) {
|
|
if (!T.canTransitionFrom(State))
|
|
continue;
|
|
uint64_t NewState = T.transitionFrom(State);
|
|
if (SeenStates.emplace(NewState).second)
|
|
Worklist.emplace_back(NewState);
|
|
++NumTransitions;
|
|
Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
|
|
}
|
|
}
|
|
LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size()
|
|
<< " states with " << NumTransitions << " transitions.\n");
|
|
|
|
const auto &ActionTypes = Transitions.back().getTypes();
|
|
OS << "// The type of an action in the " << Name << " automaton.\n";
|
|
if (ActionTypes.size() == 1) {
|
|
OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
|
|
} else {
|
|
OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
|
|
<< ">;\n";
|
|
}
|
|
OS << "\n";
|
|
|
|
Emitter.emit(Name, OS);
|
|
}
|
|
|
|
StringRef Automaton::getActionSymbolType(StringRef A) {
|
|
Twine Ty = "TypeOf_" + A;
|
|
if (!R->getValue(Ty.str()))
|
|
return "";
|
|
return R->getValueAsString(Ty.str());
|
|
}
|
|
|
|
Transition::Transition(Record *R, Automaton *Parent) {
|
|
BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
|
|
NewState = 0;
|
|
assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
|
|
"State cannot be represented in 64 bits!");
|
|
for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
|
|
if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
|
|
if (Bit->getValue())
|
|
NewState |= 1ULL << I;
|
|
}
|
|
}
|
|
|
|
for (StringRef A : Parent->getActionSymbolFields()) {
|
|
RecordVal *SymbolV = R->getValue(A);
|
|
if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
|
|
Actions.emplace_back(R->getValueAsDef(A), 0, "");
|
|
Types.emplace_back(Ty->getAsString());
|
|
} else if (isa<IntRecTy>(SymbolV->getType())) {
|
|
Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
|
|
Types.emplace_back("unsigned");
|
|
} else if (isa<StringRecTy>(SymbolV->getType())) {
|
|
Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
|
|
Types.emplace_back("std::string");
|
|
} else {
|
|
report_fatal_error("Unhandled symbol type!");
|
|
}
|
|
|
|
StringRef TypeOverride = Parent->getActionSymbolType(A);
|
|
if (!TypeOverride.empty())
|
|
Types.back() = std::string(TypeOverride);
|
|
}
|
|
}
|
|
|
|
bool Transition::canTransitionFrom(uint64_t State) {
|
|
if ((State & NewState) == 0)
|
|
// The bits we want to set are not set;
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
uint64_t Transition::transitionFrom(uint64_t State) {
|
|
return State | NewState;
|
|
}
|
|
|
|
void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
|
|
|
|
void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
|
|
const ActionTuple &AT = Actions[A];
|
|
if (AT.size() > 1)
|
|
OS << "std::make_tuple(";
|
|
bool First = true;
|
|
for (const auto &SingleAction : AT) {
|
|
if (!First)
|
|
OS << ", ";
|
|
First = false;
|
|
SingleAction.print(OS);
|
|
}
|
|
if (AT.size() > 1)
|
|
OS << ")";
|
|
}
|
|
|
|
namespace llvm {
|
|
|
|
void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
|
|
AutomatonEmitter(RK).run(OS);
|
|
}
|
|
|
|
} // namespace llvm
|