351 lines
13 KiB
C++
351 lines
13 KiB
C++
//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file defines the Suffix Tree class and Suffix Tree Node struct.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#ifndef LLVM_SUPPORT_SUFFIXTREE_H
|
|
#define LLVM_SUPPORT_SUFFIXTREE_H
|
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/Support/Allocator.h"
|
|
#include <vector>
|
|
|
|
namespace llvm {
|
|
|
|
/// Represents an undefined index in the suffix tree.
|
|
const unsigned EmptyIdx = -1;
|
|
|
|
/// A node in a suffix tree which represents a substring or suffix.
|
|
///
|
|
/// Each node has either no children or at least two children, with the root
|
|
/// being a exception in the empty tree.
|
|
///
|
|
/// Children are represented as a map between unsigned integers and nodes. If
|
|
/// a node N has a child M on unsigned integer k, then the mapping represented
|
|
/// by N is a proper prefix of the mapping represented by M. Note that this,
|
|
/// although similar to a trie is somewhat different: each node stores a full
|
|
/// substring of the full mapping rather than a single character state.
|
|
///
|
|
/// Each internal node contains a pointer to the internal node representing
|
|
/// the same string, but with the first character chopped off. This is stored
|
|
/// in \p Link. Each leaf node stores the start index of its respective
|
|
/// suffix in \p SuffixIdx.
|
|
struct SuffixTreeNode {
|
|
|
|
/// The children of this node.
|
|
///
|
|
/// A child existing on an unsigned integer implies that from the mapping
|
|
/// represented by the current node, there is a way to reach another
|
|
/// mapping by tacking that character on the end of the current string.
|
|
llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
|
|
|
|
/// The start index of this node's substring in the main string.
|
|
unsigned StartIdx = EmptyIdx;
|
|
|
|
/// The end index of this node's substring in the main string.
|
|
///
|
|
/// Every leaf node must have its \p EndIdx incremented at the end of every
|
|
/// step in the construction algorithm. To avoid having to update O(N)
|
|
/// nodes individually at the end of every step, the end index is stored
|
|
/// as a pointer.
|
|
unsigned *EndIdx = nullptr;
|
|
|
|
/// For leaves, the start index of the suffix represented by this node.
|
|
///
|
|
/// For all other nodes, this is ignored.
|
|
unsigned SuffixIdx = EmptyIdx;
|
|
|
|
/// For internal nodes, a pointer to the internal node representing
|
|
/// the same sequence with the first character chopped off.
|
|
///
|
|
/// This acts as a shortcut in Ukkonen's algorithm. One of the things that
|
|
/// Ukkonen's algorithm does to achieve linear-time construction is
|
|
/// keep track of which node the next insert should be at. This makes each
|
|
/// insert O(1), and there are a total of O(N) inserts. The suffix link
|
|
/// helps with inserting children of internal nodes.
|
|
///
|
|
/// Say we add a child to an internal node with associated mapping S. The
|
|
/// next insertion must be at the node representing S - its first character.
|
|
/// This is given by the way that we iteratively build the tree in Ukkonen's
|
|
/// algorithm. The main idea is to look at the suffixes of each prefix in the
|
|
/// string, starting with the longest suffix of the prefix, and ending with
|
|
/// the shortest. Therefore, if we keep pointers between such nodes, we can
|
|
/// move to the next insertion point in O(1) time. If we don't, then we'd
|
|
/// have to query from the root, which takes O(N) time. This would make the
|
|
/// construction algorithm O(N^2) rather than O(N).
|
|
SuffixTreeNode *Link = nullptr;
|
|
|
|
/// The length of the string formed by concatenating the edge labels from the
|
|
/// root to this node.
|
|
unsigned ConcatLen = 0;
|
|
|
|
/// Returns true if this node is a leaf.
|
|
bool isLeaf() const { return SuffixIdx != EmptyIdx; }
|
|
|
|
/// Returns true if this node is the root of its owning \p SuffixTree.
|
|
bool isRoot() const { return StartIdx == EmptyIdx; }
|
|
|
|
/// Return the number of elements in the substring associated with this node.
|
|
size_t size() const {
|
|
|
|
// Is it the root? If so, it's the empty string so return 0.
|
|
if (isRoot())
|
|
return 0;
|
|
|
|
assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
|
|
|
|
// Size = the number of elements in the string.
|
|
// For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
|
|
return *EndIdx - StartIdx + 1;
|
|
}
|
|
|
|
SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
|
|
: StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
|
|
|
|
SuffixTreeNode() {}
|
|
};
|
|
|
|
/// A data structure for fast substring queries.
|
|
///
|
|
/// Suffix trees represent the suffixes of their input strings in their leaves.
|
|
/// A suffix tree is a type of compressed trie structure where each node
|
|
/// represents an entire substring rather than a single character. Each leaf
|
|
/// of the tree is a suffix.
|
|
///
|
|
/// A suffix tree can be seen as a type of state machine where each state is a
|
|
/// substring of the full string. The tree is structured so that, for a string
|
|
/// of length N, there are exactly N leaves in the tree. This structure allows
|
|
/// us to quickly find repeated substrings of the input string.
|
|
///
|
|
/// In this implementation, a "string" is a vector of unsigned integers.
|
|
/// These integers may result from hashing some data type. A suffix tree can
|
|
/// contain 1 or many strings, which can then be queried as one large string.
|
|
///
|
|
/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
|
|
/// suffix tree construction. Ukkonen's algorithm is explained in more detail
|
|
/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
|
|
/// paper is available at
|
|
///
|
|
/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
|
|
class SuffixTree {
|
|
public:
|
|
/// Each element is an integer representing an instruction in the module.
|
|
llvm::ArrayRef<unsigned> Str;
|
|
|
|
/// A repeated substring in the tree.
|
|
struct RepeatedSubstring {
|
|
/// The length of the string.
|
|
unsigned Length;
|
|
|
|
/// The start indices of each occurrence.
|
|
std::vector<unsigned> StartIndices;
|
|
};
|
|
|
|
private:
|
|
/// Maintains each node in the tree.
|
|
llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
|
|
|
|
/// The root of the suffix tree.
|
|
///
|
|
/// The root represents the empty string. It is maintained by the
|
|
/// \p NodeAllocator like every other node in the tree.
|
|
SuffixTreeNode *Root = nullptr;
|
|
|
|
/// Maintains the end indices of the internal nodes in the tree.
|
|
///
|
|
/// Each internal node is guaranteed to never have its end index change
|
|
/// during the construction algorithm; however, leaves must be updated at
|
|
/// every step. Therefore, we need to store leaf end indices by reference
|
|
/// to avoid updating O(N) leaves at every step of construction. Thus,
|
|
/// every internal node must be allocated its own end index.
|
|
llvm::BumpPtrAllocator InternalEndIdxAllocator;
|
|
|
|
/// The end index of each leaf in the tree.
|
|
unsigned LeafEndIdx = -1;
|
|
|
|
/// Helper struct which keeps track of the next insertion point in
|
|
/// Ukkonen's algorithm.
|
|
struct ActiveState {
|
|
/// The next node to insert at.
|
|
SuffixTreeNode *Node = nullptr;
|
|
|
|
/// The index of the first character in the substring currently being added.
|
|
unsigned Idx = EmptyIdx;
|
|
|
|
/// The length of the substring we have to add at the current step.
|
|
unsigned Len = 0;
|
|
};
|
|
|
|
/// The point the next insertion will take place at in the
|
|
/// construction algorithm.
|
|
ActiveState Active;
|
|
|
|
/// Allocate a leaf node and add it to the tree.
|
|
///
|
|
/// \param Parent The parent of this node.
|
|
/// \param StartIdx The start index of this node's associated string.
|
|
/// \param Edge The label on the edge leaving \p Parent to this node.
|
|
///
|
|
/// \returns A pointer to the allocated leaf node.
|
|
SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
|
|
unsigned Edge);
|
|
|
|
/// Allocate an internal node and add it to the tree.
|
|
///
|
|
/// \param Parent The parent of this node. Only null when allocating the root.
|
|
/// \param StartIdx The start index of this node's associated string.
|
|
/// \param EndIdx The end index of this node's associated string.
|
|
/// \param Edge The label on the edge leaving \p Parent to this node.
|
|
///
|
|
/// \returns A pointer to the allocated internal node.
|
|
SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
|
|
unsigned EndIdx, unsigned Edge);
|
|
|
|
/// Set the suffix indices of the leaves to the start indices of their
|
|
/// respective suffixes.
|
|
void setSuffixIndices();
|
|
|
|
/// Construct the suffix tree for the prefix of the input ending at
|
|
/// \p EndIdx.
|
|
///
|
|
/// Used to construct the full suffix tree iteratively. At the end of each
|
|
/// step, the constructed suffix tree is either a valid suffix tree, or a
|
|
/// suffix tree with implicit suffixes. At the end of the final step, the
|
|
/// suffix tree is a valid tree.
|
|
///
|
|
/// \param EndIdx The end index of the current prefix in the main string.
|
|
/// \param SuffixesToAdd The number of suffixes that must be added
|
|
/// to complete the suffix tree at the current phase.
|
|
///
|
|
/// \returns The number of suffixes that have not been added at the end of
|
|
/// this step.
|
|
unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
|
|
|
|
public:
|
|
/// Construct a suffix tree from a sequence of unsigned integers.
|
|
///
|
|
/// \param Str The string to construct the suffix tree for.
|
|
SuffixTree(const std::vector<unsigned> &Str);
|
|
|
|
/// Iterator for finding all repeated substrings in the suffix tree.
|
|
struct RepeatedSubstringIterator {
|
|
private:
|
|
/// The current node we're visiting.
|
|
SuffixTreeNode *N = nullptr;
|
|
|
|
/// The repeated substring associated with this node.
|
|
RepeatedSubstring RS;
|
|
|
|
/// The nodes left to visit.
|
|
std::vector<SuffixTreeNode *> ToVisit;
|
|
|
|
/// The minimum length of a repeated substring to find.
|
|
/// Since we're outlining, we want at least two instructions in the range.
|
|
/// FIXME: This may not be true for targets like X86 which support many
|
|
/// instruction lengths.
|
|
const unsigned MinLength = 2;
|
|
|
|
/// Move the iterator to the next repeated substring.
|
|
void advance() {
|
|
// Clear the current state. If we're at the end of the range, then this
|
|
// is the state we want to be in.
|
|
RS = RepeatedSubstring();
|
|
N = nullptr;
|
|
|
|
// Each leaf node represents a repeat of a string.
|
|
std::vector<SuffixTreeNode *> LeafChildren;
|
|
|
|
// Continue visiting nodes until we find one which repeats more than once.
|
|
while (!ToVisit.empty()) {
|
|
SuffixTreeNode *Curr = ToVisit.back();
|
|
ToVisit.pop_back();
|
|
LeafChildren.clear();
|
|
|
|
// Keep track of the length of the string associated with the node. If
|
|
// it's too short, we'll quit.
|
|
unsigned Length = Curr->ConcatLen;
|
|
|
|
// Iterate over each child, saving internal nodes for visiting, and
|
|
// leaf nodes in LeafChildren. Internal nodes represent individual
|
|
// strings, which may repeat.
|
|
for (auto &ChildPair : Curr->Children) {
|
|
// Save all of this node's children for processing.
|
|
if (!ChildPair.second->isLeaf())
|
|
ToVisit.push_back(ChildPair.second);
|
|
|
|
// It's not an internal node, so it must be a leaf. If we have a
|
|
// long enough string, then save the leaf children.
|
|
else if (Length >= MinLength)
|
|
LeafChildren.push_back(ChildPair.second);
|
|
}
|
|
|
|
// The root never represents a repeated substring. If we're looking at
|
|
// that, then skip it.
|
|
if (Curr->isRoot())
|
|
continue;
|
|
|
|
// Do we have any repeated substrings?
|
|
if (LeafChildren.size() >= 2) {
|
|
// Yes. Update the state to reflect this, and then bail out.
|
|
N = Curr;
|
|
RS.Length = Length;
|
|
for (SuffixTreeNode *Leaf : LeafChildren)
|
|
RS.StartIndices.push_back(Leaf->SuffixIdx);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// At this point, either NewRS is an empty RepeatedSubstring, or it was
|
|
// set in the above loop. Similarly, N is either nullptr, or the node
|
|
// associated with NewRS.
|
|
}
|
|
|
|
public:
|
|
/// Return the current repeated substring.
|
|
RepeatedSubstring &operator*() { return RS; }
|
|
|
|
RepeatedSubstringIterator &operator++() {
|
|
advance();
|
|
return *this;
|
|
}
|
|
|
|
RepeatedSubstringIterator operator++(int I) {
|
|
RepeatedSubstringIterator It(*this);
|
|
advance();
|
|
return It;
|
|
}
|
|
|
|
bool operator==(const RepeatedSubstringIterator &Other) const {
|
|
return N == Other.N;
|
|
}
|
|
bool operator!=(const RepeatedSubstringIterator &Other) const {
|
|
return !(*this == Other);
|
|
}
|
|
|
|
RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
|
|
// Do we have a non-null node?
|
|
if (N) {
|
|
// Yes. At the first step, we need to visit all of N's children.
|
|
// Note: This means that we visit N last.
|
|
ToVisit.push_back(N);
|
|
advance();
|
|
}
|
|
}
|
|
};
|
|
|
|
typedef RepeatedSubstringIterator iterator;
|
|
iterator begin() { return iterator(Root); }
|
|
iterator end() { return iterator(nullptr); }
|
|
};
|
|
|
|
} // namespace llvm
|
|
|
|
#endif // LLVM_SUPPORT_SUFFIXTREE_H
|