C++ Paste by Aho Corasick
Description: None
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | #include <iostream> #include <fstream> #include <string> #include <vector> #include <queue> #include <map> class KeywordTree { public: typedef void(*foundCallback)(int patternNr, int position); KeywordTree(const std::vector<std::string>& patterns) : root_(new Node(0, '\0')) { int n = 0; for (std::vector<std::string>::const_iterator it = patterns.begin(); it != patterns.end(); ++it, ++n) { insertPattern(*it, n); lengths_[n] = it->length(); } computeLinks(); } ~KeywordTree(); void findOccurences(const std::string& text, foundCallback cb); private: KeywordTree(const KeywordTree &kwt); KeywordTree& operator=(const KeywordTree &rhs); void insertPattern(const std::string& pattern, int patterNr); void computeLinks(); struct Node { typedef std::map<char, Node *> Childs; typedef std::map<char, Node *>::iterator ChildIterator; Node(Node *parent, char edgeChar) : parent_(parent) , definesPatternNr_(-1) , edgeChar_(edgeChar) {}; bool isLeaf() { return childs_.empty(); }; bool hasChild(char ch) { return (childs_.find(ch) != childs_.end()); }; Node *parent_; Node *failure_; Node *output_; int definesPatternNr_; char edgeChar_; // char of edge pointing to this node Childs childs_; }; void freeTree(Node *n); Node *root_; std::map<int, int> lengths_; }; KeywordTree::~KeywordTree() { freeTree(root_); } void KeywordTree::insertPattern(const std::string& pattern, int patternNr) { Node *n = root_; for (std::string::const_iterator it = pattern.begin(); it != pattern.end(); ++it) { Node::ChildIterator ch = n->childs_.find(*it); if (ch != n->childs_.end()) { n = ch->second; // follow path } else { n->childs_[*it] = new Node(n, *it); // create path n = n->childs_[*it]; } } n->definesPatternNr_ = patternNr; } void KeywordTree::computeLinks() { root_->failure_ = root_; // set all failure links in depth 1 to root + put all nodes depth 1 in a queue std::queue<Node *> qu; for (Node::ChildIterator it = root_->childs_.begin(); it != root_->childs_.end(); ++it) { it->second->failure_ = root_; qu.push(it->second); } // del all nodes depth 1 in queue; put all nodes depth 2 in queue for (int i = 0, n = qu.size(); i != n; ++i) { Node *n = qu.front(); qu.pop(); for (Node::ChildIterator it = n->childs_.begin(); it != n->childs_.end(); ++it) { qu.push(it->second); } } // compute links for depht >= 2 while (!qu.empty()) { Node *n = qu.front(); qu.pop(); char ch = n->edgeChar_; Node *m = n->parent_->failure_; while (m != root_ && !m->hasChild(ch)) { m = m->failure_; } if (m != root_) { n->failure_ = m->childs_[ch]; } else { Node::ChildIterator chit = root_->childs_.find(ch); if (chit != root_->childs_.end()) { n->failure_ = chit->second; } else { n->failure_ = root_; } } if (n->failure_->definesPatternNr_ != -1) { n->output_ = n->failure_; } else { n->output_ = n->failure_->output_; } // enqueue childs for (Node::ChildIterator it = n->childs_.begin(); it != n->childs_.end(); ++it) { qu.push(it->second); } } } void KeywordTree::freeTree(Node *n) { if (n->isLeaf()) { delete n; return; } for (Node::ChildIterator ch = n->childs_.begin(); ch != n->childs_.end(); ++ch) { freeTree(ch->second); } } void KeywordTree::findOccurences(const std::string& text, foundCallback cb) { Node *n = root_; unsigned int i = 0; while (i < text.length()) { while (n->childs_.find(text[i]) != n->childs_.end()) { n = n->childs_[text[i]]; if (n->definesPatternNr_ != -1) { cb(n->definesPatternNr_, i - lengths_[n->definesPatternNr_] + 1); } if (n->output_ != 0) { Node *m = n->output_; while (m != 0) { cb(m->definesPatternNr_, i - lengths_[m->definesPatternNr_] + 1); m = m->output_; } } ++i; } if (n == root_) { ++i; } else { n = n->failure_; } } } std::vector<std::string> patterns; void cb(int patternNr, int position) { std::cout << "found " << patterns[patternNr] << " @ " << position << "\n"; } int main(int argc, char **argv) { for (int i = 2; i < argc; ++i) { patterns.push_back(argv[i]); } KeywordTree kwt(patterns); char buffer[1024]; std::ifstream inf(argv[1]); for (;;) { if (!inf.getline(buffer, sizeof (buffer))) break; std::cout << buffer << "\n"; kwt.findOccurences(buffer, cb); } } |