| // Copyright 2013 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "base/substring_set_matcher/substring_set_matcher.h" |
| |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <queue> |
| |
| #ifdef __SSE2__ |
| #include <immintrin.h> |
| #include "base/bits.h" |
| #endif |
| |
| #include "base/check_op.h" |
| #include "base/containers/contains.h" |
| #include "base/containers/queue.h" |
| #include "base/numerics/checked_math.h" |
| #include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check |
| |
| namespace base { |
| |
| namespace { |
| |
| // Compare MatcherStringPattern instances based on their string patterns. |
| bool ComparePatterns(const MatcherStringPattern* a, |
| const MatcherStringPattern* b) { |
| return a->pattern() < b->pattern(); |
| } |
| |
| std::vector<const MatcherStringPattern*> GetVectorOfPointers( |
| const std::vector<MatcherStringPattern>& patterns) { |
| std::vector<const MatcherStringPattern*> pattern_pointers; |
| pattern_pointers.reserve(patterns.size()); |
| |
| for (const MatcherStringPattern& pattern : patterns) |
| pattern_pointers.push_back(&pattern); |
| |
| return pattern_pointers; |
| } |
| |
| } // namespace |
| |
| bool SubstringSetMatcher::Build( |
| const std::vector<MatcherStringPattern>& patterns) { |
| return Build(GetVectorOfPointers(patterns)); |
| } |
| |
| bool SubstringSetMatcher::Build( |
| std::vector<const MatcherStringPattern*> patterns) { |
| // Ensure there are no duplicate IDs and all pattern strings are distinct. |
| #if DCHECK_IS_ON() |
| { |
| std::set<MatcherStringPattern::ID> ids; |
| std::set<std::string> pattern_strings; |
| for (const MatcherStringPattern* pattern : patterns) { |
| CHECK(!base::Contains(ids, pattern->id())); |
| CHECK(!base::Contains(pattern_strings, pattern->pattern())); |
| ids.insert(pattern->id()); |
| pattern_strings.insert(pattern->pattern()); |
| } |
| } |
| #endif |
| |
| // Check that all the match labels fit into an edge. |
| for (const MatcherStringPattern* pattern : patterns) { |
| if (pattern->id() >= kInvalidNodeID) { |
| return false; |
| } |
| } |
| |
| // Compute the total number of tree nodes needed. |
| std::sort(patterns.begin(), patterns.end(), ComparePatterns); |
| NodeID tree_size = GetTreeSize(patterns); |
| if (tree_size >= kInvalidNodeID) { |
| return false; |
| } |
| tree_.reserve(GetTreeSize(patterns)); |
| BuildAhoCorasickTree(patterns); |
| |
| // Sanity check that no new allocations happened in the tree and our computed |
| // size was correct. |
| DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns))); |
| |
| is_empty_ = patterns.empty() && tree_.size() == 1u; |
| return true; |
| } |
| |
| SubstringSetMatcher::SubstringSetMatcher() = default; |
| SubstringSetMatcher::~SubstringSetMatcher() = default; |
| |
| bool SubstringSetMatcher::Match( |
| const std::string& text, |
| std::set<MatcherStringPattern::ID>* matches) const { |
| const size_t old_number_of_matches = matches->size(); |
| |
| // Handle patterns matching the empty string. |
| const AhoCorasickNode* const root = &tree_[kRootID]; |
| AccumulateMatchesForNode(root, matches); |
| |
| const AhoCorasickNode* current_node = root; |
| for (const char c : text) { |
| NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
| |
| // If the child not can't be found, progressively iterate over the longest |
| // proper suffix of the string represented by the current node. In a sense |
| // we are pruning prefixes from the text. |
| while (child == kInvalidNodeID && current_node != root) { |
| current_node = &tree_[current_node->failure()]; |
| child = current_node->GetEdge(static_cast<unsigned char>(c)); |
| } |
| |
| if (child != kInvalidNodeID) { |
| // The string represented by |child| is the longest possible suffix of the |
| // current position of |text| in the trie. |
| current_node = &tree_[child]; |
| AccumulateMatchesForNode(current_node, matches); |
| } else { |
| // The empty string is the longest possible suffix of the current position |
| // of |text| in the trie. |
| DCHECK_EQ(root, current_node); |
| } |
| } |
| |
| return old_number_of_matches != matches->size(); |
| } |
| |
| bool SubstringSetMatcher::AnyMatch(const std::string& text) const { |
| // Handle patterns matching the empty string. |
| const AhoCorasickNode* const root = &tree_[kRootID]; |
| if (root->has_outputs()) { |
| return true; |
| } |
| |
| const AhoCorasickNode* current_node = root; |
| for (const char c : text) { |
| NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
| |
| // If the child not can't be found, progressively iterate over the longest |
| // proper suffix of the string represented by the current node. In a sense |
| // we are pruning prefixes from the text. |
| while (child == kInvalidNodeID && current_node != root) { |
| current_node = &tree_[current_node->failure()]; |
| child = current_node->GetEdge(static_cast<unsigned char>(c)); |
| } |
| |
| if (child != kInvalidNodeID) { |
| // The string represented by |child| is the longest possible suffix of the |
| // current position of |text| in the trie. |
| current_node = &tree_[child]; |
| if (current_node->has_outputs()) { |
| return true; |
| } |
| } else { |
| // The empty string is the longest possible suffix of the current position |
| // of |text| in the trie. |
| DCHECK_EQ(root, current_node); |
| } |
| } |
| |
| return false; |
| } |
| |
| size_t SubstringSetMatcher::EstimateMemoryUsage() const { |
| return base::trace_event::EstimateMemoryUsage(tree_); |
| } |
| |
| // static |
| constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID; |
| constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID; |
| |
| SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize( |
| const std::vector<const MatcherStringPattern*>& patterns) const { |
| DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns)); |
| |
| base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node. |
| if (patterns.empty()) |
| return result.ValueOrDie(); |
| |
| auto last = patterns.begin(); |
| auto current = last + 1; |
| // For the first pattern, each letter is a label of an edge to a new node. |
| result += (*last)->pattern().size(); |
| |
| // For the subsequent patterns, only count the edges which were not counted |
| // yet. For this it suffices to test against the previous pattern, because the |
| // patterns are sorted. |
| for (; current != patterns.end(); ++last, ++current) { |
| const std::string& last_pattern = (*last)->pattern(); |
| const std::string& current_pattern = (*current)->pattern(); |
| size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size()); |
| |
| size_t common_prefix = 0; |
| while (common_prefix < prefix_bound && |
| last_pattern[common_prefix] == current_pattern[common_prefix]) { |
| ++common_prefix; |
| } |
| |
| result -= common_prefix; |
| result += current_pattern.size(); |
| } |
| |
| return result.ValueOrDie(); |
| } |
| |
| void SubstringSetMatcher::BuildAhoCorasickTree( |
| const SubstringPatternVector& patterns) { |
| DCHECK(tree_.empty()); |
| |
| // Initialize root node of tree. |
| tree_.emplace_back(); |
| |
| // Build the initial trie for all the patterns. |
| for (const MatcherStringPattern* pattern : patterns) |
| InsertPatternIntoAhoCorasickTree(pattern); |
| |
| CreateFailureAndOutputEdges(); |
| } |
| |
| void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| const MatcherStringPattern* pattern) { |
| const std::string& text = pattern->pattern(); |
| const std::string::const_iterator text_end = text.end(); |
| |
| // Iterators on the tree and the text. |
| AhoCorasickNode* current_node = &tree_[kRootID]; |
| std::string::const_iterator i = text.begin(); |
| |
| // Follow existing paths for as long as possible. |
| while (i != text_end) { |
| NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i)); |
| if (child == kInvalidNodeID) |
| break; |
| current_node = &tree_[child]; |
| ++i; |
| } |
| |
| // Create new nodes if necessary. |
| while (i != text_end) { |
| tree_.emplace_back(); |
| current_node->SetEdge(static_cast<unsigned char>(*i), |
| static_cast<NodeID>(tree_.size() - 1)); |
| current_node = &tree_.back(); |
| ++i; |
| } |
| |
| // Register match. |
| current_node->SetMatchID(pattern->id()); |
| } |
| |
| void SubstringSetMatcher::CreateFailureAndOutputEdges() { |
| base::queue<AhoCorasickNode*> queue; |
| |
| // Initialize the failure edges for |root| and its children. |
| AhoCorasickNode* const root = &tree_[0]; |
| |
| root->SetOutputLink(kInvalidNodeID); |
| |
| NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID; |
| |
| for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) { |
| const AhoCorasickEdge& edge = root->edges()[edge_idx]; |
| if (edge.label >= kFirstSpecialLabel) { |
| continue; |
| } |
| AhoCorasickNode* child = &tree_[edge.node_id]; |
| // Failure node is kept as the root. |
| child->SetOutputLink(root_output_link); |
| queue.push(child); |
| } |
| |
| // Do a breadth first search over the trie to create failure edges. We |
| // maintain the invariant that any node in |queue| has had its |failure_| and |
| // |output_link_| edge already initialized. |
| while (!queue.empty()) { |
| AhoCorasickNode* current_node = queue.front(); |
| queue.pop(); |
| |
| // Compute the failure and output edges of children using the failure edges |
| // of the current node. |
| for (unsigned edge_idx = 0; edge_idx < current_node->num_edges(); |
| ++edge_idx) { |
| const AhoCorasickEdge& edge = current_node->edges()[edge_idx]; |
| if (edge.label >= kFirstSpecialLabel) { |
| continue; |
| } |
| AhoCorasickNode* child = &tree_[edge.node_id]; |
| |
| const AhoCorasickNode* failure_candidate_parent = |
| &tree_[current_node->failure()]; |
| NodeID failure_candidate_id = |
| failure_candidate_parent->GetEdge(edge.label); |
| while (failure_candidate_id == kInvalidNodeID && |
| failure_candidate_parent != root) { |
| failure_candidate_parent = &tree_[failure_candidate_parent->failure()]; |
| failure_candidate_id = failure_candidate_parent->GetEdge(edge.label); |
| } |
| |
| if (failure_candidate_id == kInvalidNodeID) { |
| DCHECK_EQ(root, failure_candidate_parent); |
| // |failure_candidate| is invalid and we can't proceed further since we |
| // have reached the root. Hence the longest proper suffix of this string |
| // represented by this node is the empty string (represented by root). |
| failure_candidate_id = kRootID; |
| } else { |
| child->SetFailure(failure_candidate_id); |
| } |
| |
| const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id]; |
| // Now |failure_candidate| is |child|'s longest possible proper suffix in |
| // the trie. We also know that since we are doing a breadth first search, |
| // we would have established |failure_candidate|'s output link by now. |
| // Hence we can define |child|'s output link as follows: |
| child->SetOutputLink(failure_candidate->IsEndOfPattern() |
| ? failure_candidate_id |
| : failure_candidate->output_link()); |
| |
| queue.push(child); |
| } |
| } |
| } |
| |
| void SubstringSetMatcher::AccumulateMatchesForNode( |
| const AhoCorasickNode* node, |
| std::set<MatcherStringPattern::ID>* matches) const { |
| DCHECK(matches); |
| |
| if (!node->has_outputs()) { |
| // Fast reject. |
| return; |
| } |
| if (node->IsEndOfPattern()) |
| matches->insert(node->GetMatchID()); |
| |
| NodeID node_id = node->output_link(); |
| while (node_id != kInvalidNodeID) { |
| node = &tree_[node_id]; |
| matches->insert(node->GetMatchID()); |
| node_id = node->output_link(); |
| } |
| } |
| |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() { |
| static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| edges_.inline_edges[0].label = kEmptyLabel; |
| edges_.inline_edges[1].label = kEmptyLabel; |
| } |
| |
| SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() { |
| if (edges_capacity_ != 0) { |
| delete[] edges_.edges; |
| } |
| } |
| |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) { |
| *this = std::move(other); |
| } |
| |
| SubstringSetMatcher::AhoCorasickNode& |
| SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) { |
| if (edges_capacity_ != 0) { |
| // Delete the old heap allocation if needed. |
| delete[] edges_.edges; |
| } |
| if (other.edges_capacity_ == 0) { |
| static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| edges_.inline_edges[0] = other.edges_.inline_edges[0]; |
| edges_.inline_edges[1] = other.edges_.inline_edges[1]; |
| } else { |
| // Move over the heap allocation. |
| edges_.edges = other.edges_.edges; |
| other.edges_.edges = nullptr; |
| } |
| num_free_edges_ = other.num_free_edges_; |
| edges_capacity_ = other.edges_capacity_; |
| return *this; |
| } |
| |
| SubstringSetMatcher::NodeID |
| SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const { |
| DCHECK(edges_capacity_ != 0); |
| #ifdef __SSE2__ |
| const __m128i lbl = _mm_set1_epi32(static_cast<int>(label)); |
| const __m128i mask = _mm_set1_epi32(0x1ff); |
| for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) { |
| const __m128i four = _mm_loadu_si128( |
| reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx])); |
| const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl); |
| const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match)); |
| if (match_mask != 0) { |
| if (match_mask & 0x1u) { |
| return edges_.edges[edge_idx].node_id; |
| } |
| if (match_mask & 0x10u) { |
| return edges_.edges[edge_idx + 1].node_id; |
| } |
| if (match_mask & 0x100u) { |
| return edges_.edges[edge_idx + 2].node_id; |
| } |
| DCHECK(match_mask & 0x1000u); |
| return edges_.edges[edge_idx + 3].node_id; |
| } |
| } |
| #else |
| for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| const AhoCorasickEdge& edge = edges_.edges[edge_idx]; |
| if (edge.label == label) |
| return edge.node_id; |
| } |
| #endif |
| return kInvalidNodeID; |
| } |
| |
| void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label, |
| NodeID node) { |
| DCHECK_LT(node, kInvalidNodeID); |
| |
| #if DCHECK_IS_ON() |
| // We don't support overwriting existing edges. |
| for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| DCHECK_NE(label, edges()[edge_idx].label); |
| } |
| #endif |
| |
| if (edges_capacity_ == 0 && num_free_edges_ > 0) { |
| // Still space in the inline storage, so use that. |
| edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node}; |
| if (label == kFailureNodeLabel) { |
| // Make sure that kFailureNodeLabel is first. |
| // NOTE: We don't use std::swap here, because the compiler doesn't |
| // understand that inline_edges[] is 4-aligned and can give |
| // a warning or error. |
| AhoCorasickEdge temp = edges_.inline_edges[0]; |
| edges_.inline_edges[0] = edges_.inline_edges[num_edges()]; |
| edges_.inline_edges[num_edges()] = temp; |
| } |
| --num_free_edges_; |
| return; |
| } |
| |
| if (num_free_edges_ == 0) { |
| // We are out of space, so double our capacity (unless that would cause |
| // num_free_edges_ to overflow). This can either be because we are |
| // converting from inline to heap storage, or because we are increasing the |
| // size of our heap storage. |
| unsigned old_capacity = |
| edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_; |
| unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1); |
| DCHECK_EQ(0u, new_capacity % 4); |
| AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity]; |
| memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity); |
| for (unsigned edge_idx = old_capacity; edge_idx < new_capacity; |
| ++edge_idx) { |
| new_edges[edge_idx].label = kEmptyLabel; |
| } |
| if (edges_capacity_ != 0) { |
| delete[] edges_.edges; |
| } |
| edges_.edges = new_edges; |
| // These casts are safe due to the DCHECK above. |
| edges_capacity_ = static_cast<uint16_t>(new_capacity); |
| num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity); |
| } |
| |
| // Insert the new edge at the end of our heap storage. |
| edges_.edges[num_edges()] = AhoCorasickEdge{label, node}; |
| if (label == kFailureNodeLabel) { |
| // Make sure that kFailureNodeLabel is first. |
| std::swap(edges_.edges[0], edges_.edges[num_edges()]); |
| } |
| --num_free_edges_; |
| } |
| |
| void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) { |
| DCHECK_NE(kInvalidNodeID, node); |
| if (node != kRootID) { |
| SetEdge(kFailureNodeLabel, node); |
| } |
| } |
| |
| size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const { |
| if (edges_capacity_ == 0) { |
| return 0; |
| } else { |
| return base::trace_event::EstimateMemoryUsage(edges_.edges, |
| edges_capacity_); |
| } |
| } |
| |
| } // namespace base |