| // Copyright 2020 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/regexp/experimental/experimental-compiler.h" |
| |
| #include "src/regexp/experimental/experimental.h" |
| #include "src/zone/zone-list-inl.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| namespace { |
| |
| // TODO(mbid, v8:10765): Currently the experimental engine doesn't support |
| // UTF-16, but this shouldn't be too hard to implement. |
| constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; |
| |
| class CanBeHandledVisitor final : private RegExpVisitor { |
| // Visitor to implement `ExperimentalRegExp::CanBeHandled`. |
| public: |
| static bool Check(RegExpTree* tree, JSRegExp::Flags flags, |
| int capture_count) { |
| if (!AreSuitableFlags(flags)) return false; |
| CanBeHandledVisitor visitor; |
| tree->Accept(&visitor, nullptr); |
| return visitor.result_; |
| } |
| |
| private: |
| CanBeHandledVisitor() = default; |
| |
| static bool AreSuitableFlags(JSRegExp::Flags flags) { |
| // TODO(mbid, v8:10765): We should be able to support all flags in the |
| // future. |
| static constexpr JSRegExp::Flags kAllowedFlags = |
| JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline | |
| JSRegExp::kDotAll | JSRegExp::kLinear; |
| // We support Unicode iff kUnicode is among the supported flags. |
| STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == |
| ((kAllowedFlags & JSRegExp::kUnicode) != 0)); |
| return (flags & ~kAllowedFlags) == 0; |
| } |
| |
| void* VisitDisjunction(RegExpDisjunction* node, void*) override { |
| for (RegExpTree* alt : *node->alternatives()) { |
| alt->Accept(this, nullptr); |
| if (!result_) { |
| return nullptr; |
| } |
| } |
| return nullptr; |
| } |
| |
| void* VisitAlternative(RegExpAlternative* node, void*) override { |
| for (RegExpTree* child : *node->nodes()) { |
| child->Accept(this, nullptr); |
| if (!result_) { |
| return nullptr; |
| } |
| } |
| return nullptr; |
| } |
| |
| void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { |
| result_ = result_ && AreSuitableFlags(node->flags()); |
| return nullptr; |
| } |
| |
| void* VisitAssertion(RegExpAssertion* node, void*) override { |
| result_ = result_ && AreSuitableFlags(node->flags()); |
| return nullptr; |
| } |
| |
| void* VisitAtom(RegExpAtom* node, void*) override { |
| result_ = result_ && AreSuitableFlags(node->flags()); |
| return nullptr; |
| } |
| |
| void* VisitText(RegExpText* node, void*) override { |
| for (TextElement& el : *node->elements()) { |
| el.tree()->Accept(this, nullptr); |
| if (!result_) { |
| return nullptr; |
| } |
| } |
| return nullptr; |
| } |
| |
| void* VisitQuantifier(RegExpQuantifier* node, void*) override { |
| // Finite but large values of `min()` and `max()` are bad for the |
| // breadth-first engine because finite (optional) repetition is dealt with |
| // by replicating the bytecode of the body of the quantifier. The number |
| // of replicatons grows exponentially in how deeply quantifiers are nested. |
| // `replication_factor_` keeps track of how often the current node will |
| // have to be replicated in the generated bytecode, and we don't allow this |
| // to exceed some small value. |
| static constexpr int kMaxReplicationFactor = 16; |
| |
| // First we rule out values for min and max that are too big even before |
| // taking into account the ambient replication_factor_. This also guards |
| // against overflows in `local_replication` or `replication_factor_`. |
| if (node->min() > kMaxReplicationFactor || |
| (node->max() != RegExpTree::kInfinity && |
| node->max() > kMaxReplicationFactor)) { |
| result_ = false; |
| return nullptr; |
| } |
| |
| // Save the current replication factor so that it can be restored if we |
| // return with `result_ == true`. |
| int before_replication_factor = replication_factor_; |
| |
| int local_replication; |
| if (node->max() == RegExpTree::kInfinity) { |
| local_replication = node->min() + 1; |
| } else { |
| local_replication = node->max(); |
| } |
| |
| replication_factor_ *= local_replication; |
| if (replication_factor_ > kMaxReplicationFactor) { |
| result_ = false; |
| return nullptr; |
| } |
| |
| switch (node->quantifier_type()) { |
| case RegExpQuantifier::GREEDY: |
| case RegExpQuantifier::NON_GREEDY: |
| break; |
| case RegExpQuantifier::POSSESSIVE: |
| // TODO(mbid, v8:10765): It's not clear to me whether this can be |
| // supported in breadth-first mode. Re2 doesn't support it. |
| result_ = false; |
| return nullptr; |
| } |
| |
| node->body()->Accept(this, nullptr); |
| replication_factor_ = before_replication_factor; |
| return nullptr; |
| } |
| |
| void* VisitCapture(RegExpCapture* node, void*) override { |
| node->body()->Accept(this, nullptr); |
| return nullptr; |
| } |
| |
| void* VisitGroup(RegExpGroup* node, void*) override { |
| node->body()->Accept(this, nullptr); |
| return nullptr; |
| } |
| |
| void* VisitLookaround(RegExpLookaround* node, void*) override { |
| // TODO(mbid, v8:10765): This will be hard to support, but not impossible I |
| // think. See product automata. |
| result_ = false; |
| return nullptr; |
| } |
| |
| void* VisitBackReference(RegExpBackReference* node, void*) override { |
| // This can't be implemented without backtracking. |
| result_ = false; |
| return nullptr; |
| } |
| |
| void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } |
| |
| private: |
| // See comment in `VisitQuantifier`: |
| int replication_factor_ = 1; |
| |
| bool result_ = true; |
| }; |
| |
| } // namespace |
| |
| bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, |
| JSRegExp::Flags flags, |
| int capture_count) { |
| return CanBeHandledVisitor::Check(tree, flags, capture_count); |
| } |
| |
| namespace { |
| |
| // A label in bytecode which starts with no known address. The address *must* |
| // be bound with `Bind` before the label goes out of scope. |
| // Implemented as a linked list through the `payload.pc` of FORK and JMP |
| // instructions. |
| struct Label { |
| public: |
| Label() = default; |
| ~Label() { |
| DCHECK_EQ(state_, BOUND); |
| DCHECK_GE(bound_index_, 0); |
| } |
| |
| // Don't copy, don't move. Moving could be implemented, but it's not |
| // needed anywhere. |
| Label(const Label&) = delete; |
| Label& operator=(const Label&) = delete; |
| |
| private: |
| friend class BytecodeAssembler; |
| |
| // UNBOUND implies unbound_patch_list_begin_. |
| // BOUND implies bound_index_. |
| enum { UNBOUND, BOUND } state_ = UNBOUND; |
| union { |
| int unbound_patch_list_begin_ = -1; |
| int bound_index_; |
| }; |
| }; |
| |
| class BytecodeAssembler { |
| public: |
| // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from |
| // the `tree` size we're going to compile? |
| explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {} |
| |
| ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); } |
| |
| void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); } |
| |
| void Assertion(RegExpAssertion::AssertionType t) { |
| code_.Add(RegExpInstruction::Assertion(t), zone_); |
| } |
| |
| void ClearRegister(int32_t register_index) { |
| code_.Add(RegExpInstruction::ClearRegister(register_index), zone_); |
| } |
| |
| void ConsumeRange(uc16 from, uc16 to) { |
| code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_); |
| } |
| |
| void ConsumeAnyChar() { |
| code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_); |
| } |
| |
| void Fork(Label& target) { |
| LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target); |
| } |
| |
| void Jmp(Label& target) { |
| LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target); |
| } |
| |
| void SetRegisterToCp(int32_t register_index) { |
| code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_); |
| } |
| |
| void Bind(Label& target) { |
| DCHECK_EQ(target.state_, Label::UNBOUND); |
| |
| int index = code_.length(); |
| |
| while (target.unbound_patch_list_begin_ != -1) { |
| RegExpInstruction& inst = code_[target.unbound_patch_list_begin_]; |
| DCHECK(inst.opcode == RegExpInstruction::FORK || |
| inst.opcode == RegExpInstruction::JMP); |
| |
| target.unbound_patch_list_begin_ = inst.payload.pc; |
| inst.payload.pc = index; |
| } |
| |
| target.state_ = Label::BOUND; |
| target.bound_index_ = index; |
| } |
| |
| void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); } |
| |
| private: |
| void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) { |
| RegExpInstruction result; |
| result.opcode = op; |
| |
| if (target.state_ == Label::BOUND) { |
| result.payload.pc = target.bound_index_; |
| } else { |
| DCHECK_EQ(target.state_, Label::UNBOUND); |
| int new_list_begin = code_.length(); |
| DCHECK_GE(new_list_begin, 0); |
| |
| result.payload.pc = target.unbound_patch_list_begin_; |
| |
| target.unbound_patch_list_begin_ = new_list_begin; |
| } |
| |
| code_.Add(result, zone_); |
| } |
| |
| Zone* zone_; |
| ZoneList<RegExpInstruction> code_; |
| }; |
| |
| class CompileVisitor : private RegExpVisitor { |
| public: |
| static ZoneList<RegExpInstruction> Compile(RegExpTree* tree, |
| JSRegExp::Flags flags, |
| Zone* zone) { |
| CompileVisitor compiler(zone); |
| |
| if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) { |
| // The match is not anchored, i.e. may start at any input position, so we |
| // emit a preamble corresponding to /.*?/. This skips an arbitrary |
| // prefix in the input non-greedily. |
| compiler.CompileNonGreedyStar( |
| [&]() { compiler.assembler_.ConsumeAnyChar(); }); |
| } |
| |
| compiler.assembler_.SetRegisterToCp(0); |
| tree->Accept(&compiler, nullptr); |
| compiler.assembler_.SetRegisterToCp(1); |
| compiler.assembler_.Accept(); |
| |
| return std::move(compiler.assembler_).IntoCode(); |
| } |
| |
| private: |
| explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {} |
| |
| // Generate a disjunction of code fragments compiled by a function `alt_gen`. |
| // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - |
| // 1` and should build code corresponding to the ith alternative. |
| template <class F> |
| void CompileDisjunction(int alt_num, F&& gen_alt) { |
| // An alternative a1 | ... | an is compiled into |
| // |
| // FORK tail1 |
| // <a1> |
| // JMP end |
| // tail1: |
| // FORK tail2 |
| // <a2> |
| // JMP end |
| // tail2: |
| // ... |
| // ... |
| // tail{n -1}: |
| // <an> |
| // end: |
| // |
| // By the semantics of the FORK instruction (see above at definition and |
| // semantics), a forked thread has lower priority than the thread that |
| // spawned it. This means that with the code we're generating here, the |
| // thread matching the alternative a1 has indeed highest priority, followed |
| // by the thread for a2 and so on. |
| |
| if (alt_num == 0) { |
| // The empty disjunction. This can never match. |
| assembler_.Fail(); |
| return; |
| } |
| |
| Label end; |
| |
| for (int i = 0; i != alt_num - 1; ++i) { |
| Label tail; |
| assembler_.Fork(tail); |
| gen_alt(i); |
| assembler_.Jmp(end); |
| assembler_.Bind(tail); |
| } |
| |
| gen_alt(alt_num - 1); |
| |
| assembler_.Bind(end); |
| } |
| |
| void* VisitDisjunction(RegExpDisjunction* node, void*) override { |
| ZoneList<RegExpTree*>& alts = *node->alternatives(); |
| CompileDisjunction(alts.length(), |
| [&](int i) { alts[i]->Accept(this, nullptr); }); |
| return nullptr; |
| } |
| |
| void* VisitAlternative(RegExpAlternative* node, void*) override { |
| for (RegExpTree* child : *node->nodes()) { |
| child->Accept(this, nullptr); |
| } |
| return nullptr; |
| } |
| |
| void* VisitAssertion(RegExpAssertion* node, void*) override { |
| assembler_.Assertion(node->assertion_type()); |
| return nullptr; |
| } |
| |
| void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { |
| // A character class is compiled as Disjunction over its `CharacterRange`s. |
| ZoneList<CharacterRange>* ranges = node->ranges(zone_); |
| CharacterRange::Canonicalize(ranges); |
| if (node->is_negated()) { |
| // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d) |
| // union of k intervals is a union of at most k + 1 intervals. |
| ZoneList<CharacterRange>* negated = |
| zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_); |
| CharacterRange::Negate(ranges, negated, zone_); |
| DCHECK_LE(negated->length(), ranges->length() + 1); |
| ranges = negated; |
| } |
| |
| CompileDisjunction(ranges->length(), [&](int i) { |
| // We don't support utf16 for now, so only ranges that can be specified |
| // by (complements of) ranges with uc16 bounds. |
| STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max()); |
| |
| uc32 from = (*ranges)[i].from(); |
| DCHECK_LE(from, kMaxSupportedCodepoint); |
| uc16 from_uc16 = static_cast<uc16>(from); |
| |
| uc32 to = (*ranges)[i].to(); |
| DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint); |
| uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint)); |
| |
| assembler_.ConsumeRange(from_uc16, to_uc16); |
| }); |
| return nullptr; |
| } |
| |
| void* VisitAtom(RegExpAtom* node, void*) override { |
| for (uc16 c : node->data()) { |
| assembler_.ConsumeRange(c, c); |
| } |
| return nullptr; |
| } |
| |
| void ClearRegisters(Interval indices) { |
| if (indices.is_empty()) return; |
| DCHECK_EQ(indices.from() % 2, 0); |
| DCHECK_EQ(indices.to() % 2, 1); |
| for (int i = indices.from(); i <= indices.to(); i += 2) { |
| // It suffices to clear the register containing the `begin` of a capture |
| // because this indicates that the capture is undefined, regardless of |
| // the value in the `end` register. |
| assembler_.ClearRegister(i); |
| } |
| } |
| |
| // Emit bytecode corresponding to /<emit_body>*/. |
| template <class F> |
| void CompileGreedyStar(F&& emit_body) { |
| // This is compiled into |
| // |
| // begin: |
| // FORK end |
| // <body> |
| // JMP begin |
| // end: |
| // ... |
| // |
| // This is greedy because a forked thread has lower priority than the |
| // thread that spawned it. |
| Label begin; |
| Label end; |
| |
| assembler_.Bind(begin); |
| assembler_.Fork(end); |
| emit_body(); |
| assembler_.Jmp(begin); |
| |
| assembler_.Bind(end); |
| } |
| |
| // Emit bytecode corresponding to /<emit_body>*?/. |
| template <class F> |
| void CompileNonGreedyStar(F&& emit_body) { |
| // This is compiled into |
| // |
| // FORK body |
| // JMP end |
| // body: |
| // <body> |
| // FORK body |
| // end: |
| // ... |
| |
| Label body; |
| Label end; |
| |
| assembler_.Fork(body); |
| assembler_.Jmp(end); |
| |
| assembler_.Bind(body); |
| emit_body(); |
| assembler_.Fork(body); |
| |
| assembler_.Bind(end); |
| } |
| |
| // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/. |
| template <class F> |
| void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) { |
| // This is compiled into |
| // |
| // FORK end |
| // <body> |
| // FORK end |
| // <body> |
| // ... |
| // ... |
| // FORK end |
| // <body> |
| // end: |
| // ... |
| |
| Label end; |
| for (int i = 0; i != max_repetition_num; ++i) { |
| assembler_.Fork(end); |
| emit_body(); |
| } |
| assembler_.Bind(end); |
| } |
| |
| // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/. |
| template <class F> |
| void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) { |
| // This is compiled into |
| // |
| // FORK body0 |
| // JMP end |
| // body0: |
| // <body> |
| // FORK body1 |
| // JMP end |
| // body1: |
| // <body> |
| // ... |
| // ... |
| // body{max_repetition_num - 1}: |
| // <body> |
| // end: |
| // ... |
| |
| Label end; |
| for (int i = 0; i != max_repetition_num; ++i) { |
| Label body; |
| assembler_.Fork(body); |
| assembler_.Jmp(end); |
| |
| assembler_.Bind(body); |
| emit_body(); |
| } |
| assembler_.Bind(end); |
| } |
| |
| void* VisitQuantifier(RegExpQuantifier* node, void*) override { |
| // Emit the body, but clear registers occuring in body first. |
| // |
| // TODO(mbid,v8:10765): It's not always necessary to a) capture registers |
| // and b) clear them. For example, we don't have to capture anything for |
| // the first 4 repetitions if node->min() >= 5, and then we don't have to |
| // clear registers in the first node->min() repetitions. |
| // Later, and if node->min() == 0, we don't have to clear registers before |
| // the first optional repetition. |
| Interval body_registers = node->body()->CaptureRegisters(); |
| auto emit_body = [&]() { |
| ClearRegisters(body_registers); |
| node->body()->Accept(this, nullptr); |
| }; |
| |
| // First repeat the body `min()` times. |
| for (int i = 0; i != node->min(); ++i) emit_body(); |
| |
| switch (node->quantifier_type()) { |
| case RegExpQuantifier::POSSESSIVE: |
| UNREACHABLE(); |
| case RegExpQuantifier::GREEDY: { |
| if (node->max() == RegExpTree::kInfinity) { |
| CompileGreedyStar(emit_body); |
| } else { |
| DCHECK_NE(node->max(), RegExpTree::kInfinity); |
| CompileGreedyRepetition(emit_body, node->max() - node->min()); |
| } |
| break; |
| } |
| case RegExpQuantifier::NON_GREEDY: { |
| if (node->max() == RegExpTree::kInfinity) { |
| CompileNonGreedyStar(emit_body); |
| } else { |
| DCHECK_NE(node->max(), RegExpTree::kInfinity); |
| CompileNonGreedyRepetition(emit_body, node->max() - node->min()); |
| } |
| } |
| } |
| return nullptr; |
| } |
| |
| void* VisitCapture(RegExpCapture* node, void*) override { |
| int index = node->index(); |
| int start_register = RegExpCapture::StartRegister(index); |
| int end_register = RegExpCapture::EndRegister(index); |
| assembler_.SetRegisterToCp(start_register); |
| node->body()->Accept(this, nullptr); |
| assembler_.SetRegisterToCp(end_register); |
| return nullptr; |
| } |
| |
| void* VisitGroup(RegExpGroup* node, void*) override { |
| node->body()->Accept(this, nullptr); |
| return nullptr; |
| } |
| |
| void* VisitLookaround(RegExpLookaround* node, void*) override { |
| // TODO(mbid,v8:10765): Support this case. |
| UNREACHABLE(); |
| } |
| |
| void* VisitBackReference(RegExpBackReference* node, void*) override { |
| UNREACHABLE(); |
| } |
| |
| void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } |
| |
| void* VisitText(RegExpText* node, void*) override { |
| for (TextElement& text_el : *node->elements()) { |
| text_el.tree()->Accept(this, nullptr); |
| } |
| return nullptr; |
| } |
| |
| private: |
| Zone* zone_; |
| BytecodeAssembler assembler_; |
| }; |
| |
| } // namespace |
| |
| ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile( |
| RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) { |
| return CompileVisitor::Compile(tree, flags, zone); |
| } |
| |
| } // namespace internal |
| } // namespace v8 |