blob: 4d53c2c0c551677c662a8cf86cf9d3cd5c33137a [file] [log] [blame]
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/experimental/experimental-compiler.h"
#include "src/regexp/experimental/experimental.h"
#include "src/zone/zone-list-inl.h"
namespace v8 {
namespace internal {
namespace {
// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
// UTF-16, but this shouldn't be too hard to implement.
constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
class CanBeHandledVisitor final : private RegExpVisitor {
// Visitor to implement `ExperimentalRegExp::CanBeHandled`.
public:
static bool Check(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count) {
if (!AreSuitableFlags(flags)) return false;
CanBeHandledVisitor visitor;
tree->Accept(&visitor, nullptr);
return visitor.result_;
}
private:
CanBeHandledVisitor() = default;
static bool AreSuitableFlags(JSRegExp::Flags flags) {
// TODO(mbid, v8:10765): We should be able to support all flags in the
// future.
static constexpr JSRegExp::Flags kAllowedFlags =
JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline |
JSRegExp::kDotAll | JSRegExp::kLinear;
// We support Unicode iff kUnicode is among the supported flags.
STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
((kAllowedFlags & JSRegExp::kUnicode) != 0));
return (flags & ~kAllowedFlags) == 0;
}
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
for (RegExpTree* alt : *node->alternatives()) {
alt->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitAlternative(RegExpAlternative* node, void*) override {
for (RegExpTree* child : *node->nodes()) {
child->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags());
return nullptr;
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags());
return nullptr;
}
void* VisitAtom(RegExpAtom* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags());
return nullptr;
}
void* VisitText(RegExpText* node, void*) override {
for (TextElement& el : *node->elements()) {
el.tree()->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// Finite but large values of `min()` and `max()` are bad for the
// breadth-first engine because finite (optional) repetition is dealt with
// by replicating the bytecode of the body of the quantifier. The number
// of replicatons grows exponentially in how deeply quantifiers are nested.
// `replication_factor_` keeps track of how often the current node will
// have to be replicated in the generated bytecode, and we don't allow this
// to exceed some small value.
static constexpr int kMaxReplicationFactor = 16;
// First we rule out values for min and max that are too big even before
// taking into account the ambient replication_factor_. This also guards
// against overflows in `local_replication` or `replication_factor_`.
if (node->min() > kMaxReplicationFactor ||
(node->max() != RegExpTree::kInfinity &&
node->max() > kMaxReplicationFactor)) {
result_ = false;
return nullptr;
}
// Save the current replication factor so that it can be restored if we
// return with `result_ == true`.
int before_replication_factor = replication_factor_;
int local_replication;
if (node->max() == RegExpTree::kInfinity) {
local_replication = node->min() + 1;
} else {
local_replication = node->max();
}
replication_factor_ *= local_replication;
if (replication_factor_ > kMaxReplicationFactor) {
result_ = false;
return nullptr;
}
switch (node->quantifier_type()) {
case RegExpQuantifier::GREEDY:
case RegExpQuantifier::NON_GREEDY:
break;
case RegExpQuantifier::POSSESSIVE:
// TODO(mbid, v8:10765): It's not clear to me whether this can be
// supported in breadth-first mode. Re2 doesn't support it.
result_ = false;
return nullptr;
}
node->body()->Accept(this, nullptr);
replication_factor_ = before_replication_factor;
return nullptr;
}
void* VisitCapture(RegExpCapture* node, void*) override {
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitGroup(RegExpGroup* node, void*) override {
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitLookaround(RegExpLookaround* node, void*) override {
// TODO(mbid, v8:10765): This will be hard to support, but not impossible I
// think. See product automata.
result_ = false;
return nullptr;
}
void* VisitBackReference(RegExpBackReference* node, void*) override {
// This can't be implemented without backtracking.
result_ = false;
return nullptr;
}
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
private:
// See comment in `VisitQuantifier`:
int replication_factor_ = 1;
bool result_ = true;
};
} // namespace
bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
JSRegExp::Flags flags,
int capture_count) {
return CanBeHandledVisitor::Check(tree, flags, capture_count);
}
namespace {
// A label in bytecode which starts with no known address. The address *must*
// be bound with `Bind` before the label goes out of scope.
// Implemented as a linked list through the `payload.pc` of FORK and JMP
// instructions.
struct Label {
public:
Label() = default;
~Label() {
DCHECK_EQ(state_, BOUND);
DCHECK_GE(bound_index_, 0);
}
// Don't copy, don't move. Moving could be implemented, but it's not
// needed anywhere.
Label(const Label&) = delete;
Label& operator=(const Label&) = delete;
private:
friend class BytecodeAssembler;
// UNBOUND implies unbound_patch_list_begin_.
// BOUND implies bound_index_.
enum { UNBOUND, BOUND } state_ = UNBOUND;
union {
int unbound_patch_list_begin_ = -1;
int bound_index_;
};
};
class BytecodeAssembler {
public:
// TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
// the `tree` size we're going to compile?
explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
void Assertion(RegExpAssertion::AssertionType t) {
code_.Add(RegExpInstruction::Assertion(t), zone_);
}
void ClearRegister(int32_t register_index) {
code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
}
void ConsumeRange(uc16 from, uc16 to) {
code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
}
void ConsumeAnyChar() {
code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
}
void Fork(Label& target) {
LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
}
void Jmp(Label& target) {
LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
}
void SetRegisterToCp(int32_t register_index) {
code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
}
void Bind(Label& target) {
DCHECK_EQ(target.state_, Label::UNBOUND);
int index = code_.length();
while (target.unbound_patch_list_begin_ != -1) {
RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
DCHECK(inst.opcode == RegExpInstruction::FORK ||
inst.opcode == RegExpInstruction::JMP);
target.unbound_patch_list_begin_ = inst.payload.pc;
inst.payload.pc = index;
}
target.state_ = Label::BOUND;
target.bound_index_ = index;
}
void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
private:
void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
RegExpInstruction result;
result.opcode = op;
if (target.state_ == Label::BOUND) {
result.payload.pc = target.bound_index_;
} else {
DCHECK_EQ(target.state_, Label::UNBOUND);
int new_list_begin = code_.length();
DCHECK_GE(new_list_begin, 0);
result.payload.pc = target.unbound_patch_list_begin_;
target.unbound_patch_list_begin_ = new_list_begin;
}
code_.Add(result, zone_);
}
Zone* zone_;
ZoneList<RegExpInstruction> code_;
};
class CompileVisitor : private RegExpVisitor {
public:
static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
JSRegExp::Flags flags,
Zone* zone) {
CompileVisitor compiler(zone);
if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) {
// The match is not anchored, i.e. may start at any input position, so we
// emit a preamble corresponding to /.*?/. This skips an arbitrary
// prefix in the input non-greedily.
compiler.CompileNonGreedyStar(
[&]() { compiler.assembler_.ConsumeAnyChar(); });
}
compiler.assembler_.SetRegisterToCp(0);
tree->Accept(&compiler, nullptr);
compiler.assembler_.SetRegisterToCp(1);
compiler.assembler_.Accept();
return std::move(compiler.assembler_).IntoCode();
}
private:
explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
// Generate a disjunction of code fragments compiled by a function `alt_gen`.
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
// 1` and should build code corresponding to the ith alternative.
template <class F>
void CompileDisjunction(int alt_num, F&& gen_alt) {
// An alternative a1 | ... | an is compiled into
//
// FORK tail1
// <a1>
// JMP end
// tail1:
// FORK tail2
// <a2>
// JMP end
// tail2:
// ...
// ...
// tail{n -1}:
// <an>
// end:
//
// By the semantics of the FORK instruction (see above at definition and
// semantics), a forked thread has lower priority than the thread that
// spawned it. This means that with the code we're generating here, the
// thread matching the alternative a1 has indeed highest priority, followed
// by the thread for a2 and so on.
if (alt_num == 0) {
// The empty disjunction. This can never match.
assembler_.Fail();
return;
}
Label end;
for (int i = 0; i != alt_num - 1; ++i) {
Label tail;
assembler_.Fork(tail);
gen_alt(i);
assembler_.Jmp(end);
assembler_.Bind(tail);
}
gen_alt(alt_num - 1);
assembler_.Bind(end);
}
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
ZoneList<RegExpTree*>& alts = *node->alternatives();
CompileDisjunction(alts.length(),
[&](int i) { alts[i]->Accept(this, nullptr); });
return nullptr;
}
void* VisitAlternative(RegExpAlternative* node, void*) override {
for (RegExpTree* child : *node->nodes()) {
child->Accept(this, nullptr);
}
return nullptr;
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
assembler_.Assertion(node->assertion_type());
return nullptr;
}
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
// A character class is compiled as Disjunction over its `CharacterRange`s.
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
CharacterRange::Canonicalize(ranges);
if (node->is_negated()) {
// The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
// union of k intervals is a union of at most k + 1 intervals.
ZoneList<CharacterRange>* negated =
zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
CharacterRange::Negate(ranges, negated, zone_);
DCHECK_LE(negated->length(), ranges->length() + 1);
ranges = negated;
}
CompileDisjunction(ranges->length(), [&](int i) {
// We don't support utf16 for now, so only ranges that can be specified
// by (complements of) ranges with uc16 bounds.
STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
uc32 from = (*ranges)[i].from();
DCHECK_LE(from, kMaxSupportedCodepoint);
uc16 from_uc16 = static_cast<uc16>(from);
uc32 to = (*ranges)[i].to();
DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
assembler_.ConsumeRange(from_uc16, to_uc16);
});
return nullptr;
}
void* VisitAtom(RegExpAtom* node, void*) override {
for (uc16 c : node->data()) {
assembler_.ConsumeRange(c, c);
}
return nullptr;
}
void ClearRegisters(Interval indices) {
if (indices.is_empty()) return;
DCHECK_EQ(indices.from() % 2, 0);
DCHECK_EQ(indices.to() % 2, 1);
for (int i = indices.from(); i <= indices.to(); i += 2) {
// It suffices to clear the register containing the `begin` of a capture
// because this indicates that the capture is undefined, regardless of
// the value in the `end` register.
assembler_.ClearRegister(i);
}
}
// Emit bytecode corresponding to /<emit_body>*/.
template <class F>
void CompileGreedyStar(F&& emit_body) {
// This is compiled into
//
// begin:
// FORK end
// <body>
// JMP begin
// end:
// ...
//
// This is greedy because a forked thread has lower priority than the
// thread that spawned it.
Label begin;
Label end;
assembler_.Bind(begin);
assembler_.Fork(end);
emit_body();
assembler_.Jmp(begin);
assembler_.Bind(end);
}
// Emit bytecode corresponding to /<emit_body>*?/.
template <class F>
void CompileNonGreedyStar(F&& emit_body) {
// This is compiled into
//
// FORK body
// JMP end
// body:
// <body>
// FORK body
// end:
// ...
Label body;
Label end;
assembler_.Fork(body);
assembler_.Jmp(end);
assembler_.Bind(body);
emit_body();
assembler_.Fork(body);
assembler_.Bind(end);
}
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
template <class F>
void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
// This is compiled into
//
// FORK end
// <body>
// FORK end
// <body>
// ...
// ...
// FORK end
// <body>
// end:
// ...
Label end;
for (int i = 0; i != max_repetition_num; ++i) {
assembler_.Fork(end);
emit_body();
}
assembler_.Bind(end);
}
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
template <class F>
void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
// This is compiled into
//
// FORK body0
// JMP end
// body0:
// <body>
// FORK body1
// JMP end
// body1:
// <body>
// ...
// ...
// body{max_repetition_num - 1}:
// <body>
// end:
// ...
Label end;
for (int i = 0; i != max_repetition_num; ++i) {
Label body;
assembler_.Fork(body);
assembler_.Jmp(end);
assembler_.Bind(body);
emit_body();
}
assembler_.Bind(end);
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// Emit the body, but clear registers occuring in body first.
//
// TODO(mbid,v8:10765): It's not always necessary to a) capture registers
// and b) clear them. For example, we don't have to capture anything for
// the first 4 repetitions if node->min() >= 5, and then we don't have to
// clear registers in the first node->min() repetitions.
// Later, and if node->min() == 0, we don't have to clear registers before
// the first optional repetition.
Interval body_registers = node->body()->CaptureRegisters();
auto emit_body = [&]() {
ClearRegisters(body_registers);
node->body()->Accept(this, nullptr);
};
// First repeat the body `min()` times.
for (int i = 0; i != node->min(); ++i) emit_body();
switch (node->quantifier_type()) {
case RegExpQuantifier::POSSESSIVE:
UNREACHABLE();
case RegExpQuantifier::GREEDY: {
if (node->max() == RegExpTree::kInfinity) {
CompileGreedyStar(emit_body);
} else {
DCHECK_NE(node->max(), RegExpTree::kInfinity);
CompileGreedyRepetition(emit_body, node->max() - node->min());
}
break;
}
case RegExpQuantifier::NON_GREEDY: {
if (node->max() == RegExpTree::kInfinity) {
CompileNonGreedyStar(emit_body);
} else {
DCHECK_NE(node->max(), RegExpTree::kInfinity);
CompileNonGreedyRepetition(emit_body, node->max() - node->min());
}
}
}
return nullptr;
}
void* VisitCapture(RegExpCapture* node, void*) override {
int index = node->index();
int start_register = RegExpCapture::StartRegister(index);
int end_register = RegExpCapture::EndRegister(index);
assembler_.SetRegisterToCp(start_register);
node->body()->Accept(this, nullptr);
assembler_.SetRegisterToCp(end_register);
return nullptr;
}
void* VisitGroup(RegExpGroup* node, void*) override {
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitLookaround(RegExpLookaround* node, void*) override {
// TODO(mbid,v8:10765): Support this case.
UNREACHABLE();
}
void* VisitBackReference(RegExpBackReference* node, void*) override {
UNREACHABLE();
}
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
void* VisitText(RegExpText* node, void*) override {
for (TextElement& text_el : *node->elements()) {
text_el.tree()->Accept(this, nullptr);
}
return nullptr;
}
private:
Zone* zone_;
BytecodeAssembler assembler_;
};
} // namespace
ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) {
return CompileVisitor::Compile(tree, flags, zone);
}
} // namespace internal
} // namespace v8