| // Copyright 2020 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/regexp/experimental/experimental-interpreter.h" |
| |
| #include "src/base/optional.h" |
| #include "src/objects/fixed-array-inl.h" |
| #include "src/objects/string-inl.h" |
| #include "src/regexp/experimental/experimental.h" |
| #include "src/strings/char-predicates-inl.h" |
| #include "src/zone/zone-allocator.h" |
| #include "src/zone/zone-list-inl.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| namespace { |
| |
| constexpr int kUndefinedRegisterValue = -1; |
| |
| template <class Character> |
| bool SatisfiesAssertion(RegExpAssertion::AssertionType type, |
| Vector<const Character> context, int position) { |
| DCHECK_LE(position, context.length()); |
| DCHECK_GE(position, 0); |
| |
| switch (type) { |
| case RegExpAssertion::START_OF_INPUT: |
| return position == 0; |
| case RegExpAssertion::END_OF_INPUT: |
| return position == context.length(); |
| case RegExpAssertion::START_OF_LINE: |
| if (position == 0) return true; |
| return unibrow::IsLineTerminator(context[position - 1]); |
| case RegExpAssertion::END_OF_LINE: |
| if (position == context.length()) return true; |
| return unibrow::IsLineTerminator(context[position]); |
| case RegExpAssertion::BOUNDARY: |
| if (context.length() == 0) { |
| return false; |
| } else if (position == 0) { |
| return IsRegExpWord(context[position]); |
| } else if (position == context.length()) { |
| return IsRegExpWord(context[position - 1]); |
| } else { |
| return IsRegExpWord(context[position - 1]) != |
| IsRegExpWord(context[position]); |
| } |
| case RegExpAssertion::NON_BOUNDARY: |
| return !SatisfiesAssertion(RegExpAssertion::BOUNDARY, context, position); |
| } |
| } |
| |
| Vector<RegExpInstruction> ToInstructionVector( |
| ByteArray raw_bytes, const DisallowHeapAllocation& no_gc) { |
| RegExpInstruction* inst_begin = |
| reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress()); |
| int inst_num = raw_bytes.length() / sizeof(RegExpInstruction); |
| DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length()); |
| return Vector<RegExpInstruction>(inst_begin, inst_num); |
| } |
| |
| template <class Character> |
| Vector<const Character> ToCharacterVector(String str, |
| const DisallowHeapAllocation& no_gc); |
| |
| template <> |
| Vector<const uint8_t> ToCharacterVector<uint8_t>( |
| String str, const DisallowHeapAllocation& no_gc) { |
| DCHECK(str.IsFlat()); |
| String::FlatContent content = str.GetFlatContent(no_gc); |
| DCHECK(content.IsOneByte()); |
| return content.ToOneByteVector(); |
| } |
| |
| template <> |
| Vector<const uc16> ToCharacterVector<uc16>( |
| String str, const DisallowHeapAllocation& no_gc) { |
| DCHECK(str.IsFlat()); |
| String::FlatContent content = str.GetFlatContent(no_gc); |
| DCHECK(content.IsTwoByte()); |
| return content.ToUC16Vector(); |
| } |
| |
| template <class Character> |
| class NfaInterpreter { |
| // Executes a bytecode program in breadth-first mode, without backtracking. |
| // `Character` can be instantiated with `uint8_t` or `uc16` for one byte or |
| // two byte input strings. |
| // |
| // In contrast to the backtracking implementation, this has linear time |
| // complexity in the length of the input string. Breadth-first mode means |
| // that threads are executed in lockstep with respect to their input |
| // position, i.e. the threads share a common input index. This is similar |
| // to breadth-first simulation of a non-deterministic finite automaton (nfa), |
| // hence the name of the class. |
| // |
| // To follow the semantics of a backtracking VM implementation, we have to be |
| // careful about whether we stop execution when a thread executes ACCEPT. |
| // For example, consider execution of the bytecode generated by the regexp |
| // |
| // r = /abc|..|[a-c]{10,}/ |
| // |
| // on input "abcccccccccccccc". Clearly the three alternatives |
| // - /abc/ |
| // - /../ |
| // - /[a-c]{10,}/ |
| // all match this input. A backtracking implementation will report "abc" as |
| // match, because it explores the first alternative before the others. |
| // |
| // However, if we execute breadth first, then we execute the 3 threads |
| // - t1, which tries to match /abc/ |
| // - t2, which tries to match /../ |
| // - t3, which tries to match /[a-c]{10,}/ |
| // in lockstep i.e. by iterating over the input and feeding all threads one |
| // character at a time. t2 will execute an ACCEPT after two characters, |
| // while t1 will only execute ACCEPT after three characters. Thus we find a |
| // match for the second alternative before a match of the first alternative. |
| // |
| // This shows that we cannot always stop searching as soon as some thread t |
| // executes ACCEPT: If there is a thread u with higher priority than t, then |
| // it must be finished first. If u produces a match, then we can discard the |
| // match of t because matches produced by threads with higher priority are |
| // preferred over matches of threads with lower priority. On the other hand, |
| // we are allowed to abort all threads with lower priority than t if t |
| // produces a match: Such threads can only produce worse matches. In the |
| // example above, we can abort t3 after two characters because of t2's match. |
| // |
| // Thus the interpreter keeps track of a priority-ordered list of threads. |
| // If a thread ACCEPTs, all threads with lower priority are discarded, and |
| // the search continues with the threads with higher priority. If no threads |
| // with high priority are left, we return the match that was produced by the |
| // ACCEPTing thread with highest priority. |
| public: |
| NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin, |
| ByteArray bytecode, int register_count_per_match, String input, |
| int32_t input_index, Zone* zone) |
| : isolate_(isolate), |
| call_origin_(call_origin), |
| bytecode_object_(bytecode), |
| bytecode_(ToInstructionVector(bytecode, no_gc_)), |
| register_count_per_match_(register_count_per_match), |
| input_object_(input), |
| input_(ToCharacterVector<Character>(input, no_gc_)), |
| input_index_(input_index), |
| pc_last_input_index_(zone->NewArray<int>(bytecode.length()), |
| bytecode.length()), |
| active_threads_(0, zone), |
| blocked_threads_(0, zone), |
| register_array_allocator_(zone), |
| best_match_registers_(base::nullopt), |
| zone_(zone) { |
| DCHECK(!bytecode_.empty()); |
| DCHECK_GE(input_index_, 0); |
| DCHECK_LE(input_index_, input_.length()); |
| |
| std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); |
| } |
| |
| // Finds matches and writes their concatenated capture registers to |
| // `output_registers`. `output_registers[i]` has to be valid for all i < |
| // output_register_count. The search continues until all remaining matches |
| // have been found or there is no space left in `output_registers`. Returns |
| // the number of matches found. |
| int FindMatches(int32_t* output_registers, int output_register_count) { |
| const int max_match_num = output_register_count / register_count_per_match_; |
| |
| int match_num = 0; |
| while (match_num != max_match_num) { |
| int err_code = FindNextMatch(); |
| if (err_code != RegExp::kInternalRegExpSuccess) return err_code; |
| |
| if (!FoundMatch()) break; |
| |
| Vector<int> registers = *best_match_registers_; |
| output_registers = |
| std::copy(registers.begin(), registers.end(), output_registers); |
| |
| ++match_num; |
| |
| const int match_begin = registers[0]; |
| const int match_end = registers[1]; |
| DCHECK_LE(match_begin, match_end); |
| const int match_length = match_end - match_begin; |
| if (match_length != 0) { |
| SetInputIndex(match_end); |
| } else if (match_end == input_.length()) { |
| // Zero-length match, input exhausted. |
| SetInputIndex(match_end); |
| break; |
| } else { |
| // Zero-length match, more input. We don't want to report more matches |
| // here endlessly, so we advance by 1. |
| SetInputIndex(match_end + 1); |
| |
| // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to |
| // the next codepoint, not to the next code unit. See also |
| // `RegExpUtils::AdvanceStringIndex`. |
| STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode); |
| } |
| } |
| |
| return match_num; |
| } |
| |
| private: |
| // The state of a "thread" executing experimental regexp bytecode. (Not to |
| // be confused with an OS thread.) |
| struct InterpreterThread { |
| // This thread's program counter, i.e. the index within `bytecode_` of the |
| // next instruction to be executed. |
| int pc; |
| // Pointer to the array of registers, which is always size |
| // `register_count_per_match_`. Should be deallocated with |
| // `register_array_allocator_`. |
| int* register_array_begin; |
| }; |
| |
| // Handles pending interrupts if there are any. Returns |
| // RegExp::kInternalRegExpSuccess if execution can continue, and an error |
| // code otherwise. |
| int HandleInterrupts() { |
| StackLimitCheck check(isolate_); |
| if (call_origin_ == RegExp::CallOrigin::kFromJs) { |
| // Direct calls from JavaScript can be interrupted in two ways: |
| // 1. A real stack overflow, in which case we let the caller throw the |
| // exception. |
| // 2. The stack guard was used to interrupt execution for another purpose, |
| // forcing the call through the runtime system. |
| if (check.JsHasOverflowed()) { |
| return RegExp::kInternalRegExpException; |
| } else if (check.InterruptRequested()) { |
| return RegExp::kInternalRegExpRetry; |
| } |
| } else { |
| DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime); |
| HandleScope handles(isolate_); |
| Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_); |
| Handle<String> input_handle(input_object_, isolate_); |
| |
| if (check.JsHasOverflowed()) { |
| // We abort the interpreter now anyway, so gc can't invalidate any |
| // pointers. |
| AllowHeapAllocation yes_gc; |
| isolate_->StackOverflow(); |
| return RegExp::kInternalRegExpException; |
| } else if (check.InterruptRequested()) { |
| // TODO(mbid): Is this really equivalent to whether the string is |
| // one-byte or two-byte? A comment at the declaration of |
| // IsOneByteRepresentationUnderneath says that this might fail for |
| // external strings. |
| const bool was_one_byte = |
| String::IsOneByteRepresentationUnderneath(input_object_); |
| |
| Object result; |
| { |
| AllowHeapAllocation yes_gc; |
| result = isolate_->stack_guard()->HandleInterrupts(); |
| } |
| if (result.IsException(isolate_)) { |
| return RegExp::kInternalRegExpException; |
| } |
| |
| // If we changed between a LATIN1 and a UC16 string, we need to restart |
| // regexp matching with the appropriate template instantiation of |
| // RawMatch. |
| if (String::IsOneByteRepresentationUnderneath(*input_handle) != |
| was_one_byte) { |
| return RegExp::kInternalRegExpRetry; |
| } |
| |
| // Update objects and pointers in case they have changed during gc. |
| bytecode_object_ = *bytecode_handle; |
| bytecode_ = ToInstructionVector(bytecode_object_, no_gc_); |
| input_object_ = *input_handle; |
| input_ = ToCharacterVector<Character>(input_object_, no_gc_); |
| } |
| } |
| return RegExp::kInternalRegExpSuccess; |
| } |
| |
| // Change the current input index for future calls to `FindNextMatch`. |
| void SetInputIndex(int new_input_index) { |
| DCHECK_GE(input_index_, 0); |
| DCHECK_LE(input_index_, input_.length()); |
| |
| input_index_ = new_input_index; |
| } |
| |
| // Find the next match and return the corresponding capture registers and |
| // write its capture registers to `best_match_registers_`. The search starts |
| // at the current `input_index_`. Returns RegExp::kInternalRegExpSuccess if |
| // execution could finish regularly (with or without a match) and an error |
| // code due to interrupt otherwise. |
| int FindNextMatch() { |
| DCHECK(active_threads_.is_empty()); |
| // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_` |
| // here? As long as |
| // |
| // pc_last_input_index_[pc] < input_index_ |
| // |
| // for all possible program counters pc that are reachable without input |
| // from pc = 0 and |
| // |
| // pc_last_input_index_[k] <= input_index_ |
| // |
| // for all k > 0 hold I think everything should be fine. Maybe we can do |
| // something about this in `SetInputIndex`. |
| std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); |
| |
| // Clean up left-over data from a previous call to FindNextMatch. |
| for (InterpreterThread t : blocked_threads_) { |
| DestroyThread(t); |
| } |
| blocked_threads_.DropAndClear(); |
| |
| for (InterpreterThread t : active_threads_) { |
| DestroyThread(t); |
| } |
| active_threads_.DropAndClear(); |
| |
| if (best_match_registers_.has_value()) { |
| FreeRegisterArray(best_match_registers_->begin()); |
| best_match_registers_ = base::nullopt; |
| } |
| |
| // All threads start at bytecode 0. |
| active_threads_.Add( |
| InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_); |
| // Run the initial thread, potentially forking new threads, until every |
| // thread is blocked without further input. |
| RunActiveThreads(); |
| |
| // We stop if one of the following conditions hold: |
| // - We have exhausted the entire input. |
| // - We have found a match at some point, and there are no remaining |
| // threads with higher priority than the thread that produced the match. |
| // Threads with low priority have been aborted earlier, and the remaining |
| // threads are blocked here, so the latter simply means that |
| // `blocked_threads_` is empty. |
| while (input_index_ != input_.length() && |
| !(FoundMatch() && blocked_threads_.is_empty())) { |
| DCHECK(active_threads_.is_empty()); |
| uc16 input_char = input_[input_index_]; |
| ++input_index_; |
| |
| static constexpr int kTicksBetweenInterruptHandling = 64; |
| if (input_index_ % kTicksBetweenInterruptHandling == 0) { |
| int err_code = HandleInterrupts(); |
| if (err_code != RegExp::kInternalRegExpSuccess) return err_code; |
| } |
| |
| // We unblock all blocked_threads_ by feeding them the input char. |
| FlushBlockedThreads(input_char); |
| |
| // Run all threads until they block or accept. |
| RunActiveThreads(); |
| } |
| |
| return RegExp::kInternalRegExpSuccess; |
| } |
| |
| // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT |
| // instruction, or its PC value was already processed. |
| // - If processing of `t` can't continue because of CONSUME_RANGE, it is |
| // pushed on `blocked_threads_`. |
| // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and |
| // the current input index. All remaining `active_threads_` are discarded. |
| void RunActiveThread(InterpreterThread t) { |
| while (true) { |
| if (IsPcProcessed(t.pc)) return; |
| MarkPcProcessed(t.pc); |
| |
| RegExpInstruction inst = bytecode_[t.pc]; |
| switch (inst.opcode) { |
| case RegExpInstruction::CONSUME_RANGE: { |
| blocked_threads_.Add(t, zone_); |
| return; |
| } |
| case RegExpInstruction::ASSERTION: |
| if (!SatisfiesAssertion(inst.payload.assertion_type, input_, |
| input_index_)) { |
| DestroyThread(t); |
| return; |
| } |
| ++t.pc; |
| break; |
| case RegExpInstruction::FORK: { |
| InterpreterThread fork{inst.payload.pc, |
| NewRegisterArrayUninitialized()}; |
| Vector<int> fork_registers = GetRegisterArray(fork); |
| Vector<int> t_registers = GetRegisterArray(t); |
| DCHECK_EQ(fork_registers.length(), t_registers.length()); |
| std::copy(t_registers.begin(), t_registers.end(), |
| fork_registers.begin()); |
| active_threads_.Add(fork, zone_); |
| |
| ++t.pc; |
| break; |
| } |
| case RegExpInstruction::JMP: |
| t.pc = inst.payload.pc; |
| break; |
| case RegExpInstruction::ACCEPT: |
| if (best_match_registers_.has_value()) { |
| FreeRegisterArray(best_match_registers_->begin()); |
| } |
| best_match_registers_ = GetRegisterArray(t); |
| |
| for (InterpreterThread s : active_threads_) { |
| FreeRegisterArray(s.register_array_begin); |
| } |
| active_threads_.DropAndClear(); |
| return; |
| case RegExpInstruction::SET_REGISTER_TO_CP: |
| GetRegisterArray(t)[inst.payload.register_index] = input_index_; |
| ++t.pc; |
| break; |
| case RegExpInstruction::CLEAR_REGISTER: |
| GetRegisterArray(t)[inst.payload.register_index] = |
| kUndefinedRegisterValue; |
| ++t.pc; |
| break; |
| } |
| } |
| } |
| |
| // Run each active thread until it can't continue without further input. |
| // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from |
| // low to high priority. |
| void RunActiveThreads() { |
| while (!active_threads_.is_empty()) { |
| RunActiveThread(active_threads_.RemoveLast()); |
| } |
| } |
| |
| // Unblock all blocked_threads_ by feeding them an `input_char`. Should only |
| // be called with `input_index_` pointing to the character *after* |
| // `input_char` so that `pc_last_input_index_` is updated correctly. |
| void FlushBlockedThreads(uc16 input_char) { |
| // The threads in blocked_threads_ are sorted from high to low priority, |
| // but active_threads_ needs to be sorted from low to high priority, so we |
| // need to activate blocked threads in reverse order. |
| for (int i = blocked_threads_.length() - 1; i >= 0; --i) { |
| InterpreterThread t = blocked_threads_[i]; |
| RegExpInstruction inst = bytecode_[t.pc]; |
| DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE); |
| RegExpInstruction::Uc16Range range = inst.payload.consume_range; |
| if (input_char >= range.min && input_char <= range.max) { |
| ++t.pc; |
| active_threads_.Add(t, zone_); |
| } else { |
| DestroyThread(t); |
| } |
| } |
| blocked_threads_.DropAndClear(); |
| } |
| |
| bool FoundMatch() const { return best_match_registers_.has_value(); } |
| |
| Vector<int> GetRegisterArray(InterpreterThread t) { |
| return Vector<int>(t.register_array_begin, register_count_per_match_); |
| } |
| |
| int* NewRegisterArrayUninitialized() { |
| return register_array_allocator_.allocate(register_count_per_match_); |
| } |
| |
| int* NewRegisterArray(int fill_value) { |
| int* array_begin = NewRegisterArrayUninitialized(); |
| int* array_end = array_begin + register_count_per_match_; |
| std::fill(array_begin, array_end, fill_value); |
| return array_begin; |
| } |
| |
| void FreeRegisterArray(int* register_array_begin) { |
| register_array_allocator_.deallocate(register_array_begin, |
| register_count_per_match_); |
| } |
| |
| void DestroyThread(InterpreterThread t) { |
| FreeRegisterArray(t.register_array_begin); |
| } |
| |
| // It is redundant to have two threads t, t0 execute at the same PC value, |
| // because one of t, t0 matches iff the other does. We can thus discard |
| // the one with lower priority. We check whether a thread executed at some |
| // PC value by recording for every possible value of PC what the value of |
| // input_index_ was the last time a thread executed at PC. If a thread |
| // tries to continue execution at a PC value that we have seen before at |
| // the current input index, we abort it. (We execute threads with higher |
| // priority first, so the second thread is guaranteed to have lower |
| // priority.) |
| // |
| // Check whether we've seen an active thread with a given pc value since the |
| // last increment of `input_index_`. |
| bool IsPcProcessed(int pc) { |
| DCHECK_LE(pc_last_input_index_[pc], input_index_); |
| return pc_last_input_index_[pc] == input_index_; |
| } |
| |
| // Mark a pc as having been processed since the last increment of |
| // `input_index_`. |
| void MarkPcProcessed(int pc) { |
| DCHECK_LE(pc_last_input_index_[pc], input_index_); |
| pc_last_input_index_[pc] = input_index_; |
| } |
| |
| Isolate* const isolate_; |
| |
| const RegExp::CallOrigin call_origin_; |
| |
| const DisallowHeapAllocation no_gc_; |
| |
| ByteArray bytecode_object_; |
| Vector<const RegExpInstruction> bytecode_; |
| |
| // Number of registers used per thread. |
| const int register_count_per_match_; |
| |
| String input_object_; |
| Vector<const Character> input_; |
| int input_index_; |
| |
| // pc_last_input_index_[k] records the value of input_index_ the last |
| // time a thread t such that t.pc == k was activated, i.e. put on |
| // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See |
| // also `RunActiveThread`. |
| Vector<int> pc_last_input_index_; |
| |
| // Active threads can potentially (but not necessarily) continue without |
| // input. Sorted from low to high priority. |
| ZoneList<InterpreterThread> active_threads_; |
| |
| // The pc of a blocked thread points to an instruction that consumes a |
| // character. Sorted from high to low priority (so the opposite of |
| // `active_threads_`). |
| ZoneList<InterpreterThread> blocked_threads_; |
| |
| // RecyclingZoneAllocator maintains a linked list through freed allocations |
| // for reuse if possible. |
| RecyclingZoneAllocator<int> register_array_allocator_; |
| |
| // The register array of the best match found so far during the current |
| // search. If several threads ACCEPTed, then this will be the register array |
| // of the accepting thread with highest priority. Should be deallocated with |
| // `register_array_allocator_`. |
| base::Optional<Vector<int>> best_match_registers_; |
| |
| Zone* zone_; |
| }; |
| |
| } // namespace |
| |
| int ExperimentalRegExpInterpreter::FindMatches( |
| Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode, |
| int register_count_per_match, String input, int start_index, |
| int32_t* output_registers, int output_register_count, Zone* zone) { |
| DCHECK(input.IsFlat()); |
| DisallowHeapAllocation no_gc; |
| |
| if (input.GetFlatContent(no_gc).IsOneByte()) { |
| NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode, |
| register_count_per_match, input, |
| start_index, zone); |
| return interpreter.FindMatches(output_registers, output_register_count); |
| } else { |
| DCHECK(input.GetFlatContent(no_gc).IsTwoByte()); |
| NfaInterpreter<uc16> interpreter(isolate, call_origin, bytecode, |
| register_count_per_match, input, |
| start_index, zone); |
| return interpreter.FindMatches(output_registers, output_register_count); |
| } |
| } |
| |
| } // namespace internal |
| } // namespace v8 |