| // Copyright 2012 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/regexp/regexp.h" |
| |
| #include "src/codegen/compilation-cache.h" |
| #include "src/heap/heap-inl.h" |
| #include "src/objects/js-regexp-inl.h" |
| #include "src/regexp/regexp-bytecode-generator.h" |
| #include "src/regexp/regexp-compiler.h" |
| #include "src/regexp/regexp-dotprinter.h" |
| #include "src/regexp/regexp-interpreter.h" |
| #include "src/regexp/regexp-macro-assembler-arch.h" |
| #include "src/regexp/regexp-parser.h" |
| #include "src/strings/string-search.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| using namespace regexp_compiler_constants; // NOLINT(build/namespaces) |
| |
| class RegExpImpl final : public AllStatic { |
| public: |
| // Returns a string representation of a regular expression. |
| // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. |
| // This function calls the garbage collector if necessary. |
| static Handle<String> ToString(Handle<Object> value); |
| |
| // Prepares a JSRegExp object with Irregexp-specific data. |
| static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> pattern, JSRegExp::Flags flags, |
| int capture_register_count); |
| |
| static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> pattern, JSRegExp::Flags flags, |
| Handle<String> match_pattern); |
| |
| static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, int32_t* output, |
| int output_size); |
| |
| static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, |
| Handle<RegExpMatchInfo> last_match_info); |
| |
| // Execute a regular expression on the subject, starting from index. |
| // If matching succeeds, return the number of matches. This can be larger |
| // than one in the case of global regular expressions. |
| // The captures and subcaptures are stored into the registers vector. |
| // If matching fails, returns RE_FAILURE. |
| // If execution fails, sets a pending exception and returns RE_EXCEPTION. |
| static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, int32_t* output, |
| int output_size); |
| |
| // Execute an Irregexp bytecode pattern. |
| // On a successful match, the result is a JSArray containing |
| // captured positions. On a failure, the result is the null value. |
| // Returns an empty handle in case of an exception. |
| V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec( |
| Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, |
| int index, Handle<RegExpMatchInfo> last_match_info); |
| |
| static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> sample_subject, bool is_one_byte); |
| static inline bool EnsureCompiledIrregexp(Isolate* isolate, |
| Handle<JSRegExp> re, |
| Handle<String> sample_subject, |
| bool is_one_byte); |
| |
| // Returns true on success, false on failure. |
| static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input, |
| JSRegExp::Flags flags, Handle<String> pattern, |
| Handle<String> sample_subject, bool is_one_byte); |
| |
| // For acting on the JSRegExp data FixedArray. |
| static int IrregexpMaxRegisterCount(FixedArray re); |
| static void SetIrregexpMaxRegisterCount(FixedArray re, int value); |
| static void SetIrregexpCaptureNameMap(FixedArray re, |
| Handle<FixedArray> value); |
| static int IrregexpNumberOfCaptures(FixedArray re); |
| static int IrregexpNumberOfRegisters(FixedArray re); |
| static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte); |
| static Code IrregexpNativeCode(FixedArray re, bool is_one_byte); |
| }; |
| |
| V8_WARN_UNUSED_RESULT |
| static inline MaybeHandle<Object> ThrowRegExpException( |
| Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, |
| Handle<String> error_text) { |
| THROW_NEW_ERROR( |
| isolate, |
| NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text), |
| Object); |
| } |
| |
| inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> error_text) { |
| USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate), |
| error_text)); |
| } |
| |
| // Identifies the sort of regexps where the regexp engine is faster |
| // than the code used for atom matches. |
| static bool HasFewDifferentCharacters(Handle<String> pattern) { |
| int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); |
| if (length <= kPatternTooShortForBoyerMoore) return false; |
| const int kMod = 128; |
| bool character_found[kMod]; |
| int different = 0; |
| memset(&character_found[0], 0, sizeof(character_found)); |
| for (int i = 0; i < length; i++) { |
| int ch = (pattern->Get(i) & (kMod - 1)); |
| if (!character_found[ch]) { |
| character_found[ch] = true; |
| different++; |
| // We declare a regexp low-alphabet if it has at least 3 times as many |
| // characters as it has different characters. |
| if (different * 3 > length) return false; |
| } |
| } |
| return true; |
| } |
| |
| // Generic RegExp methods. Dispatches to implementation specific methods. |
| |
| // static |
| MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> pattern, |
| JSRegExp::Flags flags) { |
| DCHECK(pattern->IsFlat()); |
| |
| Zone zone(isolate->allocator(), ZONE_NAME); |
| CompilationCache* compilation_cache = isolate->compilation_cache(); |
| MaybeHandle<FixedArray> maybe_cached = |
| compilation_cache->LookupRegExp(pattern, flags); |
| Handle<FixedArray> cached; |
| if (maybe_cached.ToHandle(&cached)) { |
| re->set_data(*cached); |
| return re; |
| } |
| |
| PostponeInterruptsScope postpone(isolate); |
| RegExpCompileData parse_result; |
| FlatStringReader reader(isolate, pattern); |
| DCHECK(!isolate->has_pending_exception()); |
| if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, |
| &parse_result)) { |
| // Throw an exception if we fail to parse the pattern. |
| return ThrowRegExpException(isolate, re, pattern, parse_result.error); |
| } |
| |
| bool has_been_compiled = false; |
| |
| if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) && |
| !HasFewDifferentCharacters(pattern)) { |
| // Parse-tree is a single atom that is equal to the pattern. |
| RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern); |
| has_been_compiled = true; |
| } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && |
| parse_result.capture_count == 0) { |
| RegExpAtom* atom = parse_result.tree->AsAtom(); |
| Vector<const uc16> atom_pattern = atom->data(); |
| Handle<String> atom_string; |
| ASSIGN_RETURN_ON_EXCEPTION( |
| isolate, atom_string, |
| isolate->factory()->NewStringFromTwoByte(atom_pattern), Object); |
| if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) { |
| RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string); |
| has_been_compiled = true; |
| } |
| } |
| if (!has_been_compiled) { |
| RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags, |
| parse_result.capture_count); |
| } |
| DCHECK(re->data().IsFixedArray()); |
| // Compilation succeeded so the data is set on the regexp |
| // and we can store it in the cache. |
| Handle<FixedArray> data(FixedArray::cast(re->data()), isolate); |
| compilation_cache->PutRegExp(pattern, flags, data); |
| |
| return re; |
| } |
| |
| // static |
| MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, |
| Handle<RegExpMatchInfo> last_match_info) { |
| switch (regexp->TypeTag()) { |
| case JSRegExp::ATOM: |
| return RegExpImpl::AtomExec(isolate, regexp, subject, index, |
| last_match_info); |
| case JSRegExp::IRREGEXP: { |
| return RegExpImpl::IrregexpExec(isolate, regexp, subject, index, |
| last_match_info); |
| } |
| default: |
| UNREACHABLE(); |
| } |
| } |
| |
| // RegExp Atom implementation: Simple string search using indexOf. |
| |
| void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> pattern, JSRegExp::Flags flags, |
| Handle<String> match_pattern) { |
| isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags, |
| match_pattern); |
| } |
| |
| static void SetAtomLastCapture(Isolate* isolate, |
| Handle<RegExpMatchInfo> last_match_info, |
| String subject, int from, int to) { |
| SealHandleScope shs(isolate); |
| last_match_info->SetNumberOfCaptureRegisters(2); |
| last_match_info->SetLastSubject(subject); |
| last_match_info->SetLastInput(subject); |
| last_match_info->SetCapture(0, from); |
| last_match_info->SetCapture(1, to); |
| } |
| |
| int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, int32_t* output, |
| int output_size) { |
| DCHECK_LE(0, index); |
| DCHECK_LE(index, subject->length()); |
| |
| subject = String::Flatten(isolate, subject); |
| DisallowHeapAllocation no_gc; // ensure vectors stay valid |
| |
| String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); |
| int needle_len = needle.length(); |
| DCHECK(needle.IsFlat()); |
| DCHECK_LT(0, needle_len); |
| |
| if (index + needle_len > subject->length()) { |
| return RegExp::RE_FAILURE; |
| } |
| |
| for (int i = 0; i < output_size; i += 2) { |
| String::FlatContent needle_content = needle.GetFlatContent(no_gc); |
| String::FlatContent subject_content = subject->GetFlatContent(no_gc); |
| DCHECK(needle_content.IsFlat()); |
| DCHECK(subject_content.IsFlat()); |
| // dispatch on type of strings |
| index = |
| (needle_content.IsOneByte() |
| ? (subject_content.IsOneByte() |
| ? SearchString(isolate, subject_content.ToOneByteVector(), |
| needle_content.ToOneByteVector(), index) |
| : SearchString(isolate, subject_content.ToUC16Vector(), |
| needle_content.ToOneByteVector(), index)) |
| : (subject_content.IsOneByte() |
| ? SearchString(isolate, subject_content.ToOneByteVector(), |
| needle_content.ToUC16Vector(), index) |
| : SearchString(isolate, subject_content.ToUC16Vector(), |
| needle_content.ToUC16Vector(), index))); |
| if (index == -1) { |
| return i / 2; // Return number of matches. |
| } else { |
| output[i] = index; |
| output[i + 1] = index + needle_len; |
| index += needle_len; |
| } |
| } |
| return output_size / 2; |
| } |
| |
| Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> subject, int index, |
| Handle<RegExpMatchInfo> last_match_info) { |
| static const int kNumRegisters = 2; |
| STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); |
| int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); |
| |
| int res = |
| AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters); |
| |
| if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value(); |
| |
| DCHECK_EQ(res, RegExp::RE_SUCCESS); |
| SealHandleScope shs(isolate); |
| SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0], |
| output_registers[1]); |
| return last_match_info; |
| } |
| |
| // Irregexp implementation. |
| |
| // Ensures that the regexp object contains a compiled version of the |
| // source for either one-byte or two-byte subject strings. |
| // If the compiled version doesn't already exist, it is compiled |
| // from the source pattern. |
| // If compilation fails, an exception is thrown and this function |
| // returns false. |
| bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> sample_subject, |
| bool is_one_byte) { |
| Object compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); |
| if (compiled_code != Smi::FromInt(JSRegExp::kUninitializedValue)) { |
| DCHECK(FLAG_regexp_interpret_all ? compiled_code.IsByteArray() |
| : compiled_code.IsCode()); |
| return true; |
| } |
| return CompileIrregexp(isolate, re, sample_subject, is_one_byte); |
| } |
| |
| bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> sample_subject, |
| bool is_one_byte) { |
| // Compile the RegExp. |
| Zone zone(isolate->allocator(), ZONE_NAME); |
| PostponeInterruptsScope postpone(isolate); |
| #ifdef DEBUG |
| Object entry = re->DataAt(JSRegExp::code_index(is_one_byte)); |
| // When arriving here entry can only be a smi representing an uncompiled |
| // regexp. |
| DCHECK(entry.IsSmi()); |
| int entry_value = Smi::ToInt(entry); |
| DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value); |
| #endif |
| |
| JSRegExp::Flags flags = re->GetFlags(); |
| |
| Handle<String> pattern(re->Pattern(), isolate); |
| pattern = String::Flatten(isolate, pattern); |
| RegExpCompileData compile_data; |
| FlatStringReader reader(isolate, pattern); |
| if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, |
| &compile_data)) { |
| // Throw an exception if we fail to parse the pattern. |
| // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. |
| USE(ThrowRegExpException(isolate, re, pattern, compile_data.error)); |
| return false; |
| } |
| const bool compilation_succeeded = |
| Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject, |
| is_one_byte); |
| if (!compilation_succeeded) { |
| DCHECK(!compile_data.error.is_null()); |
| ThrowRegExpException(isolate, re, compile_data.error); |
| return false; |
| } |
| |
| Handle<FixedArray> data = |
| Handle<FixedArray>(FixedArray::cast(re->data()), isolate); |
| data->set(JSRegExp::code_index(is_one_byte), compile_data.code); |
| SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); |
| int register_max = IrregexpMaxRegisterCount(*data); |
| if (compile_data.register_count > register_max) { |
| SetIrregexpMaxRegisterCount(*data, compile_data.register_count); |
| } |
| |
| return true; |
| } |
| |
| int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) { |
| return Smi::cast(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex)).value(); |
| } |
| |
| void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) { |
| re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); |
| } |
| |
| void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re, |
| Handle<FixedArray> value) { |
| if (value.is_null()) { |
| re.set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero); |
| } else { |
| re.set(JSRegExp::kIrregexpCaptureNameMapIndex, *value); |
| } |
| } |
| |
| int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) { |
| return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex)); |
| } |
| |
| int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) { |
| return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex)); |
| } |
| |
| ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) { |
| return ByteArray::cast(re.get(JSRegExp::code_index(is_one_byte))); |
| } |
| |
| Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) { |
| return Code::cast(re.get(JSRegExp::code_index(is_one_byte))); |
| } |
| |
| void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, |
| Handle<String> pattern, |
| JSRegExp::Flags flags, int capture_count) { |
| // Initialize compiled code entries to null. |
| isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern, |
| flags, capture_count); |
| } |
| |
| // static |
| int RegExp::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject) { |
| DCHECK(subject->IsFlat()); |
| |
| // Check representation of the underlying storage. |
| bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); |
| if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject, |
| is_one_byte)) { |
| return -1; |
| } |
| |
| DisallowHeapAllocation no_gc; |
| FixedArray data = FixedArray::cast(regexp->data()); |
| if (FLAG_regexp_interpret_all) { |
| // Byte-code regexp needs space allocated for all its registers. |
| // The result captures are copied to the start of the registers array |
| // if the match succeeds. This way those registers are not clobbered |
| // when we set the last match info from last successful match. |
| return RegExpImpl::IrregexpNumberOfRegisters(data) + |
| (RegExpImpl::IrregexpNumberOfCaptures(data) + 1) * 2; |
| } else { |
| // Native regexp only needs room to output captures. Registers are handled |
| // internally. |
| return (RegExpImpl::IrregexpNumberOfCaptures(data) + 1) * 2; |
| } |
| } |
| |
| int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, |
| Handle<String> subject, int index, |
| int32_t* output, int output_size) { |
| Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); |
| |
| DCHECK_LE(0, index); |
| DCHECK_LE(index, subject->length()); |
| DCHECK(subject->IsFlat()); |
| |
| bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); |
| |
| if (!FLAG_regexp_interpret_all) { |
| DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| do { |
| EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); |
| Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); |
| // The stack is used to allocate registers for the compiled regexp code. |
| // This means that in case of failure, the output registers array is left |
| // untouched and contains the capture results from the previous successful |
| // match. We can use that to set the last match info lazily. |
| int res = NativeRegExpMacroAssembler::Match(code, subject, output, |
| output_size, index, isolate); |
| if (res != NativeRegExpMacroAssembler::RETRY) { |
| DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || |
| isolate->has_pending_exception()); |
| STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == |
| RegExp::RE_SUCCESS); |
| STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == |
| RegExp::RE_FAILURE); |
| STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) == |
| RegExp::RE_EXCEPTION); |
| return res; |
| } |
| // If result is RETRY, the string has changed representation, and we |
| // must restart from scratch. |
| // In this case, it means we must make sure we are prepared to handle |
| // the, potentially, different subject (the string can switch between |
| // being internal and external, and even between being Latin1 and UC16, |
| // but the characters are always the same). |
| RegExp::IrregexpPrepare(isolate, regexp, subject); |
| is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); |
| } while (true); |
| UNREACHABLE(); |
| } else { |
| DCHECK(FLAG_regexp_interpret_all); |
| DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); |
| // We must have done EnsureCompiledIrregexp, so we can get the number of |
| // registers. |
| int number_of_capture_registers = |
| (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; |
| int32_t* raw_output = &output[number_of_capture_registers]; |
| |
| do { |
| // We do not touch the actual capture result registers until we know there |
| // has been a match so that we can use those capture results to set the |
| // last match info. |
| for (int i = number_of_capture_registers - 1; i >= 0; i--) { |
| raw_output[i] = -1; |
| } |
| Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), |
| isolate); |
| |
| IrregexpInterpreter::Result result = IrregexpInterpreter::Match( |
| isolate, byte_codes, subject, raw_output, index); |
| DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION, |
| isolate->has_pending_exception()); |
| |
| switch (result) { |
| case IrregexpInterpreter::SUCCESS: |
| // Copy capture results to the start of the registers array. |
| MemCopy(output, raw_output, |
| number_of_capture_registers * sizeof(int32_t)); |
| return result; |
| case IrregexpInterpreter::EXCEPTION: |
| case IrregexpInterpreter::FAILURE: |
| return result; |
| case IrregexpInterpreter::RETRY: |
| // The string has changed representation, and we must restart the |
| // match. |
| is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); |
| EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); |
| break; |
| } |
| } while (true); |
| UNREACHABLE(); |
| } |
| } |
| |
| MaybeHandle<Object> RegExpImpl::IrregexpExec( |
| Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, |
| int previous_index, Handle<RegExpMatchInfo> last_match_info) { |
| DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); |
| |
| subject = String::Flatten(isolate, subject); |
| |
| // Prepare space for the return values. |
| #ifdef DEBUG |
| if (FLAG_regexp_interpret_all && FLAG_trace_regexp_bytecodes) { |
| String pattern = regexp->Pattern(); |
| PrintF("\n\nRegexp match: /%s/\n\n", pattern.ToCString().get()); |
| PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); |
| } |
| #endif |
| int required_registers = RegExp::IrregexpPrepare(isolate, regexp, subject); |
| if (required_registers < 0) { |
| // Compiling failed with an exception. |
| DCHECK(isolate->has_pending_exception()); |
| return MaybeHandle<Object>(); |
| } |
| |
| int32_t* output_registers = nullptr; |
| if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { |
| output_registers = NewArray<int32_t>(required_registers); |
| } |
| std::unique_ptr<int32_t[]> auto_release(output_registers); |
| if (output_registers == nullptr) { |
| output_registers = isolate->jsregexp_static_offsets_vector(); |
| } |
| |
| int res = |
| RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index, |
| output_registers, required_registers); |
| if (res == RegExp::RE_SUCCESS) { |
| int capture_count = |
| IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); |
| return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, |
| capture_count, output_registers); |
| } |
| if (res == RegExp::RE_EXCEPTION) { |
| DCHECK(isolate->has_pending_exception()); |
| return MaybeHandle<Object>(); |
| } |
| DCHECK(res == RegExp::RE_FAILURE); |
| return isolate->factory()->null_value(); |
| } |
| |
| // static |
| Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo( |
| Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, |
| Handle<String> subject, int capture_count, int32_t* match) { |
| // This is the only place where match infos can grow. If, after executing the |
| // regexp, RegExpExecStub finds that the match info is too small, it restarts |
| // execution in RegExpImpl::Exec, which finally grows the match info right |
| // here. |
| |
| int capture_register_count = (capture_count + 1) * 2; |
| Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures( |
| isolate, last_match_info, capture_register_count); |
| result->SetNumberOfCaptureRegisters(capture_register_count); |
| |
| if (*result != *last_match_info) { |
| if (*last_match_info == *isolate->regexp_last_match_info()) { |
| // This inner condition is only needed for special situations like the |
| // regexp fuzzer, where we pass our own custom RegExpMatchInfo to |
| // RegExpImpl::Exec; there actually want to bypass the Isolate's match |
| // info and execute the regexp without side effects. |
| isolate->native_context()->set_regexp_last_match_info(*result); |
| } |
| } |
| |
| DisallowHeapAllocation no_allocation; |
| if (match != nullptr) { |
| for (int i = 0; i < capture_register_count; i += 2) { |
| result->SetCapture(i, match[i]); |
| result->SetCapture(i + 1, match[i + 1]); |
| } |
| } |
| result->SetLastSubject(*subject); |
| result->SetLastInput(*subject); |
| return result; |
| } |
| |
| // static |
| void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) { |
| DotPrinter::DotPrint(label, node); |
| } |
| |
| namespace { |
| |
| // Returns true if we've either generated too much irregex code within this |
| // isolate, or the pattern string is too long. |
| bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) { |
| // Limit the space regexps take up on the heap. In order to limit this we |
| // would like to keep track of the amount of regexp code on the heap. This |
| // is not tracked, however. As a conservative approximation we track the |
| // total regexp code compiled including code that has subsequently been freed |
| // and the total executable memory at any point. |
| static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB; |
| static constexpr size_t kRegExpCompiledLimit = 1 * MB; |
| |
| Heap* heap = isolate->heap(); |
| if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true; |
| return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit && |
| heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit); |
| } |
| |
| } // namespace |
| |
| // static |
| bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone, |
| RegExpCompileData* data, JSRegExp::Flags flags, |
| Handle<String> pattern, |
| Handle<String> sample_subject, |
| bool is_one_byte) { |
| return RegExpImpl::Compile(isolate, zone, data, flags, pattern, |
| sample_subject, is_one_byte); |
| } |
| |
| bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, |
| JSRegExp::Flags flags, Handle<String> pattern, |
| Handle<String> sample_subject, bool is_one_byte) { |
| if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| data->error = |
| isolate->factory()->NewStringFromAsciiChecked("RegExp too big"); |
| return false; |
| } |
| |
| bool is_sticky = IsSticky(flags); |
| bool is_global = IsGlobal(flags); |
| bool is_unicode = IsUnicode(flags); |
| RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte); |
| |
| if (compiler.optimize()) { |
| compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern)); |
| } |
| |
| // Sample some characters from the middle of the string. |
| static const int kSampleSize = 128; |
| |
| sample_subject = String::Flatten(isolate, sample_subject); |
| int chars_sampled = 0; |
| int half_way = (sample_subject->length() - kSampleSize) / 2; |
| for (int i = Max(0, half_way); |
| i < sample_subject->length() && chars_sampled < kSampleSize; |
| i++, chars_sampled++) { |
| compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); |
| } |
| |
| // Wrap the body of the regexp in capture #0. |
| RegExpNode* captured_body = |
| RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept()); |
| RegExpNode* node = captured_body; |
| bool is_end_anchored = data->tree->IsAnchoredAtEnd(); |
| bool is_start_anchored = data->tree->IsAnchoredAtStart(); |
| int max_length = data->tree->max_match(); |
| if (!is_start_anchored && !is_sticky) { |
| // Add a .*? at the beginning, outside the body capture, unless |
| // this expression is anchored at the beginning or sticky. |
| JSRegExp::Flags default_flags = JSRegExp::Flags(); |
| RegExpNode* loop_node = RegExpQuantifier::ToNode( |
| 0, RegExpTree::kInfinity, false, |
| new (zone) RegExpCharacterClass('*', default_flags), &compiler, |
| captured_body, data->contains_anchor); |
| |
| if (data->contains_anchor) { |
| // Unroll loop once, to take care of the case that might start |
| // at the start of input. |
| ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone); |
| first_step_node->AddAlternative(GuardedAlternative(captured_body)); |
| first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( |
| new (zone) RegExpCharacterClass('*', default_flags), false, |
| loop_node))); |
| node = first_step_node; |
| } else { |
| node = loop_node; |
| } |
| } |
| if (is_one_byte) { |
| node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); |
| // Do it again to propagate the new nodes to places where they were not |
| // put because they had not been calculated yet. |
| if (node != nullptr) { |
| node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); |
| } |
| } else if (is_unicode && (is_global || is_sticky)) { |
| node = RegExpCompiler::OptionallyStepBackToLeadSurrogate(&compiler, node, |
| flags); |
| } |
| |
| if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone); |
| data->node = node; |
| Analysis analysis(isolate, is_one_byte); |
| analysis.EnsureAnalyzed(node); |
| if (analysis.has_failed()) { |
| data->error = |
| isolate->factory()->NewStringFromAsciiChecked(analysis.error_message()); |
| return false; |
| } |
| |
| // Create the correct assembler for the architecture. |
| std::unique_ptr<RegExpMacroAssembler> macro_assembler; |
| if (!FLAG_regexp_interpret_all) { |
| // Native regexp implementation. |
| DCHECK(!FLAG_jitless); |
| |
| NativeRegExpMacroAssembler::Mode mode = |
| is_one_byte ? NativeRegExpMacroAssembler::LATIN1 |
| : NativeRegExpMacroAssembler::UC16; |
| |
| #if V8_TARGET_ARCH_IA32 |
| macro_assembler.reset(new RegExpMacroAssemblerIA32( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_X64 |
| macro_assembler.reset(new RegExpMacroAssemblerX64( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_ARM |
| macro_assembler.reset(new RegExpMacroAssemblerARM( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_ARM64 |
| macro_assembler.reset(new RegExpMacroAssemblerARM64( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_S390 |
| macro_assembler.reset(new RegExpMacroAssemblerS390( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_PPC |
| macro_assembler.reset(new RegExpMacroAssemblerPPC( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_MIPS |
| macro_assembler.reset(new RegExpMacroAssemblerMIPS( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #elif V8_TARGET_ARCH_MIPS64 |
| macro_assembler.reset(new RegExpMacroAssemblerMIPS( |
| isolate, zone, mode, (data->capture_count + 1) * 2)); |
| #else |
| #error "Unsupported architecture" |
| #endif |
| } else { |
| DCHECK(FLAG_regexp_interpret_all); |
| |
| // Interpreted regexp implementation. |
| macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone)); |
| } |
| |
| macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern)); |
| |
| // Inserted here, instead of in Assembler, because it depends on information |
| // in the AST that isn't replicated in the Node structure. |
| static const int kMaxBacksearchLimit = 1024; |
| if (is_end_anchored && !is_start_anchored && !is_sticky && |
| max_length < kMaxBacksearchLimit) { |
| macro_assembler->SetCurrentPositionFromEnd(max_length); |
| } |
| |
| if (is_global) { |
| RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; |
| if (data->tree->min_match() > 0) { |
| mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; |
| } else if (is_unicode) { |
| mode = RegExpMacroAssembler::GLOBAL_UNICODE; |
| } |
| macro_assembler->set_global_mode(mode); |
| } |
| |
| RegExpCompiler::CompilationResult result = compiler.Assemble( |
| isolate, macro_assembler.get(), node, data->capture_count, pattern); |
| |
| if (FLAG_correctness_fuzzer_suppressions && |
| strncmp(result.error_message, "Stack overflow", 15) == 0) { |
| FATAL("Aborting on stack overflow"); |
| } |
| |
| if (result.error_message != nullptr) { |
| data->error = |
| isolate->factory()->NewStringFromAsciiChecked(result.error_message); |
| } |
| data->code = result.code; |
| data->register_count = result.num_registers; |
| |
| return result.Succeeded(); |
| } |
| |
| RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp, |
| Handle<String> subject, Isolate* isolate) |
| : register_array_(nullptr), |
| register_array_size_(0), |
| regexp_(regexp), |
| subject_(subject), |
| isolate_(isolate) { |
| bool interpreted = FLAG_regexp_interpret_all; |
| |
| if (regexp_->TypeTag() == JSRegExp::ATOM) { |
| static const int kAtomRegistersPerMatch = 2; |
| registers_per_match_ = kAtomRegistersPerMatch; |
| // There is no distinction between interpreted and native for atom regexps. |
| interpreted = false; |
| } else { |
| registers_per_match_ = RegExp::IrregexpPrepare(isolate_, regexp_, subject_); |
| if (registers_per_match_ < 0) { |
| num_matches_ = -1; // Signal exception. |
| return; |
| } |
| } |
| |
| DCHECK(IsGlobal(regexp->GetFlags())); |
| if (!interpreted) { |
| register_array_size_ = |
| Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); |
| max_matches_ = register_array_size_ / registers_per_match_; |
| } else { |
| // Global loop in interpreted regexp is not implemented. We choose |
| // the size of the offsets vector so that it can only store one match. |
| register_array_size_ = registers_per_match_; |
| max_matches_ = 1; |
| } |
| |
| if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { |
| register_array_ = NewArray<int32_t>(register_array_size_); |
| } else { |
| register_array_ = isolate->jsregexp_static_offsets_vector(); |
| } |
| |
| // Set state so that fetching the results the first time triggers a call |
| // to the compiled regexp. |
| current_match_index_ = max_matches_ - 1; |
| num_matches_ = max_matches_; |
| DCHECK_LE(2, registers_per_match_); // Each match has at least one capture. |
| DCHECK_GE(register_array_size_, registers_per_match_); |
| int32_t* last_match = |
| ®ister_array_[current_match_index_ * registers_per_match_]; |
| last_match[0] = -1; |
| last_match[1] = 0; |
| } |
| |
| RegExpGlobalCache::~RegExpGlobalCache() { |
| // Deallocate the register array if we allocated it in the constructor |
| // (as opposed to using the existing jsregexp_static_offsets_vector). |
| if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { |
| DeleteArray(register_array_); |
| } |
| } |
| |
| int RegExpGlobalCache::AdvanceZeroLength(int last_index) { |
| if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() && |
| unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && |
| unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { |
| // Advance over the surrogate pair. |
| return last_index + 2; |
| } |
| return last_index + 1; |
| } |
| |
| int32_t* RegExpGlobalCache::FetchNext() { |
| current_match_index_++; |
| if (current_match_index_ >= num_matches_) { |
| // Current batch of results exhausted. |
| // Fail if last batch was not even fully filled. |
| if (num_matches_ < max_matches_) { |
| num_matches_ = 0; // Signal failed match. |
| return nullptr; |
| } |
| |
| int32_t* last_match = |
| ®ister_array_[(current_match_index_ - 1) * registers_per_match_]; |
| int last_end_index = last_match[1]; |
| |
| if (regexp_->TypeTag() == JSRegExp::ATOM) { |
| num_matches_ = |
| RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index, |
| register_array_, register_array_size_); |
| } else { |
| int last_start_index = last_match[0]; |
| if (last_start_index == last_end_index) { |
| // Zero-length match. Advance by one code point. |
| last_end_index = AdvanceZeroLength(last_end_index); |
| } |
| if (last_end_index > subject_->length()) { |
| num_matches_ = 0; // Signal failed match. |
| return nullptr; |
| } |
| num_matches_ = RegExpImpl::IrregexpExecRaw( |
| isolate_, regexp_, subject_, last_end_index, register_array_, |
| register_array_size_); |
| } |
| |
| if (num_matches_ <= 0) return nullptr; |
| current_match_index_ = 0; |
| return register_array_; |
| } else { |
| return ®ister_array_[current_match_index_ * registers_per_match_]; |
| } |
| } |
| |
| int32_t* RegExpGlobalCache::LastSuccessfulMatch() { |
| int index = current_match_index_ * registers_per_match_; |
| if (num_matches_ == 0) { |
| // After a failed match we shift back by one result. |
| index -= registers_per_match_; |
| } |
| return ®ister_array_[index]; |
| } |
| |
| Object RegExpResultsCache::Lookup(Heap* heap, String key_string, |
| Object key_pattern, |
| FixedArray* last_match_cache, |
| ResultsCacheType type) { |
| FixedArray cache; |
| if (!key_string.IsInternalizedString()) return Smi::kZero; |
| if (type == STRING_SPLIT_SUBSTRINGS) { |
| DCHECK(key_pattern.IsString()); |
| if (!key_pattern.IsInternalizedString()) return Smi::kZero; |
| cache = heap->string_split_cache(); |
| } else { |
| DCHECK(type == REGEXP_MULTIPLE_INDICES); |
| DCHECK(key_pattern.IsFixedArray()); |
| cache = heap->regexp_multiple_cache(); |
| } |
| |
| uint32_t hash = key_string.Hash(); |
| uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & |
| ~(kArrayEntriesPerCacheEntry - 1)); |
| if (cache.get(index + kStringOffset) != key_string || |
| cache.get(index + kPatternOffset) != key_pattern) { |
| index = |
| ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); |
| if (cache.get(index + kStringOffset) != key_string || |
| cache.get(index + kPatternOffset) != key_pattern) { |
| return Smi::kZero; |
| } |
| } |
| |
| *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset)); |
| return cache.get(index + kArrayOffset); |
| } |
| |
| void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, |
| Handle<Object> key_pattern, |
| Handle<FixedArray> value_array, |
| Handle<FixedArray> last_match_cache, |
| ResultsCacheType type) { |
| Factory* factory = isolate->factory(); |
| Handle<FixedArray> cache; |
| if (!key_string->IsInternalizedString()) return; |
| if (type == STRING_SPLIT_SUBSTRINGS) { |
| DCHECK(key_pattern->IsString()); |
| if (!key_pattern->IsInternalizedString()) return; |
| cache = factory->string_split_cache(); |
| } else { |
| DCHECK(type == REGEXP_MULTIPLE_INDICES); |
| DCHECK(key_pattern->IsFixedArray()); |
| cache = factory->regexp_multiple_cache(); |
| } |
| |
| uint32_t hash = key_string->Hash(); |
| uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & |
| ~(kArrayEntriesPerCacheEntry - 1)); |
| if (cache->get(index + kStringOffset) == Smi::kZero) { |
| cache->set(index + kStringOffset, *key_string); |
| cache->set(index + kPatternOffset, *key_pattern); |
| cache->set(index + kArrayOffset, *value_array); |
| cache->set(index + kLastMatchOffset, *last_match_cache); |
| } else { |
| uint32_t index2 = |
| ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); |
| if (cache->get(index2 + kStringOffset) == Smi::kZero) { |
| cache->set(index2 + kStringOffset, *key_string); |
| cache->set(index2 + kPatternOffset, *key_pattern); |
| cache->set(index2 + kArrayOffset, *value_array); |
| cache->set(index2 + kLastMatchOffset, *last_match_cache); |
| } else { |
| cache->set(index2 + kStringOffset, Smi::kZero); |
| cache->set(index2 + kPatternOffset, Smi::kZero); |
| cache->set(index2 + kArrayOffset, Smi::kZero); |
| cache->set(index2 + kLastMatchOffset, Smi::kZero); |
| cache->set(index + kStringOffset, *key_string); |
| cache->set(index + kPatternOffset, *key_pattern); |
| cache->set(index + kArrayOffset, *value_array); |
| cache->set(index + kLastMatchOffset, *last_match_cache); |
| } |
| } |
| // If the array is a reasonably short list of substrings, convert it into a |
| // list of internalized strings. |
| if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { |
| for (int i = 0; i < value_array->length(); i++) { |
| Handle<String> str(String::cast(value_array->get(i)), isolate); |
| Handle<String> internalized_str = factory->InternalizeString(str); |
| value_array->set(i, *internalized_str); |
| } |
| } |
| // Convert backing store to a copy-on-write array. |
| value_array->set_map_no_write_barrier( |
| ReadOnlyRoots(isolate).fixed_cow_array_map()); |
| } |
| |
| void RegExpResultsCache::Clear(FixedArray cache) { |
| for (int i = 0; i < kRegExpResultsCacheSize; i++) { |
| cache.set(i, Smi::kZero); |
| } |
| } |
| |
| } // namespace internal |
| } // namespace v8 |