| // Copyright 2019 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef V8_REGEXP_REGEXP_COMPILER_H_ |
| #define V8_REGEXP_REGEXP_COMPILER_H_ |
| |
| #include <bitset> |
| |
| #include "src/base/small-vector.h" |
| #include "src/regexp/regexp-nodes.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| class DynamicBitSet; |
| class Isolate; |
| |
| namespace regexp_compiler_constants { |
| |
| // The '2' variant is has inclusive from and exclusive to. |
| // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, |
| // which include WhiteSpace (7.2) or LineTerminator (7.3) values. |
| constexpr uc32 kRangeEndMarker = 0x110000; |
| constexpr int kSpaceRanges[] = { |
| '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, |
| 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, |
| 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; |
| constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); |
| |
| constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', |
| '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; |
| constexpr int kWordRangeCount = arraysize(kWordRanges); |
| constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; |
| constexpr int kDigitRangeCount = arraysize(kDigitRanges); |
| constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, |
| kLeadSurrogateStart + 1, kRangeEndMarker}; |
| constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); |
| constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, |
| 0x2028, 0x202A, kRangeEndMarker}; |
| constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); |
| |
| // More makes code generation slower, less makes V8 benchmark score lower. |
| constexpr int kMaxLookaheadForBoyerMoore = 8; |
| // In a 3-character pattern you can maximally step forwards 3 characters |
| // at a time, which is not always enough to pay for the extra logic. |
| constexpr int kPatternTooShortForBoyerMoore = 2; |
| |
| } // namespace regexp_compiler_constants |
| |
| inline bool IgnoreCase(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kIgnoreCase) != 0; |
| } |
| |
| inline bool IsUnicode(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kUnicode) != 0; |
| } |
| |
| inline bool IsSticky(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kSticky) != 0; |
| } |
| |
| inline bool IsGlobal(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kGlobal) != 0; |
| } |
| |
| inline bool DotAll(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kDotAll) != 0; |
| } |
| |
| inline bool Multiline(JSRegExp::Flags flags) { |
| return (flags & JSRegExp::kMultiline) != 0; |
| } |
| |
| inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { |
| // Both unicode and ignore_case flags are set. We need to use ICU to find |
| // the closure over case equivalents. |
| return IsUnicode(flags) && IgnoreCase(flags); |
| } |
| |
| // Details of a quick mask-compare check that can look ahead in the |
| // input stream. |
| class QuickCheckDetails { |
| public: |
| QuickCheckDetails() |
| : characters_(0), mask_(0), value_(0), cannot_match_(false) {} |
| explicit QuickCheckDetails(int characters) |
| : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} |
| bool Rationalize(bool one_byte); |
| // Merge in the information from another branch of an alternation. |
| void Merge(QuickCheckDetails* other, int from_index); |
| // Advance the current position by some amount. |
| void Advance(int by, bool one_byte); |
| void Clear(); |
| bool cannot_match() { return cannot_match_; } |
| void set_cannot_match() { cannot_match_ = true; } |
| struct Position { |
| Position() : mask(0), value(0), determines_perfectly(false) {} |
| uc32 mask; |
| uc32 value; |
| bool determines_perfectly; |
| }; |
| int characters() { return characters_; } |
| void set_characters(int characters) { characters_ = characters; } |
| Position* positions(int index) { |
| DCHECK_LE(0, index); |
| DCHECK_GT(characters_, index); |
| return positions_ + index; |
| } |
| uint32_t mask() { return mask_; } |
| uint32_t value() { return value_; } |
| |
| private: |
| // How many characters do we have quick check information from. This is |
| // the same for all branches of a choice node. |
| int characters_; |
| Position positions_[4]; |
| // These values are the condensate of the above array after Rationalize(). |
| uint32_t mask_; |
| uint32_t value_; |
| // If set to true, there is no way this quick check can match at all. |
| // E.g., if it requires to be at the start of the input, and isn't. |
| bool cannot_match_; |
| }; |
| |
| // Improve the speed that we scan for an initial point where a non-anchored |
| // regexp can match by using a Boyer-Moore-like table. This is done by |
| // identifying non-greedy non-capturing loops in the nodes that eat any |
| // character one at a time. For example in the middle of the regexp |
| // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
| // inserted at the start of any non-anchored regexp. |
| // |
| // When we have found such a loop we look ahead in the nodes to find the set of |
| // characters that can come at given distances. For example for the regexp |
| // /.?foo/ we know that there are at least 3 characters ahead of us, and the |
| // sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
| // the lookahead info where the set of characters is reasonably constrained. In |
| // our example this is from index 1 to 2 (0 is not constrained). We can now |
| // look 3 characters ahead and if we don't find one of [f, o] (the union of |
| // [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
| // |
| // For Unicode input strings we do the same, but modulo 128. |
| // |
| // We also look at the first string fed to the regexp and use that to get a hint |
| // of the character frequencies in the inputs. This affects the assessment of |
| // whether the set of characters is 'reasonably constrained'. |
| // |
| // We also have another lookahead mechanism (called quick check in the code), |
| // which uses a wide load of multiple characters followed by a mask and compare |
| // to determine whether a match is possible at this point. |
| enum ContainedInLattice { |
| kNotYet = 0, |
| kLatticeIn = 1, |
| kLatticeOut = 2, |
| kLatticeUnknown = 3 // Can also mean both in and out. |
| }; |
| |
| inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { |
| return static_cast<ContainedInLattice>(a | b); |
| } |
| |
| class BoyerMoorePositionInfo : public ZoneObject { |
| public: |
| bool at(int i) const { return map_[i]; } |
| |
| static constexpr int kMapSize = 128; |
| static constexpr int kMask = kMapSize - 1; |
| |
| int map_count() const { return map_count_; } |
| |
| void Set(int character); |
| void SetInterval(const Interval& interval); |
| void SetAll(); |
| |
| bool is_non_word() { return w_ == kLatticeOut; } |
| bool is_word() { return w_ == kLatticeIn; } |
| |
| using Bitset = std::bitset<kMapSize>; |
| Bitset raw_bitset() const { return map_; } |
| |
| private: |
| Bitset map_; |
| int map_count_ = 0; // Number of set bits in the map. |
| ContainedInLattice w_ = kNotYet; // The \w character class. |
| }; |
| |
| class BoyerMooreLookahead : public ZoneObject { |
| public: |
| BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); |
| |
| int length() { return length_; } |
| int max_char() { return max_char_; } |
| RegExpCompiler* compiler() { return compiler_; } |
| |
| int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } |
| |
| BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } |
| |
| void Set(int map_number, int character) { |
| if (character > max_char_) return; |
| BoyerMoorePositionInfo* info = bitmaps_->at(map_number); |
| info->Set(character); |
| } |
| |
| void SetInterval(int map_number, const Interval& interval) { |
| if (interval.from() > max_char_) return; |
| BoyerMoorePositionInfo* info = bitmaps_->at(map_number); |
| if (interval.to() > max_char_) { |
| info->SetInterval(Interval(interval.from(), max_char_)); |
| } else { |
| info->SetInterval(interval); |
| } |
| } |
| |
| void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } |
| |
| void SetRest(int from_map) { |
| for (int i = from_map; i < length_; i++) SetAll(i); |
| } |
| void EmitSkipInstructions(RegExpMacroAssembler* masm); |
| |
| private: |
| // This is the value obtained by EatsAtLeast. If we do not have at least this |
| // many characters left in the sample string then the match is bound to fail. |
| // Therefore it is OK to read a character this far ahead of the current match |
| // point. |
| int length_; |
| RegExpCompiler* compiler_; |
| // 0xff for Latin1, 0xffff for UTF-16. |
| int max_char_; |
| ZoneList<BoyerMoorePositionInfo*>* bitmaps_; |
| |
| int GetSkipTable(int min_lookahead, int max_lookahead, |
| Handle<ByteArray> boolean_skip_table); |
| bool FindWorthwhileInterval(int* from, int* to); |
| int FindBestInterval(int max_number_of_chars, int old_biggest_points, |
| int* from, int* to); |
| }; |
| |
| // There are many ways to generate code for a node. This class encapsulates |
| // the current way we should be generating. In other words it encapsulates |
| // the current state of the code generator. The effect of this is that we |
| // generate code for paths that the matcher can take through the regular |
| // expression. A given node in the regexp can be code-generated several times |
| // as it can be part of several traces. For example for the regexp: |
| // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part |
| // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code |
| // to match foo is generated only once (the traces have a common prefix). The |
| // code to store the capture is deferred and generated (twice) after the places |
| // where baz has been matched. |
| class Trace { |
| public: |
| // A value for a property that is either known to be true, know to be false, |
| // or not known. |
| enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; |
| |
| class DeferredAction { |
| public: |
| DeferredAction(ActionNode::ActionType action_type, int reg) |
| : action_type_(action_type), reg_(reg), next_(nullptr) {} |
| DeferredAction* next() { return next_; } |
| bool Mentions(int reg); |
| int reg() { return reg_; } |
| ActionNode::ActionType action_type() { return action_type_; } |
| |
| private: |
| ActionNode::ActionType action_type_; |
| int reg_; |
| DeferredAction* next_; |
| friend class Trace; |
| }; |
| |
| class DeferredCapture : public DeferredAction { |
| public: |
| DeferredCapture(int reg, bool is_capture, Trace* trace) |
| : DeferredAction(ActionNode::STORE_POSITION, reg), |
| cp_offset_(trace->cp_offset()), |
| is_capture_(is_capture) {} |
| int cp_offset() { return cp_offset_; } |
| bool is_capture() { return is_capture_; } |
| |
| private: |
| int cp_offset_; |
| bool is_capture_; |
| void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } |
| }; |
| |
| class DeferredSetRegisterForLoop : public DeferredAction { |
| public: |
| DeferredSetRegisterForLoop(int reg, int value) |
| : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), |
| value_(value) {} |
| int value() { return value_; } |
| |
| private: |
| int value_; |
| }; |
| |
| class DeferredClearCaptures : public DeferredAction { |
| public: |
| explicit DeferredClearCaptures(Interval range) |
| : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} |
| Interval range() { return range_; } |
| |
| private: |
| Interval range_; |
| }; |
| |
| class DeferredIncrementRegister : public DeferredAction { |
| public: |
| explicit DeferredIncrementRegister(int reg) |
| : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} |
| }; |
| |
| Trace() |
| : cp_offset_(0), |
| actions_(nullptr), |
| backtrack_(nullptr), |
| stop_node_(nullptr), |
| loop_label_(nullptr), |
| characters_preloaded_(0), |
| bound_checked_up_to_(0), |
| flush_budget_(100), |
| at_start_(UNKNOWN) {} |
| |
| // End the trace. This involves flushing the deferred actions in the trace |
| // and pushing a backtrack location onto the backtrack stack. Once this is |
| // done we can start a new trace or go to one that has already been |
| // generated. |
| void Flush(RegExpCompiler* compiler, RegExpNode* successor); |
| int cp_offset() { return cp_offset_; } |
| DeferredAction* actions() { return actions_; } |
| // A trivial trace is one that has no deferred actions or other state that |
| // affects the assumptions used when generating code. There is no recorded |
| // backtrack location in a trivial trace, so with a trivial trace we will |
| // generate code that, on a failure to match, gets the backtrack location |
| // from the backtrack stack rather than using a direct jump instruction. We |
| // always start code generation with a trivial trace and non-trivial traces |
| // are created as we emit code for nodes or add to the list of deferred |
| // actions in the trace. The location of the code generated for a node using |
| // a trivial trace is recorded in a label in the node so that gotos can be |
| // generated to that code. |
| bool is_trivial() { |
| return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && |
| characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && |
| quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; |
| } |
| TriBool at_start() { return at_start_; } |
| void set_at_start(TriBool at_start) { at_start_ = at_start; } |
| Label* backtrack() { return backtrack_; } |
| Label* loop_label() { return loop_label_; } |
| RegExpNode* stop_node() { return stop_node_; } |
| int characters_preloaded() { return characters_preloaded_; } |
| int bound_checked_up_to() { return bound_checked_up_to_; } |
| int flush_budget() { return flush_budget_; } |
| QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } |
| bool mentions_reg(int reg); |
| // Returns true if a deferred position store exists to the specified |
| // register and stores the offset in the out-parameter. Otherwise |
| // returns false. |
| bool GetStoredPosition(int reg, int* cp_offset); |
| // These set methods and AdvanceCurrentPositionInTrace should be used only on |
| // new traces - the intention is that traces are immutable after creation. |
| void add_action(DeferredAction* new_action) { |
| DCHECK(new_action->next_ == nullptr); |
| new_action->next_ = actions_; |
| actions_ = new_action; |
| } |
| void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } |
| void set_stop_node(RegExpNode* node) { stop_node_ = node; } |
| void set_loop_label(Label* label) { loop_label_ = label; } |
| void set_characters_preloaded(int count) { characters_preloaded_ = count; } |
| void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } |
| void set_flush_budget(int to) { flush_budget_ = to; } |
| void set_quick_check_performed(QuickCheckDetails* d) { |
| quick_check_performed_ = *d; |
| } |
| void InvalidateCurrentCharacter(); |
| void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); |
| |
| private: |
| int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); |
| void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, |
| const DynamicBitSet& affected_registers, |
| DynamicBitSet* registers_to_pop, |
| DynamicBitSet* registers_to_clear, Zone* zone); |
| void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, |
| const DynamicBitSet& registers_to_pop, |
| const DynamicBitSet& registers_to_clear); |
| int cp_offset_; |
| DeferredAction* actions_; |
| Label* backtrack_; |
| RegExpNode* stop_node_; |
| Label* loop_label_; |
| int characters_preloaded_; |
| int bound_checked_up_to_; |
| QuickCheckDetails quick_check_performed_; |
| int flush_budget_; |
| TriBool at_start_; |
| }; |
| |
| class GreedyLoopState { |
| public: |
| explicit GreedyLoopState(bool not_at_start); |
| |
| Label* label() { return &label_; } |
| Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } |
| |
| private: |
| Label label_; |
| Trace counter_backtrack_trace_; |
| }; |
| |
| struct PreloadState { |
| static const int kEatsAtLeastNotYetInitialized = -1; |
| bool preload_is_current_; |
| bool preload_has_checked_bounds_; |
| int preload_characters_; |
| int eats_at_least_; |
| void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } |
| }; |
| |
| // Analysis performs assertion propagation and computes eats_at_least_ values. |
| // See the comments on AssertionPropagator and EatsAtLeastPropagator for more |
| // details. |
| RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); |
| |
| class FrequencyCollator { |
| public: |
| FrequencyCollator() : total_samples_(0) { |
| for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { |
| frequencies_[i] = CharacterFrequency(i); |
| } |
| } |
| |
| void CountCharacter(int character) { |
| int index = (character & RegExpMacroAssembler::kTableMask); |
| frequencies_[index].Increment(); |
| total_samples_++; |
| } |
| |
| // Does not measure in percent, but rather per-128 (the table size from the |
| // regexp macro assembler). |
| int Frequency(int in_character) { |
| DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); |
| if (total_samples_ < 1) return 1; // Division by zero. |
| int freq_in_per128 = |
| (frequencies_[in_character].counter() * 128) / total_samples_; |
| return freq_in_per128; |
| } |
| |
| private: |
| class CharacterFrequency { |
| public: |
| CharacterFrequency() : counter_(0), character_(-1) {} |
| explicit CharacterFrequency(int character) |
| : counter_(0), character_(character) {} |
| |
| void Increment() { counter_++; } |
| int counter() { return counter_; } |
| int character() { return character_; } |
| |
| private: |
| int counter_; |
| int character_; |
| }; |
| |
| private: |
| CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; |
| int total_samples_; |
| }; |
| |
| class RegExpCompiler { |
| public: |
| RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, |
| bool is_one_byte); |
| |
| int AllocateRegister() { |
| if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
| reg_exp_too_big_ = true; |
| return next_register_; |
| } |
| return next_register_++; |
| } |
| |
| // Lookarounds to match lone surrogates for unicode character class matches |
| // are never nested. We can therefore reuse registers. |
| int UnicodeLookaroundStackRegister() { |
| if (unicode_lookaround_stack_register_ == kNoRegister) { |
| unicode_lookaround_stack_register_ = AllocateRegister(); |
| } |
| return unicode_lookaround_stack_register_; |
| } |
| |
| int UnicodeLookaroundPositionRegister() { |
| if (unicode_lookaround_position_register_ == kNoRegister) { |
| unicode_lookaround_position_register_ = AllocateRegister(); |
| } |
| return unicode_lookaround_position_register_; |
| } |
| |
| struct CompilationResult final { |
| explicit CompilationResult(RegExpError err) : error(err) {} |
| CompilationResult(Handle<Object> code, int registers) |
| : code(code), num_registers(registers) {} |
| |
| static CompilationResult RegExpTooBig() { |
| return CompilationResult(RegExpError::kTooLarge); |
| } |
| |
| bool Succeeded() const { return error == RegExpError::kNone; } |
| |
| const RegExpError error = RegExpError::kNone; |
| Handle<Object> code; |
| int num_registers = 0; |
| }; |
| |
| CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, |
| RegExpNode* start, int capture_count, |
| Handle<String> pattern); |
| |
| // Preprocessing is the final step of node creation before analysis |
| // and assembly. It includes: |
| // - Wrapping the body of the regexp in capture 0. |
| // - Inserting the implicit .* before/after the regexp if necessary. |
| // - If the input is a one-byte string, filtering out nodes that can't match. |
| // - Fixing up regexp matches that start within a surrogate pair. |
| RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags, |
| bool is_one_byte); |
| |
| // If the regexp matching starts within a surrogate pair, step back to the |
| // lead surrogate and start matching from there. |
| RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success, |
| JSRegExp::Flags flags); |
| |
| inline void AddWork(RegExpNode* node) { |
| if (!node->on_work_list() && !node->label()->is_bound()) { |
| node->set_on_work_list(true); |
| work_list_->push_back(node); |
| } |
| } |
| |
| static const int kImplementationOffset = 0; |
| static const int kNumberOfRegistersOffset = 0; |
| static const int kCodeOffset = 1; |
| |
| RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } |
| EndNode* accept() { return accept_; } |
| |
| static const int kMaxRecursion = 100; |
| inline int recursion_depth() { return recursion_depth_; } |
| inline void IncrementRecursionDepth() { recursion_depth_++; } |
| inline void DecrementRecursionDepth() { recursion_depth_--; } |
| |
| void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
| |
| inline bool one_byte() { return one_byte_; } |
| inline bool optimize() { return optimize_; } |
| inline void set_optimize(bool value) { optimize_ = value; } |
| inline bool limiting_recursion() { return limiting_recursion_; } |
| inline void set_limiting_recursion(bool value) { |
| limiting_recursion_ = value; |
| } |
| bool read_backward() { return read_backward_; } |
| void set_read_backward(bool value) { read_backward_ = value; } |
| FrequencyCollator* frequency_collator() { return &frequency_collator_; } |
| |
| int current_expansion_factor() { return current_expansion_factor_; } |
| void set_current_expansion_factor(int value) { |
| current_expansion_factor_ = value; |
| } |
| |
| Isolate* isolate() const { return isolate_; } |
| Zone* zone() const { return zone_; } |
| |
| static const int kNoRegister = -1; |
| |
| private: |
| EndNode* accept_; |
| int next_register_; |
| int unicode_lookaround_stack_register_; |
| int unicode_lookaround_position_register_; |
| ZoneVector<RegExpNode*>* work_list_; |
| int recursion_depth_; |
| RegExpMacroAssembler* macro_assembler_; |
| bool one_byte_; |
| bool reg_exp_too_big_; |
| bool limiting_recursion_; |
| bool optimize_; |
| bool read_backward_; |
| int current_expansion_factor_; |
| FrequencyCollator frequency_collator_; |
| Isolate* isolate_; |
| Zone* zone_; |
| }; |
| |
| // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. |
| class UnicodeRangeSplitter { |
| public: |
| V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base); |
| |
| static constexpr int kInitialSize = 8; |
| using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>; |
| |
| const CharacterRangeVector* bmp() const { return &bmp_; } |
| const CharacterRangeVector* lead_surrogates() const { |
| return &lead_surrogates_; |
| } |
| const CharacterRangeVector* trail_surrogates() const { |
| return &trail_surrogates_; |
| } |
| const CharacterRangeVector* non_bmp() const { return &non_bmp_; } |
| |
| private: |
| void AddRange(CharacterRange range); |
| |
| CharacterRangeVector bmp_; |
| CharacterRangeVector lead_surrogates_; |
| CharacterRangeVector trail_surrogates_; |
| CharacterRangeVector non_bmp_; |
| }; |
| |
| // We need to check for the following characters: 0x39C 0x3BC 0x178. |
| // TODO(jgruber): Move to CharacterRange. |
| bool RangeContainsLatin1Equivalents(CharacterRange range); |
| |
| } // namespace internal |
| } // namespace v8 |
| |
| #endif // V8_REGEXP_REGEXP_COMPILER_H_ |