|  | // Copyright 2011 the V8 project authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "src/parsing/scanner-character-streams.h" | 
|  |  | 
|  | #include <memory> | 
|  | #include <vector> | 
|  |  | 
|  | #include "include/v8.h" | 
|  | #include "src/common/globals.h" | 
|  | #include "src/handles/handles.h" | 
|  | #include "src/logging/counters.h" | 
|  | #include "src/objects/objects-inl.h" | 
|  | #include "src/parsing/scanner.h" | 
|  | #include "src/strings/unicode-inl.h" | 
|  |  | 
|  | namespace v8 { | 
|  | namespace internal { | 
|  |  | 
|  | class ScopedExternalStringLock { | 
|  | public: | 
|  | explicit ScopedExternalStringLock(ExternalString string) { | 
|  | DCHECK(!string.is_null()); | 
|  | if (string.IsExternalOneByteString()) { | 
|  | resource_ = ExternalOneByteString::cast(string).resource(); | 
|  | } else { | 
|  | DCHECK(string.IsExternalTwoByteString()); | 
|  | resource_ = ExternalTwoByteString::cast(string).resource(); | 
|  | } | 
|  | DCHECK(resource_); | 
|  | resource_->Lock(); | 
|  | } | 
|  |  | 
|  | // Copying a lock increases the locking depth. | 
|  | ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT | 
|  | : resource_(other.resource_) { | 
|  | resource_->Lock(); | 
|  | } | 
|  |  | 
|  | ~ScopedExternalStringLock() { resource_->Unlock(); } | 
|  |  | 
|  | private: | 
|  | // Not nullptr. | 
|  | const v8::String::ExternalStringResourceBase* resource_; | 
|  | }; | 
|  |  | 
|  | namespace { | 
|  | const unibrow::uchar kUtf8Bom = 0xFEFF; | 
|  | }  // namespace | 
|  |  | 
|  | template <typename Char> | 
|  | struct Range { | 
|  | const Char* start; | 
|  | const Char* end; | 
|  |  | 
|  | size_t length() { return static_cast<size_t>(end - start); } | 
|  | bool unaligned_start() const { | 
|  | return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString. | 
|  | template <typename Char> | 
|  | class OnHeapStream { | 
|  | public: | 
|  | using String = typename CharTraits<Char>::String; | 
|  |  | 
|  | OnHeapStream(Handle<String> string, size_t start_offset, size_t end) | 
|  | : string_(string), start_offset_(start_offset), length_(end) {} | 
|  |  | 
|  | OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) { | 
|  | UNREACHABLE(); | 
|  | } | 
|  |  | 
|  | // The no_gc argument is only here because of the templated way this class | 
|  | // is used along with other implementations that require V8 heap access. | 
|  | Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats, | 
|  | DisallowHeapAllocation* no_gc) { | 
|  | return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)], | 
|  | &string_->GetChars(*no_gc)[start_offset_ + length_]}; | 
|  | } | 
|  |  | 
|  | static const bool kCanBeCloned = false; | 
|  | static const bool kCanAccessHeap = true; | 
|  |  | 
|  | private: | 
|  | Handle<String> string_; | 
|  | const size_t start_offset_; | 
|  | const size_t length_; | 
|  | }; | 
|  |  | 
|  | // A Char stream backed by an off-heap ExternalOneByteString or | 
|  | // ExternalTwoByteString. | 
|  | template <typename Char> | 
|  | class ExternalStringStream { | 
|  | using ExternalString = typename CharTraits<Char>::ExternalString; | 
|  |  | 
|  | public: | 
|  | ExternalStringStream(ExternalString string, size_t start_offset, | 
|  | size_t length) | 
|  | : lock_(string), | 
|  | data_(string.GetChars() + start_offset), | 
|  | length_(length) {} | 
|  |  | 
|  | ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT | 
|  | : lock_(other.lock_), | 
|  | data_(other.data_), | 
|  | length_(other.length_) {} | 
|  |  | 
|  | // The no_gc argument is only here because of the templated way this class | 
|  | // is used along with other implementations that require V8 heap access. | 
|  | Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats, | 
|  | DisallowHeapAllocation* no_gc = nullptr) { | 
|  | return {&data_[Min(length_, pos)], &data_[length_]}; | 
|  | } | 
|  |  | 
|  | static const bool kCanBeCloned = true; | 
|  | static const bool kCanAccessHeap = false; | 
|  |  | 
|  | private: | 
|  | ScopedExternalStringLock lock_; | 
|  | const Char* const data_; | 
|  | const size_t length_; | 
|  | }; | 
|  |  | 
|  | // A Char stream backed by a C array. Testing only. | 
|  | template <typename Char> | 
|  | class TestingStream { | 
|  | public: | 
|  | TestingStream(const Char* data, size_t length) | 
|  | : data_(data), length_(length) {} | 
|  | // The no_gc argument is only here because of the templated way this class | 
|  | // is used along with other implementations that require V8 heap access. | 
|  | Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats, | 
|  | DisallowHeapAllocation* no_gc = nullptr) { | 
|  | return {&data_[Min(length_, pos)], &data_[length_]}; | 
|  | } | 
|  |  | 
|  | static const bool kCanBeCloned = true; | 
|  | static const bool kCanAccessHeap = false; | 
|  |  | 
|  | private: | 
|  | const Char* const data_; | 
|  | const size_t length_; | 
|  | }; | 
|  |  | 
|  | // A Char stream backed by multiple source-stream provided off-heap chunks. | 
|  | template <typename Char> | 
|  | class ChunkedStream { | 
|  | public: | 
|  | explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source) | 
|  | : source_(source) {} | 
|  |  | 
|  | ChunkedStream(const ChunkedStream&) V8_NOEXCEPT { | 
|  | // TODO(rmcilroy): Implement cloning for chunked streams. | 
|  | UNREACHABLE(); | 
|  | } | 
|  |  | 
|  | // The no_gc argument is only here because of the templated way this class | 
|  | // is used along with other implementations that require V8 heap access. | 
|  | Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats, | 
|  | DisallowHeapAllocation* no_gc = nullptr) { | 
|  | Chunk chunk = FindChunk(pos, stats); | 
|  | size_t buffer_end = chunk.length; | 
|  | size_t buffer_pos = Min(buffer_end, pos - chunk.position); | 
|  | return {&chunk.data[buffer_pos], &chunk.data[buffer_end]}; | 
|  | } | 
|  |  | 
|  | ~ChunkedStream() { | 
|  | for (Chunk& chunk : chunks_) delete[] chunk.data; | 
|  | } | 
|  |  | 
|  | static const bool kCanBeCloned = false; | 
|  | static const bool kCanAccessHeap = false; | 
|  |  | 
|  | private: | 
|  | struct Chunk { | 
|  | Chunk(const Char* const data, size_t position, size_t length) | 
|  | : data(data), position(position), length(length) {} | 
|  | const Char* const data; | 
|  | // The logical position of data. | 
|  | const size_t position; | 
|  | const size_t length; | 
|  | size_t end_position() const { return position + length; } | 
|  | }; | 
|  |  | 
|  | Chunk FindChunk(size_t position, RuntimeCallStats* stats) { | 
|  | while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats); | 
|  |  | 
|  | // Walk forwards while the position is in front of the current chunk. | 
|  | while (position >= chunks_.back().end_position() && | 
|  | chunks_.back().length > 0) { | 
|  | FetchChunk(chunks_.back().end_position(), stats); | 
|  | } | 
|  |  | 
|  | // Walk backwards. | 
|  | for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend(); | 
|  | ++reverse_it) { | 
|  | if (reverse_it->position <= position) return *reverse_it; | 
|  | } | 
|  |  | 
|  | UNREACHABLE(); | 
|  | } | 
|  |  | 
|  | virtual void ProcessChunk(const uint8_t* data, size_t position, | 
|  | size_t length) { | 
|  | // Incoming data has to be aligned to Char size. | 
|  | DCHECK_EQ(0, length % sizeof(Char)); | 
|  | chunks_.emplace_back(reinterpret_cast<const Char*>(data), position, | 
|  | length / sizeof(Char)); | 
|  | } | 
|  |  | 
|  | void FetchChunk(size_t position, RuntimeCallStats* stats) { | 
|  | const uint8_t* data = nullptr; | 
|  | size_t length; | 
|  | { | 
|  | RuntimeCallTimerScope scope(stats, | 
|  | RuntimeCallCounterId::kGetMoreDataCallback); | 
|  | length = source_->GetMoreData(&data); | 
|  | } | 
|  | ProcessChunk(data, position, length); | 
|  | } | 
|  |  | 
|  | ScriptCompiler::ExternalSourceStream* source_; | 
|  |  | 
|  | protected: | 
|  | std::vector<struct Chunk> chunks_; | 
|  | }; | 
|  |  | 
|  | // Provides a buffered utf-16 view on the bytes from the underlying ByteStream. | 
|  | // Chars are buffered if either the underlying stream isn't utf-16 or the | 
|  | // underlying utf-16 stream might move (is on-heap). | 
|  | template <template <typename T> class ByteStream> | 
|  | class BufferedCharacterStream : public Utf16CharacterStream { | 
|  | public: | 
|  | template <class... TArgs> | 
|  | BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) { | 
|  | buffer_pos_ = pos; | 
|  | } | 
|  |  | 
|  | bool can_be_cloned() const final { | 
|  | return ByteStream<uint16_t>::kCanBeCloned; | 
|  | } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> Clone() const override { | 
|  | CHECK(can_be_cloned()); | 
|  | return std::unique_ptr<Utf16CharacterStream>( | 
|  | new BufferedCharacterStream<ByteStream>(*this)); | 
|  | } | 
|  |  | 
|  | protected: | 
|  | bool ReadBlock() final { | 
|  | size_t position = pos(); | 
|  | buffer_pos_ = position; | 
|  | buffer_start_ = &buffer_[0]; | 
|  | buffer_cursor_ = buffer_start_; | 
|  |  | 
|  | DisallowHeapAllocation no_gc; | 
|  | Range<uint8_t> range = | 
|  | byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc); | 
|  | if (range.length() == 0) { | 
|  | buffer_end_ = buffer_start_; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | size_t length = Min(kBufferSize, range.length()); | 
|  | i::CopyChars(buffer_, range.start, length); | 
|  | buffer_end_ = &buffer_[length]; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool can_access_heap() const final { | 
|  | return ByteStream<uint8_t>::kCanAccessHeap; | 
|  | } | 
|  |  | 
|  | private: | 
|  | BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other) | 
|  | : byte_stream_(other.byte_stream_) {} | 
|  |  | 
|  | static const size_t kBufferSize = 512; | 
|  | uc16 buffer_[kBufferSize]; | 
|  | ByteStream<uint8_t> byte_stream_; | 
|  | }; | 
|  |  | 
|  | // Provides a unbuffered utf-16 view on the bytes from the underlying | 
|  | // ByteStream. | 
|  | template <template <typename T> class ByteStream> | 
|  | class UnbufferedCharacterStream : public Utf16CharacterStream { | 
|  | public: | 
|  | template <class... TArgs> | 
|  | UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) { | 
|  | buffer_pos_ = pos; | 
|  | } | 
|  |  | 
|  | bool can_access_heap() const final { | 
|  | return ByteStream<uint16_t>::kCanAccessHeap; | 
|  | } | 
|  |  | 
|  | bool can_be_cloned() const final { | 
|  | return ByteStream<uint16_t>::kCanBeCloned; | 
|  | } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> Clone() const override { | 
|  | return std::unique_ptr<Utf16CharacterStream>( | 
|  | new UnbufferedCharacterStream<ByteStream>(*this)); | 
|  | } | 
|  |  | 
|  | protected: | 
|  | bool ReadBlock() final { | 
|  | size_t position = pos(); | 
|  | buffer_pos_ = position; | 
|  | DisallowHeapAllocation no_gc; | 
|  | Range<uint16_t> range = | 
|  | byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc); | 
|  | buffer_start_ = range.start; | 
|  | buffer_end_ = range.end; | 
|  | buffer_cursor_ = buffer_start_; | 
|  | if (range.length() == 0) return false; | 
|  |  | 
|  | DCHECK(!range.unaligned_start()); | 
|  | DCHECK_LE(buffer_start_, buffer_end_); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other) | 
|  | : byte_stream_(other.byte_stream_) {} | 
|  |  | 
|  | ByteStream<uint16_t> byte_stream_; | 
|  | }; | 
|  |  | 
|  | // Provides a unbuffered utf-16 view on the bytes from the underlying | 
|  | // ByteStream. | 
|  | class RelocatingCharacterStream final | 
|  | : public UnbufferedCharacterStream<OnHeapStream> { | 
|  | public: | 
|  | template <class... TArgs> | 
|  | RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args) | 
|  | : UnbufferedCharacterStream<OnHeapStream>(pos, args...), | 
|  | isolate_(isolate) { | 
|  | isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback, | 
|  | v8::kGCTypeAll, this); | 
|  | } | 
|  |  | 
|  | private: | 
|  | ~RelocatingCharacterStream() final { | 
|  | isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback, | 
|  | this); | 
|  | } | 
|  |  | 
|  | static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate, | 
|  | v8::GCType type, | 
|  | v8::GCCallbackFlags flags, | 
|  | void* stream) { | 
|  | reinterpret_cast<RelocatingCharacterStream*>(stream) | 
|  | ->UpdateBufferPointers(); | 
|  | } | 
|  |  | 
|  | void UpdateBufferPointers() { | 
|  | DisallowHeapAllocation no_gc; | 
|  | Range<uint16_t> range = | 
|  | byte_stream_.GetDataAt(buffer_pos_, runtime_call_stats(), &no_gc); | 
|  | if (range.start != buffer_start_) { | 
|  | buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start; | 
|  | buffer_start_ = range.start; | 
|  | buffer_end_ = range.end; | 
|  | } | 
|  | } | 
|  |  | 
|  | Isolate* isolate_; | 
|  | }; | 
|  |  | 
|  | // ---------------------------------------------------------------------------- | 
|  | // BufferedUtf16CharacterStreams | 
|  | // | 
|  | // A buffered character stream based on a random access character | 
|  | // source (ReadBlock can be called with pos() pointing to any position, | 
|  | // even positions before the current). | 
|  | // | 
|  | // TODO(verwaest): Remove together with Utf8 external streaming streams. | 
|  | class BufferedUtf16CharacterStream : public Utf16CharacterStream { | 
|  | public: | 
|  | BufferedUtf16CharacterStream(); | 
|  |  | 
|  | protected: | 
|  | static const size_t kBufferSize = 512; | 
|  |  | 
|  | bool ReadBlock() final; | 
|  |  | 
|  | // FillBuffer should read up to kBufferSize characters at position and store | 
|  | // them into buffer_[0..]. It returns the number of characters stored. | 
|  | virtual size_t FillBuffer(size_t position) = 0; | 
|  |  | 
|  | // Fixed sized buffer that this class reads from. | 
|  | // The base class' buffer_start_ should always point to buffer_. | 
|  | uc16 buffer_[kBufferSize]; | 
|  | }; | 
|  |  | 
|  | BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() | 
|  | : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {} | 
|  |  | 
|  | bool BufferedUtf16CharacterStream::ReadBlock() { | 
|  | DCHECK_EQ(buffer_start_, buffer_); | 
|  |  | 
|  | size_t position = pos(); | 
|  | buffer_pos_ = position; | 
|  | buffer_cursor_ = buffer_; | 
|  | buffer_end_ = buffer_ + FillBuffer(position); | 
|  | DCHECK_EQ(pos(), position); | 
|  | DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize); | 
|  | return buffer_cursor_ < buffer_end_; | 
|  | } | 
|  |  | 
|  | // ---------------------------------------------------------------------------- | 
|  | // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data. | 
|  | // | 
|  | // This implementation is fairly complex, since data arrives in chunks which | 
|  | // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given | 
|  | // character position is tricky because the byte position cannot be derived | 
|  | // from the character position. | 
|  | // | 
|  | // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side | 
|  | // instead so we don't need to buffer. | 
|  |  | 
|  | class Utf8ExternalStreamingStream final : public BufferedUtf16CharacterStream { | 
|  | public: | 
|  | Utf8ExternalStreamingStream( | 
|  | ScriptCompiler::ExternalSourceStream* source_stream) | 
|  | : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}), | 
|  | source_stream_(source_stream) {} | 
|  | ~Utf8ExternalStreamingStream() final { | 
|  | for (const Chunk& chunk : chunks_) delete[] chunk.data; | 
|  | } | 
|  |  | 
|  | bool can_access_heap() const final { return false; } | 
|  |  | 
|  | bool can_be_cloned() const final { return false; } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> Clone() const override { | 
|  | UNREACHABLE(); | 
|  | } | 
|  |  | 
|  | protected: | 
|  | size_t FillBuffer(size_t position) final; | 
|  |  | 
|  | private: | 
|  | // A position within the data stream. It stores: | 
|  | // - The 'physical' position (# of bytes in the stream), | 
|  | // - the 'logical' position (# of ucs-2 characters, also within the stream), | 
|  | // - a possibly incomplete utf-8 char at the current 'physical' position. | 
|  | struct StreamPosition { | 
|  | size_t bytes; | 
|  | size_t chars; | 
|  | uint32_t incomplete_char; | 
|  | unibrow::Utf8::State state; | 
|  | }; | 
|  |  | 
|  | // Position contains a StreamPosition and the index of the chunk the position | 
|  | // points into. (The chunk_no could be derived from pos, but that'd be | 
|  | // an expensive search through all chunks.) | 
|  | struct Position { | 
|  | size_t chunk_no; | 
|  | StreamPosition pos; | 
|  | }; | 
|  |  | 
|  | // A chunk in the list of chunks, containing: | 
|  | // - The chunk data (data pointer and length), and | 
|  | // - the position at the first byte of the chunk. | 
|  | struct Chunk { | 
|  | const uint8_t* data; | 
|  | size_t length; | 
|  | StreamPosition start; | 
|  | }; | 
|  |  | 
|  | // Within the current chunk, skip forward from current_ towards position. | 
|  | bool SkipToPosition(size_t position); | 
|  | // Within the current chunk, fill the buffer_ (while it has capacity). | 
|  | void FillBufferFromCurrentChunk(); | 
|  | // Fetch a new chunk (assuming current_ is at the end of the current data). | 
|  | bool FetchChunk(); | 
|  | // Search through the chunks and set current_ to point to the given position. | 
|  | // (This call is potentially expensive.) | 
|  | void SearchPosition(size_t position); | 
|  |  | 
|  | std::vector<Chunk> chunks_; | 
|  | Position current_; | 
|  | ScriptCompiler::ExternalSourceStream* source_stream_; | 
|  | }; | 
|  |  | 
|  | bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) { | 
|  | DCHECK_LE(current_.pos.chars, position);  // We can only skip forward. | 
|  |  | 
|  | // Already there? Then return immediately. | 
|  | if (current_.pos.chars == position) return true; | 
|  |  | 
|  | const Chunk& chunk = chunks_[current_.chunk_no]; | 
|  | DCHECK(current_.pos.bytes >= chunk.start.bytes); | 
|  |  | 
|  | unibrow::Utf8::State state = chunk.start.state; | 
|  | uint32_t incomplete_char = chunk.start.incomplete_char; | 
|  | size_t it = current_.pos.bytes - chunk.start.bytes; | 
|  | const uint8_t* cursor = &chunk.data[it]; | 
|  | const uint8_t* end = &chunk.data[chunk.length]; | 
|  |  | 
|  | size_t chars = current_.pos.chars; | 
|  |  | 
|  | if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) { | 
|  | while (cursor < end) { | 
|  | unibrow::uchar t = | 
|  | unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); | 
|  | if (t == unibrow::Utf8::kIncomplete) continue; | 
|  | if (t != kUtf8Bom) { | 
|  | chars++; | 
|  | if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; | 
|  | } | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | while (cursor < end && chars < position) { | 
|  | unibrow::uchar t = | 
|  | unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); | 
|  | if (t != unibrow::Utf8::kIncomplete) { | 
|  | chars++; | 
|  | if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; | 
|  | } | 
|  | } | 
|  |  | 
|  | current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); | 
|  | current_.pos.chars = chars; | 
|  | current_.pos.incomplete_char = incomplete_char; | 
|  | current_.pos.state = state; | 
|  | current_.chunk_no += (cursor == end); | 
|  |  | 
|  | return current_.pos.chars == position; | 
|  | } | 
|  |  | 
|  | void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { | 
|  | DCHECK_LT(current_.chunk_no, chunks_.size()); | 
|  | DCHECK_EQ(buffer_start_, buffer_cursor_); | 
|  | DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize); | 
|  |  | 
|  | const Chunk& chunk = chunks_[current_.chunk_no]; | 
|  |  | 
|  | // The buffer_ is writable, but buffer_*_ members are const. So we get a | 
|  | // non-const pointer into buffer that points to the same char as buffer_end_. | 
|  | uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_); | 
|  | DCHECK_EQ(output_cursor, buffer_end_); | 
|  |  | 
|  | unibrow::Utf8::State state = current_.pos.state; | 
|  | uint32_t incomplete_char = current_.pos.incomplete_char; | 
|  |  | 
|  | // If the current chunk is the last (empty) chunk we'll have to process | 
|  | // any left-over, partial characters. | 
|  | if (chunk.length == 0) { | 
|  | unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); | 
|  | if (t != unibrow::Utf8::kBufferEmpty) { | 
|  | DCHECK_EQ(t, unibrow::Utf8::kBadChar); | 
|  | *output_cursor = static_cast<uc16>(t); | 
|  | buffer_end_++; | 
|  | current_.pos.chars++; | 
|  | current_.pos.incomplete_char = 0; | 
|  | current_.pos.state = state; | 
|  | } | 
|  | return; | 
|  | } | 
|  |  | 
|  | size_t it = current_.pos.bytes - chunk.start.bytes; | 
|  | const uint8_t* cursor = chunk.data + it; | 
|  | const uint8_t* end = chunk.data + chunk.length; | 
|  |  | 
|  | // Deal with possible BOM. | 
|  | if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) { | 
|  | while (cursor < end) { | 
|  | unibrow::uchar t = | 
|  | unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); | 
|  | if (V8_LIKELY(t < kUtf8Bom)) { | 
|  | *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case. | 
|  | } else if (t == unibrow::Utf8::kIncomplete) { | 
|  | continue; | 
|  | } else if (t == kUtf8Bom) { | 
|  | // BOM detected at beginning of the stream. Don't copy it. | 
|  | } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { | 
|  | *(output_cursor++) = static_cast<uc16>(t); | 
|  | } else { | 
|  | *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); | 
|  | *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); | 
|  | } | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | const uint16_t* max_buffer_end = buffer_start_ + kBufferSize; | 
|  | while (cursor < end && output_cursor + 1 < max_buffer_end) { | 
|  | unibrow::uchar t = | 
|  | unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); | 
|  | if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) { | 
|  | *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case. | 
|  | } else if (t == unibrow::Utf8::kIncomplete) { | 
|  | continue; | 
|  | } else { | 
|  | *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); | 
|  | *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); | 
|  | } | 
|  | // Fast path for ascii sequences. | 
|  | size_t remaining = end - cursor; | 
|  | size_t max_buffer = max_buffer_end - output_cursor; | 
|  | int max_length = static_cast<int>(Min(remaining, max_buffer)); | 
|  | DCHECK_EQ(state, unibrow::Utf8::State::kAccept); | 
|  | int ascii_length = NonAsciiStart(cursor, max_length); | 
|  | CopyChars(output_cursor, cursor, ascii_length); | 
|  | cursor += ascii_length; | 
|  | output_cursor += ascii_length; | 
|  | } | 
|  |  | 
|  | current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); | 
|  | current_.pos.chars += (output_cursor - buffer_end_); | 
|  | current_.pos.incomplete_char = incomplete_char; | 
|  | current_.pos.state = state; | 
|  | current_.chunk_no += (cursor == end); | 
|  |  | 
|  | buffer_end_ = output_cursor; | 
|  | } | 
|  |  | 
|  | bool Utf8ExternalStreamingStream::FetchChunk() { | 
|  | RuntimeCallTimerScope scope(runtime_call_stats(), | 
|  | RuntimeCallCounterId::kGetMoreDataCallback); | 
|  | DCHECK_EQ(current_.chunk_no, chunks_.size()); | 
|  | DCHECK(chunks_.empty() || chunks_.back().length != 0); | 
|  |  | 
|  | const uint8_t* chunk = nullptr; | 
|  | size_t length = source_stream_->GetMoreData(&chunk); | 
|  | chunks_.push_back({chunk, length, current_.pos}); | 
|  | return length > 0; | 
|  | } | 
|  |  | 
|  | void Utf8ExternalStreamingStream::SearchPosition(size_t position) { | 
|  | // If current_ already points to the right position, we're done. | 
|  | // | 
|  | // This is expected to be the common case, since we typically call | 
|  | // FillBuffer right after the current buffer. | 
|  | if (current_.pos.chars == position) return; | 
|  |  | 
|  | // No chunks. Fetch at least one, so we can assume !chunks_.empty() below. | 
|  | if (chunks_.empty()) { | 
|  | DCHECK_EQ(current_.chunk_no, 0u); | 
|  | DCHECK_EQ(current_.pos.bytes, 0u); | 
|  | DCHECK_EQ(current_.pos.chars, 0u); | 
|  | FetchChunk(); | 
|  | } | 
|  |  | 
|  | // Search for the last chunk whose start position is less or equal to | 
|  | // position. | 
|  | size_t chunk_no = chunks_.size() - 1; | 
|  | while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) { | 
|  | chunk_no--; | 
|  | } | 
|  |  | 
|  | // Did we find the terminating (zero-length) chunk? Then we're seeking | 
|  | // behind the end of the data, and position does not exist. | 
|  | // Set current_ to point to the terminating chunk. | 
|  | if (chunks_[chunk_no].length == 0) { | 
|  | current_ = {chunk_no, chunks_[chunk_no].start}; | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Did we find the non-last chunk? Then our position must be within chunk_no. | 
|  | if (chunk_no + 1 < chunks_.size()) { | 
|  | // Fancy-pants optimization for ASCII chunks within a utf-8 stream. | 
|  | // (Many web sites declare utf-8 encoding, but use only (or almost only) the | 
|  | //  ASCII subset for their JavaScript sources. We can exploit this, by | 
|  | //  checking whether the # bytes in a chunk are equal to the # chars, and if | 
|  | //  so avoid the expensive SkipToPosition.) | 
|  | bool ascii_only_chunk = | 
|  | chunks_[chunk_no].start.incomplete_char == 0 && | 
|  | (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) == | 
|  | (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars); | 
|  | if (ascii_only_chunk) { | 
|  | size_t skip = position - chunks_[chunk_no].start.chars; | 
|  | current_ = {chunk_no, | 
|  | {chunks_[chunk_no].start.bytes + skip, | 
|  | chunks_[chunk_no].start.chars + skip, 0, | 
|  | unibrow::Utf8::State::kAccept}}; | 
|  | } else { | 
|  | current_ = {chunk_no, chunks_[chunk_no].start}; | 
|  | SkipToPosition(position); | 
|  | } | 
|  |  | 
|  | // Since position was within the chunk, SkipToPosition should have found | 
|  | // something. | 
|  | DCHECK_EQ(position, current_.pos.chars); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // What's left: We're in the last, non-terminating chunk. Our position | 
|  | // may be in the chunk, but it may also be in 'future' chunks, which we'll | 
|  | // have to obtain. | 
|  | DCHECK_EQ(chunk_no, chunks_.size() - 1); | 
|  | current_ = {chunk_no, chunks_[chunk_no].start}; | 
|  | bool have_more_data = true; | 
|  | bool found = SkipToPosition(position); | 
|  | while (have_more_data && !found) { | 
|  | DCHECK_EQ(current_.chunk_no, chunks_.size()); | 
|  | have_more_data = FetchChunk(); | 
|  | found = have_more_data && SkipToPosition(position); | 
|  | } | 
|  |  | 
|  | // We'll return with a postion != the desired position only if we're out | 
|  | // of data. In that case, we'll point to the terminating chunk. | 
|  | DCHECK_EQ(found, current_.pos.chars == position); | 
|  | DCHECK_EQ(have_more_data, chunks_.back().length != 0); | 
|  | DCHECK_IMPLIES(!found, !have_more_data); | 
|  | DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1); | 
|  | } | 
|  |  | 
|  | size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) { | 
|  | buffer_cursor_ = buffer_; | 
|  | buffer_end_ = buffer_; | 
|  |  | 
|  | SearchPosition(position); | 
|  | bool out_of_data = current_.chunk_no != chunks_.size() && | 
|  | chunks_[current_.chunk_no].length == 0 && | 
|  | current_.pos.incomplete_char == 0; | 
|  |  | 
|  | if (out_of_data) return 0; | 
|  |  | 
|  | // Fill the buffer, until we have at least one char (or are out of data). | 
|  | // (The embedder might give us 1-byte blocks within a utf-8 char, so we | 
|  | //  can't guarantee progress with one chunk. Thus we iterate.) | 
|  | while (!out_of_data && buffer_cursor_ == buffer_end_) { | 
|  | // At end of current data, but there might be more? Then fetch it. | 
|  | if (current_.chunk_no == chunks_.size()) { | 
|  | out_of_data = !FetchChunk(); | 
|  | } | 
|  | FillBufferFromCurrentChunk(); | 
|  | } | 
|  |  | 
|  | DCHECK_EQ(current_.pos.chars - position, | 
|  | static_cast<size_t>(buffer_end_ - buffer_cursor_)); | 
|  | return buffer_end_ - buffer_cursor_; | 
|  | } | 
|  |  | 
|  | // ---------------------------------------------------------------------------- | 
|  | // ScannerStream: Create stream instances. | 
|  |  | 
|  | Utf16CharacterStream* ScannerStream::For(Isolate* isolate, | 
|  | Handle<String> data) { | 
|  | return ScannerStream::For(isolate, data, 0, data->length()); | 
|  | } | 
|  |  | 
|  | Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data, | 
|  | int start_pos, int end_pos) { | 
|  | DCHECK_GE(start_pos, 0); | 
|  | DCHECK_LE(start_pos, end_pos); | 
|  | DCHECK_LE(end_pos, data->length()); | 
|  | size_t start_offset = 0; | 
|  | if (data->IsSlicedString()) { | 
|  | SlicedString string = SlicedString::cast(*data); | 
|  | start_offset = string.offset(); | 
|  | String parent = string.parent(); | 
|  | if (parent.IsThinString()) parent = ThinString::cast(parent).actual(); | 
|  | data = handle(parent, isolate); | 
|  | } else { | 
|  | data = String::Flatten(isolate, data); | 
|  | } | 
|  | if (data->IsExternalOneByteString()) { | 
|  | return new BufferedCharacterStream<ExternalStringStream>( | 
|  | static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data), | 
|  | start_offset, static_cast<size_t>(end_pos)); | 
|  | } else if (data->IsExternalTwoByteString()) { | 
|  | return new UnbufferedCharacterStream<ExternalStringStream>( | 
|  | static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data), | 
|  | start_offset, static_cast<size_t>(end_pos)); | 
|  | } else if (data->IsSeqOneByteString()) { | 
|  | return new BufferedCharacterStream<OnHeapStream>( | 
|  | static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data), | 
|  | start_offset, static_cast<size_t>(end_pos)); | 
|  | } else if (data->IsSeqTwoByteString()) { | 
|  | return new RelocatingCharacterStream( | 
|  | isolate, static_cast<size_t>(start_pos), | 
|  | Handle<SeqTwoByteString>::cast(data), start_offset, | 
|  | static_cast<size_t>(end_pos)); | 
|  | } else { | 
|  | UNREACHABLE(); | 
|  | } | 
|  | } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting( | 
|  | const char* data) { | 
|  | return ScannerStream::ForTesting(data, strlen(data)); | 
|  | } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting( | 
|  | const char* data, size_t length) { | 
|  | if (data == nullptr) { | 
|  | DCHECK_EQ(length, 0); | 
|  |  | 
|  | // We don't want to pass in a null pointer into the the character stream, | 
|  | // because then the one-past-the-end pointer is undefined, so instead pass | 
|  | // through this static array. | 
|  | static const char non_null_empty_string[1] = {0}; | 
|  | data = non_null_empty_string; | 
|  | } | 
|  |  | 
|  | return std::unique_ptr<Utf16CharacterStream>( | 
|  | new BufferedCharacterStream<TestingStream>( | 
|  | 0, reinterpret_cast<const uint8_t*>(data), length)); | 
|  | } | 
|  |  | 
|  | std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting( | 
|  | const uint16_t* data, size_t length) { | 
|  | if (data == nullptr) { | 
|  | DCHECK_EQ(length, 0); | 
|  |  | 
|  | // We don't want to pass in a null pointer into the the character stream, | 
|  | // because then the one-past-the-end pointer is undefined, so instead pass | 
|  | // through this static array. | 
|  | static const uint16_t non_null_empty_uint16_t_string[1] = {0}; | 
|  | data = non_null_empty_uint16_t_string; | 
|  | } | 
|  |  | 
|  | return std::unique_ptr<Utf16CharacterStream>( | 
|  | new UnbufferedCharacterStream<TestingStream>(0, data, length)); | 
|  | } | 
|  |  | 
|  | Utf16CharacterStream* ScannerStream::For( | 
|  | ScriptCompiler::ExternalSourceStream* source_stream, | 
|  | v8::ScriptCompiler::StreamedSource::Encoding encoding) { | 
|  | switch (encoding) { | 
|  | case v8::ScriptCompiler::StreamedSource::TWO_BYTE: | 
|  | return new UnbufferedCharacterStream<ChunkedStream>( | 
|  | static_cast<size_t>(0), source_stream); | 
|  | case v8::ScriptCompiler::StreamedSource::ONE_BYTE: | 
|  | return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0), | 
|  | source_stream); | 
|  | case v8::ScriptCompiler::StreamedSource::UTF8: | 
|  | return new Utf8ExternalStreamingStream(source_stream); | 
|  | } | 
|  | UNREACHABLE(); | 
|  | } | 
|  |  | 
|  | }  // namespace internal | 
|  | }  // namespace v8 |