blob: 15ba6e2fcec73221194597fbf3a02b73d8fbac74 [file] [log] [blame]
/*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SRC_TRACE_PROCESSOR_UTIL_STREAMING_LINE_READER_H_
#define SRC_TRACE_PROCESSOR_UTIL_STREAMING_LINE_READER_H_
#include <functional>
#include <vector>
#include "perfetto/ext/base/string_view.h"
namespace perfetto {
namespace trace_processor {
namespace util {
// A streaming line tokenizer for efficiently processing large text files on a
// line-by-line basis. It's designed to be used in conjunction with ZipReader to
// stream lines out of a compressed file (think of a bugreport) without having
// to decompress the whole file in memory upfront.
// Internally it deals with the necessary buffering and line-merging across
// different chunks.
// Usage:
// - The caller should pass a callback into the ctor. The callback is invoked
// whenever a batch of lines has been tokenized. This happens after calls to
// either BeginWrite()+EndWrite() or Tokenize(). In order to avoid too much
// virtual dispatch overhead, the callback argument is a vector of lines, not
// a single line.
// - The caller can call either:
// - Tokenize(whole input): this exist to avoid a copy in the case of
// non-compressed (STORE) files in zip archive.
// - A sequence of BeginWrite() + EndWrite() as follows:
// - BeginWrite(n) guarantees that the caller can write at least `n` char.
// `n` is typically the decompression buffer passed to zlib.
// - The caller writes at most `n` bytes into the pointer returned above.
// - The caller calls EndWrite(m) passing the number of bytes actually
// written (`m` <= `n`);
// NOTE:
// This implementation slightly diverges from base::StringSplitter as follows:
// 1. It does NOT skip empty lines. SS coalesces empty tokens, this doesn't.
// 2. it won't output the last line unless it terminates with a \n. SS doesn't
// tell the difference between "foo\nbar" and "foo\nbar\n". This is
// fundamental for streaming, where we cannot tell upfront if we got the end.
class StreamingLineReader {
public:
// Note: the lifetime of the lines passed in the vector argument is valid only
// for the duration of the callback. Don't retain the StringView(s) passed.
using LinesCallback =
std::function<void(const std::vector<base::StringView>&)>;
explicit StreamingLineReader(LinesCallback);
~StreamingLineReader();
// This can be used when the whole input is known upfront and we just need
// splitting. This exist mostly for convenience when processing uncompressed
// (STORE) files in zip archives. If you just need a tokenizer outside of the
// context of a zip file, you are better off just using base::StringSplitter.
size_t Tokenize(base::StringView input);
// Reserves `write_buf_size` bytes into the internal buffer. The caller is
// expected to write at most `write_buf_size` on the returned pointer and
// then call EndWrite().
char* BeginWrite(size_t write_buf_size);
// Finishes the write reporting the number of bytes actually written, which
// must be <= `write_buf_size`. If one or more lines can be tokenized, this
// will cause one or more calls to the LinesCallback.
void EndWrite(size_t size_written);
private:
std::vector<char> buf_;
LinesCallback lines_callback_;
size_t size_before_write_ = 0;
};
} // namespace util
} // namespace trace_processor
} // namespace perfetto
#endif // SRC_TRACE_PROCESSOR_UTIL_STREAMING_LINE_READER_H_