blob: 124b461ab03d18b5b2481bbef649d632d3f74f01 [file] [log] [blame]
/*
* Copyright (C) 2021 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_
#define SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_
#include <stdint.h>
#include "perfetto/base/compiler.h"
#include "perfetto/base/logging.h"
#include "perfetto/protozero/proto_utils.h"
namespace protozero {
// A helper class for schema-less tokenizing of protobuf messages.
// This class takes a stream of proto-encoded bytes, pushed one by one in input
// via Push(octet), and returns a stream of tokens (each Push() call can return
// 0 or 1 token).
// A "token" contains metadata about a field, specifically: its ID, its wire
// type and:
// - For varint and fixed32/64 fields: its payload.
// - For string and bytes fields: the length of its payload.
// In this case the caller is supposed to "eat" those N bytes before calling
// Push() again.
// Note that this class cannot differentiate between a string/bytes field or
// a submessage, because they are encoded in the same way. The caller is
// supposed to know whether a field can be recursed into by just keep calling
// Push() or is a string that should be skipped.
// This is inline to allow the compiler to see through the Push method and
// avoid a function call for each byte.
class MessageTokenizer {
public:
struct Token {
uint32_t field_id; // 0 == not valid.
proto_utils::ProtoWireType type;
// For kLengthDelimited, |value| represent the length of the payload.
uint64_t value;
inline bool valid() const { return field_id != 0; }
bool operator==(const Token& o) const {
return field_id == o.field_id && type == o.type && value == o.value;
}
};
// Pushes a byte in input and returns a token, only when getting to the last
// byte of each field. Specifically:
// - For varint and fixed32 fields, the Token is returned after the last byte
// of the numeric payload is pushed.
// - For length-delimited fields, this returns after the last byte of the
// length is pushed (i.e. right before the payload starts). The caller is
// expected to either skip the next |value| bytes (in the case of a string
// or bytes fields) or keep calling Push, in the case of a submessage.
inline Token Push(uint8_t octet) {
using protozero::proto_utils::ProtoWireType;
// Parsing a fixed32/64 field is the only case where we don't have to do
// any varint decoding. This is why this block is before the remaining
// switch statement below (all the rest is a varint).
if (PERFETTO_UNLIKELY(state_ == kFixedIntValue)) {
PERFETTO_DCHECK(fixed_int_bits_ == 32 || fixed_int_bits_ == 64);
fixed_int_value_ |= static_cast<uint64_t>(octet) << fixed_int_shift_;
fixed_int_shift_ += 8;
if (fixed_int_shift_ < fixed_int_bits_)
return Token{}; // Intermediate byte of a fixed32/64.
auto wire_type = fixed_int_bits_ == 32 ? ProtoWireType::kFixed32
: ProtoWireType::kFixed64;
uint64_t fixed_int_value = fixed_int_value_;
fixed_int_value_ = fixed_int_shift_ = fixed_int_bits_ = 0;
state_ = kFieldPreamble;
return Token{field_id_, wire_type, fixed_int_value};
}
// At this point either we are: (i) parsing a field preamble; (ii) parsing a
// varint field paylod; (iii) parsing the length of a length-delimited
// field. In all cases, we need to decode a varint before proceeding.
varint_ |= static_cast<uint64_t>(octet & 0x7F) << varint_shift_;
if (octet & 0x80) {
varint_shift_ += 7;
if (PERFETTO_UNLIKELY(varint_shift_ >= 64)) {
varint_shift_ = 0;
state_ = kInvalidVarInt;
}
return Token{}; // Still parsing a varint.
}
uint64_t varint = varint_;
varint_ = 0;
varint_shift_ = 0;
switch (state_) {
case kFieldPreamble: {
auto field_type = static_cast<uint32_t>(varint & 7u); // 7 = 0..0111
field_id_ = static_cast<uint32_t>(varint >> 3);
// The field type is legit, now check it's well formed and within
// boundaries.
if (field_type == static_cast<uint32_t>(ProtoWireType::kVarInt)) {
state_ = kVarIntValue;
} else if (field_type ==
static_cast<uint32_t>(ProtoWireType::kFixed32) ||
field_type ==
static_cast<uint32_t>(ProtoWireType::kFixed64)) {
state_ = kFixedIntValue;
fixed_int_shift_ = 0;
fixed_int_value_ = 0;
fixed_int_bits_ =
field_type == static_cast<uint32_t>(ProtoWireType::kFixed32) ? 32
: 64;
} else if (field_type ==
static_cast<uint32_t>(ProtoWireType::kLengthDelimited)) {
state_ = kLenDelimited;
} else {
state_ = kInvalidFieldType;
}
return Token{};
}
case kVarIntValue: {
// Return the varint field payload and go back to the next field.
state_ = kFieldPreamble;
return Token{field_id_, ProtoWireType::kVarInt, varint};
}
case kLenDelimited: {
const auto payload_len = varint;
if (payload_len > protozero::proto_utils::kMaxMessageLength) {
state_ = kMessageTooBig;
return Token{};
}
state_ = kFieldPreamble;
// At this point the caller is expected to consume the next
// |payload_len| bytes.
return Token{field_id_, ProtoWireType::kLengthDelimited, payload_len};
}
case kFixedIntValue:
// Unreacheable because of the if before the switch.
PERFETTO_DCHECK(false);
break;
// Unrecoverable error states.
case kInvalidFieldType:
case kMessageTooBig:
case kInvalidVarInt:
break;
} // switch(state_)
return Token{}; // Keep GCC happy.
}
// Returns true if the tokenizer FSM has reached quiescence (i.e. if we are
// NOT in the middle of parsing a field).
bool idle() const {
return state_ == kFieldPreamble && varint_shift_ == 0 &&
fixed_int_shift_ == 0;
}
// Only for reporting parser errors in the trace.
uint32_t state() const { return static_cast<uint32_t>(state_); }
private:
enum State {
kFieldPreamble = 0, // Parsing the varint for the field preamble.
kVarIntValue = 1, // Parsing the payload of a varint field.
kFixedIntValue = 2, // Parsing the payload of a fixed32/64 field.
kLenDelimited = 3, // Parsing the length of a length-delimited field.
// Unrecoverable error states:
kInvalidFieldType = 4, // Encountered an invalid field type.
kMessageTooBig = 5, // Size of the length delimited message was too big.
kInvalidVarInt = 6, // Varint larger than 64 bits.
};
State state_ = kFieldPreamble;
uint32_t field_id_ = 0;
uint64_t varint_ = 0;
uint32_t varint_shift_ = 0;
uint32_t fixed_int_shift_ = 0;
uint32_t fixed_int_bits_ = 0;
uint64_t fixed_int_value_ = 0;
};
} // namespace protozero
#endif // SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_