blob: 4492a49f7a703ed7528806cf5a80baf552bd1da0 [file] [log] [blame]
/*
* Copyright 2012 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "media/filters/shell_flv_parser.h"
#include <inttypes.h>
#include <limits>
#include "base/stringprintf.h"
#include "media/base/endian_util.h"
namespace media {
// "FLV" as hex ASCII codes
static const uint32 kFLV = 0x00464c56;
// FLV configuration, such as the AVCConfigRecord and the AudioSpecificConfig,
// should proceed any actual encoded data, and should be in the top of the file.
// This constant describes how far into the file we're willing to traverse
// without encountering metadata or encoded video keyframe data before giving
// up.
static const uint64 kMetadataMaxBytes = 4 * 1024 * 1024;
static const uint8 kAudioTagType = 8;
static const uint8 kVideoTagType = 9;
static const uint8 kScriptDataObjectTagType = 18;
// size of standard FLV tag
static const int kTagSize = 11;
// To limit reads we download a bit of extra data in flash headers to make
// sure we get all of the tag information we might need, allowing us
// to download the actual encoded data directly into a decoder buffer with
// proper alignment on a subsequent read. This value is calculated as the sum of
// the FLV tag size, the VIDEODATA tag size, and the AVCVIDEOPACKET tag size,
// which is the maximum amount of tag data that we will need to parse before
// download encoded A/V data.
static const int kTagDownloadSize = kTagSize + 1 + 4;
// these constants describe total tag sizes for AAC/AVC addendums to tags
static const int kAudioTagSize = 2;
static const int kVideoTagSize = 5;
// FLV AUDIODATA tag constants
static const uint8 kSoundFormatTypeAAC = 10;
// FLV AACAUDIODATA tag constants
static const uint8 kAACPacketTypeSequence = 0;
static const uint8 kAACPacketTypeRaw = 1;
// FLV VIDEODATA tag constants
static const uint8 kCodecIDAVC = 7;
// FLV AVCVIDEODATA tag constants
static const uint8 kAVCPacketTypeSequenceHeader = 0;
static const uint8 kAVCPacketTypeNALU = 1;
// Unused:
// static const uint8 kAVCPacketTypeEndOfSequence = 2;
// SCRIPTDATA parsing constants
static const uint8 kAMF0NumberType = 0x00;
static const int kAMF0NumberLength = 9;
// static
PipelineStatus ShellFLVParser::Construct(
scoped_refptr<ShellDataSourceReader> reader,
const uint8* construction_header,
scoped_refptr<ShellParser>* parser) {
DCHECK(parser);
*parser = NULL;
// look for "FLV" string at top of file, mask off LSB
uint32 FLV = endian_util::load_uint32_big_endian(construction_header) >> 8;
if (FLV != kFLV) {
// Not an flv.
return DEMUXER_ERROR_COULD_NOT_PARSE;
}
// Check for availability of both an audio and video stream. Audio stream is
// third bit, video stream is first bit in 5th byte of file header
if ((construction_header[4] & 0x05) != 0x05) {
return DEMUXER_ERROR_NO_SUPPORTED_STREAMS;
}
// offset of first data tag in stream is next 4 bytes
uint32 data_offset =
endian_util::load_uint32_big_endian(construction_header + 5);
// add four bytes to skip over PreviousTagSize0
data_offset += 4;
// construct an FLV parser
*parser = new ShellFLVParser(reader, data_offset);
return PIPELINE_OK;
}
ShellFLVParser::ShellFLVParser(scoped_refptr<ShellDataSourceReader> reader,
uint32 tag_start_offset)
: ShellAVCParser(reader),
tag_offset_(tag_start_offset),
at_end_of_file_(false) {}
ShellFLVParser::~ShellFLVParser() {}
bool ShellFLVParser::ParseConfig() {
// traverse file until we either reach the limit of bytes we're willing to
// parse of config info or we've encountered actual keyframe video data.
while (tag_offset_ < kMetadataMaxBytes && time_to_byte_map_.size() == 0) {
if (!ParseNextTag()) {
return false;
}
}
if (duration_.InMilliseconds() == 0) {
return false;
}
// We may have a valid duration by now and the reader may know the
// length of the file in bytes, see if we can extrapolate a bitrate from
// this.
if (duration_ != kInfiniteDuration() && reader_->FileSize() > 0) {
bits_per_second_ = (uint32)(
((reader_->FileSize() * 8000ULL) / (duration_.InMilliseconds())));
}
return true;
}
scoped_refptr<ShellAU> ShellFLVParser::GetNextAU(DemuxerStream::Type type) {
if (type == DemuxerStream::AUDIO) {
return GetNextAudioAU();
} else if (type == DemuxerStream::VIDEO) {
return GetNextVideoAU();
} else {
NOTREACHED();
}
return NULL;
}
// seeking an flv:
// 1) finding nearest video keyframe before timestamp:
// a) If we are seeking in to an area we have already parsed then we
// will find the bounding keyframe.
// b) If not, we parse the FLV until a) is true.
// 2) set tag_offset_ to the byte offset of the keyframe found in 1)
bool ShellFLVParser::SeekTo(base::TimeDelta timestamp) {
// convert timestamp to millisecond FLV timestamp
uint32 timestamp_flv = (uint32)timestamp.InMilliseconds();
bool found_upper_bound = false;
uint64 seek_byte_offset = tag_offset_;
uint32 seek_timestamp = 0;
// upper_bound returns iterator of first element in container with key > arg
TimeToByteMap::iterator keyframe_in_map =
time_to_byte_map_.upper_bound(timestamp_flv);
// this is case 1a), or keyframe is last keyframe before EOS,
// or map is empty (error state)
if (keyframe_in_map == time_to_byte_map_.end()) {
// is map empty? This is an error case, we should always have found a
// keyframe during ParseConfig()
if (time_to_byte_map_.size() == 0) {
NOTREACHED() << "empty time to byte map on FLV seek";
return false;
} else {
// start at last keyframe in the map and parse from there
seek_byte_offset = time_to_byte_map_.rbegin()->second;
}
} else {
found_upper_bound = true;
// it's possible timestamp <= first keyframe in map, in which case we
// use the first keyframe in map.
if (keyframe_in_map != time_to_byte_map_.begin()) {
keyframe_in_map--;
}
seek_byte_offset = keyframe_in_map->second;
seek_timestamp = keyframe_in_map->first;
}
// if seek has changed our position in the file jump there now
if (seek_byte_offset != tag_offset_) {
JumpParserTo(seek_byte_offset);
}
// if found_upper_bound is still false we are in case 1b), parse ahead until
// we encounter an upper bound or an eof
while (!found_upper_bound && !at_end_of_file_) {
// save highest keyframe in file in case it becomes the one we want
seek_byte_offset = time_to_byte_map_.rbegin()->second;
seek_timestamp = time_to_byte_map_.rbegin()->first;
// parse next tag in the file
if (!ParseNextTag()) {
return false;
}
// check last keyframe timestamp, if it's greater than our target timestamp
// we can stop
found_upper_bound = (time_to_byte_map_.rbegin()->first > timestamp_flv);
}
// make sure we have done step 2), jump parser to new keyframe
if (seek_byte_offset != tag_offset_) {
JumpParserTo(seek_byte_offset);
}
DLOG(INFO) << base::StringPrintf("flv parser seeking to timestamp: %" PRId64
" chose keyframe at %d",
timestamp.InMilliseconds(), seek_timestamp);
return true;
}
scoped_refptr<ShellAU> ShellFLVParser::GetNextAudioAU() {
// As audio timestamps are supposed to increase monotonically we need
// only 2 to calculate a duration.
while (next_audio_aus_.size() < 2) {
if (!ParseNextTag()) {
return NULL;
}
}
// There should always be 2 AUs in the queue now, even if they are both EOS.
DCHECK_GE(next_audio_aus_.size(), 2);
// Extract first AU in queue
scoped_refptr<ShellAU> au(next_audio_aus_.front());
next_audio_aus_.pop_front();
// Next timestamp should be greater than ours, if not something is very funny
// with this FLV and we won't be able to calculate duration.
if (next_audio_aus_.front()->GetTimestamp() >= au->GetTimestamp()) {
au->SetDuration(next_audio_aus_.front()->GetTimestamp() -
au->GetTimestamp());
} else {
DLOG(ERROR) << "out of order audio timestamps encountered on FLV parsing.";
}
return au;
}
scoped_refptr<ShellAU> ShellFLVParser::GetNextVideoAU() {
while (next_video_aus_.empty()) {
if (!ParseNextTag()) {
return NULL;
}
}
// extract next video AU
scoped_refptr<ShellAU> au(next_video_aus_.front());
next_video_aus_.pop_front();
return au;
}
//
// byte layout of an FLVTAG is:
// field | type | comment
// ------------------+--------+---------
// previous tag size | uint32 | we skip past these when parsing last tag
// tag type | uint8 | parsing starts here. describes tag type
// tag data size | uint24 | size of tag data payload (everything after this)
// timestamp | uint24 | lower 24 bits of timestamp in milliseconds
// timestamp ext | uint8 | upper 8 bits of timestamp in milliseconds
// stream id | uint24 | always 0
//
bool ShellFLVParser::ParseNextTag() {
uint8 tag_buffer[kTagDownloadSize];
if (at_end_of_file_) {
return false;
}
// get previous tag size and header for next one
int bytes_read =
reader_->BlockingRead(tag_offset_, kTagDownloadSize, tag_buffer);
// if that was the last tag in the stream detect the EOS and return. This
// is where normal termination of an FLV stream will occur.
if (bytes_read < kTagDownloadSize) {
at_end_of_file_ = true;
// Normal termination of an FLV. Enqueue EOS AUs in both streams.
next_video_aus_.push_back(ShellAU::CreateEndOfStreamAU(
DemuxerStream::VIDEO, video_track_duration_));
next_audio_aus_.push_back(ShellAU::CreateEndOfStreamAU(
DemuxerStream::AUDIO, audio_track_duration_));
return true;
}
// extract the tag data size from the tag header as uint24
// this is size of attached data field only not including this header
// but including the audio and video sub-headers
uint32 tag_data_size =
endian_util::load_uint32_big_endian(tag_buffer + 1) >> 8;
// extract timestamp, wonky byte order comes from the standard
int32 timestamp = tag_buffer[4] << 16 | tag_buffer[5] << 8 | tag_buffer[6] |
tag_buffer[7] << 24;
// choose which tag type to parse
bool parse_result = true;
uint8* tag_body = tag_buffer + kTagSize;
switch (tag_buffer[0]) {
case kAudioTagType:
parse_result = ParseAudioDataTag(tag_body, tag_data_size, timestamp);
break;
case kVideoTagType:
parse_result = ParseVideoDataTag(tag_body, tag_data_size, timestamp);
break;
case kScriptDataObjectTagType:
parse_result =
ParseScriptDataObjectTag(tag_body, tag_data_size, timestamp);
break;
default:
DLOG(WARNING) << base::StringPrintf("unsupported FLV TagType %d",
tag_buffer[0]);
break;
}
// advance read pointer to next tag header
tag_offset_ += kTagSize + tag_data_size + 4;
return parse_result;
}
// FLV AUDIODATA tags are packed into a single byte, bit layout is:
// aaaabbcd
// aaaa: 4 bits format enum, AAC is 10 decimal
// bb: 2 bits sample rate enum, AAC is always 3 decimal (44 KHz)
// c: 1 bit sound size, 0 means 8 bit, 1 means 16 bit, AAC is always 1
// d: 1 bit sound type, 0 means mono, 1 means stereo, AAC is always 1
// if this is an AACAUDIODATA tag the next byte in the sequence 0 if
// this is an AudioSpecificConfig tag or 1 if it is raw AAC frame data.
//
// * NOTE that FLV standard defines fixed values for sample rate, bit
// width, and channel count for AAC samples but the AudioSpecificConfig may
// define those values differently and is authoritative, so we ignore the
// FLV-provided config values.
bool ShellFLVParser::ParseAudioDataTag(uint8* tag,
uint32 size,
uint32 timestamp) {
// Smallest meaningful size for an audio data tag is 4 bytes, one for the
// AUDIODATA tag, one for the AACAUDIODATA tag. and minimum 2 bytes of data.
if (size < kAudioTagSize + 2) {
return false;
}
// we only support parsing AAC audio data tags
if (((tag[0] >> 4) & 0x0f) != kSoundFormatTypeAAC) {
return false;
}
// now see if this is a config packet or a data packet
if (tag[1] == kAACPacketTypeSequence) { // audio config info
// AudioSpecificConfig records can be longer than two bytes but we extract
// everything we need from the first two bytes, positioned here at index 2
// and 3 in the tag buffer
ParseAudioSpecificConfig(tag[2], tag[3]);
} else if (tag[1] == kAACPacketTypeRaw) { // raw AAC audio
// this is audio data, check timestamp
base::TimeDelta ts = base::TimeDelta::FromMilliseconds(timestamp);
if (ts > audio_track_duration_) {
audio_track_duration_ = ts;
}
// build the AU
size_t prepend_size = CalculatePrependSize(DemuxerStream::AUDIO, true);
scoped_refptr<ShellAU> au = ShellAU::CreateAudioAU(
tag_offset_ + kTagSize + kAudioTagSize, size - kAudioTagSize,
prepend_size, true, ts, kInfiniteDuration(), this);
next_audio_aus_.push_back(au);
}
return true;
}
// FLV VIDEODATA tags are packed into a single byte, bit layout is:
// aaaabbbb
// aaaa: 4 bits frame type enum, 1 is AVC keyframe, 2 is AVC inter-frame
// bbbb: 4 bits codecID, 7 is AVC
// if this is an AVCVIDEOPACKET tag the next 4 bytes comprise the AVCVIDEOPACKET
// tag header:
// field | type | comment
// ------------------+--------+---------
// AVCPacketType | uint8 | 0 is config, 1 is data, 2 is EOS (ignored)
// CompositionTime | int24 | signed time offset, add to FLV timestamp for pts
//
// NOTE that FLV video data is always presented in decode order and
// CompositionTime is not entirely reliable for determining pts, as some
// encoders always set it to zero.
bool ShellFLVParser::ParseVideoDataTag(uint8* tag,
uint32 size,
uint32 timestamp) {
// need at least 5 bytes of tag data, one for the VIDEODATA tag and 4 for
// the AVCVIDEODATA tag that should always follow it
if (size < kVideoTagSize) {
return false;
}
// check for AVC format
if ((tag[0] & 0x0f) != kCodecIDAVC) {
return false;
}
// determine packet type
if (tag[1] == kAVCPacketTypeSequenceHeader) { // video config info
// AVC config record, download and parse
return DownloadAndParseAVCConfigRecord(
tag_offset_ + kTagSize + kVideoTagSize, size);
} else if (tag[1] == kAVCPacketTypeNALU) { // raw AVC data
// should we add this to our keyframe map?
bool is_keyframe = (tag[0] & 0xf0) == 0x10;
// TODO: when we add support for seeking, make sure these numbers are
// consistent with the numbers provided by the time-to-byte manifest.
if (is_keyframe) {
time_to_byte_map_[timestamp] = tag_offset_;
}
// extract 24-bit composition time offset in big-endian for this frame
int32 composition_time_offset = tag[2] * 65536 + tag[3] * 256 + tag[4];
// calculate pts from flv timestamp and cts
uint32 pts = timestamp + composition_time_offset;
// FLV standard says that there can be multiple AVC NALUs packed here, so
// we iterate through the tag data payload and enqueue byte offsets for
// each NALU we encounter. The NALUs are packed by size counter that is
// nal_header_size_ bytes long followed by the NALU of that size.
uint32 avc_data_size = size - kVideoTagSize;
uint32 avc_tag_offset = 0;
base::TimeDelta ts = base::TimeDelta::FromMilliseconds(pts);
if (ts > video_track_duration_) {
video_track_duration_ = ts;
}
size_t prepend_size =
CalculatePrependSize(DemuxerStream::VIDEO, is_keyframe);
scoped_refptr<ShellAU> au = ShellAU::CreateVideoAU(
tag_offset_ + kTagSize + kVideoTagSize + avc_tag_offset, avc_data_size,
prepend_size, nal_header_size_, is_keyframe, ts, kInfiniteDuration(),
this);
// enqueue data tag
next_video_aus_.push_back(au);
}
return true;
}
// FLV SCRIPTDATA tags are in serialized typically in Action Message Format 0 as
// a collection of key/value pairs terminated by a special code. We only wish to
// parse the duration and byterate from the scriptdata so we use a very light
// weight parser in ExtractAMF0Number();
bool ShellFLVParser::ParseScriptDataObjectTag(uint8* tag,
uint32 size,
uint32 timestamp) {
scoped_refptr<ShellScopedArray> script_buffer =
ShellBufferFactory::Instance()->AllocateArray(size);
if (!script_buffer || !script_buffer->Get()) {
return false;
}
int bytes_read =
reader_->BlockingRead(tag_offset_ + kTagSize, size, script_buffer->Get());
DCHECK_LE(size, static_cast<uint32>(std::numeric_limits<int32>::max()));
if (bytes_read < static_cast<int>(size)) {
return false;
}
// Attempt to extract the duration from the FLV metadata.
double duration_seconds = 0;
if (!ExtractAMF0Number(script_buffer, "duration", &duration_seconds)) {
// might be worth trying to parse this as AMF3?
return false;
}
duration_ = base::TimeDelta::FromMicroseconds(
duration_seconds * base::Time::kMicrosecondsPerSecond);
// Try for the byterate too, but this is nonfatal if we can't get it.
double byterate = 0;
if (ExtractAMF0Number(script_buffer, "totaldatarate", &byterate)) {
bits_per_second_ = (uint32)(byterate * 8.0);
}
return true;
}
// The SCRIPTDATA tag contains a list of ordered pairs of AMF0 strings followed
// by an arbitrary AMF0 object. Typically there's one object of interest with
// string name 'onMetaData' followed by an anonymous object. In any event we
// will scan the buffer looking only for the provided string, verify the next
// bytes in the stream describe an AMF0 number, extract and return it.
// TODO: Replace this (brittle) code with a proper AMF0 parser.
bool ShellFLVParser::ExtractAMF0Number(scoped_refptr<ShellScopedArray> amf0,
const char* name,
double* number_out) {
DCHECK(number_out);
// the string will be proceeded by a u16 big-endian string length
uint16 name_length = strlen(name);
// there's lots of nonprinting characters and zeros in amf0, so we'll need
// to search for the string using our own method
int match_offset = 0;
int name_offset = 0;
// the last index in the buffer we could extract a string followed by Number
int search_length = amf0->Size() - (name_length + kAMF0NumberLength);
uint8* search_buffer = amf0->Get();
while (match_offset <= search_length && name_offset < name_length) {
if (search_buffer[match_offset] == name[name_offset]) {
name_offset++; // advance our substring pointer in the event of a match
} else {
name_offset = 0; // reset our substring pointer on a miss
}
match_offset++; // always advance our larger string pointer
}
// If we got a match name_offset will be pointing past the end of the search
// string and match_offset will be pointing to valid memory with room to
// extract a Number
if ((name_offset == name_length) &&
(match_offset <= amf0->Size() - kAMF0NumberLength)) {
// make sure the first byte matches the number type code
if (search_buffer[match_offset] != kAMF0NumberType) {
return false;
}
// advance pointer past the number type to the number itself
match_offset++;
// load big-endian double as uint, then cast to correct type
uint64 num_as_uint =
endian_util::load_uint64_big_endian(search_buffer + match_offset);
*number_out = *((double*)(&num_as_uint));
return true;
}
return false;
}
void ShellFLVParser::JumpParserTo(uint64 byte_offset) {
next_video_aus_.clear();
next_audio_aus_.clear();
at_end_of_file_ = false;
tag_offset_ = byte_offset;
}
} // namespace media