blob: b1c830df5bb617eb0f8c8b8892a70ed9ce30d857 [file] [log] [blame]
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cobalt/media/filters/shell_avc_parser.h"
#include <limits>
#include <vector>
#include "base/logging.h"
#include "base/stringprintf.h"
#include "cobalt/media/base/decoder_buffer.h"
#include "cobalt/media/base/endian_util.h"
#include "cobalt/media/base/media_util.h"
#include "cobalt/media/base/video_types.h"
#include "cobalt/media/filters/shell_au.h"
#include "cobalt/media/filters/shell_rbsp_stream.h"
#include "cobalt/media/formats/mp4/aac.h"
namespace media {
// what's the smallest meaningful AVC config we can parse?
static const int kAVCConfigMinSize = 8;
// lower five bits of first byte in SPS should be 7
static const uint8 kSPSNALType = 7;
ShellAVCParser::ShellAVCParser(scoped_refptr<ShellDataSourceReader> reader,
const scoped_refptr<MediaLog>& media_log)
: ShellParser(reader),
media_log_(media_log),
nal_header_size_(0),
video_prepend_size_(0) {
DCHECK(media_log);
}
ShellAVCParser::~ShellAVCParser() {}
bool ShellAVCParser::Prepend(scoped_refptr<ShellAU> au,
scoped_refptr<DecoderBuffer> buffer) {
// sanity-check inputs
if (!au || !buffer) {
NOTREACHED() << "bad input to Prepend()";
return false;
}
uint8* prepend_buffer = buffer->writable_data();
if (!prepend_buffer) {
NOTREACHED() << "empty/undersized buffer to Prepend()";
return false;
}
if (au->GetType() == DemuxerStream::VIDEO) {
if (au->AddPrepend())
memcpy(prepend_buffer, video_prepend_, video_prepend_size_);
} else if (au->GetType() == DemuxerStream::AUDIO) {
#if defined(COBALT_WIN)
// We use raw AAC instead of ADTS on these platforms.
DCHECK(audio_prepend_.empty());
return true;
#endif
if (audio_prepend_.empty()) // valid ADTS header not available
return false;
// audio, need to copy ADTS header and then add buffer size
uint32 buffer_size = au->GetSize() + audio_prepend_.size();
// we can't express an AU size larger than 13 bits, something's bad here.
if (buffer_size & 0xffffe000) {
return false;
}
memcpy(prepend_buffer, &audio_prepend_[0], audio_prepend_.size());
// OR size into buffer, byte 3 gets 2 MSb of 13-bit size
prepend_buffer[3] |= (uint8)((buffer_size & 0x00001800) >> 11);
// byte 4 gets bits 10-3 of size
prepend_buffer[4] = (uint8)((buffer_size & 0x000007f8) >> 3);
// byte 5 gets bits 2-0 of size
prepend_buffer[5] |= (uint8)((buffer_size & 0x00000007) << 5);
} else {
NOTREACHED() << "unsupported demuxer stream type.";
return false;
}
return true;
}
bool ShellAVCParser::DownloadAndParseAVCConfigRecord(uint64 offset,
uint32 size) {
if (size == 0) {
return false;
}
std::vector<uint8> record_buffer(size);
int bytes_read = reader_->BlockingRead(offset, size, &record_buffer[0]);
DCHECK_LE(size, static_cast<uint32>(std::numeric_limits<int32>::max()));
if (bytes_read < static_cast<int>(size)) {
return false;
}
// ok, successfully downloaded the record, parse it
return ParseAVCConfigRecord(&record_buffer[0], size);
}
// static
bool ShellAVCParser::ParseSPS(const uint8* sps, size_t sps_size,
ShellSPSRecord* record_out) {
DCHECK(sps) << "no sps provided";
DCHECK(record_out) << "no output structure provided";
// first byte is NAL type id, check that it is SPS
if ((*sps & 0x1f) != kSPSNALType) {
DLOG(ERROR) << "bad NAL type on SPS";
return false;
}
// convert SPS NALU to RBSP stream
ShellRBSPStream sps_rbsp(sps + 1, sps_size - 1);
uint8 profile_idc = 0;
if (!sps_rbsp.ReadByte(&profile_idc)) {
DLOG(ERROR) << "failure reading profile_idc from sps RBSP";
return false;
}
// skip 3 constraint flags, 5 reserved bits, and level_idc (16 bits)
sps_rbsp.SkipBytes(2);
// ReadUEV/ReadSEV require a value to be passed by reference but
// there are many times in which we ignore this value.
uint32 disposable_uev = 0;
int32 disposable_sev = 0;
// seq_parameter_set_id
sps_rbsp.ReadUEV(&disposable_uev);
// skip profile-specific encoding information if there
if (profile_idc == 100 || profile_idc == 103 || profile_idc == 110 ||
profile_idc == 122 || profile_idc == 244 || profile_idc == 44 ||
profile_idc == 83 || profile_idc == 86 || profile_idc == 118) {
uint32 chroma_format_idc = 0;
if (!sps_rbsp.ReadUEV(&chroma_format_idc)) {
DLOG(WARNING) << "failure reading chroma_format_idc from sps RBSP";
return false;
}
if (chroma_format_idc == 3) {
// separate_color_plane_flag
sps_rbsp.SkipBits(1);
}
// bit_depth_luma_minus8
sps_rbsp.ReadUEV(&disposable_uev);
// bit_depth_chroma_minus8
sps_rbsp.ReadUEV(&disposable_uev);
// qpprime_y_zero_transform_bypass_flag
sps_rbsp.SkipBits(1);
// seq_scaling_matrix_present_flag
uint8 seq_scaling_matrix_present_flag = 0;
if (!sps_rbsp.ReadBit(&seq_scaling_matrix_present_flag)) {
DLOG(ERROR)
<< "failure reading seq_scaling_matrix_present_flag from sps RBSP";
return false;
}
if (seq_scaling_matrix_present_flag) {
// seq_scaling_list_present_flag[]
sps_rbsp.SkipBits(chroma_format_idc != 3 ? 8 : 12);
}
}
// log2_max_frame_num_minus4
sps_rbsp.ReadUEV(&disposable_uev);
// pic_order_cnt_type
uint32 pic_order_cnt_type = 0;
if (!sps_rbsp.ReadUEV(&pic_order_cnt_type)) {
DLOG(ERROR) << "failure reading pic_order_cnt_type from sps RBSP";
return false;
}
if (pic_order_cnt_type == 0) {
// log2_max_pic_order_cnt_lsb_minus4
sps_rbsp.ReadUEV(&disposable_uev);
} else if (pic_order_cnt_type == 1) {
// delta_pic_order_always_zero_flag
sps_rbsp.SkipBits(1);
// offset_for_non_ref_pic
sps_rbsp.ReadSEV(&disposable_sev);
// offset_for_top_to_bottom_field
sps_rbsp.ReadSEV(&disposable_sev);
// num_ref_frames_in_pic_order_cnt_cycle
uint32 num_ref_frames_in_pic_order_cnt_cycle = 0;
if (!sps_rbsp.ReadUEV(&num_ref_frames_in_pic_order_cnt_cycle)) {
DLOG(ERROR)
<< "failure reading num_ref_frames_in_pic_order_cnt_cycle from sps";
return false;
}
for (uint32 i = 0; i < num_ref_frames_in_pic_order_cnt_cycle; ++i) {
sps_rbsp.ReadSEV(&disposable_sev);
}
}
// number of reference frames used to decode
uint32 num_ref_frames = 0;
if (!sps_rbsp.ReadUEV(&num_ref_frames)) {
DLOG(ERROR) << "failure reading number of ref frames from sps RBSP";
return false;
}
// gaps_in_frame_num_value_allowed_flag
sps_rbsp.SkipBits(1);
// width is calculated from pic_width_in_mbs_minus1
uint32 pic_width_in_mbs_minus1 = 0;
if (!sps_rbsp.ReadUEV(&pic_width_in_mbs_minus1)) {
DLOG(WARNING) << "failure reading image width from sps RBSP";
return false;
}
// 16 pxs per macroblock
uint32 width = (pic_width_in_mbs_minus1 + 1) * 16;
// pic_height_in_map_units_minus1
uint32 pic_height_in_map_units_minus1 = 0;
if (!sps_rbsp.ReadUEV(&pic_height_in_map_units_minus1)) {
DLOG(ERROR)
<< "failure reading pic_height_in_map_uints_minus1 from sps RBSP";
return false;
}
uint8 frame_mbs_only_flag = 0;
if (!sps_rbsp.ReadBit(&frame_mbs_only_flag)) {
DLOG(ERROR) << "failure reading frame_mbs_only_flag from sps RBSP";
return false;
}
uint32 height = (2 - static_cast<uint32>(frame_mbs_only_flag)) *
(pic_height_in_map_units_minus1 + 1) * 16;
if (!frame_mbs_only_flag) {
sps_rbsp.SkipBits(1);
}
// direct_8x8_inference_flag
sps_rbsp.SkipBits(1);
// frame cropping flag
uint8 frame_cropping_flag = 0;
if (!sps_rbsp.ReadBit(&frame_cropping_flag)) {
DLOG(ERROR) << "failure reading frame_cropping_flag from sps RBSP";
return false;
}
// distance in pixels from the associated edge of the media:
//
// <---coded_size---width--------------------->
//
// +------------------------------------------+ ^
// | ^ | |
// | | | |
// | crop_top | |
// | | | |
// | v | height
// | +---------+ | |
// |<--crop_left-->| visible | | |
// | | rect |<--crop_right-->| |
// | +---------+ | |
// | ^ | |
// | | | |
// | crop_bottom | |
// | | | |
// | v | |
// +------------------------------------------+ v
//
uint32 crop_left = 0;
uint32 crop_right = 0;
uint32 crop_top = 0;
uint32 crop_bottom = 0;
// cropping values are stored divided by two
if (frame_cropping_flag) {
if (!sps_rbsp.ReadUEV(&crop_left)) {
DLOG(ERROR) << "failure reading crop_left from sps RBSP";
return false;
}
if (!sps_rbsp.ReadUEV(&crop_right)) {
DLOG(ERROR) << "failure reading crop_right from sps RBSP";
return false;
}
if (!sps_rbsp.ReadUEV(&crop_top)) {
DLOG(ERROR) << "failure reading crop_top from sps RBSP";
return false;
}
if (!sps_rbsp.ReadUEV(&crop_bottom)) {
DLOG(ERROR) << "failure reading crop_bottom from sps RBSP";
return false;
}
crop_left *= 2;
crop_right *= 2;
crop_top *= 2;
crop_bottom *= 2;
}
// remainder of SPS are values we can safely ignore, everything
// checks out, write output structure
int visible_width = width - (crop_left + crop_right);
int visible_height = height - (crop_top + crop_bottom);
record_out->coded_size = gfx::Size(width, height),
record_out->visible_rect =
gfx::Rect(crop_left, crop_top, visible_width, visible_height),
record_out->natural_size = gfx::Size(visible_width, visible_height);
record_out->num_ref_frames = num_ref_frames;
return true;
}
bool ShellAVCParser::ParseAVCConfigRecord(uint8* buffer, uint32 size) {
if (size < kAVCConfigMinSize) {
DLOG(ERROR) << base::StringPrintf("AVC config record bad size: %d", size);
return false;
}
// get the NALU header size
nal_header_size_ = (buffer[4] & 0x03) + 1;
// validate size, needs to be 1, 2 or 4 bytes only
if (nal_header_size_ != 4 && nal_header_size_ != 2 && nal_header_size_ != 1) {
return false;
}
// AVCConfigRecords contain a variable number of SPS NALU
// (Sequence Parameter Set) (Network Abstraction Layer Units)
// from which we can extract width, height, and cropping info.
// That means we need at least 1 SPS NALU in this stream for extraction.
uint8 number_of_sps_nalus = buffer[5] & 0x1f;
if (number_of_sps_nalus == 0) {
DLOG(WARNING) << "got AVCConfigRecord without any SPS NALUs!";
return false;
}
// iterate through SPS NALUs finding one of valid size for our purposes
// (this should usually be the first one), but also advancing through
// the ConfigRecord until we encounter the PPS sets
bool have_valid_sps = false;
int record_offset = 6;
size_t usable_sps_size = 0;
int usable_sps_offset = 0;
for (uint8 i = 0; i < number_of_sps_nalus; i++) {
// make sure we haven't run out of record for the 2-byte size record
DCHECK_LE(size, static_cast<uint32>(std::numeric_limits<int32>::max()));
if (record_offset + 2 > static_cast<int>(size)) {
DLOG(WARNING) << "ran out of AVCConfig record while parsing SPS size.";
return false;
}
// extract 2-byte size of this SPS
size_t sps_size =
endian_util::load_uint16_big_endian(buffer + record_offset);
// advance past the 2-byte size record
record_offset += 2;
// see if we jumped over record size
if (record_offset + sps_size > size) {
DLOG(WARNING) << "ran out of AVCConfig record while parsing SPS blocks.";
return false;
}
if (!have_valid_sps) {
have_valid_sps = true;
// save size and offset for later copying and parsing
usable_sps_size = sps_size;
usable_sps_offset = record_offset;
// continue to iterate through sps records to get to pps which follow
}
record_offset += sps_size;
}
if (!have_valid_sps) {
DLOG(WARNING)
<< "unable to parse a suitable SPS. Perhaps increase max size?";
return false;
}
// we don't strictly require a PPS, so we're even willing to accept that
// this could be the end of the bytestream, but if not the next byte should
// define the number of PPS objects in the record. Not sure if
// specific decoders could decode something without a PPS prepend but this
// doesn't break demuxing so we'll let them complain if that isn't going
// to work for them :)
size_t usable_pps_size = 0;
size_t usable_pps_offset = 0;
bool have_valid_pps = false;
DCHECK_LE(size, static_cast<uint32>(std::numeric_limits<int32>::max()));
if (record_offset + 1 < static_cast<int>(size)) {
uint8 number_of_pps_nalus = buffer[record_offset];
record_offset++;
for (uint8 i = 0; i < number_of_pps_nalus; i++) {
// make sure we don't run out of room for 2-byte size record
DCHECK_LE(size, static_cast<uint32>(std::numeric_limits<int32>::max()));
if (record_offset + 2 >= static_cast<int>(size)) {
DLOG(WARNING) << "ran out of AVCConfig record while parsing PPS size.";
return false;
}
// extract 2-byte size of this PPS
size_t pps_size =
endian_util::load_uint16_big_endian(buffer + record_offset);
record_offset += 2;
// see if there's actually room for this record in the buffer
if (record_offset + pps_size > size) {
DLOG(WARNING)
<< "ran out of AVCConfig record while scanning PPS blocks.";
return false;
}
if (!have_valid_pps) {
have_valid_pps = true;
usable_pps_size = pps_size;
usable_pps_offset = record_offset;
break;
}
}
}
// now we parse the valid SPS we extracted from byte stream earlier.
ShellSPSRecord sps_record;
if (!ParseSPS(buffer + usable_sps_offset, usable_sps_size, &sps_record)) {
DLOG(WARNING) << "error parsing SPS";
return false;
}
// we can now initialize our video decoder config
video_config_.Initialize(kCodecH264,
H264PROFILE_MAIN, // profile is ignored currently
PIXEL_FORMAT_YV12, COLOR_SPACE_HD_REC709,
sps_record.coded_size, sps_record.visible_rect,
sps_record.natural_size, EmptyExtraData(),
Unencrypted());
return BuildAnnexBPrepend(buffer + usable_sps_offset, usable_sps_size,
buffer + usable_pps_offset, usable_pps_size);
}
bool ShellAVCParser::BuildAnnexBPrepend(uint8* sps, uint32 sps_size, uint8* pps,
uint32 pps_size) {
// We will need to attach the sps and pps (if provided) to each keyframe
// video packet, with the AnnexB start code in front of each. Start with
// sps size and start code
video_prepend_size_ = sps_size + kAnnexBStartCodeSize;
if (pps_size > 0) {
// Add pps and pps start code size if needed.
video_prepend_size_ += pps_size + kAnnexBStartCodeSize;
}
// this should be a very rare case for typical videos
if (video_prepend_size_ > kAnnexBPrependMaxSize) {
NOTREACHED() << base::StringPrintf("Bad AnnexB prepend size: %d",
video_prepend_size_);
return false;
}
// start code for sps comes first
endian_util::store_uint32_big_endian(kAnnexBStartCode, video_prepend_);
// followed by sps body
memcpy(video_prepend_ + kAnnexBStartCodeSize, sps, sps_size);
int prepend_offset = kAnnexBStartCodeSize + sps_size;
if (pps_size > 0) {
// pps start code comes next
endian_util::store_uint32_big_endian(kAnnexBStartCode,
video_prepend_ + prepend_offset);
prepend_offset += kAnnexBStartCodeSize;
// followed by pps
memcpy(video_prepend_ + prepend_offset, pps, pps_size);
prepend_offset += pps_size;
}
// make sure we haven't wandered off into memory somewhere
DCHECK_EQ(prepend_offset, video_prepend_size_);
return true;
}
void ShellAVCParser::ParseAudioSpecificConfig(uint8 b0, uint8 b1) {
media::mp4::AAC aac;
std::vector<uint8> aac_config(2);
aac_config[0] = b0;
aac_config[1] = b1;
audio_prepend_.clear();
if (!aac.Parse(aac_config, media_log_) ||
!aac.ConvertEsdsToADTS(&audio_prepend_)) {
DLOG(WARNING) << "Error in parsing AudioSpecificConfig.";
return;
}
// Clear the length, it is 13 bits and stored as ******LL LLLLLLLL LLL*****
// in bytes 3 to 5.
audio_prepend_[3] &= 0xfc;
audio_prepend_[4] = 0;
audio_prepend_[5] &= 0x1f;
#if defined(COBALT_WIN)
// We use raw AAC instead of ADTS on these platforms.
audio_prepend_.clear();
#endif // defined(COBALT_WIN)
const bool kSbrInMimetype = false;
audio_config_.Initialize(
kCodecAAC, kSampleFormatS16, aac.GetChannelLayout(kSbrInMimetype),
aac.GetOutputSamplesPerSecond(kSbrInMimetype), aac.codec_specific_data(),
Unencrypted(), base::TimeDelta(), 0);
}
size_t ShellAVCParser::CalculatePrependSize(DemuxerStream::Type type,
bool is_keyframe) {
size_t prepend_size = 0;
if (type == DemuxerStream::VIDEO) {
bool needs_prepend = is_keyframe;
if (needs_prepend) prepend_size = video_prepend_size_;
} else if (type == DemuxerStream::AUDIO) {
prepend_size = audio_prepend_.size();
} else {
NOTREACHED() << "unsupported stream type";
}
return prepend_size;
}
} // namespace media