|  | /* | 
|  | *  Copyright (c) 2019 The WebM project authors. All Rights Reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS.  All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #ifndef VPX_VP9_SIMPLE_ENCODE_H_ | 
|  | #define VPX_VP9_SIMPLE_ENCODE_H_ | 
|  |  | 
|  | #include <cstddef> | 
|  | #include <cstdint> | 
|  | #include <cstdio> | 
|  | #include <memory> | 
|  | #include <vector> | 
|  |  | 
|  | namespace vp9 { | 
|  |  | 
|  | // TODO(angiebird): Add description for each frame type. | 
|  | enum FrameType { | 
|  | kFrameTypeKey = 0, | 
|  | kFrameTypeInter, | 
|  | kFrameTypeAltRef, | 
|  | kFrameTypeOverlay, | 
|  | kFrameTypeGolden, | 
|  | }; | 
|  |  | 
|  | // TODO(angiebird): Add description for each reference frame type. | 
|  | // This enum numbers have to be contiguous and start from zero except | 
|  | // kNoneRefFrame. | 
|  | enum RefFrameType { | 
|  | kRefFrameTypeLast = 0, | 
|  | kRefFrameTypePast = 1, | 
|  | kRefFrameTypeFuture = 2, | 
|  | kRefFrameTypeMax = 3, | 
|  | kRefFrameTypeNone = -1, | 
|  | }; | 
|  |  | 
|  | enum GopMapFlag { | 
|  | kGopMapFlagStart = | 
|  | 1 << 0,  // Indicate this location is the start of a group of pictures. | 
|  | kGopMapFlagUseAltRef = | 
|  | 1 << 1,  // Indicate this group of pictures will use an alt ref. Only set | 
|  | // this flag when kGopMapFlagStart is set. | 
|  | }; | 
|  |  | 
|  | // The frame is split to 4x4 blocks. | 
|  | // This structure contains the information of each 4x4 block. | 
|  | struct PartitionInfo { | 
|  | int row;           // row pixel offset of current 4x4 block | 
|  | int column;        // column pixel offset of current 4x4 block | 
|  | int row_start;     // row pixel offset of the start of the prediction block | 
|  | int column_start;  // column pixel offset of the start of the prediction block | 
|  | int width;         // prediction block width | 
|  | int height;        // prediction block height | 
|  | }; | 
|  |  | 
|  | constexpr int kMotionVectorPrecision = 8; | 
|  |  | 
|  | // In the first pass. The frame is split to 16x16 blocks. | 
|  | // This structure contains the information of each 16x16 block. | 
|  | // In the second pass. The frame is split to 4x4 blocks. | 
|  | // This structure contains the information of each 4x4 block. | 
|  | struct MotionVectorInfo { | 
|  | // Number of valid motion vectors, always 0 if this block is in the key frame. | 
|  | // For inter frames, it could be 1 or 2. | 
|  | int mv_count; | 
|  | // The reference frame for motion vectors. If the second motion vector does | 
|  | // not exist (mv_count = 1), the reference frame is kNoneRefFrame. | 
|  | // Otherwise, the reference frame is either kRefFrameTypeLast, or | 
|  | // kRefFrameTypePast, or kRefFrameTypeFuture. | 
|  | RefFrameType ref_frame[2]; | 
|  | // The row offset of motion vectors in the unit of pixel. | 
|  | // If the second motion vector does not exist, the value is 0. | 
|  | double mv_row[2]; | 
|  | // The column offset of motion vectors in the unit of pixel. | 
|  | // If the second motion vector does not exist, the value is 0. | 
|  | double mv_column[2]; | 
|  | }; | 
|  |  | 
|  | struct RefFrameInfo { | 
|  | int coding_indexes[kRefFrameTypeMax]; | 
|  |  | 
|  | // Indicate whether the reference frames are available or not. | 
|  | // When the reference frame type is not valid, it means either the to-be-coded | 
|  | // frame is a key frame or the reference frame already appears in other | 
|  | // reference frame type. vp9 always keeps three types of reference frame | 
|  | // available.  However, the duplicated reference frames will not be | 
|  | // chosen by the encoder. The priorities of choosing reference frames are | 
|  | // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture. | 
|  | // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the | 
|  | // same frame, kRefFrameTypePast will be set to invalid. | 
|  | // 1: the ref frame type is available 0: the ref frame type is not available | 
|  | int valid_list[kRefFrameTypeMax]; | 
|  | }; | 
|  |  | 
|  | bool operator==(const RefFrameInfo &a, const RefFrameInfo &b); | 
|  |  | 
|  | struct EncodeFrameInfo { | 
|  | int show_idx; | 
|  |  | 
|  | // Each show or no show frame is assigned with a coding index based on its | 
|  | // coding order (starting from zero) in the coding process of the entire | 
|  | // video. The coding index for each frame is unique. | 
|  | int coding_index; | 
|  | RefFrameInfo ref_frame_info; | 
|  | FrameType frame_type; | 
|  | }; | 
|  |  | 
|  | // This structure is a copy of vp9 |nmv_component_counts|. | 
|  | struct NewMotionvectorComponentCounts { | 
|  | std::vector<unsigned int> sign; | 
|  | std::vector<unsigned int> classes; | 
|  | std::vector<unsigned int> class0; | 
|  | std::vector<std::vector<unsigned int>> bits; | 
|  | std::vector<std::vector<unsigned int>> class0_fp; | 
|  | std::vector<unsigned int> fp; | 
|  | std::vector<unsigned int> class0_hp; | 
|  | std::vector<unsigned int> hp; | 
|  | }; | 
|  |  | 
|  | // This structure is a copy of vp9 |nmv_context_counts|. | 
|  | struct NewMotionVectorContextCounts { | 
|  | std::vector<unsigned int> joints; | 
|  | std::vector<NewMotionvectorComponentCounts> comps; | 
|  | }; | 
|  |  | 
|  | using UintArray2D = std::vector<std::vector<unsigned int>>; | 
|  | using UintArray3D = std::vector<std::vector<std::vector<unsigned int>>>; | 
|  | using UintArray5D = std::vector< | 
|  | std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>; | 
|  | using UintArray6D = std::vector<std::vector< | 
|  | std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>>; | 
|  |  | 
|  | // This structure is a copy of vp9 |tx_counts|. | 
|  | struct TransformSizeCounts { | 
|  | // Transform size found in blocks of partition size 32x32. | 
|  | // First dimension: transform size contexts (2). | 
|  | // Second dimension: transform size type (3: 32x32, 16x16, 8x8) | 
|  | UintArray2D p32x32; | 
|  | // Transform size found in blocks of partition size 16x16. | 
|  | // First dimension: transform size contexts (2). | 
|  | // Second dimension: transform size type (2: 16x16, 8x8) | 
|  | UintArray2D p16x16; | 
|  | // Transform size found in blocks of partition size 8x8. | 
|  | // First dimension: transform size contexts (2). | 
|  | // Second dimension: transform size type (1: 8x8) | 
|  | UintArray2D p8x8; | 
|  | // Overall transform size count. | 
|  | std::vector<unsigned int> tx_totals; | 
|  | }; | 
|  |  | 
|  | // This structure is a copy of vp9 |FRAME_COUNTS|. | 
|  | struct FrameCounts { | 
|  | // Intra prediction mode for luma plane. First dimension: block size (4). | 
|  | // Second dimension: intra prediction mode (10). | 
|  | UintArray2D y_mode; | 
|  | // Intra prediction mode for chroma plane. First and second dimension: | 
|  | // intra prediction mode (10). | 
|  | UintArray2D uv_mode; | 
|  | // Partition type. First dimension: partition contexts (16). | 
|  | // Second dimension: partition type (4). | 
|  | UintArray2D partition; | 
|  | // Transform coefficient. | 
|  | UintArray6D coef; | 
|  | // End of block (the position of the last non-zero transform coefficient) | 
|  | UintArray5D eob_branch; | 
|  | // Interpolation filter type. First dimension: switchable filter contexts (4). | 
|  | // Second dimension: filter types (3). | 
|  | UintArray2D switchable_interp; | 
|  | // Inter prediction mode (the motion vector type). | 
|  | // First dimension: inter mode contexts (7). | 
|  | // Second dimension: mode type (4). | 
|  | UintArray2D inter_mode; | 
|  | // Block is intra or inter predicted. First dimension: contexts (4). | 
|  | // Second dimension: type (0 for intra, 1 for inter). | 
|  | UintArray2D intra_inter; | 
|  | // Block is compound predicted (predicted from average of two blocks). | 
|  | // First dimension: contexts (5). | 
|  | // Second dimension: type (0 for single, 1 for compound prediction). | 
|  | UintArray2D comp_inter; | 
|  | // Type of the reference frame. Only one reference frame. | 
|  | // First dimension: context (5). Second dimension: context (2). | 
|  | // Third dimension: count (2). | 
|  | UintArray3D single_ref; | 
|  | // Type of the two reference frames. | 
|  | // First dimension: context (5). Second dimension: count (2). | 
|  | UintArray2D comp_ref; | 
|  | // Block skips transform and quantization, uses prediction as reconstruction. | 
|  | // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip). | 
|  | UintArray2D skip; | 
|  | // Transform size. | 
|  | TransformSizeCounts tx; | 
|  | // New motion vector. | 
|  | NewMotionVectorContextCounts mv; | 
|  | }; | 
|  |  | 
|  | struct ImageBuffer { | 
|  | // The image data is stored in raster order, | 
|  | // i.e. image[plane][r][c] = | 
|  | // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]]. | 
|  | std::unique_ptr<unsigned char[]> plane_buffer[3]; | 
|  | int plane_width[3]; | 
|  | int plane_height[3]; | 
|  | }; | 
|  |  | 
|  | void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file); | 
|  |  | 
|  | struct EncodeFrameResult { | 
|  | int show_idx; | 
|  | FrameType frame_type; | 
|  | int coding_idx; | 
|  | RefFrameInfo ref_frame_info; | 
|  | size_t coding_data_bit_size; | 
|  | size_t coding_data_byte_size; | 
|  | // The EncodeFrame will allocate a buffer, write the coding data into the | 
|  | // buffer and give the ownership of the buffer to coding_data. | 
|  | std::unique_ptr<unsigned char[]> coding_data; | 
|  | double psnr; | 
|  | uint64_t sse; | 
|  | int quantize_index; | 
|  | FrameCounts frame_counts; | 
|  | int num_rows_4x4;  // number of row units, in size of 4. | 
|  | int num_cols_4x4;  // number of column units, in size of 4. | 
|  | // A vector of the partition information of the frame. | 
|  | // The number of elements is |num_rows_4x4| * |num_cols_4x4|. | 
|  | // The frame is divided 4x4 blocks of |num_rows_4x4| rows and | 
|  | // |num_cols_4x4| columns. | 
|  | // Each 4x4 block contains the current pixel position (|row|, |column|), | 
|  | // the start pixel position of the partition (|row_start|, |column_start|), | 
|  | // and the |width|, |height| of the partition. | 
|  | // The current pixel position can be the same as the start pixel position | 
|  | // if the 4x4 block is the top-left block in the partition. Otherwise, they | 
|  | // are different. | 
|  | // Within the same partition, all 4x4 blocks have the same |row_start|, | 
|  | // |column_start|, |width| and |height|. | 
|  | // For example, if the frame is partitioned to a 32x32 block, | 
|  | // starting at (0, 0). Then, there're 64 4x4 blocks within this partition. | 
|  | // They all have the same |row_start|, |column_start|, |width|, |height|, | 
|  | // which can be used to figure out the start of the current partition and | 
|  | // the start of the next partition block. | 
|  | // Horizontal next: |column_start| + |width|, | 
|  | // Vertical next: |row_start| + |height|. | 
|  | std::vector<PartitionInfo> partition_info; | 
|  | // A vector of the motion vector information of the frame. | 
|  | // The number of elements is |num_rows_4x4| * |num_cols_4x4|. | 
|  | // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and | 
|  | // |num_cols_4x4| columns. | 
|  | // Each 4x4 block contains 0 motion vector if this is an intra predicted | 
|  | // frame (for example, the key frame). If the frame is inter predicted, | 
|  | // each 4x4 block contains either 1 or 2 motion vectors. | 
|  | // Similar to partition info, all 4x4 blocks inside the same partition block | 
|  | // share the same motion vector information. | 
|  | std::vector<MotionVectorInfo> motion_vector_info; | 
|  | ImageBuffer coded_frame; | 
|  | }; | 
|  |  | 
|  | struct GroupOfPicture { | 
|  | // This list will be updated internally in StartEncode() and | 
|  | // EncodeFrame()/EncodeFrameWithQuantizeIndex(). | 
|  | // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be | 
|  | // triggered when the coded frame is the last one in the previous group of | 
|  | // pictures. | 
|  | std::vector<EncodeFrameInfo> encode_frame_list; | 
|  |  | 
|  | // Indicates the index of the next coding frame in encode_frame_list. | 
|  | // In other words, EncodeFrameInfo of the next coding frame can be | 
|  | // obtained with encode_frame_list[next_encode_frame_index]. | 
|  | // Internally, next_encode_frame_index will be set to zero after the last | 
|  | // frame of the group of pictures is coded. Otherwise, next_encode_frame_index | 
|  | // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex() | 
|  | // call. | 
|  | int next_encode_frame_index; | 
|  |  | 
|  | // Number of show frames in this group of pictures. | 
|  | int show_frame_count; | 
|  |  | 
|  | // The show index/timestamp of the earliest show frame in the group of | 
|  | // pictures. | 
|  | int start_show_index; | 
|  |  | 
|  | // The coding index of the first coding frame in the group of pictures. | 
|  | int start_coding_index; | 
|  |  | 
|  | // Indicates whether this group of pictures starts with a key frame. | 
|  | int first_is_key_frame; | 
|  |  | 
|  | // Indicates whether this group of pictures uses an alt ref. | 
|  | int use_alt_ref; | 
|  |  | 
|  | // Indicates whether previous group of pictures used an alt ref. | 
|  | int last_gop_use_alt_ref; | 
|  | }; | 
|  |  | 
|  | class SimpleEncode { | 
|  | public: | 
|  | // When outfile_path is set, the encoder will output the bitstream in ivf | 
|  | // format. | 
|  | SimpleEncode(int frame_width, int frame_height, int frame_rate_num, | 
|  | int frame_rate_den, int target_bitrate, int num_frames, | 
|  | const char *infile_path, const char *outfile_path = nullptr); | 
|  | ~SimpleEncode(); | 
|  | SimpleEncode(SimpleEncode &) = delete; | 
|  | SimpleEncode &operator=(const SimpleEncode &) = delete; | 
|  |  | 
|  | // Adjusts the encoder's coding speed. | 
|  | // If this function is not called, the encoder will use default encode_speed | 
|  | // 0. Call this function before ComputeFirstPassStats() if needed. | 
|  | // The encode_speed is equivalent to --cpu-used of the vpxenc command. | 
|  | // The encode_speed's range should be [0, 9]. | 
|  | // Setting the encode_speed to a higher level will yield faster coding | 
|  | // at the cost of lower compression efficiency. | 
|  | void SetEncodeSpeed(int encode_speed); | 
|  |  | 
|  | // Makes encoder compute the first pass stats and store it at | 
|  | // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the | 
|  | // first pass stats. | 
|  | void ComputeFirstPassStats(); | 
|  |  | 
|  | // Outputs the first pass stats represented by a 2-D vector. | 
|  | // One can use the frame index at first dimension to retrieve the stats for | 
|  | // each video frame. The stats of each video frame is a vector of 25 double | 
|  | // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h | 
|  | std::vector<std::vector<double>> ObserveFirstPassStats(); | 
|  |  | 
|  | // Outputs the first pass motion vectors represented by a 2-D vector. | 
|  | // One can use the frame index at first dimension to retrieve the mvs for | 
|  | // each video frame. The frame is divided into 16x16 blocks. The number of | 
|  | // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). | 
|  | std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors(); | 
|  |  | 
|  | // Ouputs a copy of key_frame_map_, a binary vector with size equal to the | 
|  | // number of show frames in the video. For each entry in the vector, 1 | 
|  | // indicates the position is a key frame and 0 indicates it's not a key frame. | 
|  | // This function should be called after ComputeFirstPassStats() | 
|  | std::vector<int> ObserveKeyFrameMap() const; | 
|  |  | 
|  | // Sets group of pictures map for coding the entire video. | 
|  | // Each entry in the gop_map corresponds to a show frame in the video. | 
|  | // Therefore, the size of gop_map should equal to the number of show frames in | 
|  | // the entire video. | 
|  | // If a given entry's kGopMapFlagStart is set, it means this is the start of a | 
|  | // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to | 
|  | // indicate whether this gop use altref. | 
|  | // If a given entry is zero, it means it's in the middle of a gop. | 
|  | // This function should be called only once after ComputeFirstPassStats(), | 
|  | // before StartEncode(). | 
|  | // This API will check and modify the gop_map to satisfy the following | 
|  | // constraints. | 
|  | // 1) Each key frame position should be at the start of a gop. | 
|  | // 2) The last gop should not use an alt ref. | 
|  | void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size); | 
|  |  | 
|  | // Observe the group of pictures map set through | 
|  | // SetExternalGroupOfPicturesMap(). This function should be called after | 
|  | // SetExternalGroupOfPicturesMap(). | 
|  | std::vector<int> ObserveExternalGroupOfPicturesMap(); | 
|  |  | 
|  | // Initializes the encoder for actual encoding. | 
|  | // This function should be called after ComputeFirstPassStats(). | 
|  | void StartEncode(); | 
|  |  | 
|  | // Frees the encoder. | 
|  | // This function should be called after StartEncode() or EncodeFrame(). | 
|  | void EndEncode(); | 
|  |  | 
|  | // The key frame group size includes one key frame plus the number of | 
|  | // following inter frames. Note that the key frame group size only counts the | 
|  | // show frames. The number of no show frames like alternate refereces are not | 
|  | // counted. | 
|  | int GetKeyFrameGroupSize() const; | 
|  |  | 
|  | // Provides the group of pictures that the next coding frame is in. | 
|  | // Only call this function between StartEncode() and EndEncode() | 
|  | GroupOfPicture ObserveGroupOfPicture() const; | 
|  |  | 
|  | // Gets encode_frame_info for the next coding frame. | 
|  | // Only call this function between StartEncode() and EndEncode() | 
|  | EncodeFrameInfo GetNextEncodeFrameInfo() const; | 
|  |  | 
|  | // Encodes a frame | 
|  | // This function should be called after StartEncode() and before EndEncode(). | 
|  | void EncodeFrame(EncodeFrameResult *encode_frame_result); | 
|  |  | 
|  | // Encodes a frame with a specific quantize index. | 
|  | // This function should be called after StartEncode() and before EndEncode(). | 
|  | void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result, | 
|  | int quantize_index); | 
|  |  | 
|  | // Encode a frame with target frame bits usage. | 
|  | // The encoder will find a quantize index to make the actual frame bits usage | 
|  | // match the target. | 
|  | void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, | 
|  | int target_frame_bits); | 
|  |  | 
|  | // Gets the number of coding frames for the video. The coding frames include | 
|  | // show frame and no show frame. | 
|  | // This function should be called after ComputeFirstPassStats(). | 
|  | int GetCodingFrameNum() const; | 
|  |  | 
|  | // Gets the total number of pixels of YUV planes per frame. | 
|  | uint64_t GetFramePixelCount() const; | 
|  |  | 
|  | private: | 
|  | // Compute the key frame locations of the video based on first pass stats. | 
|  | // The results are returned as a binary vector with 1s indicating keyframes | 
|  | // and 0s indicating non keyframes. | 
|  | // It has to be called after impl_ptr_->first_pass_stats is computed. | 
|  | std::vector<int> ComputeKeyFrameMap() const; | 
|  |  | 
|  | // Updates key_frame_group_size_, reset key_frame_group_index_ and init | 
|  | // ref_frame_info_. | 
|  | void UpdateKeyFrameGroup(int key_frame_show_index); | 
|  |  | 
|  | // Update key_frame_group_index_. | 
|  | void PostUpdateKeyFrameGroupIndex(FrameType frame_type); | 
|  |  | 
|  | void PostUpdateState(const EncodeFrameResult &encode_frame_result); | 
|  |  | 
|  | class EncodeImpl; | 
|  |  | 
|  | int frame_width_;   // frame width in pixels. | 
|  | int frame_height_;  // frame height in pixels. | 
|  | int frame_rate_num_; | 
|  | int frame_rate_den_; | 
|  | int target_bitrate_; | 
|  | int num_frames_; | 
|  | int encode_speed_; | 
|  |  | 
|  | std::FILE *in_file_; | 
|  | std::FILE *out_file_; | 
|  | std::unique_ptr<EncodeImpl> impl_ptr_; | 
|  |  | 
|  | std::vector<int> key_frame_map_; | 
|  | std::vector<int> gop_map_; | 
|  | GroupOfPicture group_of_picture_; | 
|  |  | 
|  | // The key frame group size includes one key frame plus the number of | 
|  | // following inter frames. Note that the key frame group size only counts the | 
|  | // show frames. The number of no show frames like alternate references are not | 
|  | // counted. | 
|  | int key_frame_group_size_; | 
|  |  | 
|  | // The index for the to-be-coded show frame in the key frame group. | 
|  | int key_frame_group_index_; | 
|  |  | 
|  | // Each show or no show frame is assigned with a coding index based on its | 
|  | // coding order (starting from zero) in the coding process of the entire | 
|  | // video. The coding index of the to-be-coded frame. | 
|  | int frame_coding_index_; | 
|  |  | 
|  | // Number of show frames we have coded so far. | 
|  | int show_frame_count_; | 
|  |  | 
|  | // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key | 
|  | // frame appears? | 
|  | // Reference frames info of the to-be-coded frame. | 
|  | RefFrameInfo ref_frame_info_; | 
|  |  | 
|  | // A 2-D vector of motion vector information of the frame collected | 
|  | // from the first pass. The first dimension is the frame index. | 
|  | // Each frame is divided into 16x16 blocks. The number of elements is | 
|  | // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). | 
|  | // Each 16x16 block contains 0 motion vector if this is an intra predicted | 
|  | // frame (for example, the key frame). If the frame is inter predicted, | 
|  | // each 16x16 block contains either 1 or 2 motion vectors. | 
|  | // The first motion vector is always from the LAST_FRAME. | 
|  | // The second motion vector is always from the GOLDEN_FRAME. | 
|  | std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_; | 
|  | }; | 
|  |  | 
|  | }  // namespace vp9 | 
|  |  | 
|  | #endif  // VPX_VP9_SIMPLE_ENCODE_H_ |