| /* |
| * Copyright (c) 2019 The WebM project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #ifndef VPX_VP9_SIMPLE_ENCODE_H_ |
| #define VPX_VP9_SIMPLE_ENCODE_H_ |
| |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdio> |
| #include <memory> |
| #include <vector> |
| |
| namespace vp9 { |
| |
| // TODO(angiebird): Add description for each frame type. |
| enum FrameType { |
| kFrameTypeKey = 0, |
| kFrameTypeInter, |
| kFrameTypeAltRef, |
| kFrameTypeOverlay, |
| kFrameTypeGolden, |
| }; |
| |
| // TODO(angiebird): Add description for each reference frame type. |
| // This enum numbers have to be contiguous and start from zero except |
| // kNoneRefFrame. |
| enum RefFrameType { |
| kRefFrameTypeLast = 0, |
| kRefFrameTypePast = 1, |
| kRefFrameTypeFuture = 2, |
| kRefFrameTypeMax = 3, |
| kRefFrameTypeNone = -1, |
| }; |
| |
| enum GopMapFlag { |
| kGopMapFlagStart = |
| 1 << 0, // Indicate this location is the start of a group of pictures. |
| kGopMapFlagUseAltRef = |
| 1 << 1, // Indicate this group of pictures will use an alt ref. Only set |
| // this flag when kGopMapFlagStart is set. |
| }; |
| |
| // The frame is split to 4x4 blocks. |
| // This structure contains the information of each 4x4 block. |
| struct PartitionInfo { |
| int row; // row pixel offset of current 4x4 block |
| int column; // column pixel offset of current 4x4 block |
| int row_start; // row pixel offset of the start of the prediction block |
| int column_start; // column pixel offset of the start of the prediction block |
| int width; // prediction block width |
| int height; // prediction block height |
| }; |
| |
| constexpr int kMotionVectorPrecision = 8; |
| |
| // In the first pass. The frame is split to 16x16 blocks. |
| // This structure contains the information of each 16x16 block. |
| // In the second pass. The frame is split to 4x4 blocks. |
| // This structure contains the information of each 4x4 block. |
| struct MotionVectorInfo { |
| // Number of valid motion vectors, always 0 if this block is in the key frame. |
| // For inter frames, it could be 1 or 2. |
| int mv_count; |
| // The reference frame for motion vectors. If the second motion vector does |
| // not exist (mv_count = 1), the reference frame is kNoneRefFrame. |
| // Otherwise, the reference frame is either kRefFrameTypeLast, or |
| // kRefFrameTypePast, or kRefFrameTypeFuture. |
| RefFrameType ref_frame[2]; |
| // The row offset of motion vectors in the unit of pixel. |
| // If the second motion vector does not exist, the value is 0. |
| double mv_row[2]; |
| // The column offset of motion vectors in the unit of pixel. |
| // If the second motion vector does not exist, the value is 0. |
| double mv_column[2]; |
| }; |
| |
| struct RefFrameInfo { |
| int coding_indexes[kRefFrameTypeMax]; |
| |
| // Indicate whether the reference frames are available or not. |
| // When the reference frame type is not valid, it means either the to-be-coded |
| // frame is a key frame or the reference frame already appears in other |
| // reference frame type. vp9 always keeps three types of reference frame |
| // available. However, the duplicated reference frames will not be |
| // chosen by the encoder. The priorities of choosing reference frames are |
| // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture. |
| // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the |
| // same frame, kRefFrameTypePast will be set to invalid. |
| // 1: the ref frame type is available 0: the ref frame type is not available |
| int valid_list[kRefFrameTypeMax]; |
| }; |
| |
| bool operator==(const RefFrameInfo &a, const RefFrameInfo &b); |
| |
| struct EncodeFrameInfo { |
| int show_idx; |
| |
| // Each show or no show frame is assigned with a coding index based on its |
| // coding order (starting from zero) in the coding process of the entire |
| // video. The coding index for each frame is unique. |
| int coding_index; |
| RefFrameInfo ref_frame_info; |
| FrameType frame_type; |
| }; |
| |
| // This structure is a copy of vp9 |nmv_component_counts|. |
| struct NewMotionvectorComponentCounts { |
| std::vector<unsigned int> sign; |
| std::vector<unsigned int> classes; |
| std::vector<unsigned int> class0; |
| std::vector<std::vector<unsigned int>> bits; |
| std::vector<std::vector<unsigned int>> class0_fp; |
| std::vector<unsigned int> fp; |
| std::vector<unsigned int> class0_hp; |
| std::vector<unsigned int> hp; |
| }; |
| |
| // This structure is a copy of vp9 |nmv_context_counts|. |
| struct NewMotionVectorContextCounts { |
| std::vector<unsigned int> joints; |
| std::vector<NewMotionvectorComponentCounts> comps; |
| }; |
| |
| using UintArray2D = std::vector<std::vector<unsigned int>>; |
| using UintArray3D = std::vector<std::vector<std::vector<unsigned int>>>; |
| using UintArray5D = std::vector< |
| std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>; |
| using UintArray6D = std::vector<std::vector< |
| std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>>; |
| |
| // This structure is a copy of vp9 |tx_counts|. |
| struct TransformSizeCounts { |
| // Transform size found in blocks of partition size 32x32. |
| // First dimension: transform size contexts (2). |
| // Second dimension: transform size type (3: 32x32, 16x16, 8x8) |
| UintArray2D p32x32; |
| // Transform size found in blocks of partition size 16x16. |
| // First dimension: transform size contexts (2). |
| // Second dimension: transform size type (2: 16x16, 8x8) |
| UintArray2D p16x16; |
| // Transform size found in blocks of partition size 8x8. |
| // First dimension: transform size contexts (2). |
| // Second dimension: transform size type (1: 8x8) |
| UintArray2D p8x8; |
| // Overall transform size count. |
| std::vector<unsigned int> tx_totals; |
| }; |
| |
| // This structure is a copy of vp9 |FRAME_COUNTS|. |
| struct FrameCounts { |
| // Intra prediction mode for luma plane. First dimension: block size (4). |
| // Second dimension: intra prediction mode (10). |
| UintArray2D y_mode; |
| // Intra prediction mode for chroma plane. First and second dimension: |
| // intra prediction mode (10). |
| UintArray2D uv_mode; |
| // Partition type. First dimension: partition contexts (16). |
| // Second dimension: partition type (4). |
| UintArray2D partition; |
| // Transform coefficient. |
| UintArray6D coef; |
| // End of block (the position of the last non-zero transform coefficient) |
| UintArray5D eob_branch; |
| // Interpolation filter type. First dimension: switchable filter contexts (4). |
| // Second dimension: filter types (3). |
| UintArray2D switchable_interp; |
| // Inter prediction mode (the motion vector type). |
| // First dimension: inter mode contexts (7). |
| // Second dimension: mode type (4). |
| UintArray2D inter_mode; |
| // Block is intra or inter predicted. First dimension: contexts (4). |
| // Second dimension: type (0 for intra, 1 for inter). |
| UintArray2D intra_inter; |
| // Block is compound predicted (predicted from average of two blocks). |
| // First dimension: contexts (5). |
| // Second dimension: type (0 for single, 1 for compound prediction). |
| UintArray2D comp_inter; |
| // Type of the reference frame. Only one reference frame. |
| // First dimension: context (5). Second dimension: context (2). |
| // Third dimension: count (2). |
| UintArray3D single_ref; |
| // Type of the two reference frames. |
| // First dimension: context (5). Second dimension: count (2). |
| UintArray2D comp_ref; |
| // Block skips transform and quantization, uses prediction as reconstruction. |
| // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip). |
| UintArray2D skip; |
| // Transform size. |
| TransformSizeCounts tx; |
| // New motion vector. |
| NewMotionVectorContextCounts mv; |
| }; |
| |
| struct ImageBuffer { |
| // The image data is stored in raster order, |
| // i.e. image[plane][r][c] = |
| // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]]. |
| std::unique_ptr<unsigned char[]> plane_buffer[3]; |
| int plane_width[3]; |
| int plane_height[3]; |
| }; |
| |
| void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file); |
| |
| struct EncodeFrameResult { |
| int show_idx; |
| FrameType frame_type; |
| int coding_idx; |
| RefFrameInfo ref_frame_info; |
| size_t coding_data_bit_size; |
| size_t coding_data_byte_size; |
| // The EncodeFrame will allocate a buffer, write the coding data into the |
| // buffer and give the ownership of the buffer to coding_data. |
| std::unique_ptr<unsigned char[]> coding_data; |
| double psnr; |
| uint64_t sse; |
| int quantize_index; |
| FrameCounts frame_counts; |
| int num_rows_4x4; // number of row units, in size of 4. |
| int num_cols_4x4; // number of column units, in size of 4. |
| // A vector of the partition information of the frame. |
| // The number of elements is |num_rows_4x4| * |num_cols_4x4|. |
| // The frame is divided 4x4 blocks of |num_rows_4x4| rows and |
| // |num_cols_4x4| columns. |
| // Each 4x4 block contains the current pixel position (|row|, |column|), |
| // the start pixel position of the partition (|row_start|, |column_start|), |
| // and the |width|, |height| of the partition. |
| // The current pixel position can be the same as the start pixel position |
| // if the 4x4 block is the top-left block in the partition. Otherwise, they |
| // are different. |
| // Within the same partition, all 4x4 blocks have the same |row_start|, |
| // |column_start|, |width| and |height|. |
| // For example, if the frame is partitioned to a 32x32 block, |
| // starting at (0, 0). Then, there're 64 4x4 blocks within this partition. |
| // They all have the same |row_start|, |column_start|, |width|, |height|, |
| // which can be used to figure out the start of the current partition and |
| // the start of the next partition block. |
| // Horizontal next: |column_start| + |width|, |
| // Vertical next: |row_start| + |height|. |
| std::vector<PartitionInfo> partition_info; |
| // A vector of the motion vector information of the frame. |
| // The number of elements is |num_rows_4x4| * |num_cols_4x4|. |
| // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and |
| // |num_cols_4x4| columns. |
| // Each 4x4 block contains 0 motion vector if this is an intra predicted |
| // frame (for example, the key frame). If the frame is inter predicted, |
| // each 4x4 block contains either 1 or 2 motion vectors. |
| // Similar to partition info, all 4x4 blocks inside the same partition block |
| // share the same motion vector information. |
| std::vector<MotionVectorInfo> motion_vector_info; |
| ImageBuffer coded_frame; |
| }; |
| |
| struct GroupOfPicture { |
| // This list will be updated internally in StartEncode() and |
| // EncodeFrame()/EncodeFrameWithQuantizeIndex(). |
| // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be |
| // triggered when the coded frame is the last one in the previous group of |
| // pictures. |
| std::vector<EncodeFrameInfo> encode_frame_list; |
| |
| // Indicates the index of the next coding frame in encode_frame_list. |
| // In other words, EncodeFrameInfo of the next coding frame can be |
| // obtained with encode_frame_list[next_encode_frame_index]. |
| // Internally, next_encode_frame_index will be set to zero after the last |
| // frame of the group of pictures is coded. Otherwise, next_encode_frame_index |
| // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex() |
| // call. |
| int next_encode_frame_index; |
| |
| // Number of show frames in this group of pictures. |
| int show_frame_count; |
| |
| // The show index/timestamp of the earliest show frame in the group of |
| // pictures. |
| int start_show_index; |
| |
| // The coding index of the first coding frame in the group of pictures. |
| int start_coding_index; |
| |
| // Indicates whether this group of pictures starts with a key frame. |
| int first_is_key_frame; |
| |
| // Indicates whether this group of pictures uses an alt ref. |
| int use_alt_ref; |
| |
| // Indicates whether previous group of pictures used an alt ref. |
| int last_gop_use_alt_ref; |
| }; |
| |
| class SimpleEncode { |
| public: |
| // When outfile_path is set, the encoder will output the bitstream in ivf |
| // format. |
| SimpleEncode(int frame_width, int frame_height, int frame_rate_num, |
| int frame_rate_den, int target_bitrate, int num_frames, |
| const char *infile_path, const char *outfile_path = nullptr); |
| ~SimpleEncode(); |
| SimpleEncode(SimpleEncode &) = delete; |
| SimpleEncode &operator=(const SimpleEncode &) = delete; |
| |
| // Adjusts the encoder's coding speed. |
| // If this function is not called, the encoder will use default encode_speed |
| // 0. Call this function before ComputeFirstPassStats() if needed. |
| // The encode_speed is equivalent to --cpu-used of the vpxenc command. |
| // The encode_speed's range should be [0, 9]. |
| // Setting the encode_speed to a higher level will yield faster coding |
| // at the cost of lower compression efficiency. |
| void SetEncodeSpeed(int encode_speed); |
| |
| // Makes encoder compute the first pass stats and store it at |
| // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the |
| // first pass stats. |
| void ComputeFirstPassStats(); |
| |
| // Outputs the first pass stats represented by a 2-D vector. |
| // One can use the frame index at first dimension to retrieve the stats for |
| // each video frame. The stats of each video frame is a vector of 25 double |
| // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h |
| std::vector<std::vector<double>> ObserveFirstPassStats(); |
| |
| // Outputs the first pass motion vectors represented by a 2-D vector. |
| // One can use the frame index at first dimension to retrieve the mvs for |
| // each video frame. The frame is divided into 16x16 blocks. The number of |
| // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). |
| std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors(); |
| |
| // Ouputs a copy of key_frame_map_, a binary vector with size equal to the |
| // number of show frames in the video. For each entry in the vector, 1 |
| // indicates the position is a key frame and 0 indicates it's not a key frame. |
| // This function should be called after ComputeFirstPassStats() |
| std::vector<int> ObserveKeyFrameMap() const; |
| |
| // Sets group of pictures map for coding the entire video. |
| // Each entry in the gop_map corresponds to a show frame in the video. |
| // Therefore, the size of gop_map should equal to the number of show frames in |
| // the entire video. |
| // If a given entry's kGopMapFlagStart is set, it means this is the start of a |
| // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to |
| // indicate whether this gop use altref. |
| // If a given entry is zero, it means it's in the middle of a gop. |
| // This function should be called only once after ComputeFirstPassStats(), |
| // before StartEncode(). |
| // This API will check and modify the gop_map to satisfy the following |
| // constraints. |
| // 1) Each key frame position should be at the start of a gop. |
| // 2) The last gop should not use an alt ref. |
| void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size); |
| |
| // Observe the group of pictures map set through |
| // SetExternalGroupOfPicturesMap(). This function should be called after |
| // SetExternalGroupOfPicturesMap(). |
| std::vector<int> ObserveExternalGroupOfPicturesMap(); |
| |
| // Initializes the encoder for actual encoding. |
| // This function should be called after ComputeFirstPassStats(). |
| void StartEncode(); |
| |
| // Frees the encoder. |
| // This function should be called after StartEncode() or EncodeFrame(). |
| void EndEncode(); |
| |
| // The key frame group size includes one key frame plus the number of |
| // following inter frames. Note that the key frame group size only counts the |
| // show frames. The number of no show frames like alternate refereces are not |
| // counted. |
| int GetKeyFrameGroupSize() const; |
| |
| // Provides the group of pictures that the next coding frame is in. |
| // Only call this function between StartEncode() and EndEncode() |
| GroupOfPicture ObserveGroupOfPicture() const; |
| |
| // Gets encode_frame_info for the next coding frame. |
| // Only call this function between StartEncode() and EndEncode() |
| EncodeFrameInfo GetNextEncodeFrameInfo() const; |
| |
| // Encodes a frame |
| // This function should be called after StartEncode() and before EndEncode(). |
| void EncodeFrame(EncodeFrameResult *encode_frame_result); |
| |
| // Encodes a frame with a specific quantize index. |
| // This function should be called after StartEncode() and before EndEncode(). |
| void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result, |
| int quantize_index); |
| |
| // Encode a frame with target frame bits usage. |
| // The encoder will find a quantize index to make the actual frame bits usage |
| // match the target. |
| void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, |
| int target_frame_bits); |
| |
| // Gets the number of coding frames for the video. The coding frames include |
| // show frame and no show frame. |
| // This function should be called after ComputeFirstPassStats(). |
| int GetCodingFrameNum() const; |
| |
| // Gets the total number of pixels of YUV planes per frame. |
| uint64_t GetFramePixelCount() const; |
| |
| private: |
| // Compute the key frame locations of the video based on first pass stats. |
| // The results are returned as a binary vector with 1s indicating keyframes |
| // and 0s indicating non keyframes. |
| // It has to be called after impl_ptr_->first_pass_stats is computed. |
| std::vector<int> ComputeKeyFrameMap() const; |
| |
| // Updates key_frame_group_size_, reset key_frame_group_index_ and init |
| // ref_frame_info_. |
| void UpdateKeyFrameGroup(int key_frame_show_index); |
| |
| // Update key_frame_group_index_. |
| void PostUpdateKeyFrameGroupIndex(FrameType frame_type); |
| |
| void PostUpdateState(const EncodeFrameResult &encode_frame_result); |
| |
| class EncodeImpl; |
| |
| int frame_width_; // frame width in pixels. |
| int frame_height_; // frame height in pixels. |
| int frame_rate_num_; |
| int frame_rate_den_; |
| int target_bitrate_; |
| int num_frames_; |
| int encode_speed_; |
| |
| std::FILE *in_file_; |
| std::FILE *out_file_; |
| std::unique_ptr<EncodeImpl> impl_ptr_; |
| |
| std::vector<int> key_frame_map_; |
| std::vector<int> gop_map_; |
| GroupOfPicture group_of_picture_; |
| |
| // The key frame group size includes one key frame plus the number of |
| // following inter frames. Note that the key frame group size only counts the |
| // show frames. The number of no show frames like alternate references are not |
| // counted. |
| int key_frame_group_size_; |
| |
| // The index for the to-be-coded show frame in the key frame group. |
| int key_frame_group_index_; |
| |
| // Each show or no show frame is assigned with a coding index based on its |
| // coding order (starting from zero) in the coding process of the entire |
| // video. The coding index of the to-be-coded frame. |
| int frame_coding_index_; |
| |
| // Number of show frames we have coded so far. |
| int show_frame_count_; |
| |
| // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key |
| // frame appears? |
| // Reference frames info of the to-be-coded frame. |
| RefFrameInfo ref_frame_info_; |
| |
| // A 2-D vector of motion vector information of the frame collected |
| // from the first pass. The first dimension is the frame index. |
| // Each frame is divided into 16x16 blocks. The number of elements is |
| // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). |
| // Each 16x16 block contains 0 motion vector if this is an intra predicted |
| // frame (for example, the key frame). If the frame is inter predicted, |
| // each 16x16 block contains either 1 or 2 motion vectors. |
| // The first motion vector is always from the LAST_FRAME. |
| // The second motion vector is always from the GOLDEN_FRAME. |
| std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_; |
| }; |
| |
| } // namespace vp9 |
| |
| #endif // VPX_VP9_SIMPLE_ENCODE_H_ |