src/third_party/libvpx/vp9/common/vp9_idct.c - cobalt - Git at Google

 /*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include <math.h>

 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"

 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
     { idct4_c, idct4_c },   // DCT_DCT  = 0
     { iadst4_c, idct4_c },  // ADST_DCT = 1
     { idct4_c, iadst4_c },  // DCT_ADST = 2
     { iadst4_c, iadst4_c }  // ADST_ADST = 3
   };

   int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
   tran_low_t temp_in[4], temp_out[4];

   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, outptr);
     input += 4;
     outptr += 4;
   }

   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
     }
   }
 }

 static const transform_2d IHT_8[] = {
   { idct8_c, idct8_c },   // DCT_DCT  = 0
   { iadst8_c, idct8_c },  // ADST_DCT = 1
   { idct8_c, iadst8_c },  // DCT_ADST = 2
   { iadst8_c, iadst8_c }  // ADST_ADST = 3
 };

 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];

   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
     ht.rows(input, outptr);
     input += 8;
     outptr += 8;
   }

   // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
     }
   }
 }

 static const transform_2d IHT_16[] = {
   { idct16_c, idct16_c },   // DCT_DCT  = 0
   { iadst16_c, idct16_c },  // ADST_DCT = 1
   { idct16_c, iadst16_c },  // DCT_ADST = 2
   { iadst16_c, iadst16_c }  // ADST_ADST = 3
 };

 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];

   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr);
     input += 16;
     outptr += 16;
   }

   // Columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
     }
   }
 }

 // idct
 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
   if (eob > 1)
     vpx_idct4x4_16_add(input, dest, stride);
   else
     vpx_idct4x4_1_add(input, dest, stride);
 }

 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
   if (eob > 1)
     vpx_iwht4x4_16_add(input, dest, stride);
   else
     vpx_iwht4x4_1_add(input, dest, stride);
 }

 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to decide what to do.
   if (eob == 1)
     // DC only DCT coefficient
     vpx_idct8x8_1_add(input, dest, stride);
   else if (eob <= 12)
     vpx_idct8x8_12_add(input, dest, stride);
   else
     vpx_idct8x8_64_add(input, dest, stride);
 }

 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1) /* DC only DCT coefficient. */
     vpx_idct16x16_1_add(input, dest, stride);
   else if (eob <= 10)
     vpx_idct16x16_10_add(input, dest, stride);
   else if (eob <= 38)
     vpx_idct16x16_38_add(input, dest, stride);
   else
     vpx_idct16x16_256_add(input, dest, stride);
 }

 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   if (eob == 1)
     vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
     // non-zero coeff only in upper-left 8x8
     vpx_idct32x32_34_add(input, dest, stride);
   else if (eob <= 135)
     // non-zero coeff only in upper-left 16x16
     vpx_idct32x32_135_add(input, dest, stride);
   else
     vpx_idct32x32_1024_add(input, dest, stride);
 }

 // iht
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT)
     vp9_idct4x4_add(input, dest, stride, eob);
   else
     vp9_iht4x4_16_add(input, dest, stride, tx_type);
 }

 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
   } else {
     vp9_iht8x8_64_add(input, dest, stride, tx_type);
   }
 }

 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
   } else {
     vp9_iht16x16_256_add(input, dest, stride, tx_type);
   }
 }

 #if CONFIG_VP9_HIGHBITDEPTH

 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   const highbd_transform_2d IHT_4[] = {
     { vpx_highbd_idct4_c, vpx_highbd_idct4_c },   // DCT_DCT  = 0
     { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },  // ADST_DCT = 1
     { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2
     { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3
   };

   int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
   tran_low_t temp_in[4], temp_out[4];

   // Inverse transform row vectors.
   for (i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, outptr, bd);
     input += 4;
     outptr += 4;
   }

   // Inverse transform column vectors.
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out, bd);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
     }
   }
 }

 static const highbd_transform_2d HIGH_IHT_8[] = {
   { vpx_highbd_idct8_c, vpx_highbd_idct8_c },   // DCT_DCT  = 0
   { vpx_highbd_iadst8_c, vpx_highbd_idct8_c },  // ADST_DCT = 1
   { vpx_highbd_idct8_c, vpx_highbd_iadst8_c },  // DCT_ADST = 2
   { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }  // ADST_ADST = 3
 };

 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];

   // Inverse transform row vectors.
   for (i = 0; i < 8; ++i) {
     ht.rows(input, outptr, bd);
     input += 8;
     outptr += 8;
   }

   // Inverse transform column vectors.
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
     }
   }
 }

 static const highbd_transform_2d HIGH_IHT_16[] = {
   { vpx_highbd_idct16_c, vpx_highbd_idct16_c },   // DCT_DCT  = 0
   { vpx_highbd_iadst16_c, vpx_highbd_idct16_c },  // ADST_DCT = 1
   { vpx_highbd_idct16_c, vpx_highbd_iadst16_c },  // DCT_ADST = 2
   { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }  // ADST_ADST = 3
 };

 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];

   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr, bd);
     input += 16;
     outptr += 16;
   }

   // Columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
     }
   }
 }

 // idct
 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
   else
     vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
 }

 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
     vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }

 void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to decide what to do.
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
   } else if (eob <= 12) {
     vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
 }

 void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to separate different cases.
   // DC only DCT coefficient.
   if (eob == 1) {
     vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
   } else if (eob <= 10) {
     vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
   } else if (eob <= 38) {
     vpx_highbd_idct16x16_38_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
   }
 }

 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // Non-zero coeff only in upper-left 8x8
   if (eob == 1) {
     vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
   } else if (eob <= 34) {
     vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
   } else if (eob <= 135) {
     vpx_highbd_idct32x32_135_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
   }
 }

 // iht
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
                            uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT)
     vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
   else
     vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
 }

 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
                            uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
   } else {
     vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
   }
 }

 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
                              uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
   } else {
     vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
	/*
	* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include <math.h>

	#include "./vp9_rtcd.h"
	#include "./vpx_dsp_rtcd.h"
	#include "vp9/common/vp9_blockd.h"
	#include "vp9/common/vp9_idct.h"
	#include "vpx_dsp/inv_txfm.h"
	#include "vpx_ports/mem.h"

	void vp9_iht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride,
	int tx_type) {
	const transform_2d IHT_4[] = {
	{ idct4_c, idct4_c }, // DCT_DCT = 0
	{ iadst4_c, idct4_c }, // ADST_DCT = 1
	{ idct4_c, iadst4_c }, // DCT_ADST = 2
	{ iadst4_c, iadst4_c } // ADST_ADST = 3
	};

	int i, j;
	tran_low_t out[4 * 4];
	tran_low_t *outptr = out;
	tran_low_t temp_in[4], temp_out[4];

	// inverse transform row vectors
	for (i = 0; i < 4; ++i) {
	IHT_4[tx_type].rows(input, outptr);
	input += 4;
	outptr += 4;
	}

	// inverse transform column vectors
	for (i = 0; i < 4; ++i) {
	for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
	IHT_4[tx_type].cols(temp_in, temp_out);
	for (j = 0; j < 4; ++j) {
	dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
	ROUND_POWER_OF_TWO(temp_out[j], 4));
	}
	}
	}

	static const transform_2d IHT_8[] = {
	{ idct8_c, idct8_c }, // DCT_DCT = 0
	{ iadst8_c, idct8_c }, // ADST_DCT = 1
	{ idct8_c, iadst8_c }, // DCT_ADST = 2
	{ iadst8_c, iadst8_c } // ADST_ADST = 3
	};

	void vp9_iht8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride,
	int tx_type) {
	int i, j;
	tran_low_t out[8 * 8];
	tran_low_t *outptr = out;
	tran_low_t temp_in[8], temp_out[8];
	const transform_2d ht = IHT_8[tx_type];

	// inverse transform row vectors
	for (i = 0; i < 8; ++i) {
	ht.rows(input, outptr);
	input += 8;
	outptr += 8;
	}

	// inverse transform column vectors
	for (i = 0; i < 8; ++i) {
	for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
	ht.cols(temp_in, temp_out);
	for (j = 0; j < 8; ++j) {
	dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
	ROUND_POWER_OF_TWO(temp_out[j], 5));
	}
	}
	}

	static const transform_2d IHT_16[] = {
	{ idct16_c, idct16_c }, // DCT_DCT = 0
	{ iadst16_c, idct16_c }, // ADST_DCT = 1
	{ idct16_c, iadst16_c }, // DCT_ADST = 2
	{ iadst16_c, iadst16_c } // ADST_ADST = 3
	};

	void vp9_iht16x16_256_add_c(const tran_low_t input, uint8_t dest, int stride,
	int tx_type) {
	int i, j;
	tran_low_t out[16 * 16];
	tran_low_t *outptr = out;
	tran_low_t temp_in[16], temp_out[16];
	const transform_2d ht = IHT_16[tx_type];

	// Rows
	for (i = 0; i < 16; ++i) {
	ht.rows(input, outptr);
	input += 16;
	outptr += 16;
	}

	// Columns
	for (i = 0; i < 16; ++i) {
	for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
	ht.cols(temp_in, temp_out);
	for (j = 0; j < 16; ++j) {
	dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
	ROUND_POWER_OF_TWO(temp_out[j], 6));
	}
	}
	}

	// idct
	void vp9_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,
	int eob) {
	if (eob > 1)
	vpx_idct4x4_16_add(input, dest, stride);
	else
	vpx_idct4x4_1_add(input, dest, stride);
	}

	void vp9_iwht4x4_add(const tran_low_t input, uint8_t dest, int stride,
	int eob) {
	if (eob > 1)
	vpx_iwht4x4_16_add(input, dest, stride);
	else
	vpx_iwht4x4_1_add(input, dest, stride);
	}

	void vp9_idct8x8_add(const tran_low_t input, uint8_t dest, int stride,
	int eob) {
	// If dc is 1, then input[0] is the reconstructed value, do not need
	// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

	// The calculation can be simplified if there are not many non-zero dct
	// coefficients. Use eobs to decide what to do.
	if (eob == 1)
	// DC only DCT coefficient
	vpx_idct8x8_1_add(input, dest, stride);
	else if (eob <= 12)
	vpx_idct8x8_12_add(input, dest, stride);
	else
	vpx_idct8x8_64_add(input, dest, stride);
	}

	void vp9_idct16x16_add(const tran_low_t input, uint8_t dest, int stride,
	int eob) {
	/* The calculation can be simplified if there are not many non-zero dct
	* coefficients. Use eobs to separate different cases. */
	if (eob == 1) /* DC only DCT coefficient. */
	vpx_idct16x16_1_add(input, dest, stride);
	else if (eob <= 10)
	vpx_idct16x16_10_add(input, dest, stride);
	else if (eob <= 38)
	vpx_idct16x16_38_add(input, dest, stride);
	else
	vpx_idct16x16_256_add(input, dest, stride);
	}

	void vp9_idct32x32_add(const tran_low_t input, uint8_t dest, int stride,
	int eob) {
	if (eob == 1)
	vpx_idct32x32_1_add(input, dest, stride);
	else if (eob <= 34)
	// non-zero coeff only in upper-left 8x8
	vpx_idct32x32_34_add(input, dest, stride);
	else if (eob <= 135)
	// non-zero coeff only in upper-left 16x16
	vpx_idct32x32_135_add(input, dest, stride);
	else
	vpx_idct32x32_1024_add(input, dest, stride);
	}

	// iht
	void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,
	int stride, int eob) {
	if (tx_type == DCT_DCT)
	vp9_idct4x4_add(input, dest, stride, eob);
	else
	vp9_iht4x4_16_add(input, dest, stride, tx_type);
	}

	void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,
	int stride, int eob) {
	if (tx_type == DCT_DCT) {
	vp9_idct8x8_add(input, dest, stride, eob);
	} else {
	vp9_iht8x8_64_add(input, dest, stride, tx_type);
	}
	}

	void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,
	int stride, int eob) {
	if (tx_type == DCT_DCT) {
	vp9_idct16x16_add(input, dest, stride, eob);
	} else {
	vp9_iht16x16_256_add(input, dest, stride, tx_type);
	}
	}

	#if CONFIG_VP9_HIGHBITDEPTH

	void vp9_highbd_iht4x4_16_add_c(const tran_low_t input, uint16_t dest,
	int stride, int tx_type, int bd) {
	const highbd_transform_2d IHT_4[] = {
	{ vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0
	{ vpx_highbd_iadst4_c, vpx_highbd_idct4_c }, // ADST_DCT = 1
	{ vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2
	{ vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3
	};

	int i, j;
	tran_low_t out[4 * 4];
	tran_low_t *outptr = out;
	tran_low_t temp_in[4], temp_out[4];

	// Inverse transform row vectors.
	for (i = 0; i < 4; ++i) {
	IHT_4[tx_type].rows(input, outptr, bd);
	input += 4;
	outptr += 4;
	}

	// Inverse transform column vectors.
	for (i = 0; i < 4; ++i) {
	for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
	IHT_4[tx_type].cols(temp_in, temp_out, bd);
	for (j = 0; j < 4; ++j) {
	dest[j * stride + i] = highbd_clip_pixel_add(
	dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
	}
	}
	}

	static const highbd_transform_2d HIGH_IHT_8[] = {
	{ vpx_highbd_idct8_c, vpx_highbd_idct8_c }, // DCT_DCT = 0
	{ vpx_highbd_iadst8_c, vpx_highbd_idct8_c }, // ADST_DCT = 1
	{ vpx_highbd_idct8_c, vpx_highbd_iadst8_c }, // DCT_ADST = 2
	{ vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3
	};

	void vp9_highbd_iht8x8_64_add_c(const tran_low_t input, uint16_t dest,
	int stride, int tx_type, int bd) {
	int i, j;
	tran_low_t out[8 * 8];
	tran_low_t *outptr = out;
	tran_low_t temp_in[8], temp_out[8];
	const highbd_transform_2d ht = HIGH_IHT_8[tx_type];

	// Inverse transform row vectors.
	for (i = 0; i < 8; ++i) {
	ht.rows(input, outptr, bd);
	input += 8;
	outptr += 8;
	}

	// Inverse transform column vectors.
	for (i = 0; i < 8; ++i) {
	for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
	ht.cols(temp_in, temp_out, bd);
	for (j = 0; j < 8; ++j) {
	dest[j * stride + i] = highbd_clip_pixel_add(
	dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
	}
	}
	}

	static const highbd_transform_2d HIGH_IHT_16[] = {
	{ vpx_highbd_idct16_c, vpx_highbd_idct16_c }, // DCT_DCT = 0
	{ vpx_highbd_iadst16_c, vpx_highbd_idct16_c }, // ADST_DCT = 1
	{ vpx_highbd_idct16_c, vpx_highbd_iadst16_c }, // DCT_ADST = 2
	{ vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3
	};

	void vp9_highbd_iht16x16_256_add_c(const tran_low_t input, uint16_t dest,
	int stride, int tx_type, int bd) {
	int i, j;
	tran_low_t out[16 * 16];
	tran_low_t *outptr = out;
	tran_low_t temp_in[16], temp_out[16];
	const highbd_transform_2d ht = HIGH_IHT_16[tx_type];

	// Rows
	for (i = 0; i < 16; ++i) {
	ht.rows(input, outptr, bd);
	input += 16;
	outptr += 16;
	}

	// Columns
	for (i = 0; i < 16; ++i) {
	for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
	ht.cols(temp_in, temp_out, bd);
	for (j = 0; j < 16; ++j) {
	dest[j * stride + i] = highbd_clip_pixel_add(
	dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
	}
	}
	}

	// idct
	void vp9_highbd_idct4x4_add(const tran_low_t input, uint16_t dest, int stride,
	int eob, int bd) {
	if (eob > 1)
	vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
	else
	vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
	}

	void vp9_highbd_iwht4x4_add(const tran_low_t input, uint16_t dest, int stride,
	int eob, int bd) {
	if (eob > 1)
	vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
	else
	vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
	}

	void vp9_highbd_idct8x8_add(const tran_low_t input, uint16_t dest, int stride,
	int eob, int bd) {
	// If dc is 1, then input[0] is the reconstructed value, do not need
	// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

	// The calculation can be simplified if there are not many non-zero dct
	// coefficients. Use eobs to decide what to do.
	// DC only DCT coefficient
	if (eob == 1) {
	vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
	} else if (eob <= 12) {
	vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
	} else {
	vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
	}
	}

	void vp9_highbd_idct16x16_add(const tran_low_t input, uint16_t dest,
	int stride, int eob, int bd) {
	// The calculation can be simplified if there are not many non-zero dct
	// coefficients. Use eobs to separate different cases.
	// DC only DCT coefficient.
	if (eob == 1) {
	vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
	} else if (eob <= 10) {
	vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
	} else if (eob <= 38) {
	vpx_highbd_idct16x16_38_add(input, dest, stride, bd);
	} else {
	vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
	}
	}

	void vp9_highbd_idct32x32_add(const tran_low_t input, uint16_t dest,
	int stride, int eob, int bd) {
	// Non-zero coeff only in upper-left 8x8
	if (eob == 1) {
	vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
	} else if (eob <= 34) {
	vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
	} else if (eob <= 135) {
	vpx_highbd_idct32x32_135_add(input, dest, stride, bd);
	} else {
	vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
	}
	}

	// iht
	void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
	uint16_t *dest, int stride, int eob, int bd) {
	if (tx_type == DCT_DCT)
	vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
	else
	vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
	}

	void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
	uint16_t *dest, int stride, int eob, int bd) {
	if (tx_type == DCT_DCT) {
	vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
	} else {
	vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
	}
	}

	void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
	uint16_t *dest, int stride, int eob, int bd) {
	if (tx_type == DCT_DCT) {
	vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
	} else {
	vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
	}
	}
	#endif // CONFIG_VP9_HIGHBITDEPTH