src/third_party/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c - cobalt - Git at Google

 /*
  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include <smmintrin.h>  // SSE4.1

 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"

 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
                                              __m128i *const out) {
   // stage 5
   out[0] = _mm_add_epi32(in[0], in[3]);
   out[1] = _mm_add_epi32(in[1], in[2]);
   out[2] = _mm_sub_epi32(in[1], in[2]);
   out[3] = _mm_sub_epi32(in[0], in[3]);
   highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
   out[8] = _mm_add_epi32(in[8], in[11]);
   out[9] = _mm_add_epi32(in[9], in[10]);
   out[10] = _mm_sub_epi32(in[9], in[10]);
   out[11] = _mm_sub_epi32(in[8], in[11]);
   out[12] = _mm_sub_epi32(in[15], in[12]);
   out[13] = _mm_sub_epi32(in[14], in[13]);
   out[14] = _mm_add_epi32(in[14], in[13]);
   out[15] = _mm_add_epi32(in[15], in[12]);
 }

 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
                                              __m128i *const out) {
   out[0] = _mm_add_epi32(in[0], in[7]);
   out[1] = _mm_add_epi32(in[1], in[6]);
   out[2] = _mm_add_epi32(in[2], in[5]);
   out[3] = _mm_add_epi32(in[3], in[4]);
   out[4] = _mm_sub_epi32(in[3], in[4]);
   out[5] = _mm_sub_epi32(in[2], in[5]);
   out[6] = _mm_sub_epi32(in[1], in[6]);
   out[7] = _mm_sub_epi32(in[0], in[7]);
   out[8] = in[8];
   out[9] = in[9];
   highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
   highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
   out[14] = in[14];
   out[15] = in[15];
 }

 void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];

   // stage 2
   highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
                           &step2[15]);
   highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
                           &step2[14]);
   highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
                           &step2[13]);
   highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
                           &step2[12]);

   // stage 3
   highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
                           &step1[7]);
   highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
                           &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
   step1[11] = _mm_add_epi32(step2[11], step2[10]);
   step1[12] = _mm_add_epi32(step2[12], step2[13]);
   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
   step1[15] = _mm_add_epi32(step2[15], step2[14]);

   // stage 4
   highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
   highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
                           &step2[3]);
   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
                           &step2[9], &step2[14]);
   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
                           &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
   step1[7] = _mm_add_epi32(step1[7], step1[6]);
   step2[8] = step1[8];
   step2[11] = step1[11];
   step2[12] = step1[12];
   step2[15] = step1[15];

   highbd_idct16_4col_stage5(step2, step1);
   highbd_idct16_4col_stage6(step1, step2);
   highbd_idct16_4col_stage7(step2, io);
 }

 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
   __m128i temp1[2];

   // stage 2
   highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
                                   &step2[15]);
   highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
                                   &step2[14]);
   highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
                                   &step2[13]);
   highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
                                   &step2[12]);

   // stage 3
   highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
                                   &step1[7]);
   highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
                                   &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
   step1[11] = _mm_add_epi32(step2[11], step2[10]);
   step1[12] = _mm_add_epi32(step2[12], step2[13]);
   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
   step1[15] = _mm_add_epi32(step2[15], step2[14]);

   // stage 4
   extend_64bit(io[0], temp1);
   step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
   step2[1] = step2[0];
   highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
                                   &step2[3]);
   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
                           &step2[9], &step2[14]);
   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
                           &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
   step1[7] = _mm_add_epi32(step1[7], step1[6]);
   step2[8] = step1[8];
   step2[11] = step1[11];
   step2[12] = step1[12];
   step2[15] = step1[15];

   highbd_idct16_4col_stage5(step2, step1);
   highbd_idct16_4col_stage6(step1, step2);
   highbd_idct16_4col_stage7(step2, io);
 }

 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
   __m128i temp[2];

   // stage 2
   highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
                                   &step2[15]);
   highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
                                   &step2[12]);

   // stage 3
   highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
                                   &step1[7]);
   step1[8] = step2[8];
   step1[9] = step2[8];
   step1[10] = step2[11];
   step1[11] = step2[11];
   step1[12] = step2[12];
   step1[13] = step2[12];
   step1[14] = step2[15];
   step1[15] = step2[15];

   // stage 4
   extend_64bit(io[0], temp);
   step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
   step2[1] = step2[0];
   step2[2] = _mm_setzero_si128();
   step2[3] = _mm_setzero_si128();
   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
                           &step2[9], &step2[14]);
   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
                           &step2[13], &step2[10]);
   step2[5] = step1[4];
   step2[6] = step1[7];
   step2[8] = step1[8];
   step2[11] = step1[11];
   step2[12] = step1[12];
   step2[15] = step1[15];

   highbd_idct16_4col_stage5(step2, step1);
   highbd_idct16_4col_stage6(step1, step2);
   highbd_idct16_4col_stage7(step2, io);
 }

 void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
                                          uint16_t *dest, int stride, int bd) {
   int i;
   __m128i out[16], *in;

   if (bd == 8) {
     __m128i l[16], r[16];

     in = l;
     for (i = 0; i < 2; i++) {
       highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
       highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
       idct16_8col(in, in);
       in = r;
       input += 128;
     }

     for (i = 0; i < 16; i += 8) {
       int j;
       transpose_16bit_8x8(l + i, out);
       transpose_16bit_8x8(r + i, out + 8);
       idct16_8col(out, out);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
       dest += 8;
     }
   } else {
     __m128i all[4][16];

     for (i = 0; i < 4; i++) {
       in = all[i];
       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
       vpx_highbd_idct16_4col_sse4_1(in);
       input += 4 * 16;
     }

     for (i = 0; i < 16; i += 4) {
       int j;
       transpose_32bit_4x4(all[0] + i, out + 0);
       transpose_32bit_4x4(all[1] + i, out + 4);
       transpose_32bit_4x4(all[2] + i, out + 8);
       transpose_32bit_4x4(all[3] + i, out + 12);
       vpx_highbd_idct16_4col_sse4_1(out);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
   }
 }

 void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                         int stride, int bd) {
   int i;
   __m128i out[16];

   if (bd == 8) {
     __m128i in[16], temp[16];

     highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
     for (i = 8; i < 16; i++) {
       in[i] = _mm_setzero_si128();
     }
     idct16_8col(in, temp);

     for (i = 0; i < 16; i += 8) {
       int j;
       transpose_16bit_8x8(temp + i, in);
       idct16_8col(in, out);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
       dest += 8;
     }
   } else {
     __m128i all[2][16], *in;

     for (i = 0; i < 2; i++) {
       in = all[i];
       highbd_load_transpose_32bit_8x4(input, 16, in);
       highbd_idct16x16_38_4col(in);
       input += 4 * 16;
     }

     for (i = 0; i < 16; i += 4) {
       int j;
       transpose_32bit_4x4(all[0] + i, out + 0);
       transpose_32bit_4x4(all[1] + i, out + 4);
       highbd_idct16x16_38_4col(out);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
   }
 }

 void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                         int stride, int bd) {
   int i;
   __m128i out[16];

   if (bd == 8) {
     __m128i in[16], l[16];

     in[0] = load_pack_8_32bit(input + 0 * 16);
     in[1] = load_pack_8_32bit(input + 1 * 16);
     in[2] = load_pack_8_32bit(input + 2 * 16);
     in[3] = load_pack_8_32bit(input + 3 * 16);

     idct16x16_10_pass1(in, l);

     for (i = 0; i < 16; i += 8) {
       int j;
       idct16x16_10_pass2(l + i, in);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_8(dest + j * stride, in[j], bd);
       }
       dest += 8;
     }
   } else {
     __m128i all[2][16], *in;

     for (i = 0; i < 2; i++) {
       in = all[i];
       highbd_load_transpose_32bit_4x4(input, 16, in);
       highbd_idct16x16_10_4col(in);
       input += 4 * 16;
     }

     for (i = 0; i < 16; i += 4) {
       int j;
       transpose_32bit_4x4(&all[0][i], out);
       highbd_idct16x16_10_4col(out);

       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
   }
 }
	/*
	* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include <smmintrin.h> // SSE4.1

	#include "./vpx_dsp_rtcd.h"
	#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
	#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
	#include "vpx_dsp/x86/inv_txfm_sse2.h"
	#include "vpx_dsp/x86/transpose_sse2.h"
	#include "vpx_dsp/x86/txfm_common_sse2.h"

	static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
	__m128i *const out) {
	// stage 5
	out[0] = _mm_add_epi32(in[0], in[3]);
	out[1] = _mm_add_epi32(in[1], in[2]);
	out[2] = _mm_sub_epi32(in[1], in[2]);
	out[3] = _mm_sub_epi32(in[0], in[3]);
	highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
	out[8] = _mm_add_epi32(in[8], in[11]);
	out[9] = _mm_add_epi32(in[9], in[10]);
	out[10] = _mm_sub_epi32(in[9], in[10]);
	out[11] = _mm_sub_epi32(in[8], in[11]);
	out[12] = _mm_sub_epi32(in[15], in[12]);
	out[13] = _mm_sub_epi32(in[14], in[13]);
	out[14] = _mm_add_epi32(in[14], in[13]);
	out[15] = _mm_add_epi32(in[15], in[12]);
	}

	static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
	__m128i *const out) {
	out[0] = _mm_add_epi32(in[0], in[7]);
	out[1] = _mm_add_epi32(in[1], in[6]);
	out[2] = _mm_add_epi32(in[2], in[5]);
	out[3] = _mm_add_epi32(in[3], in[4]);
	out[4] = _mm_sub_epi32(in[3], in[4]);
	out[5] = _mm_sub_epi32(in[2], in[5]);
	out[6] = _mm_sub_epi32(in[1], in[6]);
	out[7] = _mm_sub_epi32(in[0], in[7]);
	out[8] = in[8];
	out[9] = in[9];
	highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
	highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
	out[14] = in[14];
	out[15] = in[15];
	}

	void vpx_highbd_idct16_4col_sse4_1(__m128i const io /io[16]*/) {
	__m128i step1[16], step2[16];

	// stage 2
	highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
	&step2[15]);
	highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
	&step2[14]);
	highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
	&step2[13]);
	highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
	&step2[12]);

	// stage 3
	highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
	&step1[7]);
	highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
	&step1[6]);
	step1[8] = _mm_add_epi32(step2[8], step2[9]);
	step1[9] = _mm_sub_epi32(step2[8], step2[9]);
	step1[10] = _mm_sub_epi32(step2[11], step2[10]);
	step1[11] = _mm_add_epi32(step2[11], step2[10]);
	step1[12] = _mm_add_epi32(step2[12], step2[13]);
	step1[13] = _mm_sub_epi32(step2[12], step2[13]);
	step1[14] = _mm_sub_epi32(step2[15], step2[14]);
	step1[15] = _mm_add_epi32(step2[15], step2[14]);

	// stage 4
	highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
	highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
	&step2[3]);
	highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
	&step2[9], &step2[14]);
	highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
	&step2[13], &step2[10]);
	step2[5] = _mm_sub_epi32(step1[4], step1[5]);
	step1[4] = _mm_add_epi32(step1[4], step1[5]);
	step2[6] = _mm_sub_epi32(step1[7], step1[6]);
	step1[7] = _mm_add_epi32(step1[7], step1[6]);
	step2[8] = step1[8];
	step2[11] = step1[11];
	step2[12] = step1[12];
	step2[15] = step1[15];

	highbd_idct16_4col_stage5(step2, step1);
	highbd_idct16_4col_stage6(step1, step2);
	highbd_idct16_4col_stage7(step2, io);
	}

	static INLINE void highbd_idct16x16_38_4col(__m128i const io /io[16]*/) {
	__m128i step1[16], step2[16];
	__m128i temp1[2];

	// stage 2
	highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
	&step2[15]);
	highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
	&step2[14]);
	highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
	&step2[13]);
	highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
	&step2[12]);

	// stage 3
	highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
	&step1[7]);
	highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
	&step1[6]);
	step1[8] = _mm_add_epi32(step2[8], step2[9]);
	step1[9] = _mm_sub_epi32(step2[8], step2[9]);
	step1[10] = _mm_sub_epi32(step2[11], step2[10]);
	step1[11] = _mm_add_epi32(step2[11], step2[10]);
	step1[12] = _mm_add_epi32(step2[12], step2[13]);
	step1[13] = _mm_sub_epi32(step2[12], step2[13]);
	step1[14] = _mm_sub_epi32(step2[15], step2[14]);
	step1[15] = _mm_add_epi32(step2[15], step2[14]);

	// stage 4
	extend_64bit(io[0], temp1);
	step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
	step2[1] = step2[0];
	highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
	&step2[3]);
	highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
	&step2[9], &step2[14]);
	highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
	&step2[13], &step2[10]);
	step2[5] = _mm_sub_epi32(step1[4], step1[5]);
	step1[4] = _mm_add_epi32(step1[4], step1[5]);
	step2[6] = _mm_sub_epi32(step1[7], step1[6]);
	step1[7] = _mm_add_epi32(step1[7], step1[6]);
	step2[8] = step1[8];
	step2[11] = step1[11];
	step2[12] = step1[12];
	step2[15] = step1[15];

	highbd_idct16_4col_stage5(step2, step1);
	highbd_idct16_4col_stage6(step1, step2);
	highbd_idct16_4col_stage7(step2, io);
	}

	static INLINE void highbd_idct16x16_10_4col(__m128i const io /io[16]*/) {
	__m128i step1[16], step2[16];
	__m128i temp[2];

	// stage 2
	highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
	&step2[15]);
	highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
	&step2[12]);

	// stage 3
	highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
	&step1[7]);
	step1[8] = step2[8];
	step1[9] = step2[8];
	step1[10] = step2[11];
	step1[11] = step2[11];
	step1[12] = step2[12];
	step1[13] = step2[12];
	step1[14] = step2[15];
	step1[15] = step2[15];

	// stage 4
	extend_64bit(io[0], temp);
	step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
	step2[1] = step2[0];
	step2[2] = _mm_setzero_si128();
	step2[3] = _mm_setzero_si128();
	highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
	&step2[9], &step2[14]);
	highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
	&step2[13], &step2[10]);
	step2[5] = step1[4];
	step2[6] = step1[7];
	step2[8] = step1[8];
	step2[11] = step1[11];
	step2[12] = step1[12];
	step2[15] = step1[15];

	highbd_idct16_4col_stage5(step2, step1);
	highbd_idct16_4col_stage6(step1, step2);
	highbd_idct16_4col_stage7(step2, io);
	}

	void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
	uint16_t *dest, int stride, int bd) {
	int i;
	__m128i out[16], *in;

	if (bd == 8) {
	__m128i l[16], r[16];

	in = l;
	for (i = 0; i < 2; i++) {
	highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
	highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
	idct16_8col(in, in);
	in = r;
	input += 128;
	}

	for (i = 0; i < 16; i += 8) {
	int j;
	transpose_16bit_8x8(l + i, out);
	transpose_16bit_8x8(r + i, out + 8);
	idct16_8col(out, out);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_8(dest + j * stride, out[j], bd);
	}
	dest += 8;
	}
	} else {
	__m128i all[4][16];

	for (i = 0; i < 4; i++) {
	in = all[i];
	highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
	highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
	vpx_highbd_idct16_4col_sse4_1(in);
	input += 4 * 16;
	}

	for (i = 0; i < 16; i += 4) {
	int j;
	transpose_32bit_4x4(all[0] + i, out + 0);
	transpose_32bit_4x4(all[1] + i, out + 4);
	transpose_32bit_4x4(all[2] + i, out + 8);
	transpose_32bit_4x4(all[3] + i, out + 12);
	vpx_highbd_idct16_4col_sse4_1(out);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_4(dest + j * stride, out[j], bd);
	}
	dest += 4;
	}
	}
	}

	void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t input, uint16_t dest,
	int stride, int bd) {
	int i;
	__m128i out[16];

	if (bd == 8) {
	__m128i in[16], temp[16];

	highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
	for (i = 8; i < 16; i++) {
	in[i] = _mm_setzero_si128();
	}
	idct16_8col(in, temp);

	for (i = 0; i < 16; i += 8) {
	int j;
	transpose_16bit_8x8(temp + i, in);
	idct16_8col(in, out);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_8(dest + j * stride, out[j], bd);
	}
	dest += 8;
	}
	} else {
	__m128i all[2][16], *in;

	for (i = 0; i < 2; i++) {
	in = all[i];
	highbd_load_transpose_32bit_8x4(input, 16, in);
	highbd_idct16x16_38_4col(in);
	input += 4 * 16;
	}

	for (i = 0; i < 16; i += 4) {
	int j;
	transpose_32bit_4x4(all[0] + i, out + 0);
	transpose_32bit_4x4(all[1] + i, out + 4);
	highbd_idct16x16_38_4col(out);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_4(dest + j * stride, out[j], bd);
	}
	dest += 4;
	}
	}
	}

	void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t input, uint16_t dest,
	int stride, int bd) {
	int i;
	__m128i out[16];

	if (bd == 8) {
	__m128i in[16], l[16];

	in[0] = load_pack_8_32bit(input + 0 * 16);
	in[1] = load_pack_8_32bit(input + 1 * 16);
	in[2] = load_pack_8_32bit(input + 2 * 16);
	in[3] = load_pack_8_32bit(input + 3 * 16);

	idct16x16_10_pass1(in, l);

	for (i = 0; i < 16; i += 8) {
	int j;
	idct16x16_10_pass2(l + i, in);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_8(dest + j * stride, in[j], bd);
	}
	dest += 8;
	}
	} else {
	__m128i all[2][16], *in;

	for (i = 0; i < 2; i++) {
	in = all[i];
	highbd_load_transpose_32bit_4x4(input, 16, in);
	highbd_idct16x16_10_4col(in);
	input += 4 * 16;
	}

	for (i = 0; i < 16; i += 4) {
	int j;
	transpose_32bit_4x4(&all[0][i], out);
	highbd_idct16x16_10_4col(out);

	for (j = 0; j < 16; ++j) {
	highbd_write_buffer_4(dest + j * stride, out[j], bd);
	}
	dest += 4;
	}
	}
	}