|  | ; Copyright © 2018-2021, VideoLAN and dav1d authors | 
|  | ; Copyright © 2018, Two Orioles, LLC | 
|  | ; All rights reserved. | 
|  | ; | 
|  | ; Redistribution and use in source and binary forms, with or without | 
|  | ; modification, are permitted provided that the following conditions are met: | 
|  | ; | 
|  | ; 1. Redistributions of source code must retain the above copyright notice, this | 
|  | ;    list of conditions and the following disclaimer. | 
|  | ; | 
|  | ; 2. Redistributions in binary form must reproduce the above copyright notice, | 
|  | ;    this list of conditions and the following disclaimer in the documentation | 
|  | ;    and/or other materials provided with the distribution. | 
|  | ; | 
|  | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | 
|  | ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | 
|  | ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 
|  | ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR | 
|  | ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | 
|  | ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | 
|  | ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | 
|  | ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 
|  | ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  |  | 
|  | %include "config.asm" | 
|  | %include "ext/x86/x86inc.asm" | 
|  |  | 
|  |  | 
|  | SECTION_RODATA 16 | 
|  |  | 
|  | deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15 | 
|  |  | 
|  | deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 | 
|  | deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7 | 
|  |  | 
|  | %macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 | 
|  | pw_%1_m%2:  times 4 dw  %1, -%2 | 
|  | %if %3 != 2 | 
|  | pw_%2_%1:   times 4 dw  %2,  %1 | 
|  | %endif | 
|  | %if %3 | 
|  | pw_m%1_m%2: times 4 dw -%1, -%2 | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | ;adst4 | 
|  | pw_1321_3803:   times 4 dw  1321,  3803 | 
|  | pw_2482_m1321:  times 4 dw  2482, -1321 | 
|  | pw_3344_2482:   times 4 dw  3344,  2482 | 
|  | pw_3344_m3803:  times 4 dw  3344, -3803 | 
|  | pw_3344_m3344:  times 4 dw  3344, -3344 | 
|  | pw_0_3344       times 4 dw     0,  3344 | 
|  | pw_m6688_m3803: times 4 dw -6688, -3803 | 
|  |  | 
|  | COEF_PAIR 2896, 2896 | 
|  | COEF_PAIR 1567, 3784 | 
|  | COEF_PAIR  799, 4017 | 
|  | COEF_PAIR 3406, 2276 | 
|  | COEF_PAIR  401, 4076 | 
|  | COEF_PAIR 1931, 3612 | 
|  | COEF_PAIR 3166, 2598 | 
|  | COEF_PAIR 3920, 1189 | 
|  | COEF_PAIR 3784, 1567, 1 | 
|  | COEF_PAIR  995, 3973 | 
|  | COEF_PAIR 1751, 3703 | 
|  | COEF_PAIR 3513, 2106 | 
|  | COEF_PAIR 3857, 1380 | 
|  | COEF_PAIR 4017,  799, 1 | 
|  | COEF_PAIR  201, 4091 | 
|  | COEF_PAIR 2440, 3290 | 
|  | COEF_PAIR 3035, 2751 | 
|  | COEF_PAIR 4052,  601 | 
|  | COEF_PAIR 2276, 3406, 1 | 
|  | COEF_PAIR 4076,  401, 2 | 
|  | COEF_PAIR 2598, 3166, 2 | 
|  | COEF_PAIR 3612, 1931, 2 | 
|  | COEF_PAIR 1189, 3920, 2 | 
|  |  | 
|  | pd_2048:        times 4 dd  2048 | 
|  | pw_2048:        times 8 dw  2048 | 
|  | pw_m2048:       times 8 dw -2048 | 
|  | pw_4096:        times 8 dw  4096 | 
|  | pw_16384:       times 8 dw  16384 | 
|  | pw_m16384:      times 8 dw  -16384 | 
|  | pw_1697x16:     times 8 dw  1697*16 | 
|  | pw_1697x8:      times 8 dw  1697*8 | 
|  | pw_2896x8:      times 8 dw  2896*8 | 
|  | pw_3344x8:      times 8 dw  3344*8 | 
|  | pw_8192:        times 8 dw  8192 | 
|  | pw_m8192:       times 8 dw -8192 | 
|  | pw_5:           times 8 dw  5 | 
|  | pw_201x8:       times 8 dw   201*8 | 
|  | pw_4091x8:      times 8 dw  4091*8 | 
|  | pw_m2751x8:     times 8 dw -2751*8 | 
|  | pw_3035x8:      times 8 dw  3035*8 | 
|  | pw_1751x8:      times 8 dw  1751*8 | 
|  | pw_3703x8:      times 8 dw  3703*8 | 
|  | pw_m1380x8:     times 8 dw -1380*8 | 
|  | pw_3857x8:      times 8 dw  3857*8 | 
|  | pw_995x8:       times 8 dw   995*8 | 
|  | pw_3973x8:      times 8 dw  3973*8 | 
|  | pw_m2106x8:     times 8 dw -2106*8 | 
|  | pw_3513x8:      times 8 dw  3513*8 | 
|  | pw_2440x8:      times 8 dw  2440*8 | 
|  | pw_3290x8:      times 8 dw  3290*8 | 
|  | pw_m601x8:      times 8 dw  -601*8 | 
|  | pw_4052x8:      times 8 dw  4052*8 | 
|  |  | 
|  | pw_4095x8:      times 8 dw  4095*8 | 
|  | pw_101x8:       times 8 dw   101*8 | 
|  | pw_2967x8:      times 8 dw  2967*8 | 
|  | pw_m2824x8:     times 8 dw -2824*8 | 
|  | pw_3745x8:      times 8 dw  3745*8 | 
|  | pw_1660x8:      times 8 dw  1660*8 | 
|  | pw_3822x8:      times 8 dw  3822*8 | 
|  | pw_m1474x8:     times 8 dw -1474*8 | 
|  | pw_3996x8:      times 8 dw  3996*8 | 
|  | pw_897x8:       times 8 dw   897*8 | 
|  | pw_3461x8:      times 8 dw  3461*8 | 
|  | pw_m2191x8:     times 8 dw -2191*8 | 
|  | pw_3349x8:      times 8 dw  3349*8 | 
|  | pw_2359x8:      times 8 dw  2359*8 | 
|  | pw_4036x8:      times 8 dw  4036*8 | 
|  | pw_m700x8:      times 8 dw  -700*8 | 
|  | pw_4065x8:      times 8 dw  4065*8 | 
|  | pw_501x8:       times 8 dw   501*8 | 
|  | pw_3229x8:      times 8 dw  3229*8 | 
|  | pw_m2520x8:     times 8 dw -2520*8 | 
|  | pw_3564x8:      times 8 dw  3564*8 | 
|  | pw_2019x8:      times 8 dw  2019*8 | 
|  | pw_3948x8:      times 8 dw  3948*8 | 
|  | pw_m1092x8:     times 8 dw -1092*8 | 
|  | pw_3889x8:      times 8 dw  3889*8 | 
|  | pw_1285x8:      times 8 dw  1285*8 | 
|  | pw_3659x8:      times 8 dw  3659*8 | 
|  | pw_m1842x8:     times 8 dw -1842*8 | 
|  | pw_3102x8:      times 8 dw  3102*8 | 
|  | pw_2675x8:      times 8 dw  2675*8 | 
|  | pw_4085x8:      times 8 dw  4085*8 | 
|  | pw_m301x8:      times 8 dw  -301*8 | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) | 
|  |  | 
|  | %if ARCH_X86_64 | 
|  | %define o(x) x | 
|  | %else | 
|  | %define o(x) r5-$$+x ; PIC | 
|  | %endif | 
|  |  | 
|  | %macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4] | 
|  | lea                  r2, [dstq+strideq*2] | 
|  | %assign %%i 1 | 
|  | %rotate 5 | 
|  | %rep 4 | 
|  | %if %1 & 2 | 
|  | CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1) | 
|  | %else | 
|  | CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) | 
|  | %endif | 
|  | %assign %%i %%i + 1 | 
|  | %rotate 1 | 
|  | %endrep | 
|  |  | 
|  | movd                 m%3, [%%row_adr1]        ;dst0 | 
|  | movd                 m%5, [%%row_adr2]        ;dst1 | 
|  | punpckldq            m%3, m%5                 ;high: dst1 :low: dst0 | 
|  | movd                 m%4, [%%row_adr3]        ;dst2 | 
|  | movd                 m%5, [%%row_adr4]        ;dst3 | 
|  | punpckldq            m%4, m%5                 ;high: dst3 :low: dst2 | 
|  |  | 
|  | pxor                 m%5, m%5 | 
|  | punpcklbw            m%3, m%5                 ;extend byte to word | 
|  | punpcklbw            m%4, m%5                 ;extend byte to word | 
|  |  | 
|  | paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0 | 
|  | paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2 | 
|  |  | 
|  | packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 | 
|  |  | 
|  | movd        [%%row_adr1], m%3                  ;store dst0 + out0 | 
|  | pshuflw              m%4, m%3, q1032 | 
|  | movd        [%%row_adr2], m%4                  ;store dst1 + out1 | 
|  | punpckhqdq           m%3, m%3 | 
|  | movd        [%%row_adr3], m%3                  ;store dst2 + out2 | 
|  | psrlq                m%3, 32 | 
|  | movd        [%%row_adr4], m%3                  ;store dst3 + out3 | 
|  | %endmacro | 
|  |  | 
|  | %macro ITX4_END 4-5 2048 ; row[1-4], rnd | 
|  | %if %5 | 
|  | mova                 m2, [o(pw_%5)] | 
|  | pmulhrsw             m0, m2 | 
|  | pmulhrsw             m1, m2 | 
|  | %endif | 
|  |  | 
|  | WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4 | 
|  | ret | 
|  | %endmacro | 
|  |  | 
|  | ; flags: 1 = swap, 2: coef_regs, 4: no_pack | 
|  | %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags | 
|  | %if %6 & 2 | 
|  | pmaddwd              m%2, m%4, m%1 | 
|  | pmaddwd              m%1, m%5 | 
|  | %elif %6 & 1 | 
|  | pmaddwd              m%2, m%1, [o(pw_%5_%4)] | 
|  | pmaddwd              m%1, [o(pw_%4_m%5)] | 
|  | %else | 
|  | pmaddwd              m%2, m%1, [o(pw_%4_m%5)] | 
|  | pmaddwd              m%1, [o(pw_%5_%4)] | 
|  | %endif | 
|  | paddd                m%2, m%3 | 
|  | paddd                m%1, m%3 | 
|  | psrad                m%2, 12 | 
|  | psrad                m%1, 12 | 
|  | %if %6 & 4 == 0 | 
|  | packssdw             m%1, m%2 | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro IDCT4_1D_PACKED 0-1   ;pw_2896x8 | 
|  | mova                 m3, [o(pd_2048)] | 
|  | punpckhwd            m2, m0, m1            ;unpacked in1 in3 | 
|  | punpcklwd            m0, m1                ;unpacked in0 in2 | 
|  | ITX_MUL2X_PACK        2, 1, 3, 1567, 3784 | 
|  | ITX_MUL2X_PACK        0, 1, 3, 2896, 2896 | 
|  | psubsw               m1, m0, m2            ;high: out2 ;low: out3 | 
|  | paddsw               m0, m2                ;high: out1 ;low: out0 | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack | 
|  | cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 | 
|  | %define %%p1 m(i%1_%3_internal_8bpc) | 
|  | %if ARCH_X86_32 | 
|  | LEA                    r5, $$ | 
|  | %endif | 
|  | %if has_epilogue | 
|  | %ifidn %1_%2, dct_dct | 
|  | test                 eobd, eobd | 
|  | jz %%end | 
|  | %endif | 
|  | lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] | 
|  | call %%p1 | 
|  | RET | 
|  | %%end: | 
|  | %else | 
|  | lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] | 
|  | %ifidn %1_%2, dct_dct | 
|  | test                 eobd, eobd | 
|  | jnz %%p1 | 
|  | %else | 
|  | times ((%%end - %%p1) >> 31) & 1 jmp %%p1 | 
|  | ALIGN function_align | 
|  | %%end: | 
|  | %endif | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_TXFM_4X4_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 4x4, 6 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw              m0, [coeffq], q0000 | 
|  | punpcklqdq           m0, m0 | 
|  | mova                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1 | 
|  | mov            [coeffq], eobd                ;0 | 
|  | pmulhrsw             m0, m1 | 
|  | mova                 m1, m0 | 
|  | TAIL_CALL m(iadst_4x4_internal_8bpc).end2 | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM ssse3 | 
|  | ; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. | 
|  |  | 
|  | INV_TXFM_4X4_FN dct, dct | 
|  | INV_TXFM_4X4_FN dct, adst | 
|  | INV_TXFM_4X4_FN dct, flipadst | 
|  | INV_TXFM_4X4_FN dct, identity | 
|  |  | 
|  | cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0 | 
|  | mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2 | 
|  |  | 
|  | IDCT4_1D_PACKED | 
|  |  | 
|  | mova                 m2, [o(deint_shuf)] | 
|  | shufps               m3, m0, m1, q1331 | 
|  | shufps               m0, m1, q0220 | 
|  | pshufb               m0, m2                 ;high: in1 ;low: in0 | 
|  | pshufb               m1, m3, m2             ;high: in3 ;low :in2 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | IDCT4_1D_PACKED | 
|  |  | 
|  | pxor                 m2, m2 | 
|  | mova      [coeffq+16*0], m2 | 
|  | mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); | 
|  |  | 
|  | ITX4_END     0, 1, 3, 2 | 
|  |  | 
|  | INV_TXFM_4X4_FN adst, dct | 
|  | INV_TXFM_4X4_FN adst, adst | 
|  | INV_TXFM_4X4_FN adst, flipadst | 
|  | INV_TXFM_4X4_FN adst, identity | 
|  |  | 
|  | cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m0, [coeffq+16*0] | 
|  | mova                 m1, [coeffq+16*1] | 
|  | call .main | 
|  | punpckhwd            m2, m0, m1 | 
|  | punpcklwd            m0, m1 | 
|  | punpckhwd            m1, m0, m2       ;high: in3 ;low :in2 | 
|  | punpcklwd            m0, m2           ;high: in1 ;low: in0 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | call .main | 
|  |  | 
|  | .end: | 
|  | pxor                 m2, m2 | 
|  | mova      [coeffq+16*0], m2 | 
|  | mova      [coeffq+16*1], m2 | 
|  |  | 
|  | .end2: | 
|  | ITX4_END              0, 1, 2, 3 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | punpcklwd            m2, m0, m1                ;unpacked in0 in2 | 
|  | punpckhwd            m0, m1                    ;unpacked in1 in3 | 
|  | mova                 m3, m0 | 
|  | pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 | 
|  | pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3 | 
|  | paddd                m1, m0                    ;t2 | 
|  | pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 | 
|  | pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2 | 
|  | pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 | 
|  | pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 | 
|  | paddd                m4, m0                    ;t0 + t3 | 
|  | pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3 | 
|  | mova                 m0, [o(pd_2048)] | 
|  | paddd                m1, m0                    ;t2 + 2048 | 
|  | paddd                m2, m0 | 
|  | paddd                m0, m4                    ;t0 + t3 + 2048 | 
|  | paddd                m5, m2                    ;t1 + t3 + 2048 | 
|  | paddd                m2, m4 | 
|  | paddd                m2, m3                    ;t0 + t1 - t3 + 2048 | 
|  | REPX      {psrad x, 12}, m1, m0, m5, m2 | 
|  | packssdw             m0, m5                    ;high: out1 ;low: out0 | 
|  | packssdw             m1, m2                    ;high: out3 ;low: out3 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_4X4_FN flipadst, dct | 
|  | INV_TXFM_4X4_FN flipadst, adst | 
|  | INV_TXFM_4X4_FN flipadst, flipadst | 
|  | INV_TXFM_4X4_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m0, [coeffq+16*0] | 
|  | mova                 m1, [coeffq+16*1] | 
|  | call m(iadst_4x4_internal_8bpc).main | 
|  | punpcklwd            m2, m1, m0 | 
|  | punpckhwd            m1, m0 | 
|  | punpcklwd            m0, m1, m2            ;high: in3 ;low :in2 | 
|  | punpckhwd            m1, m2                ;high: in1 ;low: in0 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | call m(iadst_4x4_internal_8bpc).main | 
|  |  | 
|  | .end: | 
|  | pxor                 m2, m2 | 
|  | mova      [coeffq+16*0], m2 | 
|  | mova      [coeffq+16*1], m2 | 
|  |  | 
|  | .end2: | 
|  | ITX4_END              3, 2, 1, 0 | 
|  |  | 
|  | INV_TXFM_4X4_FN identity, dct | 
|  | INV_TXFM_4X4_FN identity, adst | 
|  | INV_TXFM_4X4_FN identity, flipadst | 
|  | INV_TXFM_4X4_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m0, [coeffq+16*0] | 
|  | mova                 m1, [coeffq+16*1] | 
|  | mova                 m3, [o(pw_1697x8)] | 
|  | pmulhrsw             m2, m0, m3 | 
|  | pmulhrsw             m3, m1 | 
|  | paddsw               m0, m2 | 
|  | paddsw               m1, m3 | 
|  | punpckhwd            m2, m0, m1 | 
|  | punpcklwd            m0, m1 | 
|  | punpckhwd            m1, m0, m2            ;high: in3 ;low :in2 | 
|  | punpcklwd            m0, m2                ;high: in1 ;low: in0 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | mova                 m3, [o(pw_1697x8)] | 
|  | pmulhrsw             m2, m3, m0 | 
|  | pmulhrsw             m3, m1 | 
|  | paddsw               m0, m2 | 
|  | paddsw               m1, m3 | 
|  | jmp m(iadst_4x4_internal_8bpc).end | 
|  |  | 
|  | %macro IWHT4_1D_PACKED 0 | 
|  | punpckhqdq           m3, m0, m1            ;low: in1 high: in3 | 
|  | punpcklqdq           m0, m1                ;low: in0 high: in2 | 
|  | psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3 | 
|  | paddw                m0, m3                ;low: in0 + in1 high: in2 + in3 | 
|  | punpckhqdq           m2, m2                ;t2 t2 | 
|  | punpcklqdq           m0, m0                ;t0 t0 | 
|  | psubw                m1, m0, m2 | 
|  | psraw                m1, 1                 ;t4 t4 | 
|  | psubw                m1, m3                ;low: t1/out2 high: t3/out1 | 
|  | psubw                m0, m1                ;high: out0 | 
|  | paddw                m2, m1                ;low: out3 | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff | 
|  | mova                 m0, [coeffq+16*0] | 
|  | mova                 m1, [coeffq+16*1] | 
|  | pxor                 m2, m2 | 
|  | mova      [coeffq+16*0], m2 | 
|  | mova      [coeffq+16*1], m2 | 
|  | psraw                m0, 2 | 
|  | psraw                m1, 2 | 
|  | IWHT4_1D_PACKED | 
|  | punpckhwd            m0, m1 | 
|  | punpcklwd            m3, m1, m2 | 
|  | punpckhdq            m1, m0, m3 | 
|  | punpckldq            m0, m3 | 
|  | IWHT4_1D_PACKED | 
|  | shufpd               m0, m2, 0x01 | 
|  | ITX4_END              0, 3, 2, 1, 0 | 
|  |  | 
|  | %macro IDCT8_1D_PACKED 0 | 
|  | mova                 m6, [o(pd_2048)] | 
|  | punpckhwd            m4, m0, m3                 ;unpacked in1 in7 | 
|  | punpcklwd            m0, m2                     ;unpacked in0 in4 | 
|  | punpckhwd            m2, m1                     ;unpacked in5 in3 | 
|  | punpcklwd            m1, m3                     ;unpacked in2 in6 | 
|  | ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a | 
|  | ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a | 
|  | ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2 | 
|  | psubsw               m3, m4, m2                 ;low: t6a high: t5a | 
|  | paddsw               m4, m2                     ;low: t7  high: t4 | 
|  | pshufb               m3, [o(deint_shuf1)] | 
|  | ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1 | 
|  | ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5 | 
|  | psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2 | 
|  | paddsw               m0, m1                     ;low: tmp0 high: tmp1 | 
|  | punpcklqdq           m1, m4, m3                 ;low: t7   high: t6 | 
|  | punpckhqdq           m4, m3                     ;low: t4   high: t5 | 
|  | psubsw               m3, m0, m1                 ;low: out7 high: out6 | 
|  | paddsw               m0, m1                     ;low: out0 high: out1 | 
|  | paddsw               m1, m2, m4                 ;low: out3 high: out2 | 
|  | psubsw               m2, m4                     ;low: out4 high: out5 | 
|  | %endmacro | 
|  |  | 
|  | ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 | 
|  | ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 | 
|  | %macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 | 
|  | punpckhwd           m%4, m%1, m%2 | 
|  | punpcklwd           m%1, m%2 | 
|  | %if %7 < 8 | 
|  | pmaddwd             m%2, m%7, m%1 | 
|  | pmaddwd             m%3, m%7, m%4 | 
|  | %else | 
|  | mova                m%2, [o(pw_%7_%6)] | 
|  | %if %8 | 
|  | pmaddwd             m%3, m%1, m%2 | 
|  | pmaddwd             m%2, m%4 | 
|  | %else | 
|  | pmaddwd             m%3, m%4, m%2 | 
|  | pmaddwd             m%2, m%1 | 
|  | %endif | 
|  | %endif | 
|  | paddd               m%3, m%5 | 
|  | paddd               m%2, m%5 | 
|  | psrad               m%3, 12 | 
|  | psrad               m%2, 12 | 
|  | %if %8 | 
|  | packssdw            m%3, m%2 | 
|  | %else | 
|  | packssdw            m%2, m%3                 ;dst2 | 
|  | %endif | 
|  | %if %7 < 8 | 
|  | pmaddwd             m%4, m%6 | 
|  | pmaddwd             m%1, m%6 | 
|  | %elif %8 | 
|  | mova                m%2, [o(pw_%6_m%7)] | 
|  | pmaddwd             m%4, m%2 | 
|  | pmaddwd             m%1, m%2 | 
|  | %else | 
|  | mova                m%3, [o(pw_%6_m%7)] | 
|  | pmaddwd             m%4, m%3 | 
|  | pmaddwd             m%1, m%3 | 
|  | %endif | 
|  | paddd               m%4, m%5 | 
|  | paddd               m%1, m%5 | 
|  | psrad               m%4, 12 | 
|  | psrad               m%1, 12 | 
|  | packssdw            m%1, m%4                 ;dst1 | 
|  | %endmacro | 
|  |  | 
|  | %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 | 
|  | ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 | 
|  | ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 | 
|  | psubsw              m%3, m%1, m%2                      ;out2 | 
|  | paddsw              m%2, m%1                           ;out1 | 
|  | paddsw              m%1, m%5, m%4                      ;out0 | 
|  | psubsw              m%4, m%5                           ;out3 | 
|  | %endmacro | 
|  |  | 
|  | %macro WRITE_4X8 4 ;row[1-4] | 
|  | WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4 | 
|  | lea                dstq, [dstq+strideq*4] | 
|  | WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4 | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_4X8 0 | 
|  | punpckhwd            m4, m2, m3 | 
|  | punpcklwd            m2, m3 | 
|  | punpckhwd            m3, m0, m1 | 
|  | punpcklwd            m0, m1 | 
|  | punpckhdq            m1, m0, m2                  ;low: in2 high: in3 | 
|  | punpckldq            m0, m2                      ;low: in0 high: in1 | 
|  | punpckldq            m2, m3, m4                  ;low: in4 high: in5 | 
|  | punpckhdq            m3, m4                      ;low: in6 high: in7 | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_TXFM_4X8_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 4x8, 8 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw              m0, [coeffq], q0000 | 
|  | punpcklqdq           m0, m0 | 
|  | mova                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1 | 
|  | mov           [coeffq], eobd | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, [o(pw_2048)] | 
|  | mova                 m1, m0 | 
|  | mova                 m2, m0 | 
|  | mova                 m3, m0 | 
|  | TAIL_CALL m(iadst_4x8_internal_8bpc).end3 | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM ssse3 | 
|  | INV_TXFM_4X8_FN dct, dct | 
|  | INV_TXFM_4X8_FN dct, adst | 
|  | INV_TXFM_4X8_FN dct, flipadst | 
|  | INV_TXFM_4X8_FN dct, identity | 
|  |  | 
|  | cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | .pass1: | 
|  | call m(idct_8x4_internal_8bpc).main | 
|  | jmp m(iadst_4x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | call .main | 
|  | shufps               m1, m1, q1032 | 
|  | shufps               m3, m3, q1032 | 
|  | mova                 m4, [o(pw_2048)] | 
|  | jmp m(iadst_4x8_internal_8bpc).end2 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | IDCT8_1D_PACKED | 
|  | ret | 
|  |  | 
|  |  | 
|  | INV_TXFM_4X8_FN adst, dct | 
|  | INV_TXFM_4X8_FN adst, adst | 
|  | INV_TXFM_4X8_FN adst, flipadst | 
|  | INV_TXFM_4X8_FN adst, identity | 
|  |  | 
|  | cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | .pass1: | 
|  | call m(iadst_8x4_internal_8bpc).main | 
|  |  | 
|  | .pass1_end: | 
|  | INV_4X8 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | shufps               m0, m0, q1032 | 
|  | shufps               m1, m1, q1032 | 
|  | call .main | 
|  | mova                 m4, [o(pw_2048)] | 
|  | pxor                 m5, m5 | 
|  | psubw                m5, m4 | 
|  |  | 
|  | .end: | 
|  | punpcklqdq           m4, m5 | 
|  |  | 
|  | .end2: | 
|  | pmulhrsw             m0, m4 | 
|  | pmulhrsw             m1, m4 | 
|  | pmulhrsw             m2, m4 | 
|  | pmulhrsw             m3, m4 | 
|  | pxor                 m5, m5 | 
|  | mova      [coeffq+16*0], m5 | 
|  | mova      [coeffq+16*1], m5 | 
|  | mova      [coeffq+16*2], m5 | 
|  | mova      [coeffq+16*3], m5 | 
|  |  | 
|  | .end3: | 
|  | WRITE_4X8             0, 1, 2, 3 | 
|  | RET | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova                 m6, [o(pd_2048)] | 
|  | punpckhwd            m4, m3, m0                ;unpacked in7 in0 | 
|  | punpckhwd            m5, m2, m1                ;unpacked in5 in2 | 
|  | punpcklwd            m1, m2                    ;unpacked in3 in4 | 
|  | punpcklwd            m0, m3                    ;unpacked in1 in6 | 
|  | ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a | 
|  | ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a | 
|  | ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a | 
|  | ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a | 
|  |  | 
|  | psubsw               m3, m4, m1                ;low:  t4    high:  t5 | 
|  | paddsw               m4, m1                    ;low:  t0    high:  t1 | 
|  | psubsw               m2, m5, m0                ;low:  t6    high:  t7 | 
|  | paddsw               m5, m0                    ;low:  t2    high:  t3 | 
|  |  | 
|  | shufps               m1, m3, m2, q1032 | 
|  | punpckhwd            m2, m1 | 
|  | punpcklwd            m3, m1 | 
|  | ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a | 
|  | ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a | 
|  |  | 
|  | psubsw               m1, m4, m5                ;low:  t2    high:  t3 | 
|  | paddsw               m4, m5                    ;low:  out0  high: -out7 | 
|  | psubsw               m5, m3, m2                ;low:  t7    high:  t6 | 
|  | paddsw               m3, m2                    ;low:  out6  high: -out1 | 
|  | shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1 | 
|  | shufps               m3, m4, q3210             ;low:  out6  high: -out7 | 
|  |  | 
|  | mova                 m2, [o(pw_2896_m2896)] | 
|  | mova                 m7, [o(pw_2896_2896)] | 
|  | shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7 | 
|  | shufps               m1, m5, q3210             ;low:  t2    high:  t6 | 
|  | punpcklwd            m5, m1, m4 | 
|  | punpckhwd            m1, m4 | 
|  | pmaddwd              m4, m2, m1                ;-out5 | 
|  | pmaddwd              m2, m5                    ; out4 | 
|  | pmaddwd              m1, m7                    ; out2 | 
|  | pmaddwd              m5, m7                    ;-out3 | 
|  | REPX      {paddd x, m6}, m4, m2, m1, m5 | 
|  | REPX      {psrad x, 12}, m4, m2, m1, m5 | 
|  | packssdw             m1, m5                    ;low:  out2  high: -out3 | 
|  | packssdw             m2, m4                    ;low:  out4  high: -out5 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_4X8_FN flipadst, dct | 
|  | INV_TXFM_4X8_FN flipadst, adst | 
|  | INV_TXFM_4X8_FN flipadst, flipadst | 
|  | INV_TXFM_4X8_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | .pass1: | 
|  | call m(iadst_8x4_internal_8bpc).main | 
|  |  | 
|  | punpcklwd            m4, m3, m2 | 
|  | punpckhwd            m3, m2 | 
|  | punpcklwd            m5, m1, m0 | 
|  | punpckhwd            m1, m0 | 
|  | punpckldq            m2, m3, m1                  ;low: in4 high: in5 | 
|  | punpckhdq            m3, m1                      ;low: in6 high: in7 | 
|  | punpckldq            m0, m4, m5                  ;low: in0 high: in1 | 
|  | punpckhdq            m1, m4, m5                  ;low: in2 high: in3 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | shufps               m0, m0, q1032 | 
|  | shufps               m1, m1, q1032 | 
|  | call m(iadst_4x8_internal_8bpc).main | 
|  |  | 
|  | mova                 m4, m0 | 
|  | mova                 m5, m1 | 
|  | pshufd               m0, m3, q1032 | 
|  | pshufd               m1, m2, q1032 | 
|  | pshufd               m2, m5, q1032 | 
|  | pshufd               m3, m4, q1032 | 
|  | mova                 m5, [o(pw_2048)] | 
|  | pxor                 m4, m4 | 
|  | psubw                m4, m5 | 
|  | jmp m(iadst_4x8_internal_8bpc).end | 
|  |  | 
|  | INV_TXFM_4X8_FN identity, dct | 
|  | INV_TXFM_4X8_FN identity, adst | 
|  | INV_TXFM_4X8_FN identity, flipadst | 
|  | INV_TXFM_4X8_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | .pass1: | 
|  | mova                 m7, [o(pw_1697x8)] | 
|  | pmulhrsw             m4, m7, m0 | 
|  | pmulhrsw             m5, m7, m1 | 
|  | pmulhrsw             m6, m7, m2 | 
|  | pmulhrsw             m7, m3 | 
|  | paddsw               m0, m4 | 
|  | paddsw               m1, m5 | 
|  | paddsw               m2, m6 | 
|  | paddsw               m3, m7 | 
|  | jmp m(iadst_4x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | mova                 m4, [o(pw_4096)] | 
|  | jmp m(iadst_4x8_internal_8bpc).end2 | 
|  |  | 
|  |  | 
|  | %macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3] | 
|  | movq                 m%3, [dstq        ] | 
|  | movq                 m%4, [dstq+strideq] | 
|  | pxor                 m%5, m%5 | 
|  | punpcklbw            m%3, m%5                 ;extend byte to word | 
|  | punpcklbw            m%4, m%5                 ;extend byte to word | 
|  | %ifnum %1 | 
|  | paddw                m%3, m%1 | 
|  | %else | 
|  | paddw                m%3, %1 | 
|  | %endif | 
|  | %ifnum %2 | 
|  | paddw                m%4, m%2 | 
|  | %else | 
|  | paddw                m%4, %2 | 
|  | %endif | 
|  | packuswb             m%3, m%4 | 
|  | movq      [dstq        ], m%3 | 
|  | punpckhqdq           m%3, m%3 | 
|  | movq      [dstq+strideq], m%3 | 
|  | %endmacro | 
|  |  | 
|  | %macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3] | 
|  | WRITE_8X2             %1, %2, %5, %6, %7 | 
|  | lea                dstq, [dstq+strideq*2] | 
|  | WRITE_8X2             %3, %4, %5, %6, %7 | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_TXFM_8X4_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 8x4, 8 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw              m0, [coeffq], q0000 | 
|  | punpcklqdq           m0, m0 | 
|  | mova                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m1 | 
|  | mova                 m2, [o(pw_2048)] | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | mova                 m1, m0 | 
|  | mova                 m2, m0 | 
|  | mova                 m3, m0 | 
|  | TAIL_CALL m(iadst_8x4_internal_8bpc).end2 | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_8X4_FN dct, dct | 
|  | INV_TXFM_8X4_FN dct, adst | 
|  | INV_TXFM_8X4_FN dct, flipadst | 
|  | INV_TXFM_8X4_FN dct, identity | 
|  |  | 
|  | cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | call m(idct_4x8_internal_8bpc).main | 
|  |  | 
|  | mova                 m4, [o(deint_shuf1)] | 
|  | mova                 m5, [o(deint_shuf2)] | 
|  | pshufb               m0, m4 | 
|  | pshufb               m1, m5 | 
|  | pshufb               m2, m4 | 
|  | pshufb               m3, m5 | 
|  | punpckhdq            m4, m0, m1 | 
|  | punpckldq            m0, m1 | 
|  | punpckhdq            m5, m2, m3 | 
|  | punpckldq            m2, m3 | 
|  | punpckhqdq           m1, m0, m2                      ;in1 | 
|  | punpcklqdq           m0, m2                          ;in0 | 
|  | punpckhqdq           m3, m4, m5                      ;in3 | 
|  | punpcklqdq           m2 ,m4, m5                      ;in2 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | call .main | 
|  | jmp m(iadst_8x4_internal_8bpc).end | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova                 m6, [o(pd_2048)] | 
|  | IDCT4_1D             0, 1, 2, 3, 4, 5, 6 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_8X4_FN adst, dct | 
|  | INV_TXFM_8X4_FN adst, adst | 
|  | INV_TXFM_8X4_FN adst, flipadst | 
|  | INV_TXFM_8X4_FN adst, identity | 
|  |  | 
|  | cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | shufps               m0, m0, q1032 | 
|  | shufps               m1, m1, q1032 | 
|  | call m(iadst_4x8_internal_8bpc).main | 
|  |  | 
|  | punpckhwd            m4, m0, m1 | 
|  | punpcklwd            m0, m1 | 
|  | punpckhwd            m1, m2, m3 | 
|  | punpcklwd            m2, m3 | 
|  | pxor                 m5, m5 | 
|  | psubsw               m3, m5, m1 | 
|  | psubsw               m5, m4 | 
|  | punpckhdq            m4, m5, m3 | 
|  | punpckldq            m5, m3 | 
|  | punpckhdq            m3, m0, m2 | 
|  | punpckldq            m0, m2 | 
|  | punpckhwd            m1, m0, m5      ;in1 | 
|  | punpcklwd            m0, m5          ;in0 | 
|  | punpcklwd            m2, m3, m4      ;in2 | 
|  | punpckhwd            m3, m4          ;in3 | 
|  | jmp              tx2q | 
|  |  | 
|  | .pass2: | 
|  | call .main | 
|  |  | 
|  | .end: | 
|  | mova                 m4, [o(pw_2048)] | 
|  | pmulhrsw             m0, m4 | 
|  | pmulhrsw             m1, m4 | 
|  | pmulhrsw             m2, m4 | 
|  | pmulhrsw             m3, m4 | 
|  |  | 
|  | .end2: | 
|  | pxor                 m6, m6 | 
|  | mova      [coeffq+16*0], m6 | 
|  | mova      [coeffq+16*1], m6 | 
|  | mova      [coeffq+16*2], m6 | 
|  | mova      [coeffq+16*3], m6 | 
|  | .end3: | 
|  | WRITE_8X4             0, 1, 2, 3, 4, 5, 6 | 
|  | RET | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | punpckhwd            m6, m0, m2                    ;unpacked in0 in2 | 
|  | punpcklwd            m0, m2                        ;unpacked in0 in2 | 
|  | punpckhwd            m7, m1, m3                    ;unpacked in1 in3 | 
|  | punpcklwd            m1, m3                        ;unpacked in1 in3 | 
|  |  | 
|  | mova                 m2, [o(pw_3344_m3344)] | 
|  | mova                 m4, [o(pw_0_3344)] | 
|  | pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2 | 
|  | pmaddwd              m5, m4, m7                    ;3344 * in3 | 
|  | pmaddwd              m2, m0 | 
|  | pmaddwd              m4, m1 | 
|  | paddd                m3, m5 | 
|  | paddd                m2, m4 | 
|  | mova                 m4, [o(pd_2048)] | 
|  | paddd                m3, m4                        ;t2 + 2048 | 
|  | paddd                m2, m4 | 
|  | psrad                m3, 12 | 
|  | psrad                m2, 12 | 
|  | packssdw             m2, m3                        ;out2 | 
|  |  | 
|  | pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2 | 
|  | pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2 | 
|  | pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3 | 
|  | pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3 | 
|  | paddd                m3, m4                        ;t0 + t3 | 
|  |  | 
|  | pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3 | 
|  | mova                 m4, [o(pd_2048)] | 
|  | paddd                m0, m4 | 
|  | paddd                m4, m3                        ;t0 + t3 + 2048 | 
|  | paddd                m5, m0                        ;t1 + t3 + 2048 | 
|  | paddd                m3, m0 | 
|  | paddd                m3, m1                        ;t0 + t1 - t3 + 2048 | 
|  |  | 
|  | psrad                m4, 12                        ;out0 | 
|  | psrad                m5, 12                        ;out1 | 
|  | psrad                m3, 12                        ;out3 | 
|  | packssdw             m0, m4, m5                    ;low: out0  high: out1 | 
|  |  | 
|  | pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2 | 
|  | pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2 | 
|  | pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3 | 
|  | pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3 | 
|  | paddd                m1, m4                        ;t0 + t3 | 
|  | pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3 | 
|  |  | 
|  | mova                 m4, [o(pd_2048)] | 
|  | paddd                m6, m4 | 
|  | paddd                m4, m1                        ;t0 + t3 + 2048 | 
|  | paddd                m5, m6                        ;t1 + t3 + 2048 | 
|  | paddd                m1, m6 | 
|  | paddd                m1, m7                        ;t0 + t1 - t3 + 2048 | 
|  |  | 
|  | psrad                m4, 12                        ;out0 | 
|  | psrad                m5, 12                        ;out1 | 
|  | psrad                m1, 12                        ;out3 | 
|  | packssdw             m3, m1                        ;out3 | 
|  | packssdw             m4, m5                        ;low: out0  high: out1 | 
|  |  | 
|  | punpckhqdq           m1, m0, m4                    ;out1 | 
|  | punpcklqdq           m0, m4                        ;out0 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_8X4_FN flipadst, dct | 
|  | INV_TXFM_8X4_FN flipadst, adst | 
|  | INV_TXFM_8X4_FN flipadst, flipadst | 
|  | INV_TXFM_8X4_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  |  | 
|  | shufps               m0, m0, q1032 | 
|  | shufps               m1, m1, q1032 | 
|  | call m(iadst_4x8_internal_8bpc).main | 
|  |  | 
|  | punpckhwd            m5, m3, m2 | 
|  | punpcklwd            m3, m2 | 
|  | punpckhwd            m2, m1, m0 | 
|  | punpcklwd            m1, m0 | 
|  |  | 
|  | pxor                 m0, m0 | 
|  | psubsw               m4, m0, m2 | 
|  | psubsw               m0, m5 | 
|  | punpckhdq            m2, m0, m4 | 
|  | punpckldq            m0, m4 | 
|  | punpckhdq            m4, m3, m1 | 
|  | punpckldq            m3, m1 | 
|  | punpckhwd            m1, m0, m3      ;in1 | 
|  | punpcklwd            m0, m3          ;in0 | 
|  | punpckhwd            m3, m2, m4      ;in3 | 
|  | punpcklwd            m2, m4          ;in2 | 
|  | jmp                  tx2q | 
|  |  | 
|  | .pass2: | 
|  | call m(iadst_8x4_internal_8bpc).main | 
|  | mova                 m4, m0 | 
|  | mova                 m5, m1 | 
|  | mova                 m0, m3 | 
|  | mova                 m1, m2 | 
|  | mova                 m2, m5 | 
|  | mova                 m3, m4 | 
|  | jmp m(iadst_8x4_internal_8bpc).end | 
|  |  | 
|  | INV_TXFM_8X4_FN identity, dct | 
|  | INV_TXFM_8X4_FN identity, adst | 
|  | INV_TXFM_8X4_FN identity, flipadst | 
|  | INV_TXFM_8X4_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [coeffq+16*0] | 
|  | pmulhrsw             m1, m3, [coeffq+16*1] | 
|  | pmulhrsw             m2, m3, [coeffq+16*2] | 
|  | pmulhrsw             m3,     [coeffq+16*3] | 
|  | paddsw               m0, m0 | 
|  | paddsw               m1, m1 | 
|  | paddsw               m2, m2 | 
|  | paddsw               m3, m3 | 
|  |  | 
|  | punpckhwd            m4, m0, m1 | 
|  | punpcklwd            m0, m1 | 
|  | punpckhwd            m1, m2, m3 | 
|  | punpcklwd            m2, m3 | 
|  | punpckhdq            m5, m4, m1 | 
|  | punpckldq            m4, m1 | 
|  | punpckhdq            m3, m0, m2 | 
|  | punpckldq            m0, m2 | 
|  | punpckhwd            m1, m0, m4      ;in1 | 
|  | punpcklwd            m0, m4          ;in0 | 
|  | punpcklwd            m2, m3, m5      ;in2 | 
|  | punpckhwd            m3, m5          ;in3 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | mova                 m7, [o(pw_1697x8)] | 
|  | pmulhrsw             m4, m7, m0 | 
|  | pmulhrsw             m5, m7, m1 | 
|  | pmulhrsw             m6, m7, m2 | 
|  | pmulhrsw             m7, m3 | 
|  | paddsw               m0, m4 | 
|  | paddsw               m1, m5 | 
|  | paddsw               m2, m6 | 
|  | paddsw               m3, m7 | 
|  | jmp m(iadst_8x4_internal_8bpc).end | 
|  |  | 
|  | %macro INV_TXFM_8X8_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 8x8, 8, 16*4 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw              m0, [coeffq], q0000 | 
|  | punpcklwd            m0, m0 | 
|  | mova                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1 | 
|  | mova                 m2, [o(pw_16384)] | 
|  | mov            [coeffq], eobd | 
|  | pmulhrsw             m0, m2 | 
|  | psrlw                m2, 3 | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | .end: | 
|  | mov                 r3d, 2 | 
|  | lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] | 
|  | .loop: | 
|  | WRITE_8X4             0, 0, 0, 0, 1, 2, 3 | 
|  | lea                dstq, [dstq+strideq*2] | 
|  | dec                 r3d | 
|  | jg .loop | 
|  | jmp                tx2q | 
|  | .end3: | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 | 
|  | %if %3 | 
|  | mova                 m7, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m7, [%1+%2*0] | 
|  | pmulhrsw             m1, m7, [%1+%2*1] | 
|  | pmulhrsw             m2, m7, [%1+%2*2] | 
|  | pmulhrsw             m3, m7, [%1+%2*3] | 
|  | pmulhrsw             m4, m7, [%1+%2*4] | 
|  | pmulhrsw             m5, m7, [%1+%2*5] | 
|  | pmulhrsw             m6, m7, [%1+%2*6] | 
|  | pmulhrsw             m7, [%1+%2*7] | 
|  | %else | 
|  | mova                 m0, [%1+%2*0] | 
|  | mova                 m1, [%1+%2*1] | 
|  | mova                 m2, [%1+%2*2] | 
|  | mova                 m3, [%1+%2*3] | 
|  | mova                 m4, [%1+%2*4] | 
|  | mova                 m5, [%1+%2*5] | 
|  | mova                 m6, [%1+%2*6] | 
|  | mova                 m7, [%1+%2*7] | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 | 
|  | ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a | 
|  | ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a | 
|  | psubsw               m%2, m%4, m%5                      ;t6a | 
|  | paddsw               m%4, m%5                           ;t7 | 
|  | psubsw               m%5, m%1, m%3                      ;t5a | 
|  | paddsw               m%1, m%3                           ;t4 | 
|  | ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_8X8_FN dct, dct | 
|  | INV_TXFM_8X8_FN dct, adst | 
|  | INV_TXFM_8X8_FN dct, flipadst | 
|  | INV_TXFM_8X8_FN dct, identity | 
|  |  | 
|  | cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS          coeffq, 16 | 
|  |  | 
|  | .pass1: | 
|  | call .main | 
|  |  | 
|  | .pass1_end: | 
|  | mova                    m7, [o(pw_16384)] | 
|  |  | 
|  | .pass1_end1: | 
|  | REPX      {pmulhrsw x, m7}, m0, m2, m4, m6 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  |  | 
|  | .pass1_end2: | 
|  | REPX      {pmulhrsw x, m7}, m1, m3, m5 | 
|  | pmulhrsw                m7, [rsp+gprsize+16*0] | 
|  |  | 
|  | cglobal_label .pass1_end3 | 
|  | punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53 | 
|  | punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57 | 
|  | punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47 | 
|  | punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43 | 
|  | punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77 | 
|  | punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73 | 
|  | punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77 | 
|  | punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75 | 
|  | punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73 | 
|  | punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71 | 
|  | mova    [rsp+gprsize+16*2], m6 | 
|  | mova                    m6, [rsp+gprsize+16*1] | 
|  | punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67 | 
|  | punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63 | 
|  | punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67 | 
|  | punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65 | 
|  | punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63 | 
|  | punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61 | 
|  |  | 
|  | punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77 | 
|  | punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76 | 
|  | mova    [rsp+gprsize+16*0], m2 | 
|  | punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72 | 
|  | punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73 | 
|  | punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74 | 
|  | punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75 | 
|  | mova                    m7, [rsp+gprsize+16*2] | 
|  | punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71 | 
|  | punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70 | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | jmp                   tx2q | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)] | 
|  |  | 
|  | .pass2_main: | 
|  | call .main | 
|  |  | 
|  | .end: | 
|  | mova                    m7, [o(pw_2048)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m2, m4, m6 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  |  | 
|  | .end2: | 
|  | REPX      {pmulhrsw x, m7}, m1, m3, m5 | 
|  | pmulhrsw                m7, [rsp+gprsize+16*0] | 
|  | mova    [rsp+gprsize+16*2], m5 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  |  | 
|  | .end3: | 
|  | WRITE_8X4                0, 1, 2, 3, 5, 6, 7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 | 
|  | jmp                   tx2q | 
|  |  | 
|  | .end4: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7 | 
|  | ret | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova  [rsp+gprsize*2+16*0], m7 | 
|  | mova  [rsp+gprsize*2+16*1], m3 | 
|  | mova  [rsp+gprsize*2+16*2], m1 | 
|  | mova                    m7, [o(pd_2048)] | 
|  | IDCT4_1D                 0, 2, 4, 6, 1, 3, 7 | 
|  | mova                    m3, [rsp+gprsize*2+16*2] | 
|  | mova  [rsp+gprsize*2+16*2], m2 | 
|  | mova                    m2, [rsp+gprsize*2+16*1] | 
|  | mova  [rsp+gprsize*2+16*1], m4 | 
|  | mova                    m4, [rsp+gprsize*2+16*0] | 
|  | mova  [rsp+gprsize*2+16*0], m6 | 
|  | IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7 | 
|  | mova                    m6, [rsp+gprsize*2+16*0] | 
|  | psubsw                  m7, m0, m4                    ;out7 | 
|  | paddsw                  m0, m4                        ;out0 | 
|  | mova  [rsp+gprsize*2+16*0], m7 | 
|  | mova                    m1, [rsp+gprsize*2+16*2] | 
|  | psubsw                  m4, m6, m3                    ;out4 | 
|  | paddsw                  m3, m6                        ;out3 | 
|  | mova                    m7, [rsp+gprsize*2+16*1] | 
|  | psubsw                  m6, m1, m5                    ;out6 | 
|  | paddsw                  m1, m5                        ;out1 | 
|  | psubsw                  m5, m7, m2                    ;out5 | 
|  | paddsw                  m2, m7                        ;out2 | 
|  | ret | 
|  |  | 
|  |  | 
|  | INV_TXFM_8X8_FN adst, dct | 
|  | INV_TXFM_8X8_FN adst, adst | 
|  | INV_TXFM_8X8_FN adst, flipadst | 
|  | INV_TXFM_8X8_FN adst, identity | 
|  |  | 
|  | cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS          coeffq, 16 | 
|  |  | 
|  | .pass1: | 
|  | call .main | 
|  | call .main_pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | mova                    m7, [o(pw_16384)] | 
|  |  | 
|  | .pass1_end1: | 
|  | REPX      {pmulhrsw x, m7}, m0, m2, m4, m6 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  | pxor                    m6, m6 | 
|  | psubw                   m6, m7 | 
|  | mova                    m7, m6 | 
|  | jmp m(idct_8x8_internal_8bpc).pass1_end2 | 
|  |  | 
|  | ALIGN function_align | 
|  | .pass2: | 
|  | lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)] | 
|  |  | 
|  | .pass2_main: | 
|  | call .main | 
|  | call .main_pass2_end | 
|  |  | 
|  | .end: | 
|  | mova                    m7, [o(pw_2048)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m2, m4, m6 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  | pxor                    m6, m6 | 
|  | psubw                   m6, m7 | 
|  | mova                    m7, m6 | 
|  | jmp m(idct_8x8_internal_8bpc).end2 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova  [rsp+gprsize*2+16*0], m7 | 
|  | mova  [rsp+gprsize*2+16*1], m3 | 
|  | mova  [rsp+gprsize*2+16*2], m4 | 
|  | mova                    m7, [o(pd_2048)] | 
|  | ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a | 
|  | ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a | 
|  | paddsw                  m3, m2, m6                    ;t2 | 
|  | psubsw                  m2, m6                        ;t6 | 
|  | paddsw                  m4, m5, m1                    ;t3 | 
|  | psubsw                  m5, m1                        ;t7 | 
|  | ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a | 
|  |  | 
|  | mova                    m6, [rsp+gprsize*2+16*2] | 
|  | mova  [rsp+gprsize*2+16*2], m5 | 
|  | mova                    m1, [rsp+gprsize*2+16*1] | 
|  | mova  [rsp+gprsize*2+16*1], m2 | 
|  | mova                    m5, [rsp+gprsize*2+16*0] | 
|  | mova  [rsp+gprsize*2+16*0], m3 | 
|  | ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a | 
|  | ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a | 
|  | psubsw                  m2, m0, m6                    ;t4 | 
|  | paddsw                  m0, m6                        ;t0 | 
|  | paddsw                  m3, m5, m1                    ;t1 | 
|  | psubsw                  m5, m1                        ;t5 | 
|  | ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a | 
|  |  | 
|  | mova                    m7, [rsp+gprsize*2+16*0] | 
|  | paddsw                  m1, m3, m4                    ;-out7 | 
|  | psubsw                  m3, m4                        ;t3 | 
|  | mova  [rsp+gprsize*2+16*0], m1 | 
|  | psubsw                  m4, m0, m7                    ;t2 | 
|  | paddsw                  m0, m7                        ;out0 | 
|  | mova                    m6, [rsp+gprsize*2+16*2] | 
|  | mova                    m7, [rsp+gprsize*2+16*1] | 
|  | paddsw                  m1, m5, m6                    ;-out1 | 
|  | psubsw                  m5, m6                        ;t6 | 
|  | paddsw                  m6, m2, m7                    ;out6 | 
|  | psubsw                  m2, m7                        ;t7 | 
|  | ret | 
|  | ALIGN function_align | 
|  | .main_pass1_end: | 
|  | mova  [rsp+gprsize*2+16*1], m1 | 
|  | mova  [rsp+gprsize*2+16*2], m6 | 
|  | punpckhwd               m1, m4, m3 | 
|  | punpcklwd               m4, m3 | 
|  | punpckhwd               m7, m5, m2 | 
|  | punpcklwd               m5, m2 | 
|  | mova                    m2, [o(pw_2896_2896)] | 
|  | mova                    m6, [o(pd_2048)] | 
|  | pmaddwd                 m3, m2, m7 | 
|  | pmaddwd                 m2, m5 | 
|  | paddd                   m3, m6 | 
|  | paddd                   m2, m6 | 
|  | psrad                   m3, 12 | 
|  | psrad                   m2, 12 | 
|  | packssdw                m2, m3                        ;out2 | 
|  | mova                    m3, [o(pw_2896_m2896)] | 
|  | pmaddwd                 m7, m3 | 
|  | pmaddwd                 m5, m3 | 
|  | paddd                   m7, m6 | 
|  | paddd                   m5, m6 | 
|  | psrad                   m7, 12 | 
|  | psrad                   m5, 12 | 
|  | packssdw                m5, m7                        ;-out5 | 
|  | mova                    m3, [o(pw_2896_2896)] | 
|  | pmaddwd                 m7, m3, m1 | 
|  | pmaddwd                 m3, m4 | 
|  | paddd                   m7, m6 | 
|  | paddd                   m3, m6 | 
|  | psrad                   m7, 12 | 
|  | psrad                   m3, 12 | 
|  | packssdw                m3, m7                        ;-out3 | 
|  | mova                    m7, [o(pw_2896_m2896)] | 
|  | pmaddwd                 m1, m7 | 
|  | pmaddwd                 m4, m7 | 
|  | paddd                   m1, m6 | 
|  | paddd                   m4, m6 | 
|  | psrad                   m1, 12 | 
|  | psrad                   m4, 12 | 
|  | packssdw                m4, m1                        ;-out5 | 
|  | mova                    m1, [rsp+gprsize*2+16*1] | 
|  | mova                    m6, [rsp+gprsize*2+16*2] | 
|  | ret | 
|  | ALIGN function_align | 
|  | cglobal_label .main_pass2_end | 
|  | paddsw                  m7, m4, m3                    ;t2 + t3 | 
|  | psubsw                  m4, m3                        ;t2 - t3 | 
|  | paddsw                  m3, m5, m2                    ;t6 + t7 | 
|  | psubsw                  m5, m2                        ;t6 - t7 | 
|  | mova                    m2, [o(pw_2896x8)] | 
|  | pmulhrsw                m4, m2                        ;out4 | 
|  | pmulhrsw                m5, m2                        ;-out5 | 
|  | pmulhrsw                m7, m2                        ;-out3 | 
|  | pmulhrsw                m2, m3                        ;out2 | 
|  | mova                    m3, m7 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_8X8_FN flipadst, dct | 
|  | INV_TXFM_8X8_FN flipadst, adst | 
|  | INV_TXFM_8X8_FN flipadst, flipadst | 
|  | INV_TXFM_8X8_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS          coeffq, 16 | 
|  |  | 
|  | .pass1: | 
|  | call m(iadst_8x8_internal_8bpc).main | 
|  | call m(iadst_8x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | mova                    m7, [o(pw_m16384)] | 
|  |  | 
|  | .pass1_end1: | 
|  | pmulhrsw                m1, m7 | 
|  | mova    [rsp+gprsize+16*1], m1 | 
|  | mova                    m1, m6 | 
|  | mova                    m6, m2 | 
|  | pmulhrsw                m2, m5, m7 | 
|  | mova                    m5, m6 | 
|  | mova                    m6, m4 | 
|  | pmulhrsw                m4, m3, m7 | 
|  | mova                    m3, m6 | 
|  | mova                    m6, m0 | 
|  | mova                    m0, m7 | 
|  | pxor                    m7, m7 | 
|  | psubw                   m7, m0 | 
|  | pmulhrsw                m0, [rsp+gprsize+16*0] | 
|  | REPX      {pmulhrsw x, m7}, m1, m3, m5 | 
|  | pmulhrsw                m7, m6 | 
|  | jmp m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | ALIGN function_align | 
|  | .pass2: | 
|  | lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)] | 
|  |  | 
|  | .pass2_main: | 
|  | call m(iadst_8x8_internal_8bpc).main | 
|  | call m(iadst_8x8_internal_8bpc).main_pass2_end | 
|  |  | 
|  | .end: | 
|  | mova                    m7, [o(pw_2048)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m2, m4, m6 | 
|  | mova    [rsp+gprsize+16*2], m2 | 
|  | mova                    m2, m0 | 
|  | pxor                    m0, m0 | 
|  | psubw                   m0, m7 | 
|  | mova                    m7, m2 | 
|  | pmulhrsw                m1, m0 | 
|  | pmulhrsw                m2, m5, m0 | 
|  | mova    [rsp+gprsize+16*1], m1 | 
|  | mova                    m5, m4 | 
|  | mova                    m1, m6 | 
|  | pmulhrsw                m4, m3, m0 | 
|  | pmulhrsw                m0, [rsp+gprsize+16*0] | 
|  | mova                    m3, m5 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | jmp m(idct_8x8_internal_8bpc).end3 | 
|  |  | 
|  | INV_TXFM_8X8_FN identity, dct | 
|  | INV_TXFM_8X8_FN identity, adst | 
|  | INV_TXFM_8X8_FN identity, flipadst | 
|  | INV_TXFM_8X8_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS          coeffq, 16 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | ALIGN function_align | 
|  | .pass2: | 
|  | lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)] | 
|  |  | 
|  | .end: | 
|  | pmulhrsw                m7, [o(pw_4096)] | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_4096)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | mova    [rsp+gprsize+16*2], m5 | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  | jmp m(idct_8x8_internal_8bpc).end3 | 
|  |  | 
|  |  | 
|  | %macro INV_TXFM_4X16_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 4x16, 8 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw               m0, [coeffq], q0000 | 
|  | punpcklwd             m0, m0 | 
|  | mova                  m1, [o(pw_2896x8)] | 
|  | pmulhrsw              m0, m1 | 
|  | mov             [coeffq], eobd | 
|  | pmulhrsw              m0, [o(pw_16384)] | 
|  | pmulhrsw              m0, m1 | 
|  | pmulhrsw              m0, [o(pw_2048)] | 
|  | .end: | 
|  | WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3 | 
|  | lea                dstq, [dstq+strideq*4] | 
|  | WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3 | 
|  | lea                dstq, [dstq+strideq*4] | 
|  | WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3 | 
|  | lea                dstq, [dstq+strideq*4] | 
|  | WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3 | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_4X16_FN dct, dct | 
|  | INV_TXFM_4X16_FN dct, adst | 
|  | INV_TXFM_4X16_FN dct, flipadst | 
|  | INV_TXFM_4X16_FN dct, identity | 
|  |  | 
|  | cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                  r3, [o(m(idct_4x8_internal_8bpc).pass1)] | 
|  |  | 
|  | .pass1: | 
|  | mova                 m0, [coeffq+16*1] | 
|  | mova                 m1, [coeffq+16*3] | 
|  | mova                 m2, [coeffq+16*5] | 
|  | mova                 m3, [coeffq+16*7] | 
|  | push               tx2q | 
|  | lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] | 
|  | jmp                  r3 | 
|  |  | 
|  | .pass1_2: | 
|  | mova      [coeffq+16*1], m0 | 
|  | mova      [coeffq+16*3], m1 | 
|  | mova      [coeffq+16*5], m2 | 
|  | mova      [coeffq+16*7], m3 | 
|  | mova                 m0, [coeffq+16*0] | 
|  | mova                 m1, [coeffq+16*2] | 
|  | mova                 m2, [coeffq+16*4] | 
|  | mova                 m3, [coeffq+16*6] | 
|  | lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] | 
|  | jmp                  r3 | 
|  |  | 
|  | .pass1_end: | 
|  | pop                tx2q | 
|  |  | 
|  | mova                 m4, [coeffq+16*1] | 
|  | mova                 m5, [coeffq+16*3] | 
|  | mova                 m6, [coeffq+16*5] | 
|  | mova                 m7, [o(pw_16384)] | 
|  | REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  |  | 
|  | pmulhrsw             m7, [coeffq+16*7] | 
|  | mova       [coeffq+16*7], m7 | 
|  | jmp                tx2q | 
|  |  | 
|  | .pass2: | 
|  | call m(idct_16x4_internal_8bpc).main | 
|  |  | 
|  | .end: | 
|  | mova                  m7, [o(pw_2048)] | 
|  | REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | pmulhrsw              m7, [coeffq+16*7] | 
|  | mova       [coeffq+16*4], m4 | 
|  |  | 
|  | .end1: | 
|  | mova       [coeffq+16*5], m5 | 
|  | mova       [coeffq+16*6], m6 | 
|  | mov                   r3, coeffq | 
|  | WRITE_4X8              0, 1, 3, 2 | 
|  |  | 
|  | mova                  m0, [r3+16*4] | 
|  | mova                  m1, [r3+16*5] | 
|  | mova                  m2, [r3+16*6] | 
|  | mova                  m3, m7 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | WRITE_4X8              0, 1, 3, 2 | 
|  |  | 
|  | .end2: | 
|  | pxor                  m7, m7 | 
|  | REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_4X16_FN adst, dct | 
|  | INV_TXFM_4X16_FN adst, adst | 
|  | INV_TXFM_4X16_FN adst, flipadst | 
|  | INV_TXFM_4X16_FN adst, identity | 
|  |  | 
|  | cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                   r3, [o(m(iadst_4x8_internal_8bpc).pass1)] | 
|  | jmp   m(idct_4x16_internal_8bpc).pass1 | 
|  |  | 
|  | .pass2: | 
|  | call m(iadst_16x4_internal_8bpc).main | 
|  | call m(iadst_16x4_internal_8bpc).main_pass2_end | 
|  |  | 
|  | punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7 | 
|  | punpckhqdq            m4, m5                    ;low:  out8  high:  out10 | 
|  | punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6 | 
|  | punpckhqdq            m2, m7                    ;low: -out9  high: -out11 | 
|  | mova       [coeffq+16*4], m2 | 
|  | mova       [coeffq+16*5], m6 | 
|  | mova                  m2, [coeffq+16*6] | 
|  | mova                  m6, [coeffq+16*7] | 
|  | punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15 | 
|  | punpcklqdq            m0, m6                    ;low:  out0  high:  out2 | 
|  | punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14 | 
|  | punpcklqdq            m2, m3                    ;low: -out1  high: -out3 | 
|  |  | 
|  | mova                  m7, [o(pw_2048)] | 
|  |  | 
|  | .end1: | 
|  | REPX    {pmulhrsw x, m7}, m0, m5, m4, m6 | 
|  | pxor                  m3, m3 | 
|  | psubw                 m3, m7 | 
|  | mova                  m7, [coeffq+16*4] | 
|  | REPX    {pmulhrsw x, m3}, m2, m7, m1 | 
|  | pmulhrsw              m3, [coeffq+16*5] | 
|  | mova       [coeffq+16*7], m5 | 
|  |  | 
|  | punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11 | 
|  | punpcklqdq            m4, m7                    ;low:  out8  high:  out9 | 
|  | punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15 | 
|  | punpcklqdq            m6, m1                    ;low:  out12 high:  out13 | 
|  | punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3 | 
|  | punpcklqdq            m0, m2                    ;low:  out0  high:  out1 | 
|  | mova       [coeffq+16*4], m4 | 
|  | mova                  m4, [coeffq+16*7] | 
|  | punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5 | 
|  | punpckhqdq            m4, m3                    ;low:  out6  high:  out7 | 
|  | mova                  m3, m4 | 
|  |  | 
|  | .end2: | 
|  | mova       [coeffq+16*5], m5 | 
|  | mova       [coeffq+16*6], m6 | 
|  | mov                   r3, coeffq | 
|  | WRITE_4X8              0, 1, 2, 3 | 
|  |  | 
|  | mova                  m0, [r3+16*4] | 
|  | mova                  m1, [r3+16*5] | 
|  | mova                  m2, [r3+16*6] | 
|  | mova                  m3, m7 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | WRITE_4X8              0, 1, 2, 3 | 
|  |  | 
|  | .end3: | 
|  | pxor                  m7, m7 | 
|  | REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7 | 
|  | ret | 
|  |  | 
|  |  | 
|  | INV_TXFM_4X16_FN flipadst, dct | 
|  | INV_TXFM_4X16_FN flipadst, adst | 
|  | INV_TXFM_4X16_FN flipadst, flipadst | 
|  | INV_TXFM_4X16_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                   r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] | 
|  | jmp   m(idct_4x16_internal_8bpc).pass1 | 
|  |  | 
|  | .pass2: | 
|  | call m(iadst_16x4_internal_8bpc).main | 
|  | call m(iadst_16x4_internal_8bpc).main_pass2_end | 
|  |  | 
|  | punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7 | 
|  | punpcklqdq            m4, m5                    ;low: -out8  high: -out10 | 
|  | punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6 | 
|  | punpcklqdq            m2, m7                    ;low:  out9  high:  out11 | 
|  | mova       [coeffq+16*4], m2 | 
|  | mova       [coeffq+16*5], m6 | 
|  | mova                  m2, [coeffq+16*6] | 
|  | mova                  m6, [coeffq+16*7] | 
|  | punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15 | 
|  | punpckhqdq            m0, m6                    ;low: -out0  high: -out2 | 
|  | punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14 | 
|  | punpckhqdq            m2, m3                    ;low:  out1  high:  out3 | 
|  |  | 
|  | mova                  m7, [o(pw_m2048)] | 
|  | jmp   m(iadst_4x16_internal_8bpc).end1 | 
|  |  | 
|  |  | 
|  | INV_TXFM_4X16_FN identity, dct | 
|  | INV_TXFM_4X16_FN identity, adst | 
|  | INV_TXFM_4X16_FN identity, flipadst | 
|  | INV_TXFM_4X16_FN identity, identity | 
|  |  | 
|  | %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] | 
|  | pmulhrsw            m%2, m%3, m%1 | 
|  | %if %0 == 4 ; if downshifting by 1 | 
|  | pmulhrsw            m%2, m%4 | 
|  | %else | 
|  | paddsw              m%1, m%1 | 
|  | %endif | 
|  | paddsw              m%1, m%2 | 
|  | %endmacro | 
|  |  | 
|  | cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                  m0, [coeffq+16*1] | 
|  | mova                  m6, [o(pw_1697x8)] | 
|  | mova                  m1, [coeffq+16*3] | 
|  | mova                  m2, [coeffq+16*5] | 
|  | mova                  m3, [coeffq+16*7] | 
|  | pcmpeqw               m7, m7 | 
|  | mov                   r3, tx2q | 
|  | lea                 tx2q, [o(.pass1_2)] | 
|  | .pass1: | 
|  | pmulhrsw              m4, m6, m0 | 
|  | pmulhrsw              m5, m6, m1 | 
|  | pavgw                 m4, m0 | 
|  | pcmpeqw               m0, m7 | 
|  | pavgw                 m5, m1 | 
|  | pcmpeqw               m1, m7 | 
|  | pandn                 m0, m4 | 
|  | pmulhrsw              m4, m6, m2 | 
|  | pandn                 m1, m5 | 
|  | pmulhrsw              m5, m6, m3 | 
|  | pavgw                 m4, m2 | 
|  | pcmpeqw               m2, m7 | 
|  | pavgw                 m5, m3 | 
|  | pcmpeqw               m3, m7 | 
|  | pandn                 m2, m4 | 
|  | pandn                 m3, m5 | 
|  | jmp m(iadst_4x8_internal_8bpc).pass1_end | 
|  | .pass1_2: | 
|  | mova       [coeffq+16*1], m0 | 
|  | mova       [coeffq+16*3], m1 | 
|  | mova       [coeffq+16*5], m2 | 
|  | mova       [coeffq+16*7], m3 | 
|  | mova                  m0, [coeffq+16*0] | 
|  | mova                  m1, [coeffq+16*2] | 
|  | mova                  m2, [coeffq+16*4] | 
|  | mova                  m3, [coeffq+16*6] | 
|  | lea                 tx2q, [o(.pass1_end)] | 
|  | jmp .pass1 | 
|  | .pass1_end: | 
|  | mova                  m4, [coeffq+16*1] | 
|  | mova                  m5, [coeffq+16*3] | 
|  | mova                  m6, [coeffq+16*5] | 
|  | jmp                   r3 | 
|  | .pass2: | 
|  | mova                  m7, [o(pw_1697x16)] | 
|  | mova       [coeffq+16*6], m6 | 
|  | REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 | 
|  | mova                  m6, [coeffq+16*7] | 
|  | IDTX16                 6, 7, 7 | 
|  | mova       [coeffq+16*7], m6 | 
|  | mova                  m6, [coeffq+16*6] | 
|  | pmulhrsw              m7, m6, [o(pw_1697x16)] | 
|  | paddsw                m6, m6 | 
|  | paddsw                m6, m7 | 
|  | mova                  m7, [o(pw_2048)] | 
|  | REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | pmulhrsw              m7, [coeffq+16*7] | 
|  | mova       [coeffq+16*4], m4 | 
|  | jmp m(iadst_4x16_internal_8bpc).end2 | 
|  |  | 
|  |  | 
|  | %macro INV_TXFM_16X4_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 16x4, 8 | 
|  | %ifidn %1_%2, dct_dct | 
|  | movd                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1, [coeffq] | 
|  | movd                 m2, [o(pw_16384)] | 
|  | mov            [coeffq], eobd | 
|  | mov                 r2d, 2 | 
|  | lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] | 
|  | .dconly: | 
|  | pmulhrsw             m0, m2 | 
|  | movd                 m2, [o(pw_2048)]              ;intentionally rip-relative | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | pshuflw              m0, m0, q0000 | 
|  | punpcklwd            m0, m0 | 
|  | pxor                 m5, m5 | 
|  | .dconly_loop: | 
|  | mova                 m1, [dstq] | 
|  | mova                 m3, [dstq+strideq] | 
|  | punpckhbw            m2, m1, m5 | 
|  | punpcklbw            m1, m5 | 
|  | punpckhbw            m4, m3, m5 | 
|  | punpcklbw            m3, m5 | 
|  | paddw                m2, m0 | 
|  | paddw                m1, m0 | 
|  | paddw                m4, m0 | 
|  | paddw                m3, m0 | 
|  | packuswb             m1, m2 | 
|  | packuswb             m3, m4 | 
|  | mova             [dstq], m1 | 
|  | mova     [dstq+strideq], m3 | 
|  | lea                dstq, [dstq+strideq*2] | 
|  | dec                 r2d | 
|  | jg .dconly_loop | 
|  | jmp                tx2q | 
|  | .end: | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro LOAD_7ROWS 2 ;src, stride | 
|  | mova                 m0, [%1+%2*0] | 
|  | mova                 m1, [%1+%2*1] | 
|  | mova                 m2, [%1+%2*2] | 
|  | mova                 m3, [%1+%2*3] | 
|  | mova                 m4, [%1+%2*4] | 
|  | mova                 m5, [%1+%2*5] | 
|  | mova                 m6, [%1+%2*6] | 
|  | %endmacro | 
|  |  | 
|  | %macro SAVE_7ROWS 2 ;src, stride | 
|  | mova          [%1+%2*0], m0 | 
|  | mova          [%1+%2*1], m1 | 
|  | mova          [%1+%2*2], m2 | 
|  | mova          [%1+%2*3], m3 | 
|  | mova          [%1+%2*4], m4 | 
|  | mova          [%1+%2*5], m5 | 
|  | mova          [%1+%2*6], m6 | 
|  | %endmacro | 
|  |  | 
|  | %macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3] | 
|  | punpckhwd            m%5, m%4, m%1                ;packed in13 in3 | 
|  | punpcklwd            m%1, m%4                     ;packed in1  in15 | 
|  | punpcklwd            m%4, m%3, m%2                ;packed in9  in7 | 
|  | punpckhwd            m%2, m%3                     ;packed in5  in11 | 
|  | mova                 m%7, [o(pd_2048)] | 
|  | ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a | 
|  | ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a | 
|  | ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a | 
|  | ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a | 
|  | psubsw               m%6, m%1, m%4                 ;low: t9    high: t14 | 
|  | paddsw               m%1, m%4                      ;low: t8    high: t15 | 
|  | psubsw               m%4, m%5, m%2                 ;low: t10   high: t13 | 
|  | paddsw               m%5, m%2                      ;low: t11   high: t12 | 
|  | mova                 m%2, [o(deint_shuf2)] | 
|  | pshufb               m%6, m%2 | 
|  | pshufb               m%4, m%2 | 
|  | ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a | 
|  | ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a | 
|  | psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a | 
|  | paddsw               m%1, m%5                      ;low: t8a   high: t15a | 
|  | psubsw               m%5, m%6, m%4                 ;low: t10   high: t13 | 
|  | paddsw               m%6, m%4                      ;low: t9    high: t14 | 
|  | pshufb               m%3, m%2 | 
|  | pshufb               m%5, m%2 | 
|  | ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11 | 
|  | ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a | 
|  | packssdw             m%2, m%4                      ;low: t11   high: t10a | 
|  | packssdw             m%3, m%5                      ;low: t12   high: t13a | 
|  | punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14 | 
|  | punpcklqdq           m%1, m%6                      ;low: t8a   high: t9 | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_16X4_FN dct, dct | 
|  | INV_TXFM_16X4_FN dct, adst | 
|  | INV_TXFM_16X4_FN dct, flipadst | 
|  | INV_TXFM_16X4_FN dct, identity | 
|  |  | 
|  | cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_7ROWS        coeffq, 16 | 
|  | call .main | 
|  |  | 
|  | .pass1_end: | 
|  | punpckhwd             m7, m0, m2                 ;packed out1,  out5 | 
|  | punpcklwd             m0, m2                     ;packed out0,  out4 | 
|  | punpcklwd             m2, m1, m3                 ;packed out3,  out7 | 
|  | punpckhwd             m1, m3                     ;packed out2,  out6 | 
|  | mova       [coeffq+16*6], m7 | 
|  | mova                  m7, [coeffq+16*7] | 
|  | punpckhwd             m3, m4, m6                 ;packed out9,  out13 | 
|  | punpcklwd             m4, m6                     ;packed out8,  out12 | 
|  | punpcklwd             m6, m5, m7                 ;packed out11, out15 | 
|  | punpckhwd             m5, m7                     ;packed out10, out14 | 
|  |  | 
|  | .pass1_end2: | 
|  | mova                  m7, [o(pw_16384)] | 
|  | REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | pmulhrsw              m7, [coeffq+16*6] | 
|  | mova       [coeffq+16*6], m7 | 
|  |  | 
|  | .pass1_end3: | 
|  | punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high | 
|  | punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low | 
|  | punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high | 
|  | punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low | 
|  | punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1) | 
|  | punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0) | 
|  | punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3) | 
|  | punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2) | 
|  | mova       [coeffq+16*7], m3 | 
|  | mova                  m3, [coeffq+16*6] | 
|  | punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high | 
|  | punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low | 
|  | punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high | 
|  | punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low | 
|  | punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1) | 
|  | punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0) | 
|  | punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3) | 
|  | punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2) | 
|  | jmp                 tx2q | 
|  |  | 
|  | .pass2: | 
|  | lea                 tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] | 
|  |  | 
|  | .pass2_end: | 
|  | mova       [coeffq+16*4], m4 | 
|  | mova       [coeffq+16*5], m5 | 
|  | mova       [coeffq+16*6], m6 | 
|  | lea                   r3, [dstq+8] | 
|  | call                tx2q | 
|  |  | 
|  | add               coeffq, 16*4 | 
|  | mova                  m0, [coeffq+16*0] | 
|  | mova                  m1, [coeffq+16*1] | 
|  | mova                  m2, [coeffq+16*2] | 
|  | mova                  m3, [coeffq+16*3] | 
|  | mov                 dstq, r3 | 
|  | jmp                 tx2q | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | punpckhqdq            m7, m0, m1                 ;low:in1  high:in3 | 
|  | punpcklqdq            m0, m1 | 
|  | punpcklqdq            m1, m2, m3 | 
|  | punpckhqdq            m3, m2                     ;low:in7  high:in5 | 
|  | mova       [coeffq+16*4], m7 | 
|  | mova       [coeffq+16*5], m3 | 
|  | mova                  m7, [coeffq+16*7] | 
|  | punpcklqdq            m2, m4, m5 | 
|  | punpckhqdq            m4, m5                     ;low:in9  high:in11 | 
|  | punpcklqdq            m3, m6, m7 | 
|  | punpckhqdq            m7, m6                     ;low:in15 high:in13 | 
|  | mova       [coeffq+16*6], m4 | 
|  | IDCT8_1D_PACKED | 
|  | mova                  m6, [coeffq+16*4] | 
|  | mova                  m4, [coeffq+16*5] | 
|  | mova                  m5, [coeffq+16*6] | 
|  | mova       [coeffq+16*4], m1 | 
|  | mova       [coeffq+16*5], m2 | 
|  | mova       [coeffq+16*6], m3 | 
|  |  | 
|  | IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 | 
|  |  | 
|  | mova                  m1, [coeffq+16*4] | 
|  | psubsw                m3, m0, m7                 ;low:out15 high:out14 | 
|  | paddsw                m0, m7                     ;low:out0  high:out1 | 
|  | psubsw                m7, m1, m5                 ;low:out12 high:out13 | 
|  | paddsw                m1, m5                     ;low:out3  high:out2 | 
|  | mova       [coeffq+16*7], m3 | 
|  | mova                  m2, [coeffq+16*5] | 
|  | mova                  m3, [coeffq+16*6] | 
|  | psubsw                m5, m2, m4                 ;low:out11 high:out10 | 
|  | paddsw                m2, m4                     ;low:out4  high:out5 | 
|  | psubsw                m4, m3, m6                 ;low:out8  high:out9 | 
|  | paddsw                m3, m6                     ;low:out7  high:out6 | 
|  | mova                  m6, m7 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_16X4_FN adst, dct | 
|  | INV_TXFM_16X4_FN adst, adst | 
|  | INV_TXFM_16X4_FN adst, flipadst | 
|  | INV_TXFM_16X4_FN adst, identity | 
|  |  | 
|  | cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_7ROWS        coeffq, 16 | 
|  | call .main | 
|  | call .main_pass1_end | 
|  |  | 
|  | punpckhwd             m6, m7, m0                 ;packed -out11, -out15 | 
|  | punpcklwd             m0, m7                     ;packed   out0,   out4 | 
|  | punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7 | 
|  | punpckhwd             m4, m3                     ;packed   out8,  out12 | 
|  | mova                  m1, [coeffq+16*6] | 
|  | punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5 | 
|  | punpckhwd             m5, m1                     ;packed  out10,  out14 | 
|  | mova                  m1, [coeffq+16*7] | 
|  | mova       [coeffq+16*6], m3 | 
|  | mova       [coeffq+16*7], m7 | 
|  | punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13 | 
|  | punpcklwd             m1, m2                     ;packed   out2,   out6 | 
|  |  | 
|  | mova                  m7, [o(pw_16384)] | 
|  |  | 
|  | .pass1_end: | 
|  | REPX    {pmulhrsw x, m7}, m0, m1, m4, m5 | 
|  | pxor                  m2, m2 | 
|  | psubw                 m2, m7 | 
|  | mova                  m7, [coeffq+16*6] | 
|  | REPX    {pmulhrsw x, m2}, m7, m3, m6 | 
|  | pmulhrsw              m2, [coeffq+16*7] | 
|  | mova       [coeffq+16*6], m7 | 
|  | jmp   m(idct_16x4_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass2: | 
|  | lea                 tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] | 
|  | jmp   m(idct_16x4_internal_8bpc).pass2_end | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova       [coeffq+16*6], m0 | 
|  | pshufd                m0, m1, q1032 | 
|  | pshufd                m2, m2, q1032 | 
|  | punpckhwd             m1, m6, m0                 ;packed in13,  in2 | 
|  | punpcklwd             m0, m6                     ;packed  in3, in12 | 
|  | punpckhwd             m7, m5, m2                 ;packed in11,  in4 | 
|  | punpcklwd             m2, m5                     ;packed  in5, in10 | 
|  | mova                  m6, [o(pd_2048)] | 
|  | ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3 | 
|  | ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5 | 
|  | ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11 | 
|  | ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13 | 
|  | psubsw                m5, m1, m2                 ;low:t10a high:t11a | 
|  | paddsw                m1, m2                     ;low:t2a  high:t3a | 
|  | psubsw                m2, m7, m0                 ;low:t12a high:t13a | 
|  | paddsw                m7, m0                     ;low:t4a  high:t5a | 
|  | punpcklqdq            m0, m5 | 
|  | punpckhwd             m0, m5                     ;packed t10a, t11a | 
|  | punpcklqdq            m5, m2 | 
|  | punpckhwd             m2, m5                     ;packed t13a, t12a | 
|  | ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11 | 
|  | ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13 | 
|  | mova       [coeffq+16*4], m1 | 
|  | mova       [coeffq+16*5], m7 | 
|  | mova                  m1, [coeffq+16*6] | 
|  | mova                  m7, [coeffq+16*7] | 
|  | pshufd                m1, m1, q1032 | 
|  | pshufd                m3, m3, q1032 | 
|  | punpckhwd             m5, m7, m1                 ;packed in15,  in0 | 
|  | punpcklwd             m1, m7                     ;packed  in1, in14 | 
|  | punpckhwd             m7, m4, m3                 ;packed  in9,  in6 | 
|  | punpcklwd             m3, m4                     ;packed  in7,  in8 | 
|  | ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1 | 
|  | ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7 | 
|  | ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9 | 
|  | ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15 | 
|  | psubsw                m4, m5, m3                 ;low:t8a   high:t9a | 
|  | paddsw                m5, m3                     ;low:t0a   high:t1a | 
|  | psubsw                m3, m7, m1                 ;low:t14a  high:t15a | 
|  | paddsw                m7, m1                     ;low:t6a   high:t7a | 
|  | punpcklqdq            m1, m4 | 
|  | punpckhwd             m1, m4                     ;packed  t8a,  t9a | 
|  | punpcklqdq            m4, m3 | 
|  | punpckhwd             m3, m4                     ;packed t15a, t14a | 
|  | ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9 | 
|  | ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15 | 
|  | paddsw                m4, m1, m2                 ;low:t12a  high:t13a | 
|  | psubsw                m1, m2                     ;low:t8a   high:t9a | 
|  | psubsw                m2, m0, m3                 ;low:t14a  high:t15a | 
|  | paddsw                m0, m3                     ;low:t10a  high:t11a | 
|  | punpcklqdq            m3, m1 | 
|  | punpckhwd             m3, m1                     ;packed t12a, t13a | 
|  | punpcklqdq            m1, m2 | 
|  | punpckhwd             m2, m1                     ;packed t15a, t14a | 
|  | ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13 | 
|  | ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15 | 
|  | psubsw                m1, m3, m2                 ;low:t14a  high:t15a | 
|  | paddsw                m3, m2                     ;low:out2  high:-out13 | 
|  | psubsw                m2, m4, m0                 ;low:t10   high:t11 | 
|  | paddsw                m0, m4                     ;low:-out1 high:out14 | 
|  | mova       [coeffq+16*6], m0 | 
|  | mova       [coeffq+16*7], m3 | 
|  | mova                  m0, [coeffq+16*4] | 
|  | mova                  m3, [coeffq+16*5] | 
|  | psubsw                m4, m5, m3                 ;low:t4    high:t5 | 
|  | paddsw                m5, m3                     ;low:t0    high:t1 | 
|  | psubsw                m3, m0, m7                 ;low:t6    high:t7 | 
|  | paddsw                m0, m7                     ;low:t2    high:t3 | 
|  | punpcklqdq            m7, m4 | 
|  | punpckhwd             m7, m4                     ;packed t4, t5 | 
|  | punpcklqdq            m4, m3 | 
|  | punpckhwd             m3, m4                     ;packed t7, t6 | 
|  | ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a | 
|  | ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a | 
|  | psubsw                m4, m5, m0                 ;low:t2a   high:t3a | 
|  | paddsw                m0, m5                     ;low:out0  high:-out15 | 
|  | psubsw                m5, m7, m3                 ;low:t6    high:t7 | 
|  | paddsw                m3, m7                     ;low:-out3 high:out12 | 
|  | ret | 
|  | ALIGN function_align | 
|  | .main_pass1_end: | 
|  | mova                  m7, [o(deint_shuf1)] | 
|  | mova       [coeffq+16*4], m0 | 
|  | mova       [coeffq+16*5], m3 | 
|  | mova                  m0, [o(pw_2896_m2896)] | 
|  | mova                  m3, [o(pw_2896_2896)] | 
|  | pshufb                m1, m7                     ;t14a t15a | 
|  | pshufb                m2, m7                     ;t10  t11 | 
|  | pshufb                m4, m7                     ;t2a  t3a | 
|  | pshufb                m5, m7                     ;t6   t7 | 
|  | pmaddwd               m7, m0, m2 | 
|  | pmaddwd               m2, m3 | 
|  | paddd                 m7, m6 | 
|  | paddd                 m2, m6 | 
|  | psrad                 m7, 12 | 
|  | psrad                 m2, 12 | 
|  | packssdw              m2, m7                     ;low:out6  high:-out9 | 
|  | pmaddwd               m7, m0, m4 | 
|  | pmaddwd               m4, m3 | 
|  | paddd                 m7, m6 | 
|  | paddd                 m4, m6 | 
|  | psrad                 m7, 12 | 
|  | psrad                 m4, 12 | 
|  | packssdw              m4, m7                     ;low:-out7 high:out8 | 
|  | pmaddwd               m7, m3, m5 | 
|  | pmaddwd               m5, m0 | 
|  | paddd                 m7, m6 | 
|  | paddd                 m5, m6 | 
|  | psrad                 m7, 12 | 
|  | psrad                 m5, 12 | 
|  | packssdw              m7, m5                     ;low:out4  high:-out11 | 
|  | pmaddwd               m5, m3, m1 | 
|  | pmaddwd               m1, m0 | 
|  | paddd                 m5, m6 | 
|  | paddd                 m1, m6 | 
|  | psrad                 m5, 12 | 
|  | psrad                 m1, 12 | 
|  | packssdw              m5, m1                     ;low:-out5 high:out10 | 
|  | mova                  m0, [coeffq+16*4] | 
|  | mova                  m3, [coeffq+16*5] | 
|  | ret | 
|  | ALIGN function_align | 
|  | cglobal_label .main_pass2_end | 
|  | mova                  m7, [o(pw_2896x8)] | 
|  | punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a | 
|  | punpcklqdq            m2, m1                     ;low:t10   high:t14a | 
|  | psubsw                m1, m2, m6 | 
|  | paddsw                m2, m6 | 
|  | punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7 | 
|  | punpcklqdq            m4, m5                     ;low:t2a   high:t6 | 
|  | psubsw                m5, m4, m6 | 
|  | paddsw                m4, m6 | 
|  | pmulhrsw              m1, m7                     ;low:-out9 high:out10 | 
|  | pmulhrsw              m2, m7                     ;low:out6  high:-out5 | 
|  | pmulhrsw              m5, m7                     ;low:out8  high:-out11 | 
|  | pmulhrsw              m4, m7                     ;low:-out7 high:out4 | 
|  | punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11 | 
|  | punpcklqdq            m4, m5                     ;low:-out7 high:out8 | 
|  | punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10 | 
|  | punpcklqdq            m2, m1                     ;low:out6  high:-out9 | 
|  | ret | 
|  |  | 
|  |  | 
|  | INV_TXFM_16X4_FN flipadst, dct | 
|  | INV_TXFM_16X4_FN flipadst, adst | 
|  | INV_TXFM_16X4_FN flipadst, flipadst | 
|  | INV_TXFM_16X4_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_7ROWS        coeffq, 16 | 
|  | call m(iadst_16x4_internal_8bpc).main | 
|  | call m(iadst_16x4_internal_8bpc).main_pass1_end | 
|  |  | 
|  | punpcklwd             m6, m7, m0                 ;packed  out11,  out15 | 
|  | punpckhwd             m0, m7                     ;packed  -out0,  -out4 | 
|  | punpckhwd             m7, m3, m4                 ;packed   out3,   out7 | 
|  | punpcklwd             m4, m3                     ;packed  -out8, -out12 | 
|  | mova                  m1, [coeffq+16*6] | 
|  | punpckhwd             m3, m1, m5                 ;packed   out1,   out5 | 
|  | punpcklwd             m5, m1                     ;packed -out10, -out14 | 
|  | mova                  m1, [coeffq+16*7] | 
|  | mova       [coeffq+16*6], m3 | 
|  | mova       [coeffq+16*7], m7 | 
|  | punpcklwd             m3, m2, m1                 ;packed   out9,  out13 | 
|  | punpckhwd             m1, m2                     ;packed  -out2,  -out6 | 
|  |  | 
|  | mova                  m7, [o(pw_m16384)] | 
|  | jmp   m(iadst_16x4_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | lea                 tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] | 
|  | jmp   m(idct_16x4_internal_8bpc).pass2_end | 
|  |  | 
|  |  | 
|  | INV_TXFM_16X4_FN identity, dct | 
|  | INV_TXFM_16X4_FN identity, adst | 
|  | INV_TXFM_16X4_FN identity, flipadst | 
|  | INV_TXFM_16X4_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                  m1, [coeffq+16*6] | 
|  | mova                  m0, [coeffq+16*5] | 
|  | mova                  m2, [coeffq+16*7] | 
|  | mova                  m6, [o(pw_1697x16)] | 
|  | mova                  m7, [o(pw_16384)] | 
|  | pmulhrsw              m4, m6, m1 | 
|  | pmulhrsw              m3, m6, m0 | 
|  | pmulhrsw              m5, m6, m2 | 
|  | pmulhrsw              m4, m7 | 
|  | pmulhrsw              m3, m7 | 
|  | pmulhrsw              m5, m7 | 
|  | paddsw                m1, m4 | 
|  | paddsw                m0, m3 | 
|  | paddsw                m5, m2 | 
|  | mova                  m2, [coeffq+16*2] | 
|  | mova                  m3, [coeffq+16*3] | 
|  | mova                  m4, [coeffq+16*4] | 
|  | mova       [coeffq+16*6], m1 | 
|  | mova       [coeffq+16*5], m0 | 
|  | mova       [coeffq+16*7], m5 | 
|  | pmulhrsw              m0, m6, m2 | 
|  | pmulhrsw              m1, m6, m3 | 
|  | pmulhrsw              m5, m6, m4 | 
|  | pmulhrsw              m0, m7 | 
|  | pmulhrsw              m1, m7 | 
|  | pmulhrsw              m5, m7 | 
|  | paddsw                m2, m0 | 
|  | paddsw                m3, m1 | 
|  | paddsw                m4, m5 | 
|  | mova                  m0, [coeffq+16*0] | 
|  | mova                  m1, [coeffq+16*1] | 
|  | pmulhrsw              m5, m6, m0 | 
|  | pmulhrsw              m6, m1 | 
|  | pmulhrsw              m5, m7 | 
|  | pmulhrsw              m6, m7 | 
|  | paddsw                m0, m5 | 
|  | paddsw                m1, m6 | 
|  | mova                  m6, [coeffq+16*6] | 
|  | mova                  m5, [coeffq+16*5] | 
|  | punpckhwd             m7, m0, m2                 ;packed out1,  out5 | 
|  | punpcklwd             m0, m2                     ;packed out0,  out4 | 
|  | punpckhwd             m2, m1, m3                 ;packed out3,  out7 | 
|  | punpcklwd             m1, m3                     ;packed out2,  out6 | 
|  | mova       [coeffq+16*6], m7 | 
|  | mova                  m7, [coeffq+16*7] | 
|  | punpckhwd             m3, m4, m6                 ;packed out9,  out13 | 
|  | punpcklwd             m4, m6                     ;packed out8,  out12 | 
|  | punpckhwd             m6, m5, m7                 ;packed out11, out15 | 
|  | punpcklwd             m5, m7                     ;packed out10, out14 | 
|  | jmp   m(idct_16x4_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass2: | 
|  | lea                 tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] | 
|  | jmp   m(idct_16x4_internal_8bpc).pass2_end | 
|  |  | 
|  |  | 
|  | %macro SAVE_8ROWS 2  ;src, stride | 
|  | mova                 [%1+%2*0], m0 | 
|  | mova                 [%1+%2*1], m1 | 
|  | mova                 [%1+%2*2], m2 | 
|  | mova                 [%1+%2*3], m3 | 
|  | mova                 [%1+%2*4], m4 | 
|  | mova                 [%1+%2*5], m5 | 
|  | mova                 [%1+%2*6], m6 | 
|  | mova                 [%1+%2*7], m7 | 
|  | %endmacro | 
|  |  | 
|  | %macro INV_TXFM_8X16_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 8x16, 8, 16*16 | 
|  | %ifidn %1_%2, dct_dct | 
|  | pshuflw              m0, [coeffq], q0000 | 
|  | punpcklwd            m0, m0 | 
|  | mova                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1 | 
|  | mova                 m2, [o(pw_16384)] | 
|  | mov            [coeffq], eobd | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | psrlw                m2, 3              ; pw_2048 | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | mov                 r3d, 4 | 
|  | lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop | 
|  | .end: | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_8X16_FN dct, dct | 
|  | INV_TXFM_8X16_FN dct, adst | 
|  | INV_TXFM_8X16_FN dct, flipadst | 
|  | INV_TXFM_8X16_FN dct, identity | 
|  |  | 
|  | cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                    r3, [o(m(idct_8x8_internal_8bpc).pass1)] | 
|  |  | 
|  | .pass1: | 
|  | LOAD_8ROWS    coeffq+16*1, 32, 1 | 
|  | mov   [rsp+gprsize+16*11], tx2q | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] | 
|  | jmp                    r3 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*1, 32 | 
|  | LOAD_8ROWS    coeffq+16*0, 32, 1 | 
|  | mov                  tx2q, [rsp+gprsize+16*11] | 
|  | jmp                    r3 | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end)] | 
|  |  | 
|  | .pass2_pre: | 
|  | mova       [coeffq+16*2 ], m1 | 
|  | mova       [coeffq+16*6 ], m3 | 
|  | mova       [coeffq+16*10], m5 | 
|  | mova       [coeffq+16*14], m7 | 
|  | mova                   m1, m2 | 
|  | mova                   m2, m4 | 
|  | mova                   m3, m6 | 
|  | mova                   m4, [coeffq+16*1 ] | 
|  | mova                   m5, [coeffq+16*5 ] | 
|  | mova                   m6, [coeffq+16*9 ] | 
|  | mova                   m7, [coeffq+16*13] | 
|  |  | 
|  | .pass2_main: | 
|  | call m(idct_8x8_internal_8bpc).main | 
|  |  | 
|  | SAVE_7ROWS   rsp+gprsize+16*3, 16 | 
|  | mova                   m0, [coeffq+16*2 ] | 
|  | mova                   m1, [coeffq+16*6 ] | 
|  | mova                   m2, [coeffq+16*10] | 
|  | mova                   m3, [coeffq+16*14] | 
|  | mova                   m4, [coeffq+16*3 ] | 
|  | mova                   m5, [coeffq+16*7 ] | 
|  | mova                   m6, [coeffq+16*11] | 
|  | mova                   m7, [coeffq+16*15] | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  |  | 
|  | mov                    r3, dstq | 
|  | lea                  dstq, [dstq+strideq*8] | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end1: | 
|  | pxor                   m7, m7 | 
|  | REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_8X16_FN adst, dct | 
|  | INV_TXFM_8X16_FN adst, adst | 
|  | INV_TXFM_8X16_FN adst, flipadst | 
|  | INV_TXFM_8X16_FN adst, identity | 
|  |  | 
|  | cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                    r3, [o(m(iadst_8x8_internal_8bpc).pass1)] | 
|  | jmp  m(idct_8x16_internal_8bpc).pass1 | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(m(iadst_8x16_internal_8bpc).end)] | 
|  |  | 
|  | .pass2_pre: | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*5], m6 | 
|  | mova    [rsp+gprsize+16*6], m7 | 
|  | mova                    m0, m2 | 
|  | mova                    m1, m3 | 
|  | mova                    m2, m4 | 
|  | mova                    m3, m5 | 
|  |  | 
|  | .pass2_main: | 
|  | mova                    m4, [coeffq+16*1 ] | 
|  | mova                    m5, [coeffq+16*3 ] | 
|  | mova                    m6, [coeffq+16*13] | 
|  | mova                    m7, [coeffq+16*15] | 
|  | mova    [rsp+gprsize+16*3], m4 | 
|  | mova    [rsp+gprsize+16*4], m5 | 
|  | mova    [rsp+gprsize+16*9], m6 | 
|  | mova    [rsp+gprsize+32*5], m7 | 
|  | mova                    m4, [coeffq+16*5 ] | 
|  | mova                    m5, [coeffq+16*7 ] | 
|  | mova                    m6, [coeffq+16*9 ] | 
|  | mova                    m7, [coeffq+16*11] | 
|  |  | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass2_end | 
|  |  | 
|  | mov                    r3, dstq | 
|  | lea                  dstq, [dstq+strideq*8] | 
|  | jmp m(iadst_8x8_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(iadst_8x8_internal_8bpc).end | 
|  |  | 
|  |  | 
|  | INV_TXFM_8X16_FN flipadst, dct | 
|  | INV_TXFM_8X16_FN flipadst, adst | 
|  | INV_TXFM_8X16_FN flipadst, flipadst | 
|  | INV_TXFM_8X16_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | lea                    r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] | 
|  | jmp  m(idct_8x16_internal_8bpc).pass1 | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] | 
|  | lea                     r3, [dstq+strideq*8] | 
|  |  | 
|  | .pass2_pre: | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*5], m6 | 
|  | mova    [rsp+gprsize+16*6], m7 | 
|  | mova                    m0, m2 | 
|  | mova                    m1, m3 | 
|  | mova                    m2, m4 | 
|  | mova                    m3, m5 | 
|  |  | 
|  | .pass2_main: | 
|  | mova                    m4, [coeffq+16*1 ] | 
|  | mova                    m5, [coeffq+16*3 ] | 
|  | mova                    m6, [coeffq+16*13] | 
|  | mova                    m7, [coeffq+16*15] | 
|  | mova    [rsp+gprsize+16*3], m4 | 
|  | mova    [rsp+gprsize+16*4], m5 | 
|  | mova    [rsp+gprsize+16*9], m6 | 
|  | mova    [rsp+gprsize+32*5], m7 | 
|  | mova                    m4, [coeffq+16*5 ] | 
|  | mova                    m5, [coeffq+16*7 ] | 
|  | mova                    m6, [coeffq+16*9 ] | 
|  | mova                    m7, [coeffq+16*11] | 
|  |  | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass2_end | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                   dstq, r3 | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).end | 
|  |  | 
|  |  | 
|  | INV_TXFM_8X16_FN identity, dct | 
|  | INV_TXFM_8X16_FN identity, adst | 
|  | INV_TXFM_8X16_FN identity, flipadst | 
|  | INV_TXFM_8X16_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS    coeffq+16*1, 32, 1 | 
|  | mov                    r3, tx2q | 
|  | lea                  tx2q, [o(.pass1_end)] | 
|  | mova   [rsp+gprsize+16*1], m6 | 
|  | jmp  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*1, 32 | 
|  | LOAD_8ROWS    coeffq+16*0, 32, 1 | 
|  | mov                  tx2q, r3 | 
|  | mova   [rsp+gprsize+16*1], m6 | 
|  | jmp  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(.end1)] | 
|  |  | 
|  | .end: | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | mova   [rsp+gprsize+16*1], m6 | 
|  | mova                   m7, [o(pw_1697x16)] | 
|  | REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 | 
|  | mova                   m6, [rsp+gprsize+16*1] | 
|  | mova   [rsp+gprsize+16*2], m5 | 
|  | IDTX16                  6, 5, 7 | 
|  | mova                   m5, [rsp+gprsize+16*0] | 
|  | IDTX16                  5, 7, 7 | 
|  | mova                   m7, [o(pw_2048)] | 
|  | REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | pmulhrsw               m7, [rsp+gprsize+16*2] | 
|  | mova   [rsp+gprsize+16*0], m5 | 
|  | mova   [rsp+gprsize+16*1], m6 | 
|  | mova   [rsp+gprsize+16*2], m7 | 
|  | jmp  m(idct_8x8_internal_8bpc).end3 | 
|  |  | 
|  | .end1: | 
|  | LOAD_8ROWS    coeffq+16*1, 32 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | lea                  dstq, [dstq+strideq*2] | 
|  | jmp .end | 
|  |  | 
|  |  | 
|  | %macro INV_TXFM_16X8_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 16x8, 8, 16*16 | 
|  | %ifidn %1_%2, dct_dct | 
|  | movd                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1, [coeffq] | 
|  | movd                 m2, [o(pw_16384)] | 
|  | mov            [coeffq], eobd | 
|  | pmulhrsw             m0, m1 | 
|  | mov                 r2d, 4 | 
|  | lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly | 
|  | .end: | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_16X8_FN dct, dct | 
|  | INV_TXFM_16X8_FN dct, adst | 
|  | INV_TXFM_16X8_FN dct, flipadst | 
|  | INV_TXFM_16X8_FN dct, identity | 
|  |  | 
|  | cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS    coeffq+16*0, 32, 1 | 
|  | call m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS   rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+16*1, 32, 1 | 
|  | call  .main | 
|  | mov                    r3, tx2q | 
|  | lea                  tx2q, [o(.pass1_end)] | 
|  | jmp  m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*1, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | mov                  tx2q, r3 | 
|  | jmp  m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(.end)] | 
|  | lea                    r3, [dstq+8] | 
|  | jmp  m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    coeffq+16*1, 32 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova [rsp+gprsize*2+16*1], m2 | 
|  | mova [rsp+gprsize*2+16*2], m6 | 
|  | mova [rsp+gprsize*2+32*5], m5 | 
|  |  | 
|  | mova                   m6, [o(pd_2048)] | 
|  | ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a | 
|  | ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a | 
|  | psubsw                 m2, m0, m4                   ;t9 | 
|  | paddsw                 m0, m4                       ;t8 | 
|  | psubsw                 m4, m7, m3                   ;t14 | 
|  | paddsw                 m7, m3                       ;t15 | 
|  | ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a | 
|  | mova                   m3, [rsp+gprsize*2+16*1] | 
|  | mova                   m5, [rsp+gprsize*2+32*5] | 
|  | mova [rsp+gprsize*2+16*1], m2 | 
|  | mova [rsp+gprsize*2+32*5], m4 | 
|  | mova                   m2, [rsp+gprsize*2+16*2] | 
|  | mova [rsp+gprsize*2+16*2], m7 | 
|  | ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a | 
|  | ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a | 
|  | psubsw                 m4, m2, m3                   ;t10 | 
|  | paddsw                 m2, m3                       ;t11 | 
|  | psubsw                 m3, m1, m5                   ;t13 | 
|  | paddsw                 m1, m5                       ;t12 | 
|  | ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a | 
|  | mova                   m7, [rsp+gprsize*2+32*5] | 
|  | psubsw                 m6, m0, m2                   ;t11a | 
|  | paddsw                 m0, m2                       ;t8a | 
|  | paddsw                 m2, m7, m3                   ;t9 | 
|  | psubsw                 m7, m3                       ;t10 | 
|  | mova                   m5, [rsp+gprsize*2+16*0] | 
|  | psubsw                 m3, m5, m0                   ;out8 | 
|  | paddsw                 m0, m5                       ;out7 | 
|  | mova [rsp+gprsize*2+32*5], m0 | 
|  | mova                   m5, [rsp+gprsize*2+16*9] | 
|  | psubsw                 m0, m5, m2                   ;out9 | 
|  | paddsw                 m2, m5                       ;out6 | 
|  | mova [rsp+gprsize*2+16*0], m0 | 
|  | mova [rsp+gprsize*2+16*9], m2 | 
|  | mova                   m0, [rsp+gprsize*2+16*1] | 
|  | mova                   m2, [rsp+gprsize*2+16*2] | 
|  | mova [rsp+gprsize*2+16*1], m3 | 
|  | psubsw                 m5, m0, m4                   ;t13 | 
|  | paddsw                 m0, m4                       ;t14 | 
|  | mova                   m3, [o(pd_2048)] | 
|  | psubsw                 m4, m2, m1                   ;t12a | 
|  | paddsw                 m1, m2                       ;t15a | 
|  | mova [rsp+gprsize*2+16*2], m1 | 
|  | ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a | 
|  | ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12 | 
|  | mova                   m3, [rsp+gprsize*2+16*8] | 
|  | psubsw                 m2, m3, m5                   ;out10 | 
|  | paddsw                 m3, m5                       ;out5 | 
|  | mova                   m5, [rsp+gprsize*2+16*7] | 
|  | mova [rsp+gprsize*2+16*8], m3 | 
|  | psubsw                 m3, m5, m4                   ;out11 | 
|  | paddsw                 m5, m4                       ;out4 | 
|  | mova                   m4, [rsp+gprsize*2+16*6] | 
|  | mova [rsp+gprsize*2+16*7], m5 | 
|  | paddsw                 m5, m4, m6                   ;out3 | 
|  | psubsw                 m4, m6                       ;out12 | 
|  | mova                   m6, [rsp+gprsize*2+16*5] | 
|  | mova [rsp+gprsize*2+16*6], m5 | 
|  | psubsw                 m5, m6, m7                   ;out13 | 
|  | paddsw                 m6, m7                       ;out2 | 
|  | mova                   m7, [rsp+gprsize*2+16*4] | 
|  | mova [rsp+gprsize*2+16*5], m6 | 
|  | psubsw                 m6, m7, m0                   ;out14 | 
|  | paddsw                 m7, m0                       ;out1 | 
|  | mova                   m1, [rsp+gprsize*2+16*2] | 
|  | mova                   m0, [rsp+gprsize*2+16*3] | 
|  | mova [rsp+gprsize*2+16*4], m7 | 
|  | psubsw                 m7, m0, m1                   ;out15 | 
|  | paddsw                 m0, m1                       ;out0 | 
|  | mova [rsp+gprsize*2+16*3], m0 | 
|  | mova                   m1, [rsp+gprsize*2+16*0] | 
|  | mova                   m0, [rsp+gprsize*2+16*1] | 
|  | mova [rsp+gprsize*2+16*0], m7 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_16X8_FN adst, dct | 
|  | INV_TXFM_16X8_FN adst, adst | 
|  | INV_TXFM_16X8_FN adst, flipadst | 
|  | INV_TXFM_16X8_FN adst, identity | 
|  |  | 
|  | cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                    m7, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m7, [coeffq+16*0 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*1 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*14] | 
|  | pmulhrsw                m3, m7, [coeffq+16*15] | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*9], m2 | 
|  | mova    [rsp+gprsize+32*5], m3 | 
|  | pmulhrsw                m0, m7, [coeffq+16*6 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*7 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*8 ] | 
|  | pmulhrsw                m3, m7, [coeffq+16*9 ] | 
|  | mova    [rsp+gprsize+16*3], m2 | 
|  | mova    [rsp+gprsize+16*4], m3 | 
|  | mova    [rsp+gprsize+16*5], m0 | 
|  | mova    [rsp+gprsize+16*6], m1 | 
|  | pmulhrsw                m0, m7, [coeffq+16*2 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*3 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*4 ] | 
|  | pmulhrsw                m3, m7, [coeffq+16*5 ] | 
|  | pmulhrsw                m4, m7, [coeffq+16*10] | 
|  | pmulhrsw                m5, m7, [coeffq+16*11] | 
|  | pmulhrsw                m6, m7, [coeffq+16*12] | 
|  | pmulhrsw                m7,     [coeffq+16*13] | 
|  |  | 
|  | call .main | 
|  | call .main_pass1_end | 
|  | mov                    r3, tx2q | 
|  | lea                  tx2q, [o(.pass1_end)] | 
|  | jmp m(iadst_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*1, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | mov                  tx2q, r3 | 
|  | jmp m(iadst_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(.end)] | 
|  | lea                    r3, [dstq+8] | 
|  | jmp m(iadst_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    coeffq+16*1, 32 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp m(iadst_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova  [rsp+gprsize*2+16*0], m1 | 
|  | mova  [rsp+gprsize*2+16*1], m2 | 
|  | mova  [rsp+gprsize*2+16*2], m6 | 
|  |  | 
|  | mova                    m6, [o(pd_2048)] | 
|  | ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2 | 
|  | ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10 | 
|  | psubsw                  m1, m0, m4                   ;t10a | 
|  | paddsw                  m0, m4                       ;t2a | 
|  | psubsw                  m4, m7, m3                   ;t11a | 
|  | paddsw                  m3, m7                       ;t3a | 
|  | ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10 | 
|  | mova                    m2, [rsp+gprsize*2+16*0]     ;in3 | 
|  | mova                    m7, [rsp+gprsize*2+16*1]     ;in4 | 
|  | mova  [rsp+gprsize*2+16*0], m1                       ;t11 | 
|  | mova  [rsp+gprsize*2+16*1], m4                       ;t10 | 
|  | mova                    m1, [rsp+gprsize*2+16*2]     ;in12 | 
|  | mova  [rsp+gprsize*2+16*2], m0                       ;t2a | 
|  | ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4 | 
|  | ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12 | 
|  | psubsw                  m0, m7, m1                   ;t12a | 
|  | paddsw                  m1, m7                       ;t4a | 
|  | psubsw                  m4, m5, m2                   ;t13a | 
|  | paddsw                  m5, m2                       ;t5a | 
|  | ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13 | 
|  | mova                    m2, [rsp+gprsize*2+16*8]     ;in1 | 
|  | mova                    m7, [rsp+gprsize*2+16*9]     ;in14 | 
|  | mova  [rsp+gprsize*2+16*8], m4                       ;t12 | 
|  | mova  [rsp+gprsize*2+16*9], m0                       ;t13 | 
|  | mova                    m4, [rsp+gprsize*2+16*4]     ;in9 | 
|  | mova                    m0, [rsp+gprsize*2+16*5]     ;in6 | 
|  | mova  [rsp+gprsize*2+16*4], m1                       ;t4a | 
|  | mova  [rsp+gprsize*2+16*5], m5                       ;t5a | 
|  | ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14 | 
|  | ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6 | 
|  | psubsw                  m1, m0, m7                   ;t14a | 
|  | paddsw                  m0, m7                       ;t6a | 
|  | psubsw                  m5, m4, m2                   ;t15a | 
|  | paddsw                  m4, m2                       ;t7a | 
|  | ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15 | 
|  | mova                    m2, [rsp+gprsize*2+16*2]     ;t2a | 
|  | mova  [rsp+gprsize*2+16*2], m5                       ;t14 | 
|  | psubsw                  m7, m2, m0                   ;t6 | 
|  | paddsw                  m2, m0                       ;t2 | 
|  | psubsw                  m0, m3, m4                   ;t7 | 
|  | paddsw                  m3, m4                       ;t3 | 
|  | ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a | 
|  | mova                    m4, [rsp+gprsize*2+16*7]     ;in0 | 
|  | mova                    m5, [rsp+gprsize*2+32*5]     ;in15 | 
|  | mova  [rsp+gprsize*2+16*7], m3                       ;t3 | 
|  | mova  [rsp+gprsize*2+32*5], m1                       ;t15 | 
|  | mova                    m1, [rsp+gprsize*2+16*6]     ;in7 | 
|  | mova                    m3, [rsp+gprsize*2+16*3]     ;in8 | 
|  | mova  [rsp+gprsize*2+16*6], m7                       ;t7a | 
|  | mova  [rsp+gprsize*2+16*3], m0                       ;t6a | 
|  | ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0 | 
|  | ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8 | 
|  | psubsw                  m0, m4, m3                   ;t8a | 
|  | paddsw                  m4, m3                       ;t0a | 
|  | psubsw                  m3, m5, m1                   ;t9a | 
|  | paddsw                  m5, m1                       ;t1a | 
|  | ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8 | 
|  | mova                    m1, [rsp+gprsize*2+16*4]     ;t4a | 
|  | mova                    m7, [rsp+gprsize*2+16*5]     ;t5a | 
|  | mova  [rsp+gprsize*2+16*4], m3                       ;t8 | 
|  | mova  [rsp+gprsize*2+16*5], m0                       ;t9 | 
|  | psubsw                  m0, m4, m1                   ;t4 | 
|  | paddsw                  m4, m1                       ;t0 | 
|  | psubsw                  m3, m5, m7                   ;t5 | 
|  | paddsw                  m5, m7                       ;t1 | 
|  | ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a | 
|  | mova                    m7, [rsp+gprsize*2+16*3]     ;t6a | 
|  | psubsw                  m1, m4, m2                   ;t2a | 
|  | paddsw                  m4, m2                       ;out0 | 
|  | mova  [rsp+gprsize*2+16*3], m4                       ;out0 | 
|  | mova                    m4, [rsp+gprsize*2+16*6]     ;t7a | 
|  | psubsw                  m2, m3, m7                   ;t6 | 
|  | paddsw                  m3, m7                       ;-out3 | 
|  | mova  [rsp+gprsize*2+16*6], m3                       ;-out3 | 
|  | psubsw                  m3, m0, m4                   ;t7 | 
|  | paddsw                  m0, m4                       ;out12 | 
|  | mova [rsp+gprsize*2+16*12], m3 | 
|  | mova                    m3, [rsp+gprsize*2+16*7]     ;t3 | 
|  | mova [rsp+gprsize*2+16* 7], m2                       ;out4 | 
|  | psubsw                  m2, m5, m3                   ;t3a | 
|  | paddsw                  m5, m3                       ;-out15 | 
|  | mova [rsp+gprsize*2+16*11], m2 | 
|  | mova                    m2, [rsp+gprsize*2+32*5]     ;t15 | 
|  | mova [rsp+gprsize*2+16*10], m1                       ;-out7 | 
|  | mova                    m1, [rsp+gprsize*2+16*0]     ;t11 | 
|  | mova [rsp+gprsize*2+16*0 ], m5                       ;-out15 | 
|  | mova                    m3, [rsp+gprsize*2+16*1]     ;t10 | 
|  | mova [rsp+gprsize*2+16*1 ], m4                       ;-out11 | 
|  | mova                    m4, [rsp+gprsize*2+16*2]     ;t14 | 
|  | mova [rsp+gprsize*2+16*2 ], m0                       ;out12 | 
|  | psubsw                  m0, m3, m4                   ;t14a | 
|  | paddsw                  m3, m4                       ;t10a | 
|  | psubsw                  m5, m1, m2                   ;t15a | 
|  | paddsw                  m1, m2                       ;t11a | 
|  | ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15 | 
|  | mova                    m2, [rsp+gprsize*2+16*4]     ;t8 | 
|  | mova                    m4, [rsp+gprsize*2+16*5]     ;t9 | 
|  | mova  [rsp+gprsize*2+16*4], m3                       ;t10a | 
|  | mova  [rsp+gprsize*2+16*5], m1                       ;t11a | 
|  | mova                    m3, [rsp+gprsize*2+16*8]     ;t12 | 
|  | mova                    m1, [rsp+gprsize*2+16*9]     ;t13 | 
|  | mova  [rsp+gprsize*2+16*8], m5                       ;t14 | 
|  | mova  [rsp+gprsize*2+16*9], m0                       ;t15 | 
|  | psubsw                  m5, m2, m3                   ;t12a | 
|  | paddsw                  m2, m3                       ;t8a | 
|  | psubsw                  m0, m4, m1                   ;t13a | 
|  | paddsw                  m4, m1                       ;t9a | 
|  | ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12 | 
|  | mova                    m6, [rsp+gprsize*2+16*4]     ;t10a | 
|  | mova                    m1, [rsp+gprsize*2+16*5]     ;t11a | 
|  | psubsw                  m3, m2, m6                   ;t10 | 
|  | paddsw                  m2, m6                       ;-out1 | 
|  | paddsw                  m6, m4, m1                   ;out14 | 
|  | psubsw                  m4, m1                       ;t11 | 
|  | mova [rsp+gprsize*2+16*14], m4 | 
|  | mova [rsp+gprsize*2+16* 4], m2                       ;-out1 | 
|  | mova                    m4, [rsp+gprsize*2+16*8]     ;t14 | 
|  | mova                    m2, [rsp+gprsize*2+16*9]     ;t15 | 
|  | mova [rsp+gprsize*2+16* 9], m3                       ;out6 | 
|  | psubsw                  m3, m0, m4                   ;t14a | 
|  | paddsw                  m0, m4                       ;out2 | 
|  | psubsw                  m4, m5, m2                   ;t15a | 
|  | paddsw                  m5, m2                       ;-out13 | 
|  | mova [rsp+gprsize*2+16* 5], m0                       ;out2 | 
|  | ret | 
|  | ALIGN function_align | 
|  | .main_pass1_end: | 
|  | mova                    m0, [rsp+gprsize*2+16*14] | 
|  | mova [rsp+gprsize*2+16*14], m5 | 
|  | mova [rsp+gprsize*2+16*15], m6 | 
|  | mova                    m5, [o(pw_2896_2896)] | 
|  | mova                    m6, [o(pw_2896_m2896)] | 
|  | mova                    m7, [o(pd_2048)] | 
|  | punpcklwd               m2, m3, m4 | 
|  | punpckhwd               m3, m4 | 
|  | pmaddwd                 m4, m5, m2 | 
|  | pmaddwd                 m2, m6 | 
|  | pmaddwd                 m1, m5, m3 | 
|  | pmaddwd                 m3, m6 | 
|  | REPX         {paddd x, m7}, m4, m2, m1, m3 | 
|  | REPX         {psrad x, 12}, m4, m1, m2, m3 | 
|  | packssdw                m4, m1                       ;-out5 | 
|  | packssdw                m2, m3                       ;out10 | 
|  | mova [rsp+gprsize*2+16* 8], m4 | 
|  | mova                    m3, [rsp+gprsize*2+16* 9] | 
|  | punpcklwd               m1, m3, m0 | 
|  | punpckhwd               m3, m0 | 
|  | pmaddwd                 m0, m5, m1 | 
|  | pmaddwd                 m1, m6 | 
|  | pmaddwd                 m4, m5, m3 | 
|  | pmaddwd                 m3, m6 | 
|  | REPX         {paddd x, m7}, m0, m1, m4, m3 | 
|  | REPX         {psrad x, 12}, m0, m4, m1, m3 | 
|  | packssdw                m0, m4                       ;out6 | 
|  | packssdw                m1, m3                       ;-out9 | 
|  | mova [rsp+gprsize*2+16* 9], m0 | 
|  | mova                    m0, [rsp+gprsize*2+16* 7] | 
|  | mova                    m4, [rsp+gprsize*2+16*12] | 
|  | punpcklwd               m3, m0, m4 | 
|  | punpckhwd               m0, m4 | 
|  | pmaddwd                 m4, m5, m3 | 
|  | pmaddwd                 m3, m6 | 
|  | pmaddwd                 m5, m0 | 
|  | pmaddwd                 m0, m6 | 
|  | REPX         {paddd x, m7}, m4, m3, m5, m0 | 
|  | REPX         {psrad x, 12}, m4, m5, m3, m0 | 
|  | packssdw                m4, m5                       ;out4 | 
|  | packssdw                m3, m0                       ;-out11 | 
|  | mova [rsp+gprsize*2+16* 7], m4 | 
|  | mova                    m4, [rsp+gprsize*2+16*10] | 
|  | mova                    m5, [rsp+gprsize*2+16*11] | 
|  | punpcklwd               m0, m4, m5 | 
|  | punpckhwd               m4, m5 | 
|  | pmaddwd                 m5, m0, [o(pw_2896_2896)] | 
|  | pmaddwd                 m0, m6 | 
|  | pmaddwd                 m6, m4 | 
|  | pmaddwd                 m4, [o(pw_2896_2896)] | 
|  | REPX         {paddd x, m7}, m5, m0, m6, m4 | 
|  | REPX         {psrad x, 12}, m0, m6, m5, m4 | 
|  | packssdw                m0, m6                       ;out8 | 
|  | packssdw                m5, m4                       ;-out7 | 
|  | mova [rsp+gprsize*2+16*10], m5 | 
|  | mova                    m4, [rsp+gprsize*2+16* 2]    ;out12 | 
|  | mova                    m5, [rsp+gprsize*2+16*14]    ;-out13 | 
|  | mova                    m6, [rsp+gprsize*2+16*15]    ;out14 | 
|  | ret | 
|  | ALIGN function_align | 
|  | cglobal_label .main_pass2_end | 
|  | mova                    m7, [o(pw_2896x8)] | 
|  | mova                    m1, [rsp+gprsize*2+16* 9] | 
|  | mova                    m2, [rsp+gprsize*2+16*14] | 
|  | paddsw                  m0, m1, m2 | 
|  | psubsw                  m1, m2 | 
|  | pmulhrsw                m0, m7                       ;out6 | 
|  | pmulhrsw                m1, m7                       ;-out9 | 
|  | mova [rsp+gprsize*2+16* 9], m0 | 
|  | psubsw                  m2, m3, m4 | 
|  | paddsw                  m3, m4 | 
|  | pmulhrsw                m2, m7                       ;out10 | 
|  | pmulhrsw                m3, m7                       ;-out5 | 
|  | mova [rsp+gprsize*2+16* 8], m3 | 
|  | mova                    m3, [rsp+gprsize*2+16* 7] | 
|  | mova                    m4, [rsp+gprsize*2+16*12] | 
|  | paddsw                  m0, m3, m4 | 
|  | psubsw                  m3, m4 | 
|  | pmulhrsw                m0, m7                       ;out4 | 
|  | pmulhrsw                m3, m7                       ;-out11 | 
|  | mova [rsp+gprsize*2+16* 7], m0 | 
|  | mova                    m0, [rsp+gprsize*2+16*10] | 
|  | paddsw                  m4, m0, [rsp+gprsize*2+16*11] | 
|  | psubsw                  m0, [rsp+gprsize*2+16*11] | 
|  | pmulhrsw                m4, m7                       ;-out7 | 
|  | pmulhrsw                m0, m7                       ;out8 | 
|  | mova [rsp+gprsize*2+16*10], m4 | 
|  | mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12 | 
|  | ret | 
|  |  | 
|  | INV_TXFM_16X8_FN flipadst, dct | 
|  | INV_TXFM_16X8_FN flipadst, adst | 
|  | INV_TXFM_16X8_FN flipadst, flipadst | 
|  | INV_TXFM_16X8_FN flipadst, identity | 
|  |  | 
|  | cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mova                    m7, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m7, [coeffq+16*0 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*1 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*14] | 
|  | pmulhrsw                m3, m7, [coeffq+16*15] | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*9], m2 | 
|  | mova    [rsp+gprsize+32*5], m3 | 
|  | pmulhrsw                m0, m7, [coeffq+16*6 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*7 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*8 ] | 
|  | pmulhrsw                m3, m7, [coeffq+16*9 ] | 
|  | mova    [rsp+gprsize+16*3], m2 | 
|  | mova    [rsp+gprsize+16*4], m3 | 
|  | mova    [rsp+gprsize+16*5], m0 | 
|  | mova    [rsp+gprsize+16*6], m1 | 
|  | pmulhrsw                m0, m7, [coeffq+16*2 ] | 
|  | pmulhrsw                m1, m7, [coeffq+16*3 ] | 
|  | pmulhrsw                m2, m7, [coeffq+16*4 ] | 
|  | pmulhrsw                m3, m7, [coeffq+16*5 ] | 
|  | pmulhrsw                m4, m7, [coeffq+16*10] | 
|  | pmulhrsw                m5, m7, [coeffq+16*11] | 
|  | pmulhrsw                m6, m7, [coeffq+16*12] | 
|  | pmulhrsw                m7,     [coeffq+16*13] | 
|  |  | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS     coeffq+16*0, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                     r3, tx2q | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp m(iflipadst_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+16*1, 32 | 
|  | LOAD_8ROWS     coeffq+16*0, 32 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                   tx2q, r3 | 
|  | jmp m(iflipadst_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(.end)] | 
|  | lea                     r3, [dstq+8] | 
|  | jmp m(iflipadst_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS     coeffq+16*1, 32 | 
|  | lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                   dstq, r3 | 
|  | jmp m(iflipadst_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  |  | 
|  | INV_TXFM_16X8_FN identity, dct | 
|  | INV_TXFM_16X8_FN identity, adst | 
|  | INV_TXFM_16X8_FN identity, flipadst | 
|  | INV_TXFM_16X8_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | add                coeffq, 16*16 | 
|  | mova                   m4, [coeffq-16*7] | 
|  | mova                   m5, [coeffq-16*5] | 
|  | mova                   m6, [coeffq-16*3] | 
|  | mova                   m7, [coeffq-16*1] | 
|  | mov                    r3, tx2q | 
|  | lea                  tx2q, [o(.pass1_end)] | 
|  |  | 
|  | .pass1: | 
|  | mova                   m0, [o(pw_2896x8)] | 
|  | mova                   m2, [o(pw_1697x16)] | 
|  | mova                   m3, [o(pw_16384)] | 
|  | sub                coeffq, 8*16 | 
|  | REPX     {pmulhrsw x, m0}, m4, m5, m6, m7 | 
|  | pmulhrsw               m1, m2, m4 | 
|  | pmulhrsw               m1, m3 | 
|  | paddsw                 m1, m4 ; 1 | 
|  | pmulhrsw               m4, m2, m5 | 
|  | pmulhrsw               m4, m3 | 
|  | paddsw                 m4, m5 ; 3 | 
|  | pmulhrsw               m5, m2, m6 | 
|  | pmulhrsw               m5, m3 | 
|  | paddsw                 m5, m6 ; 5 | 
|  | pmulhrsw               m6, m2, m7 | 
|  | pmulhrsw               m6, m3 | 
|  | paddsw                 m7, m6 ; 7 | 
|  | pmulhrsw               m6, m0, [coeffq+16*6] | 
|  | mova   [rsp+gprsize+16*0], m4 | 
|  | pmulhrsw               m4, m2, m6 | 
|  | pmulhrsw               m4, m3 | 
|  | paddsw                 m6, m4 ; 6 | 
|  | pmulhrsw               m4, m0, [coeffq+16*4] | 
|  | mova   [rsp+gprsize+16*1], m6 | 
|  | pmulhrsw               m6, m2, m4 | 
|  | pmulhrsw               m6, m3 | 
|  | paddsw                 m4, m6 ; 4 | 
|  | pmulhrsw               m6, m0, [coeffq+16*2] | 
|  | pmulhrsw               m0,     [coeffq+16*0] | 
|  | pmulhrsw               m2, m6 | 
|  | pmulhrsw               m2, m3 | 
|  | paddsw                 m2, m6 ; 2 | 
|  | pmulhrsw               m6, m0, [o(pw_1697x16)] | 
|  | pmulhrsw               m6, m3 | 
|  | mova                   m3, [rsp+gprsize+16*0] | 
|  | paddsw                 m0, m6 | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass1_end: | 
|  | mova        [coeffq+16*1], m4 | 
|  | mova        [coeffq+16*3], m5 | 
|  | mova        [coeffq+16*5], m6 | 
|  | mova        [coeffq+16*7], m7 | 
|  | mova                   m4, [coeffq-16*7] | 
|  | mova                   m5, [coeffq-16*5] | 
|  | mova                   m6, [coeffq-16*3] | 
|  | mova                   m7, [coeffq-16*1] | 
|  | mova        [coeffq-16*7], m0 | 
|  | mova        [coeffq-16*5], m1 | 
|  | mova        [coeffq-16*3], m2 | 
|  | mova        [coeffq-16*1], m3 | 
|  | mov                  tx2q, r3 | 
|  | jmp .pass1 | 
|  |  | 
|  | .pass2: | 
|  | lea                  tx2q, [o(.end)] | 
|  | lea                    r3, [dstq+8] | 
|  | jmp  m(iidentity_8x8_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    coeffq+16*1, 32 | 
|  | lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(iidentity_8x8_internal_8bpc).end | 
|  |  | 
|  |  | 
|  | %macro INV_TXFM_16X16_FN 2 ; type1, type2 | 
|  | INV_TXFM_FN          %1, %2, 16x16, 8, 16*16 | 
|  | %ifidn %1_%2, dct_dct | 
|  | movd                   m1, [o(pw_2896x8)] | 
|  | pmulhrsw               m0, m1, [coeffq] | 
|  | movd                   m2, [o(pw_8192)] | 
|  | mov              [coeffq], eobd | 
|  | mov                   r2d, 8 | 
|  | lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly | 
|  | .end: | 
|  | RET | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_16X16_FN dct, dct | 
|  | INV_TXFM_16X16_FN dct, adst | 
|  | INV_TXFM_16X16_FN dct, flipadst | 
|  | INV_TXFM_16X16_FN dct, identity | 
|  |  | 
|  | cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS     coeffq+16*1, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*3, 64 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mov                     r3, tx2q | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*17, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+16*1, 32 | 
|  | LOAD_8ROWS     coeffq+16*0, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*2, 64 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+16*16, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                   tx2q, r3 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp  m(idct_8x16_internal_8bpc).pass2_pre | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.end1)] | 
|  | mov                   dstq, r3 | 
|  | lea                     r3, [dstq+8] | 
|  | jmp   m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end1: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  |  | 
|  | add                 coeffq, 32*8 | 
|  | mov                   dstq, r3 | 
|  |  | 
|  | mova                    m0, [coeffq+16*0 ] | 
|  | mova                    m1, [coeffq+16*4 ] | 
|  | mova                    m2, [coeffq+16*8 ] | 
|  | mova                    m3, [coeffq+16*12] | 
|  | mova                    m4, [coeffq+16*1 ] | 
|  | mova                    m5, [coeffq+16*5 ] | 
|  | mova                    m6, [coeffq+16*9 ] | 
|  | mova                    m7, [coeffq+16*13] | 
|  | lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end)] | 
|  | jmp  m(idct_8x16_internal_8bpc).pass2_main | 
|  |  | 
|  |  | 
|  | %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 | 
|  | mova                    m0, [coeffq+16*1 ] | 
|  | mova                    m1, [coeffq+16*3 ] | 
|  | mova                    m2, [coeffq+16*29] | 
|  | mova                    m3, [coeffq+16*31] | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*9], m2 | 
|  | mova    [rsp+gprsize+32*5], m3 | 
|  | mova                    m0, [coeffq+16*13] | 
|  | mova                    m1, [coeffq+16*15] | 
|  | mova                    m2, [coeffq+16*17] | 
|  | mova                    m3, [coeffq+16*19] | 
|  | mova    [rsp+gprsize+16*3], m2 | 
|  | mova    [rsp+gprsize+16*4], m3 | 
|  | mova    [rsp+gprsize+16*5], m0 | 
|  | mova    [rsp+gprsize+16*6], m1 | 
|  | mova                    m0, [coeffq+16*5 ] | 
|  | mova                    m1, [coeffq+16*7 ] | 
|  | mova                    m2, [coeffq+16*9 ] | 
|  | mova                    m3, [coeffq+16*11] | 
|  | mova                    m4, [coeffq+16*21] | 
|  | mova                    m5, [coeffq+16*23] | 
|  | mova                    m6, [coeffq+16*25] | 
|  | mova                    m7, [coeffq+16*27] | 
|  | %endmacro | 
|  |  | 
|  | %macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 | 
|  | mova                    m0, [coeffq+16*0 ] | 
|  | mova                    m1, [coeffq+16*2 ] | 
|  | mova                    m2, [coeffq+16*28] | 
|  | mova                    m3, [coeffq+16*30] | 
|  | mova    [rsp+gprsize+16*7], m0 | 
|  | mova    [rsp+gprsize+16*8], m1 | 
|  | mova    [rsp+gprsize+16*9], m2 | 
|  | mova    [rsp+gprsize+32*5], m3 | 
|  | mova                    m0, [coeffq+16*12] | 
|  | mova                    m1, [coeffq+16*14] | 
|  | mova                    m2, [coeffq+16*16] | 
|  | mova                    m3, [coeffq+16*18] | 
|  | mova    [rsp+gprsize+16*3], m2 | 
|  | mova    [rsp+gprsize+16*4], m3 | 
|  | mova    [rsp+gprsize+16*5], m0 | 
|  | mova    [rsp+gprsize+16*6], m1 | 
|  | mova                    m0, [coeffq+16*4 ] | 
|  | mova                    m1, [coeffq+16*6 ] | 
|  | mova                    m2, [coeffq+16*8 ] | 
|  | mova                    m3, [coeffq+16*10] | 
|  | mova                    m4, [coeffq+16*20] | 
|  | mova                    m5, [coeffq+16*22] | 
|  | mova                    m6, [coeffq+16*24] | 
|  | mova                    m7, [coeffq+16*26] | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_16X16_FN adst, dct | 
|  | INV_TXFM_16X16_FN adst, adst | 
|  | INV_TXFM_16X16_FN adst, flipadst | 
|  |  | 
|  | cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | ITX_16X16_ADST_LOAD_ODD_COEFS | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | mov                     r3, tx2q | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp  m(iadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*17, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp  m(iadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+16*1, 32 | 
|  | ITX_16X16_ADST_LOAD_EVEN_COEFS | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp  m(iadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+16*16, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                   tx2q, r3 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | jmp  m(iadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp m(iadst_8x16_internal_8bpc).pass2_pre | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.end1)] | 
|  | mov                   dstq, r3 | 
|  | lea                     r3, [dstq+8] | 
|  | jmp  m(iadst_8x8_internal_8bpc).end | 
|  |  | 
|  | .end1: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  |  | 
|  | add                 coeffq, 32*8 | 
|  | mov                   dstq, r3 | 
|  |  | 
|  | mova                    m4, [coeffq+16*0 ] | 
|  | mova                    m5, [coeffq+16*2 ] | 
|  | mova                    m0, [coeffq+16*4 ] | 
|  | mova                    m1, [coeffq+16*6 ] | 
|  | mova                    m2, [coeffq+16*8 ] | 
|  | mova                    m3, [coeffq+16*10] | 
|  | mova                    m6, [coeffq+16*12] | 
|  | mova                    m7, [coeffq+16*14] | 
|  | mova    [rsp+gprsize+16*7], m4 | 
|  | mova    [rsp+gprsize+16*8], m5 | 
|  | mova    [rsp+gprsize+16*5], m6 | 
|  | mova    [rsp+gprsize+16*6], m7 | 
|  | lea                   tx2q, [o(m(iadst_8x16_internal_8bpc).end)] | 
|  | jmp m(iadst_8x16_internal_8bpc).pass2_main | 
|  |  | 
|  |  | 
|  | INV_TXFM_16X16_FN flipadst, dct | 
|  | INV_TXFM_16X16_FN flipadst, adst | 
|  | INV_TXFM_16X16_FN flipadst, flipadst | 
|  |  | 
|  | cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | ITX_16X16_ADST_LOAD_ODD_COEFS | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | mov                     r3, tx2q | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | mova                    m7, [o(pw_m8192)] | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+16*1, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | mova                    m7, [o(pw_m8192)] | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS    coeffq+16*17, 32 | 
|  | ITX_16X16_ADST_LOAD_EVEN_COEFS | 
|  | call m(iadst_16x8_internal_8bpc).main | 
|  | call m(iadst_16x8_internal_8bpc).main_pass1_end | 
|  |  | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS     coeffq+16*0, 32 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | mova                    m7, [o(pw_m8192)] | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+16*16, 32 | 
|  | LOAD_8ROWS    coeffq+16* 0, 32 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                   tx2q, r3 | 
|  | mova                    m7, [o(pw_m8192)] | 
|  | jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass2: | 
|  | lea                   tx2q, [o(.end)] | 
|  | lea                     r3, [dstq+8] | 
|  | jmp m(iflipadst_8x16_internal_8bpc).pass2_pre | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.end1)] | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).end | 
|  |  | 
|  | .end1: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  |  | 
|  | add                 coeffq, 32*8 | 
|  |  | 
|  | mova                    m4, [coeffq+16*0 ] | 
|  | mova                    m5, [coeffq+16*2 ] | 
|  | mova                    m0, [coeffq+16*4 ] | 
|  | mova                    m1, [coeffq+16*6 ] | 
|  | mova                    m2, [coeffq+16*8 ] | 
|  | mova                    m3, [coeffq+16*10] | 
|  | mova                    m6, [coeffq+16*12] | 
|  | mova                    m7, [coeffq+16*14] | 
|  | mova    [rsp+gprsize+16*7], m4 | 
|  | mova    [rsp+gprsize+16*8], m5 | 
|  | mova    [rsp+gprsize+16*5], m6 | 
|  | mova    [rsp+gprsize+16*6], m7 | 
|  |  | 
|  | lea                   tx2q, [o(.end2)] | 
|  | mov                   dstq, r3 | 
|  | jmp m(iflipadst_8x16_internal_8bpc).pass2_main | 
|  |  | 
|  | .end2: | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | jmp  m(iflipadst_8x8_internal_8bpc).end | 
|  |  | 
|  |  | 
|  | %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 | 
|  | pmulhrsw            m%2, m%3, m%1 | 
|  | psraw               m%2, 1 | 
|  | pavgw               m%1, m%2 | 
|  | %endmacro | 
|  |  | 
|  | INV_TXFM_16X16_FN identity, dct | 
|  | INV_TXFM_16X16_FN identity, identity | 
|  |  | 
|  | cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | add                 coeffq, 16*17 | 
|  | mov                     r3, tx2q | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  |  | 
|  | .pass1: | 
|  | mova                    m6, [o(pw_1697x16)] | 
|  | mova                    m7, [coeffq+32*6] | 
|  | mova                    m0, [coeffq+32*0] | 
|  | mova                    m1, [coeffq+32*1] | 
|  | mova                    m2, [coeffq+32*2] | 
|  | mova                    m3, [coeffq+32*3] | 
|  | mova                    m4, [coeffq+32*4] | 
|  | REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 | 
|  | mova                    m5, [coeffq+32*5] | 
|  | mova    [rsp+gprsize+16*1], m7 | 
|  | IDTX16B                  5, 7, 6 | 
|  | mova                    m7, [coeffq+32*7] | 
|  | IDTX16B                  7, 6, 6 | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS          coeffq, 32 | 
|  | sub                 coeffq, 16 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp .pass1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS          coeffq, 32 | 
|  | sub                 coeffq, 15*16 | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp .pass1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS          coeffq, 32 | 
|  | sub                 coeffq, 16 | 
|  | mov                   tx2q, r3 | 
|  | jmp .pass1 | 
|  |  | 
|  | .pass2: | 
|  | lea                     r3, [dstq+8] | 
|  | lea                   tx2q, [o(.end1)] | 
|  |  | 
|  | .end: | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova    [rsp+gprsize+16*1], m4 | 
|  | mova                    m7, [o(pw_1697x16)] | 
|  | REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 | 
|  | mova                    m4, [o(pw_2048)] | 
|  | pmulhrsw                m5, m4 | 
|  | pmulhrsw                m6, m4 | 
|  | mova    [rsp+gprsize+16*2], m5 | 
|  | mova                    m5, [rsp+gprsize+16*1] | 
|  | mova    [rsp+gprsize+16*1], m6 | 
|  | IDTX16                   5, 6, 7 | 
|  | mova                    m6, [rsp+gprsize+16*0] | 
|  | IDTX16                   6, 7, 7 | 
|  | REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6 | 
|  | pmulhrsw                m4, m5 | 
|  | mova    [rsp+gprsize+16*0], m6 | 
|  | jmp   m(idct_8x8_internal_8bpc).end3 | 
|  |  | 
|  | .end1: | 
|  | LOAD_8ROWS     coeffq+16*1, 32 | 
|  | lea                   tx2q, [o(.end2)] | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | jmp .end | 
|  |  | 
|  | .end2: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  |  | 
|  | add                 coeffq, 32*8 | 
|  | LOAD_8ROWS          coeffq, 32 | 
|  | lea                   tx2q, [o(.end3)] | 
|  | mov                   dstq, r3 | 
|  | jmp .end | 
|  |  | 
|  | .end3: | 
|  | LOAD_8ROWS     coeffq+16*1, 32 | 
|  | lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)] | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | jmp .end | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call  m(idct_8x32_internal_8bpc) | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                 m1, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m1, [coeffq] | 
|  | movd                 m2, [o(pw_8192)] | 
|  | mov            [coeffq], eobd | 
|  | pmulhrsw             m0, m2 | 
|  | psrlw                m2, 2            ;pw_2048 | 
|  | pmulhrsw             m0, m1 | 
|  | pmulhrsw             m0, m2 | 
|  | pshuflw              m0, m0, q0000 | 
|  | punpcklwd            m0, m0 | 
|  | mov                 r3d, 8 | 
|  | lea                tx2q, [o(.end)] | 
|  | jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop | 
|  |  | 
|  | .end: | 
|  | RET | 
|  |  | 
|  |  | 
|  |  | 
|  | cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | cmp                   eobd, 106 | 
|  | jle .fast | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*3, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1: | 
|  | mova   [rsp+gprsize+16*9 ], m0                        ;in24 | 
|  | mova   [rsp+gprsize+16*10], m4                        ;in28 | 
|  | mova   [rsp+gprsize+16*17], m2                        ;in26 | 
|  | mova   [rsp+gprsize+16*18], m6                        ;in30 | 
|  | mova   [rsp+gprsize+16*31], m1                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m3                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m5                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  | LOAD_8ROWS     coeffq+16*2, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_1: | 
|  | mova   [rsp+gprsize+16*7 ], m0                        ;in16 | 
|  | mova   [rsp+gprsize+16*8 ], m4                        ;in20 | 
|  | mova   [rsp+gprsize+16*15], m2                        ;in18 | 
|  | mova   [rsp+gprsize+16*16], m6                        ;in22 | 
|  | mova   [rsp+gprsize+16*33], m1                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m3                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m5                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m7                        ;in23 | 
|  |  | 
|  | .fast: | 
|  | LOAD_8ROWS     coeffq+16*1, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | mova   [rsp+gprsize+16*5 ], m0                        ;in8 | 
|  | mova   [rsp+gprsize+16*6 ], m4                        ;in12 | 
|  | mova   [rsp+gprsize+16*13], m2                        ;in10 | 
|  | mova   [rsp+gprsize+16*14], m6                        ;in14 | 
|  | mova   [rsp+gprsize+16*21], m1                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m3                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m5                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  | LOAD_8ROWS     coeffq+16*0, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | mova   [rsp+gprsize+16*11], m2                        ;in2 | 
|  | mova   [rsp+gprsize+16*12], m6                        ;in6 | 
|  | mova   [rsp+gprsize+16*19], m1                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m3                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m5                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m7                        ;in7 | 
|  | mova                    m1, m4                        ;in4 | 
|  | mova                    m2, [rsp+gprsize+16*5 ]       ;in8 | 
|  | mova                    m3, [rsp+gprsize+16*6 ]       ;in12 | 
|  |  | 
|  | cmp                   eobd, 106 | 
|  | jg .full | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS   rsp+gprsize+16*3 , 16 | 
|  | mova                    m0, [rsp+gprsize+16*11] | 
|  | mova                    m1, [rsp+gprsize+16*12] | 
|  | mova                    m2, [rsp+gprsize+16*13] | 
|  | mova                    m3, [rsp+gprsize+16*14] | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call .main_fast | 
|  | jmp  .pass2 | 
|  |  | 
|  | .full: | 
|  | mova                    m4, [rsp+gprsize+16*7 ]       ;in16 | 
|  | mova                    m5, [rsp+gprsize+16*8 ]       ;in20 | 
|  | mova                    m6, [rsp+gprsize+16*9 ]       ;in24 | 
|  | mova                    m7, [rsp+gprsize+16*10]       ;in28 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS   rsp+gprsize+16*3 , 16 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  | call .main | 
|  |  | 
|  | .pass2: | 
|  | lea                     r3, [o(.end6)] | 
|  |  | 
|  | .end: | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   tx2q, [o(.end2)] | 
|  |  | 
|  | .end1: | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \ | 
|  | 8,  9,  10, 11, 12, 13, 14, 15, \ | 
|  | 16, 17, 18, 19, 20, 21, 22, 23, \ | 
|  | 24, 25, 26, 27, 28, 29, 30, 31 | 
|  |  | 
|  | jmp                   tx2q | 
|  |  | 
|  | .end2: | 
|  | lea                   tx2q, [o(.end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end3: | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | lea                   tx2q, [o(.end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end4: | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | lea                   tx2q, [o(.end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end5: | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | mov                   tx2q, r3 | 
|  | jmp   m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end6: | 
|  | ret | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main_veryfast | 
|  | mova                    m0, [rsp+gprsize*2+16*19]     ;in1 | 
|  | pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31 | 
|  | pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17 | 
|  | mova                    m7, [o(pd_2048)] | 
|  | mova [rsp+gprsize*2+16*19], m0                        ;t16 | 
|  | mova [rsp+gprsize*2+16*34], m3                        ;t31 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a | 
|  | mova [rsp+gprsize*2+16*20], m3                        ;t17a | 
|  | mova [rsp+gprsize*2+16*33], m0                        ;t30a | 
|  | mova                    m1, [rsp+gprsize*2+16*22]     ;in7 | 
|  | pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29 | 
|  | pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19 | 
|  | mova [rsp+gprsize*2+16*22], m1                        ;t19 | 
|  | mova [rsp+gprsize*2+16*31], m2                        ;t28 | 
|  | ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a | 
|  | mova [rsp+gprsize*2+16*21], m2                        ;t18a | 
|  | mova [rsp+gprsize*2+16*32], m1                        ;t29a | 
|  | mova                    m0, [rsp+gprsize*2+16*23]     ;in5 | 
|  | pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27 | 
|  | pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21 | 
|  | mova [rsp+gprsize*2+16*23], m0                        ;t20 | 
|  | mova [rsp+gprsize*2+16*30], m3                        ;t27 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a | 
|  | mova [rsp+gprsize*2+16*24], m3                        ;t21a | 
|  | mova [rsp+gprsize*2+16*29], m0                        ;t26a | 
|  | mova                    m2, [rsp+gprsize*2+16*26]     ;in3 | 
|  | pxor                    m0, m0 | 
|  | mova                    m3, m0 | 
|  | pmulhrsw                m1, m2, [o(pw_4052x8)] | 
|  | pmulhrsw                m2, [o(pw_m601x8)] | 
|  | jmp .main2 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main_fast ;bottom half is zero | 
|  | mova                    m0, [rsp+gprsize*2+16*19]     ;in1 | 
|  | mova                    m1, [rsp+gprsize*2+16*20]     ;in15 | 
|  | pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a | 
|  | pmulhrsw                m0, [o(pw_201x8)]             ;t16a | 
|  | pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a | 
|  | pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a | 
|  | mova                    m7, [o(pd_2048)] | 
|  | psubsw                  m4, m0, m1                    ;t17 | 
|  | paddsw                  m0, m1                        ;t16 | 
|  | psubsw                  m5, m3, m2                    ;t30 | 
|  | paddsw                  m3, m2                        ;t31 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a | 
|  | mova [rsp+gprsize*2+16*19], m0                        ;t16 | 
|  | mova [rsp+gprsize*2+16*20], m5                        ;t17a | 
|  | mova [rsp+gprsize*2+16*33], m4                        ;t30a | 
|  | mova [rsp+gprsize*2+16*34], m3                        ;t31 | 
|  | mova                    m0, [rsp+gprsize*2+16*21]     ;in9 | 
|  | mova                    m1, [rsp+gprsize*2+16*22]     ;in7 | 
|  | pmulhrsw                m3, m0, [o(pw_3703x8)] | 
|  | pmulhrsw                m0, [o(pw_1751x8)] | 
|  | pmulhrsw                m2, m1, [o(pw_3857x8)] | 
|  | pmulhrsw                m1, [o(pw_m1380x8)] | 
|  | psubsw                  m4, m1, m0                    ;t18 | 
|  | paddsw                  m0, m1                        ;t19 | 
|  | psubsw                  m5, m2, m3                    ;t29 | 
|  | paddsw                  m3, m2                        ;t28 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a | 
|  | mova [rsp+gprsize*2+16*21], m5                        ;t18a | 
|  | mova [rsp+gprsize*2+16*22], m0                        ;t19 | 
|  | mova [rsp+gprsize*2+16*31], m3                        ;t28 | 
|  | mova [rsp+gprsize*2+16*32], m4                        ;t29a | 
|  | mova                    m0, [rsp+gprsize*2+16*23]     ;in5 | 
|  | mova                    m1, [rsp+gprsize*2+16*24]     ;in11 | 
|  | pmulhrsw                m3, m0, [o(pw_3973x8)] | 
|  | pmulhrsw                m0, [o(pw_995x8)] | 
|  | pmulhrsw                m2, m1, [o(pw_3513x8)] | 
|  | pmulhrsw                m1, [o(pw_m2106x8)] | 
|  | psubsw                  m4, m0, m1                    ;t21 | 
|  | paddsw                  m0, m1                        ;t20 | 
|  | psubsw                  m5, m3, m2                    ;t26 | 
|  | paddsw                  m3, m2                        ;t27 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a | 
|  | mova [rsp+gprsize*2+16*23], m0                        ;t20 | 
|  | mova [rsp+gprsize*2+16*24], m5                        ;t21a | 
|  | mova [rsp+gprsize*2+16*29], m4                        ;t26a | 
|  | mova [rsp+gprsize*2+16*30], m3                        ;t27 | 
|  | mova                    m0, [rsp+gprsize*2+16*25]     ;in13 | 
|  | mova                    m2, [rsp+gprsize*2+16*26]     ;in3 | 
|  | pmulhrsw                m3, m0, [o(pw_3290x8)] | 
|  | pmulhrsw                m0, [o(pw_2440x8)] | 
|  | pmulhrsw                m1, m2, [o(pw_4052x8)] | 
|  | pmulhrsw                m2, [o(pw_m601x8)] | 
|  | jmp .main2 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova                    m7, [o(pd_2048)] | 
|  | mova                    m0, [rsp+gprsize*2+16*19]     ;in1 | 
|  | mova                    m1, [rsp+gprsize*2+16*20]     ;in15 | 
|  | mova                    m2, [rsp+gprsize*2+16*33]     ;in17 | 
|  | mova                    m3, [rsp+gprsize*2+16*34]     ;in31 | 
|  | ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a | 
|  | ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a | 
|  | psubsw                  m4, m0, m2                    ;t17 | 
|  | paddsw                  m0, m2                        ;t16 | 
|  | psubsw                  m5, m3, m1                    ;t30 | 
|  | paddsw                  m3, m1                        ;t31 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a | 
|  | mova [rsp+gprsize*2+16*19], m0                        ;t16 | 
|  | mova [rsp+gprsize*2+16*20], m5                        ;t17a | 
|  | mova [rsp+gprsize*2+16*33], m4                        ;t30a | 
|  | mova [rsp+gprsize*2+16*34], m3                        ;t31 | 
|  | mova                    m0, [rsp+gprsize*2+16*21]     ;in9 | 
|  | mova                    m1, [rsp+gprsize*2+16*22]     ;in7 | 
|  | mova                    m2, [rsp+gprsize*2+16*31]     ;in25 | 
|  | mova                    m3, [rsp+gprsize*2+16*32]     ;in23 | 
|  | ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a | 
|  | ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a | 
|  | psubsw                  m4, m2, m0                    ;t18 | 
|  | paddsw                  m0, m2                        ;t19 | 
|  | psubsw                  m5, m1, m3                    ;t29 | 
|  | paddsw                  m3, m1                        ;t28 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a | 
|  | mova [rsp+gprsize*2+16*21], m5                        ;t18a | 
|  | mova [rsp+gprsize*2+16*22], m0                        ;t19 | 
|  | mova [rsp+gprsize*2+16*31], m3                        ;t28 | 
|  | mova [rsp+gprsize*2+16*32], m4                        ;t29a | 
|  | mova                    m0, [rsp+gprsize*2+16*23]     ;in5 | 
|  | mova                    m1, [rsp+gprsize*2+16*24]     ;in11 | 
|  | mova                    m2, [rsp+gprsize*2+16*29]     ;in21 | 
|  | mova                    m3, [rsp+gprsize*2+16*30]     ;in27 | 
|  | ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a | 
|  | ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a | 
|  | psubsw                  m4, m0, m2                    ;t21 | 
|  | paddsw                  m0, m2                        ;t20 | 
|  | psubsw                  m5, m3, m1                    ;t26 | 
|  | paddsw                  m3, m1                        ;t27 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a | 
|  | mova [rsp+gprsize*2+16*23], m0                        ;t20 | 
|  | mova [rsp+gprsize*2+16*24], m5                        ;t21a | 
|  | mova [rsp+gprsize*2+16*29], m4                        ;t26a | 
|  | mova [rsp+gprsize*2+16*30], m3                        ;t27 | 
|  | mova                    m0, [rsp+gprsize*2+16*25]     ;in13 | 
|  | mova                    m1, [rsp+gprsize*2+16*26]     ;in3 | 
|  | mova                    m2, [rsp+gprsize*2+16*27]     ;in29 | 
|  | mova                    m3, [rsp+gprsize*2+16*28]     ;in19 | 
|  | ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a | 
|  | ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a | 
|  |  | 
|  | .main2: | 
|  | psubsw                  m4, m2, m0                    ;t22 | 
|  | paddsw                  m0, m2                        ;t23 | 
|  | psubsw                  m5, m1, m3                    ;t25 | 
|  | paddsw                  m3, m1                        ;t24 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a | 
|  | mova                    m2, [rsp+gprsize*2+16*24]     ;t21a | 
|  | psubsw                  m1, m5, m2                    ;t21 | 
|  | paddsw                  m5, m2                        ;t22 | 
|  | mova [rsp+gprsize*2+16*25], m5                        ;t22 | 
|  | mova                    m2, [rsp+gprsize*2+16*29]     ;t26a | 
|  | psubsw                  m5, m4, m2                    ;t26 | 
|  | paddsw                  m4, m2                        ;t25 | 
|  | mova [rsp+gprsize*2+16*28], m4                        ;t25 | 
|  | ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a | 
|  | mova [rsp+gprsize*2+16*24], m5                        ;t21a | 
|  | mova [rsp+gprsize*2+16*29], m1                        ;t26a | 
|  |  | 
|  | mova                    m1, [rsp+gprsize*2+16*23]     ;t20 | 
|  | mova                    m5, [rsp+gprsize*2+16*30]     ;t27 | 
|  | psubsw                  m2, m0, m1                    ;t20a | 
|  | paddsw                  m0, m1                        ;t23a | 
|  | psubsw                  m6, m3, m5                    ;t27a | 
|  | paddsw                  m3, m5                        ;t24a | 
|  | ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27 | 
|  | mova [rsp+gprsize*2+16*26], m0                        ;t23a | 
|  | mova [rsp+gprsize*2+16*27], m3                        ;t24a | 
|  | mova [rsp+gprsize*2+16*30], m2                        ;t27 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*20]     ;t17a | 
|  | mova                    m1, [rsp+gprsize*2+16*21]     ;t18a | 
|  | mova                    m2, [rsp+gprsize*2+16*32]     ;t29a | 
|  | mova                    m3, [rsp+gprsize*2+16*33]     ;t30a | 
|  | psubsw                  m4, m0, m1                    ;t18 | 
|  | paddsw                  m0, m1                        ;t17 | 
|  | psubsw                  m5, m3, m2                    ;t29 | 
|  | paddsw                  m3, m2                        ;t30 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a | 
|  | mova [rsp+gprsize*2+16*20], m0                        ;t17 | 
|  | mova [rsp+gprsize*2+16*21], m5                        ;t18a | 
|  | mova [rsp+gprsize*2+16*32], m4                        ;t29a | 
|  | mova [rsp+gprsize*2+16*33], m3                        ;t30 | 
|  | mova                    m0, [rsp+gprsize*2+16*19]     ;t16 | 
|  | mova                    m1, [rsp+gprsize*2+16*22]     ;t19 | 
|  | mova                    m2, [rsp+gprsize*2+16*31]     ;t28 | 
|  | mova                    m3, [rsp+gprsize*2+16*34]     ;t31 | 
|  | psubsw                  m4, m0, m1                    ;t19a | 
|  | paddsw                  m0, m1                        ;t16a | 
|  | psubsw                  m5, m3, m2                    ;t28a | 
|  | paddsw                  m3, m2                        ;t31a | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28 | 
|  | mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12 | 
|  | psubsw                  m1, m5, m6                    ;t20a | 
|  | paddsw                  m5, m6                        ;t19a | 
|  | psubsw                  m6, m2, m5                    ;out19 | 
|  | paddsw                  m2, m5                        ;out12 | 
|  | mova                    m5, [rsp+gprsize*2+16*30]     ;t27 | 
|  | mova [rsp+gprsize*2+16*22], m6                        ;out19 | 
|  | mova [rsp+gprsize*2+16*15], m2                        ;out12 | 
|  | psubsw                  m6, m4, m5                    ;t27a | 
|  | paddsw                  m4, m5                        ;t28a | 
|  | ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27 | 
|  | mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3 | 
|  | psubsw                  m5, m2, m4                    ;out28 | 
|  | paddsw                  m2, m4                        ;out3 | 
|  | mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11 | 
|  | mova [rsp+gprsize*2+16*31], m5                        ;out28 | 
|  | mova [rsp+gprsize*2+16*6 ], m2                        ;out3 | 
|  | psubsw                  m5, m4, m6                    ;out20 | 
|  | paddsw                  m4, m6                        ;out11 | 
|  | mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4 | 
|  | mova [rsp+gprsize*2+16*23], m5                        ;out20 | 
|  | mova [rsp+gprsize*2+16*14], m4                        ;out11 | 
|  | psubsw                  m5, m2, m1                    ;out27 | 
|  | paddsw                  m2, m1                        ;out4 | 
|  | mova                    m1, [rsp+gprsize*2+16*26]     ;t23a | 
|  | mova                    m4, [rsp+gprsize*2+16*27]     ;t24a | 
|  | mova [rsp+gprsize*2+16*30], m5                        ;out27 | 
|  | mova [rsp+gprsize*2+16*7 ], m2                        ;out4 | 
|  | psubsw                  m5, m0, m1                    ;t23 | 
|  | paddsw                  m0, m1                        ;t16 | 
|  | psubsw                  m2, m3, m4                    ;t24 | 
|  | paddsw                  m3, m4                        ;t31 | 
|  | ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a | 
|  | mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15 | 
|  | psubsw                  m4, m6, m0                    ;out16 | 
|  | paddsw                  m6, m0                        ;out15 | 
|  | mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0 | 
|  | mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8 | 
|  | mova [rsp+gprsize*2+16*18], m6                        ;out15 | 
|  | mova [rsp+gprsize*2+16*19], m4                        ;out16 | 
|  | psubsw                  m6, m0, m3                    ;out31 | 
|  | paddsw                  m0, m3                        ;out0 | 
|  | psubsw                  m4, m1, m2                    ;out23 | 
|  | paddsw                  m1, m2                        ;out8 | 
|  | mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7 | 
|  | mova [rsp+gprsize*2+16*34], m6                        ;out31 | 
|  | mova [rsp+gprsize*2+16*11], m1                        ;out8 | 
|  | mova [rsp+gprsize*2+16*26], m4                        ;out23 | 
|  | paddsw                  m6, m3, m5                    ;out7 | 
|  | psubsw                  m3, m5                        ;out24 | 
|  | mova                    m1, [rsp+gprsize*2+16*20]     ;t17 | 
|  | mova                    m5, [rsp+gprsize*2+16*25]     ;t22 | 
|  | mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14 | 
|  | mova [rsp+gprsize*2+16*27], m3                        ;out24 | 
|  | psubsw                  m4, m1, m5                    ;t22a | 
|  | paddsw                  m1, m5                        ;t17a | 
|  | psubsw                  m3, m2, m1                    ;out17 | 
|  | paddsw                  m2, m1                        ;out14 | 
|  | mova                    m5, [rsp+gprsize*2+16*28]     ;t25 | 
|  | mova                    m1, [rsp+gprsize*2+16*33]     ;t30 | 
|  | mova [rsp+gprsize*2+16*17], m2                        ;out14 | 
|  | mova [rsp+gprsize*2+16*20], m3                        ;out17 | 
|  | psubsw                  m2, m1, m5                    ;t25a | 
|  | paddsw                  m1, m5                        ;t30a | 
|  | ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25 | 
|  | mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1 | 
|  | psubsw                  m3, m5, m1                    ;out30 | 
|  | paddsw                  m5, m1                        ;out1 | 
|  | mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9 | 
|  | mova [rsp+gprsize*2+16*33], m3                        ;out30 | 
|  | mova [rsp+gprsize*2+16*4 ], m5                        ;out1 | 
|  | psubsw                  m3, m1, m2                    ;out22 | 
|  | paddsw                  m1, m2                        ;out9 | 
|  | mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6 | 
|  | mova [rsp+gprsize*2+16*25], m3                        ;out22 | 
|  | mova [rsp+gprsize*2+16*12], m1                        ;out9 | 
|  | psubsw                  m3, m5, m4                    ;out25 | 
|  | paddsw                  m5, m4                        ;out6 | 
|  | mova                    m4, [rsp+gprsize*2+16*21]     ;t18a | 
|  | mova                    m1, [rsp+gprsize*2+16*24]     ;t21a | 
|  | mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13 | 
|  | mova [rsp+gprsize*2+16*28], m3                        ;out25 | 
|  | mova [rsp+gprsize*2+16*9 ], m5                        ;out6 | 
|  | paddsw                  m3, m4, m1                    ;t18 | 
|  | psubsw                  m4, m1                        ;t21 | 
|  | psubsw                  m5, m2, m3                    ;out18 | 
|  | paddsw                  m2, m3                        ;out13 | 
|  | mova                    m1, [rsp+gprsize*2+16*29]     ;t26a | 
|  | mova                    m3, [rsp+gprsize*2+16*32]     ;t29a | 
|  | mova [rsp+gprsize*2+16*21], m5                        ;out18 | 
|  | mova [rsp+gprsize*2+16*16], m2                        ;out13 | 
|  | psubsw                  m5, m3, m1                    ;t26 | 
|  | paddsw                  m3, m1                        ;t29 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a | 
|  | mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2 | 
|  | psubsw                  m1, m2, m3                    ;out29 | 
|  | paddsw                  m2, m3                        ;out2 | 
|  | mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10 | 
|  | mova [rsp+gprsize*2+16*32], m1                        ;out29 | 
|  | psubsw                  m7, m3, m5                    ;out21 | 
|  | paddsw                  m3, m5                        ;out10 | 
|  | mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5 | 
|  | mova [rsp+gprsize*2+16*24], m7                        ;out21 | 
|  | mova [rsp+gprsize*2+16*13], m3                        ;out10 | 
|  | psubsw                  m1, m5, m4                    ;out26 | 
|  | paddsw                  m5, m4                        ;out5 | 
|  | mova                    m7, m6                        ;out7 | 
|  | mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3 | 
|  | mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4 | 
|  | mova [rsp+gprsize*2+16*29], m1                        ;out26 | 
|  | mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6 | 
|  | mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1 | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call  m(idct_32x8_internal_8bpc) | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_8192)] | 
|  | mov               [coeffq], eobd | 
|  | mov                    r3d, 8 | 
|  | lea                   tx2q, [o(.end)] | 
|  |  | 
|  | .body: | 
|  | pmulhrsw                m0, m2 | 
|  | movd                    m2, [o(pw_2048)]  ;intentionally rip-relative | 
|  | pmulhrsw                m0, m1 | 
|  | pmulhrsw                m0, m2 | 
|  | pshuflw                 m0, m0, q0000 | 
|  | punpcklwd               m0, m0 | 
|  | pxor                    m5, m5 | 
|  |  | 
|  | .loop: | 
|  | mova                    m1, [dstq+16*0] | 
|  | mova                    m3, [dstq+16*1] | 
|  | punpckhbw               m2, m1, m5 | 
|  | punpcklbw               m1, m5 | 
|  | punpckhbw               m4, m3, m5 | 
|  | punpcklbw               m3, m5 | 
|  | paddw                   m2, m0 | 
|  | paddw                   m1, m0 | 
|  | paddw                   m4, m0 | 
|  | paddw                   m3, m0 | 
|  | packuswb                m1, m2 | 
|  | packuswb                m3, m4 | 
|  | mova           [dstq+16*0], m1 | 
|  | mova           [dstq+16*1], m3 | 
|  | add                   dstq, strideq | 
|  | dec                    r3d | 
|  | jg .loop | 
|  | jmp                   tx2q | 
|  |  | 
|  | .end: | 
|  | RET | 
|  |  | 
|  |  | 
|  | cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS     coeffq+16*0, 64 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*2, 64 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*1, 32 | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | cmp                   eobd, 106 | 
|  | jg  .full | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | jmp .pass2 | 
|  |  | 
|  | .full: | 
|  | LOAD_8ROWS    coeffq+16*17, 32 | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  |  | 
|  | .pass2: | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp  m(idct_8x32_internal_8bpc).end1 | 
|  |  | 
|  | .end: | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .end1: | 
|  | lea                     r3, [dstq+8] | 
|  | lea                   tx2q, [o(.end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end2: | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .end3: | 
|  | mov                   dstq, r3 | 
|  | add                     r3, 8 | 
|  | lea                   tx2q, [o(.end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end4: | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .end5: | 
|  | mov                   dstq, r3 | 
|  | add                     r3, 8 | 
|  | lea                   tx2q, [o(.end6)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end6: | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.end7)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .end7: | 
|  | mov                   dstq, r3 | 
|  | lea                   tx2q, [o(.end8)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass2_main | 
|  |  | 
|  | .end8: | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 | 
|  | mov                    r5d, 4 | 
|  | mov                   tx2d, 2 | 
|  | cmp                   eobd, 107 | 
|  | cmovns                tx2d, r5d | 
|  | mov                    r3d, tx2d | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)] | 
|  | .loop: | 
|  | LOAD_8ROWS     coeffq+16*0, 64 | 
|  | paddsw                  m6, [o(pw_5)] | 
|  | mova            [rsp+16*1], m6 | 
|  | mova                    m6, [o(pw_5)] | 
|  | REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  | REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 | 
|  | mova            [rsp+16*2], m5 | 
|  | mova            [rsp+16*1], m6 | 
|  | mova            [rsp+16*0], m7 | 
|  | call  m(idct_8x8_internal_8bpc).end3 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .loop | 
|  | RET | 
|  |  | 
|  | cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 | 
|  | mov                    r5d, 4 | 
|  | mov                   tx2d, 2 | 
|  | cmp                   eobd, 107 | 
|  | cmovns                tx2d, r5d | 
|  | mov                    r3d, tx2d | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | .loop: | 
|  | LOAD_8ROWS     coeffq+16*0, 16 | 
|  | pmulhrsw                m6, [o(pw_4096)] | 
|  | mova            [rsp+16*1], m6 | 
|  | mova                    m6, [o(pw_4096)] | 
|  | REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 | 
|  | lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  |  | 
|  | mov             [rsp+16*3], dstq | 
|  | mova            [rsp+16*2], m5 | 
|  | mova            [rsp+16*1], m6 | 
|  | mova            [rsp+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)] | 
|  | call  m(idct_8x8_internal_8bpc).end3 | 
|  |  | 
|  | add                 coeffq, 16*8 | 
|  | mov                   dstq, [rsp+16*3] | 
|  | lea                   dstq, [dstq+8] | 
|  | dec                    r3d | 
|  | jg .loop | 
|  | jnc .loop | 
|  | RET | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call  m(idct_16x32_internal_8bpc) | 
|  | .end: | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_16384)] | 
|  | mov               [coeffq], eobd | 
|  | pmulhrsw                m0, m1 | 
|  | mov                    r2d, 16 | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly | 
|  |  | 
|  |  | 
|  | cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | LOAD_8ROWS     coeffq+16*1, 128, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*5, 128, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end1: | 
|  | mova        [coeffq+16*1 ], m0                        ;in8 | 
|  | mova        [coeffq+16*5 ], m4                        ;in12 | 
|  | mova   [rsp+gprsize+16*13], m2                        ;in10 | 
|  | mova   [rsp+gprsize+16*14], m6                        ;in14 | 
|  | mova   [rsp+gprsize+16*21], m1                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m3                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m5                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  | LOAD_8ROWS     coeffq+16*0, 128, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*4, 128, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end3: | 
|  | mova   [rsp+gprsize+16*11], m2                        ;in2 | 
|  | mova   [rsp+gprsize+16*12], m6                        ;in6 | 
|  | mova   [rsp+gprsize+16*19], m1                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m3                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m5                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m7                        ;in7 | 
|  |  | 
|  | cmp                   eobd, 150 | 
|  | jg .full | 
|  |  | 
|  | mova                    m1, m4                        ;in4 | 
|  | mova                    m2, [coeffq+16*1 ]            ;in8 | 
|  | mova                    m3, [coeffq+16*5 ]            ;in12 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | mova                    m0, [rsp+gprsize+16*11]       ;in2 | 
|  | mova                    m1, [rsp+gprsize+16*12]       ;in6 | 
|  | mova                    m2, [rsp+gprsize+16*13]       ;in10 | 
|  | mova                    m3, [rsp+gprsize+16*14]       ;in14 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | jmp  .pass2 | 
|  |  | 
|  | .full: | 
|  | mova        [coeffq+16*0 ], m0                        ;in0 | 
|  | mova        [coeffq+16*4 ], m4                        ;in4 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*2, 128, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*6, 128, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end5: | 
|  | mova        [coeffq+16*2 ], m0                        ;in16 | 
|  | mova        [coeffq+16*6 ], m4                        ;in20 | 
|  | mova   [rsp+gprsize+16*15], m2                        ;in18 | 
|  | mova   [rsp+gprsize+16*16], m6                        ;in22 | 
|  | mova   [rsp+gprsize+16*33], m1                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m3                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m5                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m7                        ;in23 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*3, 128, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+16*7, 128, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | lea                   tx2q, [o(.pass1_end6)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end6: | 
|  | SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end7)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end7: | 
|  | mova   [rsp+gprsize+16*17], m2                        ;in26 | 
|  | mova   [rsp+gprsize+16*18], m6                        ;in30 | 
|  | mova   [rsp+gprsize+16*31], m1                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m3                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m5                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  |  | 
|  | mova                    m6, m0                        ;in24 | 
|  | mova                    m7, m4                        ;in28 | 
|  | mova                    m0, [coeffq+16*0 ]            ;in0 | 
|  | mova                    m1, [coeffq+16*4 ]            ;in4 | 
|  | mova                    m2, [coeffq+16*1 ]            ;in8 | 
|  | mova                    m3, [coeffq+16*5 ]            ;in12 | 
|  | mova                    m4, [coeffq+16*2 ]            ;in16 | 
|  | mova                    m5, [coeffq+16*6 ]            ;in20 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS   rsp+gprsize+16*3 , 16 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  |  | 
|  | .pass2: | 
|  | mov  [rsp+gprsize*1+16*35], eobd | 
|  | lea                     r3, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*35], r3 | 
|  | lea                     r3, [o(.end)] | 
|  | jmp  m(idct_8x32_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | mov                   dstq, [rsp+gprsize*2+16*35] | 
|  | mov                   eobd, [rsp+gprsize*1+16*35] | 
|  | add                 coeffq, 16*32 | 
|  |  | 
|  | mova                    m0, [coeffq+16*4 ]            ;in1 | 
|  | mova                    m1, [coeffq+16*12]            ;in3 | 
|  | mova                    m2, [coeffq+16*20]            ;in5 | 
|  | mova                    m3, [coeffq+16*28]            ;in7 | 
|  | mova                    m4, [coeffq+16*5 ]            ;in9 | 
|  | mova                    m5, [coeffq+16*13]            ;in11 | 
|  | mova                    m6, [coeffq+16*21]            ;in13 | 
|  | mova                    m7, [coeffq+16*29]            ;in15 | 
|  |  | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | mova                    m0, [coeffq+16*0 ]            ;in0 | 
|  | mova                    m1, [coeffq+16*16]            ;in4 | 
|  | mova                    m2, [coeffq+16*1 ]            ;in8 | 
|  | mova                    m3, [coeffq+16*17]            ;in12 | 
|  |  | 
|  | cmp                   eobd, 150 | 
|  | jg .full1 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ]            ;in2 | 
|  | mova                    m1, [coeffq+16*24]            ;in6 | 
|  | mova                    m2, [coeffq+16*9 ]            ;in10 | 
|  | mova                    m3, [coeffq+16*25]            ;in14 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | jmp m(idct_8x32_internal_8bpc).pass2 | 
|  |  | 
|  | .full1: | 
|  | mova                    m4, [coeffq+16*2 ]            ;in16 | 
|  | mova                    m5, [coeffq+16*18]            ;in20 | 
|  | mova                    m6, [coeffq+16*3 ]            ;in24 | 
|  | mova                    m7, [coeffq+16*19]            ;in26 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ]            ;in2 | 
|  | mova                    m1, [coeffq+16*24]            ;in6 | 
|  | mova                    m2, [coeffq+16*9 ]            ;in10 | 
|  | mova                    m3, [coeffq+16*25]            ;in14 | 
|  | mova                    m4, [coeffq+16*10]            ;in18 | 
|  | mova                    m5, [coeffq+16*26]            ;in22 | 
|  | mova                    m6, [coeffq+16*11]            ;in26 | 
|  | mova                    m7, [coeffq+16*27]            ;in30 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*6 ]            ;in17 | 
|  | mova                    m1, [coeffq+16*14]            ;in19 | 
|  | mova                    m2, [coeffq+16*22]            ;in21 | 
|  | mova                    m3, [coeffq+16*30]            ;in23 | 
|  | mova                    m4, [coeffq+16*7 ]            ;in25 | 
|  | mova                    m5, [coeffq+16*15]            ;in27 | 
|  | mova                    m6, [coeffq+16*23]            ;in29 | 
|  | mova                    m7, [coeffq+16*31]            ;in31 | 
|  |  | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  | jmp m(idct_8x32_internal_8bpc).pass2 | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  |  | 
|  | call m(idct_32x16_internal_8bpc) | 
|  | call m(idct_8x16_internal_8bpc).pass2 | 
|  |  | 
|  | add                 coeffq, 16*16 | 
|  | lea                   dstq, [r3+8] | 
|  | LOAD_8ROWS       rsp+16*11, 16 | 
|  | mova            [rsp+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end | 
|  | call m(idct_8x16_internal_8bpc).pass2 | 
|  |  | 
|  | add                 coeffq, 16*16 | 
|  | lea                   dstq, [r3+8] | 
|  | LOAD_8ROWS       rsp+16*19, 16 | 
|  | mova            [rsp+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end | 
|  | call m(idct_8x16_internal_8bpc).pass2 | 
|  |  | 
|  | add                 coeffq, 16*16 | 
|  | lea                   dstq, [r3+8] | 
|  | LOAD_8ROWS       rsp+16*27, 16 | 
|  | mova            [rsp+16*0], m7 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end | 
|  | call m(idct_8x16_internal_8bpc).pass2 | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_16384)] | 
|  | mov               [coeffq], eobd | 
|  | pmulhrsw                m0, m1 | 
|  | mov                    r3d, 16 | 
|  | lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body | 
|  |  | 
|  |  | 
|  | cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | add                 coeffq, 16 | 
|  | lea                     r3, [o(.pass1_end1)] | 
|  | .pass1: | 
|  | LOAD_8ROWS     coeffq+16*0, 128, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*4, 128, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+16*2, 64, 1 | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+16*34, 64, 1 | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  |  | 
|  | .pass1_end: | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | mov                   tx2q, r3 | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+16*0, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+16*16, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+16*32, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova   [rsp+gprsize+16*0 ], m7 | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS    coeffq+16*48, 32 | 
|  |  | 
|  | sub                 coeffq, 16 | 
|  | lea                     r3, [o(.end)] | 
|  | jmp .pass1 | 
|  |  | 
|  | .end: | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, eobd | 
|  | cmp                   eobd, 43                ;if (eob > 43) | 
|  | sbb                    r3d, r3d               ;  iteration_count++ | 
|  | cmp                    r4d, 150               ;if (eob > 150) | 
|  | sbb                    r3d, 0                 ;  iteration_count++ | 
|  | cmp                    r4d, 278               ;if (eob > 278) | 
|  | sbb                    r3d, -4                ;  iteration_count++ | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | lea                     r4, [dstq+8] | 
|  | mov             [rsp+16*3], r4 | 
|  | mov     [rsp+gprsize+16*3], r3d | 
|  | mov   [rsp+gprsize*2+16*3], coeffq | 
|  |  | 
|  | .loop: | 
|  | LOAD_8ROWS          coeffq, 64, 1 | 
|  | mova            [rsp+16*1], m6 | 
|  | pxor                    m6, m6 | 
|  | REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  | mova            [rsp+16*0], m2 | 
|  | mova            [rsp+16*1], m3 | 
|  | mova            [rsp+16*2], m4 | 
|  | mova                    m3, [o(pw_1697x16)] | 
|  | mova                    m4, [o(pw_16384)] | 
|  | REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 | 
|  | mova                    m2, [o(pw_8192)] | 
|  | REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1 | 
|  | mova                    m2, [rsp+16*0] | 
|  | mova            [rsp+16*0], m7 | 
|  | IDTX16                   2, 7, 3, 4 | 
|  | mova                    m7, [rsp+16*2] | 
|  | mova            [rsp+16*2], m5 | 
|  | IDTX16                   7, 5, 3, 4 | 
|  | mova                    m5, [rsp+16*1] | 
|  | mova            [rsp+16*1], m6 | 
|  | pmulhrsw                m3, m5 | 
|  | pmulhrsw                m3, m4 | 
|  | psrlw                   m4, 1 ; pw_8192 | 
|  | paddsw                  m3, m5 | 
|  | pmulhrsw                m2, m4 | 
|  | pmulhrsw                m3, m4 | 
|  | pmulhrsw                m4, m7 | 
|  | call  m(idct_8x8_internal_8bpc).end3 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .loop | 
|  | mov                 coeffq, [rsp+gprsize*2+16*3] | 
|  | add                 coeffq, 64*8 | 
|  | mov                    r3d, [rsp+gprsize+16*3] | 
|  | xor                   dstq, dstq | 
|  | mov     [rsp+gprsize+16*3], dstq | 
|  | mov                   dstq, [rsp+16*3] | 
|  | test                   r3d, r3d | 
|  | jnz .loop | 
|  | RET | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 12                ;0100b | 
|  | mov                    r5d, 136               ;1000 1000b | 
|  | cmp                   eobd, 44                ;if (eob > 43) | 
|  | cmovns                 r4d, r5d               ;  iteration_count+2 | 
|  | cmp                   eobd, 151               ;if (eob > 150) | 
|  | mov                    r3d, 34952             ;1000 1000 1000 1000b | 
|  | cmovs                  r3d, r4d               ;  iteration_count += 4 | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | lea                     r4, [dstq+8] | 
|  | mov             [rsp+16*3], r4 | 
|  |  | 
|  | .loop: | 
|  | LOAD_8ROWS          coeffq, 32, 1 | 
|  | REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 | 
|  | mova            [rsp+16*1], m6 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  | mova            [rsp+16*1], m5 | 
|  | mova            [rsp+16*2], m6 | 
|  | mova                    m6, [o(pw_1697x16)] | 
|  | REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 | 
|  | pmulhrsw                m7, [o(pw_2048)] | 
|  | mova                    m5, [rsp+16*1] | 
|  | mova            [rsp+16*0], m7 | 
|  | IDTX16                   5, 7, 6 | 
|  | mova                    m7, [rsp+16*2] | 
|  | IDTX16                   7, 6, 6 | 
|  | mova                    m6, [o(pw_2048)] | 
|  | REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 | 
|  | mova            [rsp+16*2], m5 | 
|  | mova            [rsp+16*1], m7 | 
|  | call  m(idct_8x8_internal_8bpc).end3 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | pxor                    m7, m7 | 
|  | REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 | 
|  |  | 
|  | .loop_end: | 
|  | add                 coeffq, 16 | 
|  | shr                    r3d, 2 | 
|  | jz .ret | 
|  | test                   r3d, 2 | 
|  | jnz .loop | 
|  | mov                    r4d, r3d | 
|  | and                    r4d, 1 | 
|  | lea                 coeffq, [coeffq+r4*8+32*7] | 
|  | mov                   dstq, [rsp+16*3] | 
|  | lea                     r4, [dstq+8] | 
|  | mov             [rsp+16*3], r4 | 
|  | jmp .loop | 
|  |  | 
|  | .ret: | 
|  | RET | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  |  | 
|  | call m(idct_32x32_internal_8bpc) | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_8192)] | 
|  | mov               [coeffq], eobd | 
|  | mov                    r3d, 32 | 
|  | lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body | 
|  |  | 
|  |  | 
|  | cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 2 | 
|  | sub                   eobd, 136 | 
|  | mov  [rsp+gprsize*1+16*35], eobd | 
|  | mov                    r3d, 4 | 
|  | cmovs                  r3d, r4d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | mov  [rsp+gprsize*2+16*35], coeffq | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_8ROWS     coeffq+64*1, 64*2 | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | mov                   tx2d, [rsp+gprsize*1+16*35] | 
|  | test                  tx2d, tx2d | 
|  | jl .fast | 
|  |  | 
|  | .full: | 
|  | LOAD_8ROWS     coeffq+64*0, 64*4 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+64*2, 64*4 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+64*17, 64*2 | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  | jmp .pass1_end | 
|  |  | 
|  | .fast: | 
|  | mova                    m0, [coeffq+256*0] | 
|  | mova                    m1, [coeffq+256*1] | 
|  | mova                    m2, [coeffq+256*2] | 
|  | mova                    m3, [coeffq+256*3] | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  |  | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | mova                    m0, [coeffq+128*1] | 
|  | mova                    m1, [coeffq+128*3] | 
|  | mova                    m2, [coeffq+128*5] | 
|  | mova                    m3, [coeffq+128*7] | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  |  | 
|  | .pass1_end: | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS     coeffq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS    coeffq+64*24, 64 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  |  | 
|  | .pass2: | 
|  | mov                 coeffq, [rsp+gprsize*2+16*35] | 
|  | mov                    r3d, 4 | 
|  | lea                   tx2q, [o(.pass2_end)] | 
|  |  | 
|  | .pass2_loop: | 
|  | mov  [rsp+gprsize*3+16*35], r3d | 
|  | lea                     r3, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*35], r3 | 
|  |  | 
|  | mova                    m0, [coeffq+16*4 ] | 
|  | mova                    m1, [coeffq+16*12] | 
|  | mova                    m2, [coeffq+16*20] | 
|  | mova                    m3, [coeffq+16*28] | 
|  | mova                    m4, [coeffq+16*5 ] | 
|  | mova                    m5, [coeffq+16*13] | 
|  | mova                    m6, [coeffq+16*21] | 
|  | mova                    m7, [coeffq+16*29] | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | mov                   eobd, [rsp+gprsize*1+16*35] | 
|  | test                  eobd, eobd | 
|  | jl .fast1 | 
|  |  | 
|  | .full1: | 
|  | mova                    m0, [coeffq+16*0 ] | 
|  | mova                    m1, [coeffq+16*16] | 
|  | mova                    m2, [coeffq+16*1 ] | 
|  | mova                    m3, [coeffq+16*17] | 
|  | mova                    m4, [coeffq+16*2 ] | 
|  | mova                    m5, [coeffq+16*18] | 
|  | mova                    m6, [coeffq+16*3 ] | 
|  | mova                    m7, [coeffq+16*19] | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ] | 
|  | mova                    m1, [coeffq+16*24] | 
|  | mova                    m2, [coeffq+16*9 ] | 
|  | mova                    m3, [coeffq+16*25] | 
|  | mova                    m4, [coeffq+16*10] | 
|  | mova                    m5, [coeffq+16*26] | 
|  | mova                    m6, [coeffq+16*11] | 
|  | mova                    m7, [coeffq+16*27] | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*6 ] | 
|  | mova                    m1, [coeffq+16*14] | 
|  | mova                    m2, [coeffq+16*22] | 
|  | mova                    m3, [coeffq+16*30] | 
|  | mova                    m4, [coeffq+16*7 ] | 
|  | mova                    m5, [coeffq+16*15] | 
|  | mova                    m6, [coeffq+16*23] | 
|  | mova                    m7, [coeffq+16*31] | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  | jmp                   tx2q | 
|  |  | 
|  | .fast1: | 
|  | mova                    m0, [coeffq+16*0 ] | 
|  | mova                    m1, [coeffq+16*16] | 
|  | mova                    m2, [coeffq+16*1 ] | 
|  | mova                    m3, [coeffq+16*17] | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ] | 
|  | mova                    m1, [coeffq+16*24] | 
|  | mova                    m2, [coeffq+16*9 ] | 
|  | mova                    m3, [coeffq+16*25] | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | jmp                   tx2q | 
|  |  | 
|  | .pass2_end: | 
|  | lea                     r3, [o(.pass2_end1)] | 
|  | jmp  m(idct_8x32_internal_8bpc).end | 
|  |  | 
|  | .pass2_end1: | 
|  | lea                   tx2q, [o(.pass2_end)] | 
|  | add                 coeffq, 16*32 | 
|  | mov                   dstq, [rsp+gprsize*2+16*35] | 
|  | mov                    r3d, [rsp+gprsize*3+16*35] | 
|  | dec                    r3d | 
|  | jg .pass2_loop | 
|  |  | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 2 | 
|  | cmp                   eobd, 136 | 
|  | mov                    r3d, 4 | 
|  | cmovs                  r3d, r4d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | lea                     r4, [dstq+8] | 
|  | mov   [rsp+gprsize*0+16*3], r4 | 
|  | mov   [rsp+gprsize*1+16*3], r3d | 
|  | mov   [rsp+gprsize*2+16*3], r3d | 
|  | mov   [rsp+gprsize*3+16*3], coeffq | 
|  |  | 
|  | .loop: | 
|  | LOAD_8ROWS          coeffq, 64 | 
|  | mova            [rsp+16*1], m6 | 
|  | lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)] | 
|  | call  m(idct_8x8_internal_8bpc).pass1_end3 | 
|  | pmulhrsw                m7, [o(pw_8192)] | 
|  | mova            [rsp+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | mova            [rsp+16*1], m6 | 
|  | mova            [rsp+16*2], m5 | 
|  | call  m(idct_8x8_internal_8bpc).end3 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  |  | 
|  | pxor                    m7, m7 | 
|  | REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .loop | 
|  |  | 
|  | mov                    r4d, [rsp+gprsize*2+16*3] | 
|  | dec                    r4d | 
|  | jle .ret | 
|  |  | 
|  | mov                   dstq, [rsp+gprsize*0+16*3] | 
|  | mov                 coeffq, [rsp+gprsize*3+16*3] | 
|  | mov   [rsp+gprsize*2+16*3], r4 | 
|  | lea                     r3, [dstq+8] | 
|  | add                 coeffq, 64*8 | 
|  | mov   [rsp+gprsize*0+16*3], r3 | 
|  | mov                    r3d, [rsp+gprsize*1+16*3] | 
|  | mov   [rsp+gprsize*3+16*3], coeffq | 
|  | jmp .loop | 
|  |  | 
|  | .ret: | 
|  | RET | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call m(idct_16x64_internal_8bpc) | 
|  | .end: | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_8192)] | 
|  | mov               [coeffq], eobd | 
|  | mov                    r2d, 32 | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly | 
|  |  | 
|  |  | 
|  | cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 2 | 
|  | sub                   eobd, 151 | 
|  | mov  [rsp+gprsize*1+16*67], eobd | 
|  | mov                    r3d, 4 | 
|  | cmovs                  r3d, r4d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | mov  [rsp+gprsize*2+16*67], coeffq | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_8ROWS     coeffq+64*0, 64*2 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+64*1, 64*2 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+64*8, 64 | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+64*0, 64 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  | mov                 coeffq, [rsp+gprsize*2+16*67] | 
|  | mov                    r3d, 2 | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(.end1)] | 
|  |  | 
|  | .pass2_loop: | 
|  | mov  [rsp+gprsize*3+16*67], r3d | 
|  | mov                   eobd, [rsp+gprsize*1+16*67] | 
|  |  | 
|  | mova                    m0, [coeffq+16*4 ]            ;in1 | 
|  | mova                    m1, [coeffq+16*12]            ;in3 | 
|  | mova                    m2, [coeffq+16*20]            ;in5 | 
|  | mova                    m3, [coeffq+16*28]            ;in7 | 
|  | mova                    m4, [coeffq+16*5 ]            ;in9 | 
|  | mova                    m5, [coeffq+16*13]            ;in11 | 
|  | mova                    m6, [coeffq+16*21]            ;in13 | 
|  | mova                    m7, [coeffq+16*29]            ;in15 | 
|  | mova   [rsp+gprsize+16*35], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*49], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*43], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*41], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*39], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*45], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*47], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*37], m7                        ;in15 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | mova                    m0, [coeffq+16*0] | 
|  | mova                    m1, [coeffq+16*1] | 
|  |  | 
|  | test                  eobd, eobd | 
|  | jl .fast | 
|  |  | 
|  | .full: | 
|  | mova                    m2, [coeffq+16*2] | 
|  | mova                    m3, [coeffq+16*3] | 
|  |  | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | mova                    m0, [coeffq+16*16] | 
|  | mova                    m1, [coeffq+16*17] | 
|  | mova                    m2, [coeffq+16*18] | 
|  | mova                    m3, [coeffq+16*19] | 
|  |  | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ] | 
|  | mova                    m1, [coeffq+16*24] | 
|  | mova                    m2, [coeffq+16*9 ] | 
|  | mova                    m3, [coeffq+16*25] | 
|  | mova                    m4, [coeffq+16*10] | 
|  | mova                    m5, [coeffq+16*26] | 
|  | mova                    m6, [coeffq+16*11] | 
|  | mova                    m7, [coeffq+16*27] | 
|  | mova   [rsp+gprsize+16*19], m0 | 
|  | mova   [rsp+gprsize+16*26], m1 | 
|  | mova   [rsp+gprsize+16*23], m2 | 
|  | mova   [rsp+gprsize+16*22], m3 | 
|  | mova   [rsp+gprsize+16*21], m4 | 
|  | mova   [rsp+gprsize+16*24], m5 | 
|  | mova   [rsp+gprsize+16*25], m6 | 
|  | mova   [rsp+gprsize+16*20], m7 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | SAVE_8ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*6 ]            ;in17 | 
|  | mova                    m1, [coeffq+16*14]            ;in19 | 
|  | mova                    m2, [coeffq+16*22]            ;in21 | 
|  | mova                    m3, [coeffq+16*30]            ;in23 | 
|  | mova                    m4, [coeffq+16*7 ]            ;in25 | 
|  | mova                    m5, [coeffq+16*15]            ;in27 | 
|  | mova                    m6, [coeffq+16*23]            ;in29 | 
|  | mova                    m7, [coeffq+16*31]            ;in31 | 
|  | mova   [rsp+gprsize+16*63], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*53], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*55], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*61], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*59], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*57], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*51], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*65], m7                        ;in31 | 
|  |  | 
|  | call .main | 
|  | jmp  .end | 
|  |  | 
|  | .fast: | 
|  | REPX          {mova x, m4}, m2, m3, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | mova                    m0, [coeffq+16*16] | 
|  | mova                    m1, [coeffq+16*17] | 
|  |  | 
|  | REPX          {mova x, m4}, m2, m3, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | mova                    m0, [coeffq+16*8 ] | 
|  | mova                    m1, [coeffq+16*24] | 
|  | mova                    m2, [coeffq+16*9 ] | 
|  | mova                    m3, [coeffq+16*25] | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_veryfast | 
|  | SAVE_8ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | call .main_fast | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mov                     r3, r4 | 
|  | jmp  m(idct_8x32_internal_8bpc).end2 | 
|  |  | 
|  | .end1: | 
|  | LOAD_8ROWS   rsp+gprsize+16*35, 16 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | lea                     r3, [rsp+16*32+gprsize] | 
|  | call .write | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | mov                    r3d, [rsp+gprsize*3+16*67] | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(.end1)] | 
|  |  | 
|  | dec                    r3d | 
|  | jg .pass2_loop | 
|  | ret | 
|  | .write: | 
|  | mova             [r3+16*0], m7 | 
|  | mov                     r4, -16*32 | 
|  | pxor                    m7, m7 | 
|  | sub                 coeffq, r4 | 
|  | .zero_loop: | 
|  | mova      [coeffq+r4+16*0], m7 | 
|  | mova      [coeffq+r4+16*1], m7 | 
|  | add                     r4, 16*2 | 
|  | jl .zero_loop | 
|  | call .write_main2 | 
|  | LOAD_8ROWS        r3+16*11, 16 | 
|  | call .write_main | 
|  | LOAD_8ROWS        r3+16*19, 16 | 
|  | call .write_main | 
|  | LOAD_8ROWS        r3+16*27, 16 | 
|  | .write_main: | 
|  | mova             [r3+16*0], m7 | 
|  | .write_main2: | 
|  | mova                    m7, [o(pw_2048)] | 
|  | REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 | 
|  | pmulhrsw                m7, [r3+16*0] | 
|  | mova             [r3+16*2], m5 | 
|  | mova             [r3+16*1], m6 | 
|  | mova             [r3+16*0], m7 | 
|  | WRITE_8X4                0, 1, 2, 3, 5, 6, 7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | WRITE_8X4                4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | ret | 
|  |  | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main_fast | 
|  | mova                    m0, [rsp+gprsize*2+16*35]     ;in1 | 
|  | pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63 | 
|  | pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33 | 
|  | mova                    m7, [o(pd_2048)] | 
|  | mova [rsp+gprsize*2+16*35], m0                        ;t32 | 
|  | mova [rsp+gprsize*2+16*66], m3                        ;t63 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a | 
|  | mova [rsp+gprsize*2+16*36], m3                        ;t33a | 
|  | mova [rsp+gprsize*2+16*65], m0                        ;t62a | 
|  |  | 
|  | mova                    m1, [rsp+gprsize*2+16*37]     ;in15 | 
|  | pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61 | 
|  | pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35 | 
|  | mova [rsp+gprsize*2+16*38], m1                        ;t35 | 
|  | mova [rsp+gprsize*2+16*63], m2                        ;t60 | 
|  | ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a | 
|  | mova [rsp+gprsize*2+16*37], m2                        ;t34a | 
|  | mova [rsp+gprsize*2+16*64], m1                        ;t61a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*39]     ;in9 | 
|  | pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59 | 
|  | pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37 | 
|  | mova [rsp+gprsize*2+16*39], m0                        ;t36 | 
|  | mova [rsp+gprsize*2+16*62], m3                        ;t59 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a | 
|  | mova [rsp+gprsize*2+16*40], m3                        ;t37a | 
|  | mova [rsp+gprsize*2+16*61], m0                        ;t58a | 
|  |  | 
|  | mova                    m1, [rsp+gprsize*2+16*41]     ;in7 | 
|  | pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57 | 
|  | pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39 | 
|  | mova [rsp+gprsize*2+16*42], m1                        ;t39 | 
|  | mova [rsp+gprsize*2+16*59], m2                        ;t56 | 
|  | ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a | 
|  | mova [rsp+gprsize*2+16*41], m2                        ;t38a | 
|  | mova [rsp+gprsize*2+16*60], m1                        ;t57a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*43]     ;in5 | 
|  | pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55 | 
|  | pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41 | 
|  | mova [rsp+gprsize*2+16*43], m0                        ;t40 | 
|  | mova [rsp+gprsize*2+16*58], m3                        ;t55 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a | 
|  | mova [rsp+gprsize*2+16*44], m3                        ;t41a | 
|  | mova [rsp+gprsize*2+16*57], m0                        ;t54a | 
|  |  | 
|  | mova                    m1, [rsp+gprsize*2+16*45]     ;in11 | 
|  | pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53 | 
|  | pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43 | 
|  | mova [rsp+gprsize*2+16*46], m1                        ;t43 | 
|  | mova [rsp+gprsize*2+16*55], m2                        ;t52 | 
|  | ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a | 
|  | mova [rsp+gprsize*2+16*45], m2                        ;t42a | 
|  | mova [rsp+gprsize*2+16*56], m1                        ;t53a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*47]     ;in13 | 
|  | pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51 | 
|  | pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45 | 
|  | mova                    m6, m0 | 
|  | mova [rsp+gprsize*2+16*54], m3                        ;t51 | 
|  | ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a | 
|  | mova [rsp+gprsize*2+16*48], m3                        ;t45a | 
|  | mova [rsp+gprsize*2+16*53], m0                        ;t50a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*49]     ;in3 | 
|  | pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49 | 
|  | pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47 | 
|  | mova                    m4, m3 | 
|  | mova                    m5, m0 | 
|  |  | 
|  | jmp .main2 | 
|  |  | 
|  | ALIGN function_align | 
|  | cglobal_label .main | 
|  | mova                    m0, [rsp+gprsize*2+16*35]     ;in1 | 
|  | mova                    m1, [rsp+gprsize*2+16*65]     ;in31 | 
|  | pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a | 
|  | pmulhrsw                m0, [o(pw_101x8)]             ;t32a | 
|  | pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a | 
|  | pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a | 
|  | mova                    m7, [o(pd_2048)] | 
|  | psubsw                  m4, m0, m1                    ;t33 | 
|  | paddsw                  m0, m1                        ;t32 | 
|  | psubsw                  m5, m3, m2                    ;t62 | 
|  | paddsw                  m3, m2                        ;t63 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a | 
|  | mova [rsp+gprsize*2+16*35], m0                        ;t32 | 
|  | mova [rsp+gprsize*2+16*36], m5                        ;t33a | 
|  | mova [rsp+gprsize*2+16*65], m4                        ;t62a | 
|  | mova [rsp+gprsize*2+16*66], m3                        ;t63 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*63]     ;in17 | 
|  | mova                    m1, [rsp+gprsize*2+16*37]     ;in15 | 
|  | pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a | 
|  | pmulhrsw                m0, [o(pw_1660x8)]            ;t34a | 
|  | pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a | 
|  | pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a | 
|  | psubsw                  m4, m1, m0                    ;t34 | 
|  | paddsw                  m0, m1                        ;t35 | 
|  | psubsw                  m5, m2, m3                    ;t61 | 
|  | paddsw                  m3, m2                        ;t60 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a | 
|  | mova [rsp+gprsize*2+16*37], m5                        ;t34a | 
|  | mova [rsp+gprsize*2+16*38], m0                        ;t35 | 
|  | mova [rsp+gprsize*2+16*63], m3                        ;t60 | 
|  | mova [rsp+gprsize*2+16*64], m4                        ;t61a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*39]     ;in9 | 
|  | mova                    m1, [rsp+gprsize*2+16*61]     ;in23 | 
|  | pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a | 
|  | pmulhrsw                m0, [o(pw_897x8)]             ;t36a | 
|  | pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a | 
|  | pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a | 
|  | psubsw                  m4, m0, m1                    ;t37 | 
|  | paddsw                  m0, m1                        ;t36 | 
|  | psubsw                  m5, m3, m2                    ;t58 | 
|  | paddsw                  m3, m2                        ;t59 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a | 
|  | mova [rsp+gprsize*2+16*39], m0                        ;t36 | 
|  | mova [rsp+gprsize*2+16*40], m5                        ;t37a | 
|  | mova [rsp+gprsize*2+16*61], m4                        ;t58a | 
|  | mova [rsp+gprsize*2+16*62], m3                        ;t59 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*59]     ;in25 | 
|  | mova                    m1, [rsp+gprsize*2+16*41]     ;in7 | 
|  | pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a | 
|  | pmulhrsw                m0, [o(pw_2359x8)]            ;t38a | 
|  | pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a | 
|  | pmulhrsw                m1, [o(pw_m700x8)]            ;t39a | 
|  | psubsw                  m4, m1, m0                    ;t38 | 
|  | paddsw                  m0, m1                        ;t39 | 
|  | psubsw                  m5, m2, m3                    ;t57 | 
|  | paddsw                  m3, m2                        ;t56 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a | 
|  | mova [rsp+gprsize*2+16*41], m5                        ;t38a | 
|  | mova [rsp+gprsize*2+16*42], m0                        ;t39 | 
|  | mova [rsp+gprsize*2+16*59], m3                        ;t56 | 
|  | mova [rsp+gprsize*2+16*60], m4                        ;t57a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*43]     ;in5 | 
|  | mova                    m1, [rsp+gprsize*2+16*57]     ;in27 | 
|  | pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a | 
|  | pmulhrsw                m0, [o(pw_501x8)]             ;t40a | 
|  | pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a | 
|  | pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a | 
|  | psubsw                  m4, m0, m1                    ;t41 | 
|  | paddsw                  m0, m1                        ;t40 | 
|  | psubsw                  m5, m3, m2                    ;t54 | 
|  | paddsw                  m3, m2                        ;t55 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a | 
|  | mova [rsp+gprsize*2+16*43], m0                        ;t40 | 
|  | mova [rsp+gprsize*2+16*44], m5                        ;t41a | 
|  | mova [rsp+gprsize*2+16*57], m4                        ;t54a | 
|  | mova [rsp+gprsize*2+16*58], m3                        ;t55 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*55]     ;in21 | 
|  | mova                    m1, [rsp+gprsize*2+16*45]     ;in11 | 
|  | pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a | 
|  | pmulhrsw                m0, [o(pw_2019x8)]            ;t42a | 
|  | pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a | 
|  | pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a | 
|  | psubsw                  m4, m1, m0                    ;t42 | 
|  | paddsw                  m0, m1                        ;t43 | 
|  | psubsw                  m5, m2, m3                    ;t53 | 
|  | paddsw                  m3, m2                        ;t52 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a | 
|  | mova [rsp+gprsize*2+16*45], m5                        ;t42a | 
|  | mova [rsp+gprsize*2+16*46], m0                        ;t43 | 
|  | mova [rsp+gprsize*2+16*55], m3                        ;t52 | 
|  | mova [rsp+gprsize*2+16*56], m4                        ;t53a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*47]     ;in13 | 
|  | mova                    m1, [rsp+gprsize*2+16*53]     ;in19 | 
|  | pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a | 
|  | pmulhrsw                m0, [o(pw_1285x8)]            ;t44a | 
|  | pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a | 
|  | pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a | 
|  | psubsw                  m4, m0, m1                    ;t45 | 
|  | paddsw                  m0, m1                        ;t44 | 
|  | psubsw                  m5, m3, m2                    ;t50 | 
|  | paddsw                  m3, m2                        ;t51 | 
|  | ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a | 
|  | mova                    m6, m0 | 
|  | mova [rsp+gprsize*2+16*48], m5                        ;t45a | 
|  | mova [rsp+gprsize*2+16*53], m4                        ;t50a | 
|  | mova [rsp+gprsize*2+16*54], m3                        ;t51 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*51]     ;in29 | 
|  | mova                    m1, [rsp+gprsize*2+16*49]     ;in3 | 
|  | pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a | 
|  | pmulhrsw                m0, [o(pw_2675x8)]            ;t46a | 
|  | pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a | 
|  | pmulhrsw                m1, [o(pw_m301x8)]            ;t47a | 
|  | psubsw                  m5, m1, m0                    ;t46 | 
|  | paddsw                  m0, m1                        ;t47 | 
|  | psubsw                  m4, m2, m3                    ;t49 | 
|  | paddsw                  m3, m2                        ;t48 | 
|  |  | 
|  | ALIGN function_align | 
|  | .main2: | 
|  | ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a | 
|  | mova                    m1, [rsp+gprsize*2+16*54]     ;t51 | 
|  | psubsw                  m2, m0, m6                    ;t44a | 
|  | paddsw                  m0, m6                        ;t47a | 
|  | psubsw                  m6, m3, m1                    ;t51a | 
|  | paddsw                  m3, m1                        ;t48a | 
|  | mova [rsp+gprsize*2+16*50], m0                        ;t47a | 
|  | mova [rsp+gprsize*2+16*51], m3                        ;t48a | 
|  | ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51 | 
|  | mova [rsp+gprsize*2+16*47], m6                        ;t44 | 
|  | mova [rsp+gprsize*2+16*54], m2                        ;t51 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*48]     ;t45a | 
|  | mova                    m3, [rsp+gprsize*2+16*53]     ;t50a | 
|  | psubsw                  m2, m4, m0                    ;t45 | 
|  | paddsw                  m4, m0                        ;t46 | 
|  | psubsw                  m6, m5, m3                    ;t50 | 
|  | paddsw                  m5, m3                        ;t49 | 
|  | ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a | 
|  | mova [rsp+gprsize*2+16*48], m6                        ;t45a | 
|  | mova [rsp+gprsize*2+16*49], m4                        ;t46 | 
|  | mova [rsp+gprsize*2+16*52], m5                        ;t49 | 
|  | mova [rsp+gprsize*2+16*53], m2                        ;t50a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*43]     ;t40 | 
|  | mova                    m2, [rsp+gprsize*2+16*46]     ;t43 | 
|  | mova                    m3, [rsp+gprsize*2+16*55]     ;t52 | 
|  | mova                    m1, [rsp+gprsize*2+16*58]     ;t55 | 
|  | psubsw                  m4, m0, m2                    ;t43a | 
|  | paddsw                  m0, m2                        ;t40a | 
|  | psubsw                  m5, m1, m3                    ;t52a | 
|  | paddsw                  m1, m3                        ;t55a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52 | 
|  | mova [rsp+gprsize*2+16*43], m0                        ;t40a | 
|  | mova [rsp+gprsize*2+16*46], m5                        ;t43 | 
|  | mova [rsp+gprsize*2+16*55], m4                        ;t52 | 
|  | mova [rsp+gprsize*2+16*58], m1                        ;t55a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*44]     ;t41a | 
|  | mova                    m2, [rsp+gprsize*2+16*45]     ;t42a | 
|  | mova                    m3, [rsp+gprsize*2+16*56]     ;t53a | 
|  | mova                    m1, [rsp+gprsize*2+16*57]     ;t54a | 
|  | psubsw                  m4, m0, m2                    ;t42 | 
|  | paddsw                  m0, m2                        ;t41 | 
|  | psubsw                  m5, m1, m3                    ;t53 | 
|  | paddsw                  m1, m3                        ;t54 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a | 
|  | mova [rsp+gprsize*2+16*44], m0                        ;t41 | 
|  | mova [rsp+gprsize*2+16*45], m5                        ;t42a | 
|  | mova [rsp+gprsize*2+16*56], m4                        ;t53a | 
|  | mova [rsp+gprsize*2+16*57], m1                        ;t54 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*41]     ;t38a | 
|  | mova                    m2, [rsp+gprsize*2+16*40]     ;t37a | 
|  | mova                    m3, [rsp+gprsize*2+16*61]     ;t58a | 
|  | mova                    m1, [rsp+gprsize*2+16*60]     ;t57a | 
|  | psubsw                  m4, m0, m2                    ;t37 | 
|  | paddsw                  m0, m2                        ;t38 | 
|  | psubsw                  m5, m1, m3                    ;t58 | 
|  | paddsw                  m1, m3                        ;t57 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a | 
|  | mova [rsp+gprsize*2+16*41], m0                        ;t38 | 
|  | mova [rsp+gprsize*2+16*40], m5                        ;t37a | 
|  | mova [rsp+gprsize*2+16*61], m4                        ;t58a | 
|  | mova [rsp+gprsize*2+16*60], m1                        ;t57 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*42]     ;t39 | 
|  | mova                    m2, [rsp+gprsize*2+16*39]     ;t36 | 
|  | mova                    m3, [rsp+gprsize*2+16*62]     ;t59 | 
|  | mova                    m1, [rsp+gprsize*2+16*59]     ;t56 | 
|  | psubsw                  m4, m0, m2                    ;t36a | 
|  | paddsw                  m0, m2                        ;t39a | 
|  | psubsw                  m5, m1, m3                    ;t59a | 
|  | paddsw                  m1, m3                        ;t56a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59 | 
|  | mova [rsp+gprsize*2+16*42], m0                        ;t39a | 
|  | mova [rsp+gprsize*2+16*39], m5                        ;t36 | 
|  | mova [rsp+gprsize*2+16*62], m4                        ;t59 | 
|  | mova [rsp+gprsize*2+16*59], m1                        ;t56a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*35]     ;t32 | 
|  | mova                    m2, [rsp+gprsize*2+16*38]     ;t35 | 
|  | mova                    m3, [rsp+gprsize*2+16*63]     ;t60 | 
|  | mova                    m1, [rsp+gprsize*2+16*66]     ;t63 | 
|  | psubsw                  m4, m0, m2                    ;t35a | 
|  | paddsw                  m0, m2                        ;t32a | 
|  | psubsw                  m5, m1, m3                    ;t60a | 
|  | paddsw                  m1, m3                        ;t63a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60 | 
|  | mova [rsp+gprsize*2+16*35], m0                        ;t32a | 
|  | mova [rsp+gprsize*2+16*38], m5                        ;t35 | 
|  | mova [rsp+gprsize*2+16*63], m4                        ;t60 | 
|  | mova [rsp+gprsize*2+16*66], m1                        ;t63a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*36]     ;t33a | 
|  | mova                    m2, [rsp+gprsize*2+16*37]     ;t34a | 
|  | mova                    m3, [rsp+gprsize*2+16*64]     ;t61a | 
|  | mova                    m1, [rsp+gprsize*2+16*65]     ;t62a | 
|  | psubsw                  m4, m0, m2                    ;t34 | 
|  | paddsw                  m0, m2                        ;t33 | 
|  | psubsw                  m5, m1, m3                    ;t61 | 
|  | paddsw                  m1, m3                        ;t62 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a | 
|  |  | 
|  | mova                    m2, [rsp+gprsize*2+16*41]     ;t38 | 
|  | mova                    m3, [rsp+gprsize*2+16*60]     ;t57 | 
|  | psubsw                  m6, m0, m2                    ;t38a | 
|  | paddsw                  m0, m2                        ;t33a | 
|  | psubsw                  m2, m1, m3                    ;t57a | 
|  | paddsw                  m1, m3                        ;t62a | 
|  | mova [rsp+gprsize*2+16*36], m0                        ;t33a | 
|  | mova [rsp+gprsize*2+16*65], m1                        ;t62a | 
|  | ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57 | 
|  | mova [rsp+gprsize*2+16*41], m2                        ;t38 | 
|  | mova [rsp+gprsize*2+16*60], m6                        ;t57 | 
|  |  | 
|  | mova                    m2, [rsp+gprsize*2+16*40]     ;t37 | 
|  | mova                    m3, [rsp+gprsize*2+16*61]     ;t58 | 
|  | psubsw                  m0, m5, m2                    ;t37 | 
|  | paddsw                  m5, m2                        ;t34 | 
|  | psubsw                  m1, m4, m3                    ;t58 | 
|  | paddsw                  m4, m3                        ;t61 | 
|  | ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a | 
|  | mova [rsp+gprsize*2+16*37], m5                        ;t34 | 
|  | mova [rsp+gprsize*2+16*64], m4                        ;t61 | 
|  | mova [rsp+gprsize*2+16*40], m1                        ;t37a | 
|  | mova [rsp+gprsize*2+16*61], m0                        ;t58a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*38]     ;t35 | 
|  | mova                    m2, [rsp+gprsize*2+16*39]     ;t36 | 
|  | mova                    m3, [rsp+gprsize*2+16*62]     ;t59 | 
|  | mova                    m1, [rsp+gprsize*2+16*63]     ;t60 | 
|  | psubsw                  m4, m0, m2                    ;t36a | 
|  | paddsw                  m0, m2                        ;t35a | 
|  | psubsw                  m5, m1, m3                    ;t59a | 
|  | paddsw                  m1, m3                        ;t60a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59 | 
|  | mova [rsp+gprsize*2+16*38], m0                        ;t35a | 
|  | mova [rsp+gprsize*2+16*39], m5                        ;t36 | 
|  | mova [rsp+gprsize*2+16*62], m4                        ;t59 | 
|  | mova [rsp+gprsize*2+16*63], m1                        ;t60a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*35]     ;t32a | 
|  | mova                    m2, [rsp+gprsize*2+16*42]     ;t39a | 
|  | mova                    m3, [rsp+gprsize*2+16*59]     ;t56a | 
|  | mova                    m1, [rsp+gprsize*2+16*66]     ;t63a | 
|  | psubsw                  m4, m0, m2                    ;t39 | 
|  | paddsw                  m0, m2                        ;t32 | 
|  | psubsw                  m5, m1, m3                    ;t56 | 
|  | paddsw                  m1, m3                        ;t63 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a | 
|  | mova [rsp+gprsize*2+16*35], m0                        ;t32 | 
|  | mova [rsp+gprsize*2+16*42], m5                        ;t39a | 
|  | mova [rsp+gprsize*2+16*59], m4                        ;t56a | 
|  | mova [rsp+gprsize*2+16*66], m1                        ;t63 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*50]     ;t47a | 
|  | mova                    m2, [rsp+gprsize*2+16*43]     ;t40a | 
|  | mova                    m3, [rsp+gprsize*2+16*58]     ;t55a | 
|  | mova                    m1, [rsp+gprsize*2+16*51]     ;t48a | 
|  | psubsw                  m4, m0, m2                    ;t40 | 
|  | paddsw                  m0, m2                        ;t47 | 
|  | psubsw                  m5, m1, m3                    ;t55 | 
|  | paddsw                  m1, m3                        ;t48 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a | 
|  | mova [rsp+gprsize*2+16*50], m0                        ;t47 | 
|  | mova [rsp+gprsize*2+16*43], m5                        ;t40a | 
|  | mova [rsp+gprsize*2+16*58], m4                        ;t55a | 
|  | mova [rsp+gprsize*2+16*51], m1                        ;t48 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*49]     ;t46 | 
|  | mova                    m2, [rsp+gprsize*2+16*44]     ;t41 | 
|  | mova                    m3, [rsp+gprsize*2+16*57]     ;t54 | 
|  | mova                    m1, [rsp+gprsize*2+16*52]     ;t49 | 
|  | psubsw                  m4, m0, m2                    ;t41a | 
|  | paddsw                  m0, m2                        ;t46a | 
|  | psubsw                  m5, m1, m3                    ;t54a | 
|  | paddsw                  m1, m3                        ;t49a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54 | 
|  | mova [rsp+gprsize*2+16*49], m0                        ;t46a | 
|  | mova [rsp+gprsize*2+16*44], m5                        ;t41 | 
|  | mova [rsp+gprsize*2+16*57], m4                        ;t54 | 
|  | mova [rsp+gprsize*2+16*52], m1                        ;t49a | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*48]     ;t45a | 
|  | mova                    m2, [rsp+gprsize*2+16*45]     ;t42a | 
|  | mova                    m3, [rsp+gprsize*2+16*56]     ;t53a | 
|  | mova                    m1, [rsp+gprsize*2+16*53]     ;t50a | 
|  | psubsw                  m4, m0, m2                    ;t42 | 
|  | paddsw                  m0, m2                        ;t45 | 
|  | psubsw                  m5, m1, m3                    ;t53 | 
|  | paddsw                  m1, m3                        ;t50 | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a | 
|  | mova [rsp+gprsize*2+16*48], m0                        ;t45 | 
|  | mova [rsp+gprsize*2+16*45], m5                        ;t42a | 
|  | mova [rsp+gprsize*2+16*56], m4                        ;t53a | 
|  | mova [rsp+gprsize*2+16*53], m1                        ;t50 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*47]     ;t44 | 
|  | mova                    m2, [rsp+gprsize*2+16*46]     ;t43 | 
|  | mova                    m3, [rsp+gprsize*2+16*55]     ;t52 | 
|  | mova                    m1, [rsp+gprsize*2+16*54]     ;t51 | 
|  | psubsw                  m4, m0, m2                    ;t43a | 
|  | paddsw                  m0, m2                        ;t44a | 
|  | psubsw                  m5, m1, m3                    ;t52a | 
|  | paddsw                  m1, m3                        ;t51a | 
|  | ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52 | 
|  |  | 
|  | mova                    m2, [rsp+gprsize*2+16*38]     ;t35a | 
|  | mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28] | 
|  | psubsw                  m6, m2, m0                    ;t44 | 
|  | paddsw                  m2, m0                        ;t35 | 
|  | psubsw                  m0, m3, m2                    ;out35 | 
|  | paddsw                  m2, m3                        ;out28 | 
|  | mova                    m3, [rsp+gprsize*2+16*63]     ;t60a | 
|  | mova [rsp+gprsize*2+16*38], m0                        ;out35 | 
|  | mova [rsp+gprsize*2+16*31], m2                        ;out28 | 
|  | psubsw                  m0, m3, m1                    ;t51 | 
|  | paddsw                  m3, m1                        ;t60 | 
|  | ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a | 
|  | mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3] | 
|  | psubsw                  m1, m2, m3                    ;out60 | 
|  | paddsw                  m2, m3                        ;out3 | 
|  | mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19] | 
|  | mova [rsp+gprsize*2+16*63], m1                        ;out60 | 
|  | mova [rsp+gprsize*2+16*6 ], m2                        ;out3 | 
|  | psubsw                  m1, m3, m0                    ;out44 | 
|  | paddsw                  m3, m0                        ;out19 | 
|  | mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12] | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*39]     ;t36 | 
|  | mova [rsp+gprsize*2+16*47], m1                        ;out44 | 
|  | mova [rsp+gprsize*2+16*22], m3                        ;out19 | 
|  | mova                    m1, [rsp+gprsize*2+16*62]     ;t59 | 
|  | psubsw                  m3, m2, m6                    ;out51 | 
|  | paddsw                  m2, m6                        ;out12 | 
|  | mova [rsp+gprsize*2+16*54], m3                        ;out51 | 
|  | mova [rsp+gprsize*2+16*15], m2                        ;out12 | 
|  | psubsw                  m2, m0, m5                    ;t43a | 
|  | paddsw                  m0, m5                        ;t36a | 
|  | mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27] | 
|  | psubsw                  m3, m1, m4                    ;t52a | 
|  | paddsw                  m1, m4                        ;t59a | 
|  | ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52 | 
|  | mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ] | 
|  | psubsw                  m6, m5, m0                    ;out36 | 
|  | paddsw                  m5, m0                        ;out27 | 
|  | psubsw                  m0, m4, m1                    ;out59 | 
|  | paddsw                  m4, m1                        ;out4 | 
|  | mova [rsp+gprsize*2+16*39], m6                        ;out36 | 
|  | mova [rsp+gprsize*2+16*30], m5                        ;out27 | 
|  | mova [rsp+gprsize*2+16*62], m0                        ;out59 | 
|  | mova [rsp+gprsize*2+16*7 ], m4                        ;out4 | 
|  | mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20] | 
|  | mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11] | 
|  | psubsw                  m4, m0, m3                    ;out43 | 
|  | paddsw                  m0, m3                        ;out20 | 
|  | psubsw                  m6, m5, m2                    ;out52 | 
|  | paddsw                  m5, m2                        ;out11 | 
|  | mova [rsp+gprsize*2+16*46], m4                        ;out43 | 
|  | mova [rsp+gprsize*2+16*23], m0                        ;out20 | 
|  | mova [rsp+gprsize*2+16*55], m6                        ;out52 | 
|  | mova [rsp+gprsize*2+16*14], m5                        ;out11 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*40]     ;t37a | 
|  | mova                    m5, [rsp+gprsize*2+16*45]     ;t42a | 
|  | mova                    m3, [rsp+gprsize*2+16*56]     ;t53a | 
|  | mova                    m1, [rsp+gprsize*2+16*61]     ;t58a | 
|  | mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26] | 
|  | psubsw                  m4, m0, m5                    ;t42 | 
|  | paddsw                  m0, m5                        ;t37 | 
|  | psubsw                  m5, m1, m3                    ;t53 | 
|  | paddsw                  m1, m3                        ;t58 | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52 | 
|  | mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ] | 
|  | psubsw                  m6, m2, m0                    ;out37 | 
|  | paddsw                  m2, m0                        ;out26 | 
|  | psubsw                  m0, m3, m1                    ;out58 | 
|  | paddsw                  m3, m1                        ;out5 | 
|  | mova [rsp+gprsize*2+16*40], m6                        ;out37 | 
|  | mova [rsp+gprsize*2+16*29], m2                        ;out26 | 
|  | mova [rsp+gprsize*2+16*61], m0                        ;out58 | 
|  | mova [rsp+gprsize*2+16*8 ], m3                        ;out5 | 
|  | mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21] | 
|  | mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10] | 
|  | psubsw                  m2, m0, m5                    ;out42 | 
|  | paddsw                  m0, m5                        ;out21 | 
|  | psubsw                  m3, m1, m4                    ;out53 | 
|  | paddsw                  m1, m4                        ;out10 | 
|  | mova [rsp+gprsize*2+16*45], m2                        ;out42 | 
|  | mova [rsp+gprsize*2+16*24], m0                        ;out21 | 
|  | mova [rsp+gprsize*2+16*56], m3                        ;out53 | 
|  | mova [rsp+gprsize*2+16*13], m1                        ;out10 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*41]     ;t38 | 
|  | mova                    m5, [rsp+gprsize*2+16*44]     ;t41 | 
|  | mova                    m3, [rsp+gprsize*2+16*57]     ;t54 | 
|  | mova                    m1, [rsp+gprsize*2+16*60]     ;t57 | 
|  | mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25] | 
|  | psubsw                  m4, m0, m5                    ;t41a | 
|  | paddsw                  m0, m5                        ;t38a | 
|  | psubsw                  m5, m1, m3                    ;t54a | 
|  | paddsw                  m1, m3                        ;t57a | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a | 
|  | mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ] | 
|  | psubsw                  m6, m2, m0                    ;out38 | 
|  | paddsw                  m2, m0                        ;out25 | 
|  | psubsw                  m0, m3, m1                    ;out57 | 
|  | paddsw                  m3, m1                        ;out6 | 
|  | mova [rsp+gprsize*2+16*41], m6                        ;out38 | 
|  | mova [rsp+gprsize*2+16*28], m2                        ;out25 | 
|  | mova [rsp+gprsize*2+16*60], m0                        ;out57 | 
|  | mova [rsp+gprsize*2+16*9 ], m3                        ;out6 | 
|  | mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22] | 
|  | mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ] | 
|  | psubsw                  m2, m0, m5                    ;out41 | 
|  | paddsw                  m0, m5                        ;out22 | 
|  | psubsw                  m3, m1, m4                    ;out54 | 
|  | paddsw                  m1, m4                        ;out9 | 
|  | mova [rsp+gprsize*2+16*44], m2                        ;out41 | 
|  | mova [rsp+gprsize*2+16*25], m0                        ;out22 | 
|  | mova [rsp+gprsize*2+16*57], m3                        ;out54 | 
|  | mova [rsp+gprsize*2+16*12], m1                        ;out9 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*42]     ;t39a | 
|  | mova                    m5, [rsp+gprsize*2+16*43]     ;t40a | 
|  | mova                    m3, [rsp+gprsize*2+16*58]     ;t55a | 
|  | mova                    m1, [rsp+gprsize*2+16*59]     ;t56a | 
|  | mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24] | 
|  | psubsw                  m4, m0, m5                    ;t40 | 
|  | paddsw                  m0, m5                        ;t39 | 
|  | psubsw                  m5, m1, m3                    ;t55 | 
|  | paddsw                  m1, m3                        ;t56 | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a | 
|  | mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ] | 
|  | psubsw                  m6, m2, m0                    ;out39 | 
|  | paddsw                  m2, m0                        ;out24 | 
|  | psubsw                  m0, m3, m1                    ;out56 | 
|  | paddsw                  m3, m1                        ;out7 | 
|  | mova [rsp+gprsize*2+16*42], m6                        ;out39 | 
|  | mova [rsp+gprsize*2+16*27], m2                        ;out24 | 
|  | mova [rsp+gprsize*2+16*59], m0                        ;out56 | 
|  | mova [rsp+gprsize*2+16*10], m3                        ;out7 | 
|  | mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23] | 
|  | mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ] | 
|  | psubsw                  m2, m0, m5                    ;out40 | 
|  | paddsw                  m0, m5                        ;out23 | 
|  | psubsw                  m3, m1, m4                    ;out55 | 
|  | paddsw                  m1, m4                        ;out8 | 
|  | mova [rsp+gprsize*2+16*43], m2                        ;out40 | 
|  | mova [rsp+gprsize*2+16*26], m0                        ;out23 | 
|  | mova [rsp+gprsize*2+16*58], m3                        ;out55 | 
|  | mova [rsp+gprsize*2+16*11], m1                        ;out8 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*37]     ;t34 | 
|  | mova                    m5, [rsp+gprsize*2+16*48]     ;t45 | 
|  | mova                    m3, [rsp+gprsize*2+16*53]     ;t50 | 
|  | mova                    m1, [rsp+gprsize*2+16*64]     ;t61 | 
|  | mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29] | 
|  | psubsw                  m4, m0, m5                    ;t45a | 
|  | paddsw                  m0, m5                        ;t34a | 
|  | psubsw                  m5, m1, m3                    ;t50a | 
|  | paddsw                  m1, m3                        ;t61a | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50 | 
|  | mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ] | 
|  | psubsw                  m6, m2, m0                    ;out34 | 
|  | paddsw                  m2, m0                        ;out29 | 
|  | psubsw                  m0, m3, m1                    ;out61 | 
|  | paddsw                  m3, m1                        ;out2 | 
|  | mova [rsp+gprsize*2+16*37], m6                        ;out34 | 
|  | mova [rsp+gprsize*2+16*32], m2                        ;out29 | 
|  | mova [rsp+gprsize*2+16*64], m0                        ;out61 | 
|  | mova [rsp+gprsize*2+16*5 ], m3                        ;out2 | 
|  | mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18] | 
|  | mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13] | 
|  | psubsw                  m2, m0, m5                    ;out45 | 
|  | paddsw                  m0, m5                        ;out18 | 
|  | psubsw                  m3, m1, m4                    ;out50 | 
|  | paddsw                  m1, m4                        ;out13 | 
|  | mova [rsp+gprsize*2+16*48], m2                        ;out45 | 
|  | mova [rsp+gprsize*2+16*21], m0                        ;out18 | 
|  | mova [rsp+gprsize*2+16*53], m3                        ;out50 | 
|  | mova [rsp+gprsize*2+16*16], m1                        ;out13 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*36]     ;t33a | 
|  | mova                    m5, [rsp+gprsize*2+16*49]     ;t46a | 
|  | mova                    m3, [rsp+gprsize*2+16*52]     ;t49a | 
|  | mova                    m1, [rsp+gprsize*2+16*65]     ;t62a | 
|  | mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30] | 
|  | psubsw                  m4, m0, m5                    ;t46 | 
|  | paddsw                  m0, m5                        ;t33 | 
|  | psubsw                  m5, m1, m3                    ;t49 | 
|  | paddsw                  m1, m3                        ;t62 | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50 | 
|  | mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ] | 
|  | psubsw                  m6, m2, m0                    ;out33 | 
|  | paddsw                  m2, m0                        ;out30 | 
|  | psubsw                  m0, m3, m1                    ;out62 | 
|  | paddsw                  m3, m1                        ;out1 | 
|  | mova [rsp+gprsize*2+16*36], m6                        ;out33 | 
|  | mova [rsp+gprsize*2+16*33], m2                        ;out30 | 
|  | mova [rsp+gprsize*2+16*65], m0                        ;out62 | 
|  | mova [rsp+gprsize*2+16*4 ], m3                        ;out1 | 
|  | mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17] | 
|  | mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14] | 
|  | psubsw                  m2, m0, m5                    ;out46 | 
|  | paddsw                  m0, m5                        ;out17 | 
|  | psubsw                  m3, m1, m4                    ;out49 | 
|  | paddsw                  m1, m4                        ;out14 | 
|  | mova [rsp+gprsize*2+16*49], m2                        ;out46 | 
|  | mova [rsp+gprsize*2+16*20], m0                        ;out17 | 
|  | mova [rsp+gprsize*2+16*52], m3                        ;out49 | 
|  | mova [rsp+gprsize*2+16*17], m1                        ;out14 | 
|  |  | 
|  | mova                    m0, [rsp+gprsize*2+16*35]     ;t32 | 
|  | mova                    m5, [rsp+gprsize*2+16*50]     ;t47 | 
|  | mova                    m3, [rsp+gprsize*2+16*51]     ;t48 | 
|  | mova                    m1, [rsp+gprsize*2+16*66]     ;t63 | 
|  | mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31] | 
|  | psubsw                  m4, m0, m5                    ;t47a | 
|  | paddsw                  m0, m5                        ;t32a | 
|  | psubsw                  m5, m1, m3                    ;t48a | 
|  | paddsw                  m1, m3                        ;t63a | 
|  | ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48 | 
|  | mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ] | 
|  | psubsw                  m6, m2, m0                    ;out32 | 
|  | paddsw                  m2, m0                        ;out31 | 
|  | psubsw                  m0, m3, m1                    ;out63 | 
|  | paddsw                  m3, m1                        ;out0 | 
|  | mova [rsp+gprsize*2+16*35], m6                        ;out32 | 
|  | mova [rsp+gprsize*2+16*34], m2                        ;out31 | 
|  | mova [rsp+gprsize*2+16*66], m0                        ;out63 | 
|  | mova [rsp+gprsize*2+16*3 ], m3                        ;out0 | 
|  | mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16] | 
|  | mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15] | 
|  | psubsw                  m2, m0, m5                    ;out47 | 
|  | paddsw                  m0, m5                        ;out16 | 
|  | psubsw                  m3, m1, m4                    ;out48 | 
|  | paddsw                  m1, m4                        ;out15 | 
|  | mova [rsp+gprsize*2+16*50], m2                        ;out47 | 
|  | mova [rsp+gprsize*2+16*19], m0                        ;out16 | 
|  | mova [rsp+gprsize*2+16*51], m3                        ;out48 | 
|  | mova [rsp+gprsize*2+16*18], m1                        ;out15 | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  |  | 
|  | call m(idct_64x16_internal_8bpc) | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_8192)] | 
|  | mov               [coeffq], eobd | 
|  | mov                    r3d, 16 | 
|  | lea                   tx2q, [o(.end)] | 
|  |  | 
|  | .body: | 
|  | pmulhrsw                m0, m2 | 
|  | movd                    m2, [o(pw_2048)]  ;intentionally rip-relative | 
|  | pmulhrsw                m0, m1 | 
|  | pmulhrsw                m0, m2 | 
|  | pshuflw                 m0, m0, q0000 | 
|  | punpcklwd               m0, m0 | 
|  | pxor                    m7, m7 | 
|  |  | 
|  | .loop: | 
|  | mova                    m1, [dstq+16*0] | 
|  | mova                    m3, [dstq+16*1] | 
|  | mova                    m5, [dstq+16*2] | 
|  | mova                    m6, [dstq+16*3] | 
|  | punpckhbw               m2, m1, m7 | 
|  | punpcklbw               m1, m7 | 
|  | punpckhbw               m4, m3, m7 | 
|  | punpcklbw               m3, m7 | 
|  | paddw                   m2, m0 | 
|  | paddw                   m1, m0 | 
|  | paddw                   m4, m0 | 
|  | paddw                   m3, m0 | 
|  | packuswb                m1, m2 | 
|  | packuswb                m3, m4 | 
|  | punpckhbw               m2, m5, m7 | 
|  | punpcklbw               m5, m7 | 
|  | punpckhbw               m4, m6, m7 | 
|  | punpcklbw               m6, m7 | 
|  | paddw                   m2, m0 | 
|  | paddw                   m5, m0 | 
|  | paddw                   m4, m0 | 
|  | paddw                   m6, m0 | 
|  | packuswb                m5, m2 | 
|  | packuswb                m6, m4 | 
|  | mova           [dstq+16*0], m1 | 
|  | mova           [dstq+16*1], m3 | 
|  | mova           [dstq+16*2], m5 | 
|  | mova           [dstq+16*3], m6 | 
|  | add                   dstq, strideq | 
|  | dec                    r3d | 
|  | jg .loop | 
|  | jmp                   tx2q | 
|  |  | 
|  | .end: | 
|  | RET | 
|  |  | 
|  |  | 
|  | %macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 | 
|  |  | 
|  | %if %3 | 
|  | mova                 m3, [o(pw_2896x8)] | 
|  | pmulhrsw             m0, m3, [%1+%2*0] | 
|  | pmulhrsw             m1, m3, [%1+%2*1] | 
|  | pmulhrsw             m2, m3, [%1+%2*2] | 
|  | pmulhrsw             m3, [%1+%2*3] | 
|  | %else | 
|  | mova                 m0, [%1+%2*0] | 
|  | mova                 m1, [%1+%2*1] | 
|  | mova                 m2, [%1+%2*2] | 
|  | mova                 m3, [%1+%2*3] | 
|  | %endif | 
|  | %endmacro | 
|  |  | 
|  | %macro LOAD_4ROWS_H 2 ;src, stride | 
|  | mova                 m4, [%1+%2*0] | 
|  | mova                 m5, [%1+%2*1] | 
|  | mova                 m6, [%1+%2*2] | 
|  | mova                 m7, [%1+%2*3] | 
|  | %endmacro | 
|  |  | 
|  | cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r3d, 2 | 
|  | mov  [rsp+gprsize*2+16*67], dstq | 
|  | lea                   dstq, [rsp+gprsize+16*68] | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_4ROWS     coeffq+32*0, 32*8 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | LOAD_4ROWS     coeffq+32*4, 32*8 | 
|  |  | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+32*2, 32*4 | 
|  | mova   [rsp+gprsize+16*19], m0 | 
|  | mova   [rsp+gprsize+16*26], m1 | 
|  | mova   [rsp+gprsize+16*23], m2 | 
|  | mova   [rsp+gprsize+16*22], m3 | 
|  | mova   [rsp+gprsize+16*21], m4 | 
|  | mova   [rsp+gprsize+16*24], m5 | 
|  | mova   [rsp+gprsize+16*25], m6 | 
|  | mova   [rsp+gprsize+16*20], m7 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | SAVE_8ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+32*1, 32*2 | 
|  | mova   [rsp+gprsize+16*35], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*49], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*43], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*41], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*39], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*45], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*47], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*37], m7                        ;in15 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+32*17, 32*2 | 
|  | mova   [rsp+gprsize+16*63], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*53], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*55], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*61], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*59], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*57], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*51], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*65], m7                        ;in31 | 
|  |  | 
|  | call m(idct_16x64_internal_8bpc).main | 
|  |  | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+32*0, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+32*8, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+32*16, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+32*24, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*35, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS       dstq+32*0, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*43, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end5: | 
|  | SAVE_8ROWS       dstq+32*8, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*51, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end6)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end6: | 
|  | SAVE_8ROWS      dstq+32*16, 32 | 
|  | LOAD_8ROWS   rsp+gprsize+16*59, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end7)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end7: | 
|  | SAVE_8ROWS      dstq+32*24, 32 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | add                   dstq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  | .pass2: | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | sub                 coeffq, 32 | 
|  | mov                    r3d, 4 | 
|  |  | 
|  | .pass2_loop: | 
|  | mov  [rsp+gprsize*1+16*67], r3d | 
|  |  | 
|  | LOAD_4ROWS     coeffq+16*0, 32*2 | 
|  | LOAD_4ROWS_H   coeffq+16*1, 32*2 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_4ROWS     coeffq+16*2, 32*2 | 
|  | LOAD_4ROWS_H   coeffq+16*3, 32*2 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  |  | 
|  | mov                    r3, dstq | 
|  | lea                  tx2q, [o(.end)] | 
|  | lea                  dstq, [dstq+strideq*8] | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end: | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | lea                  tx2q, [o(.end1)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end1: | 
|  | pxor                   m7, m7 | 
|  | REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 | 
|  |  | 
|  | add                 coeffq, 16*16 | 
|  | mov                    r3d, [rsp+gprsize*1+16*67] | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | add                   dstq, 8 | 
|  | mov  [rsp+gprsize*2+16*67], dstq | 
|  | dec                    r3d | 
|  | jg .pass2_loop | 
|  |  | 
|  | mov                    r3d, 4 | 
|  | lea                 coeffq, [rsp+gprsize+16*68] | 
|  | .pass2_loop2: | 
|  | mov  [rsp+gprsize*1+16*67], r3d | 
|  |  | 
|  | LOAD_4ROWS     coeffq+16*0, 32*2 | 
|  | LOAD_4ROWS_H   coeffq+16*1, 32*2 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_4ROWS     coeffq+16*2, 32*2 | 
|  | LOAD_4ROWS_H   coeffq+16*3, 32*2 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  |  | 
|  | mov                    r3, dstq | 
|  | lea                  tx2q, [o(.end2)] | 
|  | lea                  dstq, [dstq+strideq*8] | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end2: | 
|  | LOAD_8ROWS   rsp+gprsize+16*3, 16 | 
|  | mova   [rsp+gprsize+16*0], m7 | 
|  | lea                  tx2q, [o(.end3)] | 
|  | mov                  dstq, r3 | 
|  | jmp  m(idct_8x8_internal_8bpc).end | 
|  |  | 
|  | .end3: | 
|  |  | 
|  | add                 coeffq, 16*16 | 
|  | mov                    r3d, [rsp+gprsize*1+16*67] | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | add                   dstq, 8 | 
|  | mov  [rsp+gprsize*2+16*67], dstq | 
|  | dec                    r3d | 
|  | jg .pass2_loop2 | 
|  | ret | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call m(idct_32x64_internal_8bpc) | 
|  | .end: | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_16384)] | 
|  | mov               [coeffq], eobd | 
|  | pmulhrsw                m0, m1 | 
|  | mov                    r3d, 64 | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body | 
|  |  | 
|  |  | 
|  | cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 2 | 
|  | sub                   eobd, 136 | 
|  | mov  [rsp+gprsize*1+16*67], eobd | 
|  | mov                    r3d, 4 | 
|  | cmovs                  r3d, r4d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | mov  [rsp+gprsize*2+16*67], coeffq | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_8ROWS     coeffq+64*1, 64*2, 1 | 
|  | mova   [rsp+gprsize+16*19], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*26], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*23], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*22], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*21], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*24], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*25], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*20], m7                        ;in15 | 
|  |  | 
|  | mov                   tx2d, [rsp+gprsize*1+16*67] | 
|  | test                  tx2d, tx2d | 
|  | jl .fast | 
|  |  | 
|  | .full: | 
|  | LOAD_8ROWS     coeffq+64*0, 64*4, 1 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_8ROWS     coeffq+64*2, 64*4, 1 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+64*17, 64*2, 1 | 
|  | mova   [rsp+gprsize+16*33], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*28], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*29], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*32], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*31], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*30], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*27], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*34], m7                        ;in31 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main | 
|  | jmp .pass1_end | 
|  |  | 
|  | .fast: | 
|  | LOAD_4ROWS          coeffq, 256, 1 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  |  | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  | LOAD_4ROWS    coeffq+128*1, 256, 1 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  |  | 
|  | .pass1_end: | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS     coeffq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS    coeffq+64*24, 64 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  | .pass2: | 
|  | mov                 coeffq, [rsp+gprsize*2+16*67] | 
|  | mov                    r3d, 4 | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)] | 
|  | jmp m(idct_16x64_internal_8bpc).pass2_loop | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  | call m(idct_64x32_internal_8bpc) | 
|  | .end: | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_16384)] | 
|  | pmulhrsw                m0, m1 | 
|  | mov               [coeffq], eobd | 
|  | mov                    r3d, 32 | 
|  | lea                   tx2q, [o(.end)] | 
|  | jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body | 
|  |  | 
|  |  | 
|  | cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r4d, 2 | 
|  | sub                   eobd, 136 | 
|  | mov  [rsp+gprsize*1+16*67], eobd | 
|  | mov                    r3d, 4 | 
|  | cmovs                  r3d, r4d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | mov  [rsp+gprsize*2+16*67], coeffq | 
|  | mov  [rsp+gprsize*3+16*67], dstq | 
|  | lea                   dstq, [rsp+gprsize+16*69] | 
|  | mov  [rsp+gprsize*4+16*67], dstq | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_4ROWS     coeffq+64*0, 64*8, 1 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | LOAD_4ROWS     coeffq+64*4, 64*8, 1 | 
|  |  | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+64*2, 64*4, 1 | 
|  | mova   [rsp+gprsize+16*19], m0 | 
|  | mova   [rsp+gprsize+16*26], m1 | 
|  | mova   [rsp+gprsize+16*23], m2 | 
|  | mova   [rsp+gprsize+16*22], m3 | 
|  | mova   [rsp+gprsize+16*21], m4 | 
|  | mova   [rsp+gprsize+16*24], m5 | 
|  | mova   [rsp+gprsize+16*25], m6 | 
|  | mova   [rsp+gprsize+16*20], m7 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | SAVE_8ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+64*1, 64*2, 1 | 
|  | mova   [rsp+gprsize+16*35], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*49], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*43], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*41], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*39], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*45], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*47], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*37], m7                        ;in15 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+64*17, 64*2, 1 | 
|  | mova   [rsp+gprsize+16*63], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*53], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*55], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*61], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*59], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*57], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*51], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*65], m7                        ;in31 | 
|  |  | 
|  | call m(idct_16x64_internal_8bpc).main | 
|  |  | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+64*24, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*35, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS       dstq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*43, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end5: | 
|  | SAVE_8ROWS       dstq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*51, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end6)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end6: | 
|  | SAVE_8ROWS      dstq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*59, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                   tx2q, [o(.pass1_end7)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end | 
|  |  | 
|  | .pass1_end7: | 
|  | SAVE_8ROWS      dstq+64*24, 64 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | add                   dstq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  | .pass2: | 
|  | mov                 coeffq, [rsp+gprsize*4+16*67] | 
|  | mov                   dstq, [rsp+gprsize*3+16*67] | 
|  | mov                   eobd, [rsp+gprsize*1+16*67] | 
|  | lea                   dstq, [dstq+32] | 
|  | mov  [rsp+gprsize*1+16*35], eobd | 
|  | lea                   tx2q, [o(.pass2_end)] | 
|  | mov                    r3d, 4 | 
|  | jmp m(idct_32x32_internal_8bpc).pass2_loop | 
|  |  | 
|  | .pass2_end: | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | lea                     r3, [o(.pass2_end1)] | 
|  | jmp  m(idct_8x32_internal_8bpc).end2 | 
|  |  | 
|  | .pass2_end1: | 
|  | lea                   tx2q, [o(.pass2_end)] | 
|  | add                 coeffq, 16*32 | 
|  | mov                   dstq, [rsp+gprsize*2+16*35] | 
|  | mov                    r3d, [rsp+gprsize*3+16*35] | 
|  | dec                    r3d | 
|  | jg m(idct_32x32_internal_8bpc).pass2_loop | 
|  |  | 
|  | .pass2_end2: | 
|  | mov                   dstq, [rsp+gprsize*3+16*67] | 
|  | mov                 coeffq, [rsp+gprsize*2+16*67] | 
|  | lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] | 
|  | mov                    r3d, 4 | 
|  | jmp m(idct_32x32_internal_8bpc).pass2_loop | 
|  |  | 
|  |  | 
|  | cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  | test                  eobd, eobd | 
|  | jz .dconly | 
|  |  | 
|  | call m(idct_64x64_internal_8bpc) | 
|  | RET | 
|  |  | 
|  | .dconly: | 
|  | movd                    m1, [o(pw_2896x8)] | 
|  | pmulhrsw                m0, m1, [coeffq] | 
|  | movd                    m2, [o(pw_8192)] | 
|  | mov               [coeffq], eobd | 
|  | mov                    r3d, 64 | 
|  | lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] | 
|  | jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body | 
|  |  | 
|  | cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 | 
|  | mov                    r5d, 4 | 
|  | mov                    r4d, 2 | 
|  | sub                   eobd, 136 | 
|  | cmovns                 r4d, r5d | 
|  |  | 
|  | %if ARCH_X86_32 | 
|  | LEA                     r5, $$ | 
|  | %endif | 
|  |  | 
|  | mov  [rsp+gprsize*1+16*67], eobd | 
|  | mov                    r3d, r4d | 
|  | mov  [rsp+gprsize*4+16*67], coeffq | 
|  | mov  [rsp+gprsize*3+16*67], dstq | 
|  | lea                   dstq, [rsp+gprsize+16*69] | 
|  | mov  [rsp+gprsize*2+16*67], dstq | 
|  |  | 
|  | .pass1_loop: | 
|  | LOAD_4ROWS     coeffq+64*0, 64*8 | 
|  | pxor                    m4, m4 | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call  m(idct_8x8_internal_8bpc).main | 
|  | SAVE_7ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | pxor                    m4, m4 | 
|  | LOAD_4ROWS     coeffq+64*4, 64*8 | 
|  |  | 
|  | REPX          {mova x, m4}, m5, m6, m7 | 
|  | call m(idct_16x8_internal_8bpc).main | 
|  | mova                    m7, [rsp+gprsize+16*0] | 
|  | SAVE_8ROWS   rsp+gprsize+16*11, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+64*2, 64*4 | 
|  | mova   [rsp+gprsize+16*19], m0 | 
|  | mova   [rsp+gprsize+16*26], m1 | 
|  | mova   [rsp+gprsize+16*23], m2 | 
|  | mova   [rsp+gprsize+16*22], m3 | 
|  | mova   [rsp+gprsize+16*21], m4 | 
|  | mova   [rsp+gprsize+16*24], m5 | 
|  | mova   [rsp+gprsize+16*25], m6 | 
|  | mova   [rsp+gprsize+16*20], m7 | 
|  |  | 
|  | call m(idct_8x32_internal_8bpc).main_fast | 
|  | SAVE_8ROWS    rsp+gprsize+16*3, 16 | 
|  |  | 
|  | LOAD_8ROWS     coeffq+64*1, 64*2 | 
|  | mova   [rsp+gprsize+16*35], m0                        ;in1 | 
|  | mova   [rsp+gprsize+16*49], m1                        ;in3 | 
|  | mova   [rsp+gprsize+16*43], m2                        ;in5 | 
|  | mova   [rsp+gprsize+16*41], m3                        ;in7 | 
|  | mova   [rsp+gprsize+16*39], m4                        ;in9 | 
|  | mova   [rsp+gprsize+16*45], m5                        ;in11 | 
|  | mova   [rsp+gprsize+16*47], m6                        ;in13 | 
|  | mova   [rsp+gprsize+16*37], m7                        ;in15 | 
|  |  | 
|  | LOAD_8ROWS    coeffq+64*17, 64*2 | 
|  | mova   [rsp+gprsize+16*63], m0                        ;in17 | 
|  | mova   [rsp+gprsize+16*53], m1                        ;in19 | 
|  | mova   [rsp+gprsize+16*55], m2                        ;in21 | 
|  | mova   [rsp+gprsize+16*61], m3                        ;in23 | 
|  | mova   [rsp+gprsize+16*59], m4                        ;in25 | 
|  | mova   [rsp+gprsize+16*57], m5                        ;in27 | 
|  | mova   [rsp+gprsize+16*51], m6                        ;in29 | 
|  | mova   [rsp+gprsize+16*65], m7                        ;in31 | 
|  |  | 
|  | call m(idct_16x64_internal_8bpc).main | 
|  |  | 
|  | LOAD_8ROWS    rsp+gprsize+16*3, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end: | 
|  | SAVE_8ROWS     coeffq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*11, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end1)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end1: | 
|  | SAVE_8ROWS     coeffq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*19, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end2)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end2: | 
|  | SAVE_8ROWS    coeffq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*27, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end3)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end3: | 
|  | SAVE_8ROWS    coeffq+64*24, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*35, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end4)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end4: | 
|  | SAVE_8ROWS       dstq+64*0, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*43, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end5)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end5: | 
|  | SAVE_8ROWS       dstq+64*8, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*51, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end6)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end6: | 
|  | SAVE_8ROWS      dstq+64*16, 64 | 
|  | LOAD_8ROWS   rsp+gprsize+16*59, 16 | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | mova                    m7, [o(pw_8192)] | 
|  | lea                   tx2q, [o(.pass1_end7)] | 
|  | jmp   m(idct_8x8_internal_8bpc).pass1_end1 | 
|  |  | 
|  | .pass1_end7: | 
|  | SAVE_8ROWS      dstq+64*24, 64 | 
|  |  | 
|  | add                 coeffq, 16 | 
|  | add                   dstq, 16 | 
|  | dec                    r3d | 
|  | jg .pass1_loop | 
|  |  | 
|  | .pass2: | 
|  | mov                   dstq, [rsp+gprsize*3+16*67] | 
|  | mov                 coeffq, [rsp+gprsize*2+16*67] | 
|  | lea                   dstq, [dstq+32] | 
|  | mov                    r3d, 4 | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(.pass2_end)] | 
|  | jmp m(idct_16x64_internal_8bpc).pass2_loop | 
|  |  | 
|  | .pass2_end: | 
|  | LOAD_8ROWS   rsp+gprsize+16*35, 16 | 
|  | lea                   dstq, [dstq+strideq*2] | 
|  | lea                     r3, [rsp+16*32+gprsize] | 
|  | mova    [rsp+gprsize+16*0], m7 | 
|  | call m(idct_16x64_internal_8bpc).write | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | mov                    r3d, [rsp+gprsize*3+16*67] | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(.pass2_end)] | 
|  |  | 
|  | dec                    r3d | 
|  | jg  m(idct_16x64_internal_8bpc).pass2_loop | 
|  |  | 
|  | .pass2_end2: | 
|  | mov                 coeffq, [rsp+gprsize*4+16*67] | 
|  | mov                   dstq, [rsp+gprsize*2+16*67] | 
|  | mov                    r3d, 4 | 
|  | sub                   dstq, 72 | 
|  | lea                     r4, [dstq+8] | 
|  | mov  [rsp+gprsize*2+16*67], r4 | 
|  | lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)] | 
|  | jmp m(idct_16x64_internal_8bpc).pass2_loop |