| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 32 |
| |
| ; Note: The order of (at least some of) those constants matter! |
| |
| iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 |
| iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424 |
| iadst4_dconly1a: dw 10568, 19856, 26752, 30424 |
| iadst4_dconly1b: dw 30424, 26752, 19856, 10568 |
| |
| deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 |
| |
| %macro COEF_PAIR 2 |
| pw_%1_%2: dw %1, %2 |
| pw_m%2_%1: dw -%2, %1 |
| %endmacro |
| |
| ; ADST-only |
| pw_3803_1321: dw 3803, 1321 |
| pw_m1321_2482: dw -1321, 2482 |
| pw_2482_3344: dw 2482, 3344 |
| pw_m3344_3344: dw -3344, 3344 |
| pw_m3803_3344: dw -3803, 3344 |
| pw_m3803_m6688: dw -3803, -6688 |
| pw_2896_m2896: dw 2896, -2896 |
| |
| pw_5: times 2 dw 5 |
| pw_2048: times 2 dw 2048 |
| pw_4096: times 2 dw 4096 |
| pw_8192: times 2 dw 8192 |
| pw_16384: times 2 dw 16384 |
| pw_1697x16: times 2 dw 1697*16 |
| pw_1697x8: times 2 dw 1697*8 |
| pw_2896x8: times 2 dw 2896*8 |
| pw_5793x4: times 2 dw 5793*4 |
| |
| pd_2048: dd 2048 |
| |
| COEF_PAIR 2896, 2896 |
| COEF_PAIR 1567, 3784 |
| COEF_PAIR 3784, 1567 |
| COEF_PAIR 201, 4091 |
| COEF_PAIR 995, 3973 |
| COEF_PAIR 1751, 3703 |
| COEF_PAIR 2440, 3290 |
| COEF_PAIR 3035, 2751 |
| COEF_PAIR 3513, 2106 |
| COEF_PAIR 3857, 1380 |
| COEF_PAIR 4052, 601 |
| COEF_PAIR 401, 4076 |
| COEF_PAIR 1931, 3612 |
| COEF_PAIR 3166, 2598 |
| COEF_PAIR 3920, 1189 |
| COEF_PAIR 799, 4017 |
| COEF_PAIR 3406, 2276 |
| pw_m799_m4017: dw -799, -4017 |
| pw_m1567_m3784: dw -1567, -3784 |
| pw_m3406_m2276: dw -3406, -2276 |
| pw_m401_m4076: dw -401, -4076 |
| pw_m3166_m2598: dw -3166, -2598 |
| pw_m1931_m3612: dw -1931, -3612 |
| pw_m3920_m1189: dw -3920, -1189 |
| COEF_PAIR 2276, 3406 |
| COEF_PAIR 4017, 799 |
| |
| %macro COEF_X8 1-* |
| %rep %0 |
| dw %1*8, %1*8 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| pw_3703x8: COEF_X8 3703 |
| pw_1751x8: COEF_X8 1751 |
| pw_m1380x8: COEF_X8 -1380 |
| pw_3857x8: COEF_X8 3857 |
| pw_3973x8: COEF_X8 3973 |
| pw_995x8: COEF_X8 995 |
| pw_m2106x8: COEF_X8 -2106 |
| pw_3513x8: COEF_X8 3513 |
| pw_3290x8: COEF_X8 3290 |
| pw_2440x8: COEF_X8 2440 |
| pw_m601x8: COEF_X8 -601 |
| pw_4052x8: COEF_X8 4052 |
| |
| idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 |
| COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 |
| COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 |
| COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 |
| |
| pw_201_4091x8: dw 201*8, 4091*8 |
| pw_m601_4052x8: dw -601*8, 4052*8 |
| pw_995_3973x8: dw 995*8, 3973*8 |
| pw_m1380_3857x8: dw -1380*8, 3857*8 |
| pw_1751_3703x8: dw 1751*8, 3703*8 |
| pw_m2106_3513x8: dw -2106*8, 3513*8 |
| pw_2440_3290x8: dw 2440*8, 3290*8 |
| pw_m2751_3035x8: dw -2751*8, 3035*8 |
| |
| %define o_idct64_offset idct64_mul - (o_base) - 8 |
| |
| SECTION .text |
| |
| ; Code size reduction trickery: Intead of using rip-relative loads with |
| ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a |
| ; single rip-relative lea and then address things relative from that with |
| ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. |
| %define o_base iadst4_dconly2a + 128 |
| %define o(x) (rax - (o_base) + (x)) |
| |
| %macro REPX 2-* |
| %xdefine %%f(x) %1 |
| %rep %0 - 1 |
| %rotate 1 |
| %%f(%1) |
| %endrep |
| %endmacro |
| |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| ; flags: 1 = swap, 2 = interleave, 4: coef_regs |
| %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags |
| %if %7 & 4 |
| pmaddwd m%2, m%5, m%1 |
| pmaddwd m%1, m%6 |
| %else |
| %if %7 & 1 |
| vpbroadcastd m%2, [o(pw_%5_%6)] |
| vpbroadcastd m%3, [o(pw_m%6_%5)] |
| %else |
| vpbroadcastd m%2, [o(pw_m%6_%5)] |
| vpbroadcastd m%3, [o(pw_%5_%6)] |
| %endif |
| pmaddwd m%2, m%1 |
| pmaddwd m%1, m%3 |
| %endif |
| paddd m%2, m%4 |
| paddd m%1, m%4 |
| %if %7 & 2 |
| pslld m%2, 4 |
| psrld m%1, 12 |
| pblendw m%1, m%2, 0xaa |
| %else |
| psrad m%2, 12 |
| psrad m%1, 12 |
| packssdw m%1, m%2 |
| %endif |
| %endmacro |
| |
| ; flags: 1 = swap, 2 = interleave, 4 = coef_regs |
| %macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags |
| %if %10 & 1 |
| vpbroadcastd m%3, [o(pw_%8_%9)] |
| vpbroadcastd m%4, [o(pw_m%9_%8)] |
| vpbroadcastd xm%2, [o(pw_%6_%7)] |
| vpblendd m%2, m%2, m%3, 0xf0 |
| vpbroadcastd xm%3, [o(pw_m%7_%6)] |
| %else |
| vpbroadcastd m%3, [o(pw_m%9_%8)] |
| vpbroadcastd m%4, [o(pw_%8_%9)] |
| vpbroadcastd xm%2, [o(pw_m%7_%6)] |
| vpblendd m%2, m%2, m%3, 0xf0 |
| vpbroadcastd xm%3, [o(pw_%6_%7)] |
| %endif |
| vpblendd m%3, m%3, m%4, 0xf0 |
| ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) |
| %endmacro |
| |
| ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 |
| ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 |
| %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 |
| punpckhwd m%3, m%2, m%1 |
| punpcklwd m%2, m%1 |
| %if %7 < 32 |
| pmaddwd m%1, m%7, m%2 |
| pmaddwd m%4, m%7, m%3 |
| %else |
| vpbroadcastd m%1, [o(pw_m%7_%6)] |
| pmaddwd m%4, m%3, m%1 |
| pmaddwd m%1, m%2 |
| %endif |
| paddd m%4, m%5 |
| paddd m%1, m%5 |
| psrad m%4, 12 |
| psrad m%1, 12 |
| packssdw m%1, m%4 |
| %if %7 < 32 |
| pmaddwd m%3, m%6 |
| pmaddwd m%2, m%6 |
| %else |
| vpbroadcastd m%4, [o(pw_%6_%7)] |
| pmaddwd m%3, m%4 |
| pmaddwd m%2, m%4 |
| %endif |
| paddd m%3, m%5 |
| paddd m%2, m%5 |
| psrad m%3, 12 |
| psrad m%2, 12 |
| %if %0 == 8 |
| packssdw m%8, m%2, m%3 |
| %else |
| packssdw m%2, m%3 |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 |
| ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 |
| ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 |
| psubsw m%3, m%1, m%2 |
| paddsw m%2, m%1 |
| paddsw m%1, m%4, m%5 |
| psubsw m%4, m%5 |
| %endmacro |
| |
| %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 |
| ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a |
| ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a |
| ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 |
| paddsw m%9, m%2, m%6 ; t4 |
| psubsw m%2, m%6 ; t5a |
| paddsw m%10, m%8, m%4 ; t7 |
| psubsw m%8, m%4 ; t6a |
| ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 |
| ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 |
| psubsw m%6, m%1, m%3 ; dct4 out2 |
| paddsw m%3, m%1 ; dct4 out1 |
| paddsw m%1, m%5, m%7 ; dct4 out0 |
| psubsw m%5, m%7 ; dct4 out3 |
| psubsw m%7, m%3, m%2 ; out6 |
| paddsw m%2, m%3 ; out1 |
| paddsw m%3, m%6, m%8 ; out2 |
| psubsw m%6, m%8 ; out5 |
| psubsw m%8, m%1, m%10 ; out7 |
| paddsw m%1, m%10 ; out0 |
| paddsw m%4, m%5, m%9 ; out3 |
| psubsw m%5, m%9 ; out4 |
| %endmacro |
| |
| ; in1 = %1, in3 = %2, in5 = %3, in7 = %4 |
| ; in9 = %5, in11 = %6, in13 = %7, in15 = %8 |
| %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 |
| ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a |
| ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a |
| ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a |
| ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a |
| psubsw m%9, m%2, m%6 ; t13 |
| paddsw m%6, m%2 ; t12 |
| psubsw m%2, m%8, m%4 ; t14 |
| paddsw m%8, m%4 ; t15 |
| psubsw m%4, m%7, m%3 ; t10 |
| paddsw m%3, m%7 ; t11 |
| psubsw m%7, m%1, m%5 ; t9 |
| paddsw m%1, m%5 ; t8 |
| ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a |
| ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a |
| psubsw m%5, m%1, m%3 ; t11a |
| paddsw m%1, m%3 ; t8a |
| psubsw m%3, m%7, m%4 ; t13 |
| paddsw m%7, m%4 ; t14 |
| psubsw m%4, m%8, m%6 ; t12a |
| paddsw m%8, m%6 ; t15a |
| psubsw m%6, m%2, m%9 ; t10 |
| paddsw m%2, m%9 ; t9 |
| ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a |
| ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 |
| %endmacro |
| |
| %macro WRAP_XMM 1+ |
| INIT_XMM cpuname |
| %1 |
| INIT_YMM cpuname |
| %endmacro |
| |
| %macro ITX4_END 4-5 2048 ; row[1-4], rnd |
| %if %5 |
| vpbroadcastd m2, [o(pw_%5)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| %endif |
| lea r2, [dstq+strideq*2] |
| %assign %%i 1 |
| %rep 4 |
| %if %1 & 2 |
| CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) |
| %else |
| CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) |
| %endif |
| %assign %%i %%i + 1 |
| %rotate 1 |
| %endrep |
| movd m2, [%%row_adr1] |
| pinsrd m2, [%%row_adr2], 1 |
| movd m3, [%%row_adr3] |
| pinsrd m3, [%%row_adr4], 1 |
| pmovzxbw m2, m2 |
| pmovzxbw m3, m3 |
| paddw m0, m2 |
| paddw m1, m3 |
| packuswb m0, m1 |
| movd [%%row_adr1], m0 |
| pextrd [%%row_adr2], m0, 1 |
| pextrd [%%row_adr3], m0, 2 |
| pextrd [%%row_adr4], m0, 3 |
| ret |
| %endmacro |
| |
| %macro IWHT4_1D_PACKED 0 |
| punpckhqdq m3, m0, m1 ; in1 in3 |
| punpcklqdq m0, m1 ; in0 in2 |
| psubw m2, m0, m3 |
| paddw m0, m3 |
| punpckhqdq m2, m2 ; t2 t2 |
| punpcklqdq m0, m0 ; t0 t0 |
| psubw m1, m0, m2 |
| psraw m1, 1 |
| psubw m1, m3 ; t1 t3 |
| psubw m0, m1 ; ____ out0 |
| paddw m2, m1 ; out3 ____ |
| %endmacro |
| |
| INIT_XMM avx2 |
| cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| pxor m2, m2 |
| mova [cq+16*0], m2 |
| mova [cq+16*1], m2 |
| psraw m0, 2 |
| psraw m1, 2 |
| IWHT4_1D_PACKED |
| punpckhwd m0, m1 |
| punpcklwd m3, m1, m2 |
| punpckhdq m1, m0, m3 |
| punpckldq m0, m3 |
| IWHT4_1D_PACKED |
| vpblendd m0, m0, m2, 0x03 |
| ITX4_END 3, 0, 2, 1, 0 |
| |
| %macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size |
| cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| %define %%p1 m(i%1_%4_internal) |
| lea rax, [o_base] |
| ; Jump to the 1st txfm function if we're not taking the fast path, which |
| ; in turn performs an indirect jump to the 2nd txfm function. |
| lea tx2q, [m(i%2_%4_internal).pass2] |
| %if %3 > 0 |
| cmp eobd, %3 |
| jg %%p1 |
| %elif %3 == 0 |
| test eobd, eobd |
| jnz %%p1 |
| %else |
| ; jump to the 1st txfm function unless it's located directly after this |
| times ((%%end - %%p1) >> 31) & 1 jmp %%p1 |
| ALIGN function_align |
| %%end: |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 4x4 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd m0, [o(pw_2896x8)] |
| pmulhrsw m0, [cq] |
| vpbroadcastd m1, [o(pw_1697x8)] |
| pmulhrsw m1, m0 |
| paddw m0, m1 |
| punpcklwd m0, m0 |
| punpckhdq m1, m0, m0 |
| punpckldq m0, m0 |
| jmp m(iadst_4x4_internal).end |
| %elifidn %1_%2, identity_dct |
| mova m0, [cq+16*0] |
| packusdw m0, [cq+16*1] |
| vpbroadcastd m1, [o(pw_1697x8)] |
| vpbroadcastd m2, [o(pw_2896x8)] |
| packusdw m0, m0 |
| pmulhrsw m1, m0 |
| paddw m0, m1 |
| pmulhrsw m0, m2 |
| mova m1, m0 |
| jmp m(iadst_4x4_internal).end |
| %elif %3 >= 0 |
| vpbroadcastw m0, [cq] |
| %ifidn %1, dct |
| vpbroadcastd m1, [o(pw_2896x8)] |
| pmulhrsw m0, m1 |
| %elifidn %1, adst |
| movddup m1, [o(iadst4_dconly1a)] |
| pmulhrsw m0, m1 |
| %elifidn %1, flipadst |
| movddup m1, [o(iadst4_dconly1b)] |
| pmulhrsw m0, m1 |
| %endif |
| mov [cq], eobd ; 0 |
| %ifidn %2, dct |
| %ifnidn %1, dct |
| vpbroadcastd m1, [o(pw_2896x8)] |
| %endif |
| pmulhrsw m0, m1 |
| mova m1, m0 |
| jmp m(iadst_4x4_internal).end2 |
| %else ; adst / flipadst |
| pmulhrsw m1, m0, [o(iadst4_dconly2b)] |
| pmulhrsw m0, [o(iadst4_dconly2a)] |
| jmp m(i%2_4x4_internal).end2 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D_PACKED 0 |
| vpbroadcastd m4, [o(pd_2048)] |
| punpckhwd m2, m1, m0 |
| punpcklwd m1, m0 |
| ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 |
| ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 |
| paddsw m0, m1, m2 ; out0 out1 |
| psubsw m1, m2 ; out3 out2 |
| %endmacro |
| |
| %macro IADST4_1D_PACKED 0 |
| punpcklwd m2, m1, m0 |
| punpckhwd m3, m1, m0 |
| vpbroadcastd m5, [o(pw_m3344_3344)] |
| vpbroadcastd m0, [o(pw_3803_1321)] |
| vpbroadcastd m4, [o(pw_m1321_2482)] |
| pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 |
| psrld m5, 16 |
| pmaddwd m0, m2 |
| pmaddwd m2, m4 |
| pmaddwd m5, m3 ; 3344*in0 |
| paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 |
| vpbroadcastd m4, [o(pw_2482_3344)] |
| vpbroadcastd m5, [o(pw_m3803_3344)] |
| pmaddwd m4, m3 |
| pmaddwd m5, m3 |
| paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 |
| vpbroadcastd m0, [o(pw_m3803_m6688)] |
| pmaddwd m3, m0 |
| vpbroadcastd m0, [o(pd_2048)] |
| paddd m2, m0 |
| paddd m1, m0 |
| paddd m0, m4 |
| paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 |
| paddd m2, m4 |
| paddd m2, m3 |
| REPX {psrad x, 12}, m1, m2, m0, m5 |
| packssdw m0, m5 ; out0 out1 |
| packssdw m1, m2 ; out2 out3 |
| %endmacro |
| |
| INV_TXFM_4X4_FN dct, dct, 0 |
| INV_TXFM_4X4_FN dct, adst, 0 |
| INV_TXFM_4X4_FN dct, flipadst, 0 |
| INV_TXFM_4X4_FN dct, identity, 3 |
| |
| cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| IDCT4_1D_PACKED |
| mova m2, [o(deint_shuf)] |
| shufps m3, m0, m1, q1331 |
| shufps m0, m0, m1, q0220 |
| pshufb m0, m2 |
| pshufb m1, m3, m2 |
| jmp tx2q |
| .pass2: |
| IDCT4_1D_PACKED |
| pxor m2, m2 |
| mova [cq+16*0], m2 |
| mova [cq+16*1], m2 |
| ITX4_END 0, 1, 3, 2 |
| |
| INV_TXFM_4X4_FN adst, dct, 0 |
| INV_TXFM_4X4_FN adst, adst, 0 |
| INV_TXFM_4X4_FN adst, flipadst, 0 |
| INV_TXFM_4X4_FN adst, identity |
| |
| cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| call .main |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| pxor m2, m2 |
| mova [cq+16*0], m2 |
| mova [cq+16*1], m2 |
| .end2: |
| ITX4_END 0, 1, 2, 3 |
| ALIGN function_align |
| .main: |
| IADST4_1D_PACKED |
| ret |
| |
| INV_TXFM_4X4_FN flipadst, dct, 0 |
| INV_TXFM_4X4_FN flipadst, adst, 0 |
| INV_TXFM_4X4_FN flipadst, flipadst, 0 |
| INV_TXFM_4X4_FN flipadst, identity |
| |
| cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| call m(iadst_4x4_internal).main |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| jmp tx2q |
| .pass2: |
| call m(iadst_4x4_internal).main |
| .end: |
| pxor m2, m2 |
| mova [cq+16*0], m2 |
| mova [cq+16*1], m2 |
| .end2: |
| ITX4_END 3, 2, 1, 0 |
| |
| INV_TXFM_4X4_FN identity, dct, 3 |
| INV_TXFM_4X4_FN identity, adst |
| INV_TXFM_4X4_FN identity, flipadst |
| INV_TXFM_4X4_FN identity, identity |
| |
| cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddw m0, m2 |
| paddw m1, m3 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddw m0, m2 |
| paddw m1, m3 |
| jmp m(iadst_4x4_internal).end |
| |
| %macro WRITE_4X8 2 ; coefs[1-2] |
| movd xm4, [dstq+strideq*0] |
| pinsrd xm4, [dstq+strideq*1], 1 |
| movd xm5, [dstq+strideq*2] |
| pinsrd xm5, [dstq+r3 ], 1 |
| pinsrd xm4, [r2 +strideq*0], 2 |
| pinsrd xm4, [r2 +strideq*1], 3 |
| pinsrd xm5, [r2 +strideq*2], 2 |
| pinsrd xm5, [r2 +r3 ], 3 |
| pmovzxbw m4, xm4 |
| pmovzxbw m5, xm5 |
| paddw m4, m%1 |
| paddw m5, m%2 |
| packuswb m4, m5 |
| vextracti128 xm5, m4, 1 |
| movd [dstq+strideq*0], xm4 |
| pextrd [dstq+strideq*1], xm4, 1 |
| pextrd [dstq+strideq*2], xm4, 2 |
| pextrd [dstq+r3 ], xm4, 3 |
| movd [r2 +strideq*0], xm5 |
| pextrd [r2 +strideq*1], xm5, 1 |
| pextrd [r2 +strideq*2], xm5, 2 |
| pextrd [r2 +r3 ], xm5, 3 |
| %endmacro |
| |
| %macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 4x8 |
| %if %3 >= 0 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pmulhrsw xm1, xm0, [cq] |
| vpbroadcastd xm2, [o(pw_4096)] |
| pmulhrsw xm1, xm0 |
| pmulhrsw xm1, xm2 |
| vpermq m1, m1, q1100 |
| punpcklwd m1, m1 |
| punpckldq m0, m1, m1 |
| punpckhdq m1, m1 |
| jmp m(iadst_4x8_internal).end3 |
| %elifidn %1_%2, identity_dct |
| movd xm0, [cq+16*0] |
| punpcklwd xm0, [cq+16*1] |
| movd xm1, [cq+16*2] |
| punpcklwd xm1, [cq+16*3] |
| vpbroadcastd xm2, [o(pw_2896x8)] |
| vpbroadcastd xm3, [o(pw_1697x8)] |
| vpbroadcastd xm4, [o(pw_2048)] |
| punpckldq xm0, xm1 |
| pmulhrsw xm0, xm2 |
| pmulhrsw xm3, xm0 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm2 |
| pmulhrsw xm0, xm4 |
| vpbroadcastq m0, xm0 |
| mova m1, m0 |
| jmp m(iadst_4x8_internal).end3 |
| %elifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_2048)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| mova m1, m0 |
| jmp m(iadst_4x8_internal).end4 |
| %else ; adst_dct / flipadst_dct |
| vpbroadcastw xm0, [cq] |
| vpbroadcastd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, [o(iadst4_dconly1a)] |
| vpbroadcastd xm2, [o(pw_2048)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| %ifidn %1, adst |
| vpbroadcastq m0, xm0 |
| %else ; flipadst |
| vpermq m0, m0, q1111 |
| %endif |
| mova m1, m0 |
| jmp m(iadst_4x8_internal).end4 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IDCT8_1D_PACKED 0 |
| vpbroadcastd m6, [o(pd_2048)] |
| punpckhwd m5, m3, m0 ; in7 in1 |
| punpckhwd m4, m1, m2 ; in3 in5 |
| punpcklwd m3, m1 ; in6 in2 |
| punpcklwd m2, m0 ; in4 in0 |
| ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a |
| ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a |
| ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 |
| psubsw m0, m5, m4 ; t5a t6a (interleaved) |
| paddsw m4, m5 ; t4 t7 (interleaved) |
| ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 |
| vpbroadcastd m1, [o(pw_m2896_2896)] |
| ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 |
| %if mmsize > 16 |
| vbroadcasti128 m1, [o(deint_shuf)] |
| pshufb m4, m1 |
| %else |
| pshufb m4, [o(deint_shuf)] |
| %endif |
| psubsw m1, m2, m3 ; tmp3 tmp2 |
| paddsw m3, m2 ; tmp0 tmp1 |
| shufps m2, m4, m0, q1032 ; t7 t6 |
| vpblendd m4, m0, 0xcc ; t4 t5 |
| paddsw m0, m3, m2 ; out0 out1 |
| psubsw m3, m2 ; out7 out6 |
| psubsw m2, m1, m4 ; out4 out5 |
| paddsw m1, m4 ; out3 out2 |
| %endmacro |
| |
| %macro IADST8_1D_PACKED 1 ; pass |
| vpbroadcastd m6, [o(pd_2048)] |
| punpckhwd m0, m4, m3 ; 0 7 |
| punpckhwd m1, m5, m2 ; 2 5 |
| punpcklwd m2, m5 ; 4 3 |
| punpcklwd m3, m4 ; 6 1 |
| %if %1 == 1 |
| ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a |
| ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a |
| ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a |
| ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a |
| psubsw m4, m0, m2 ; t5 t4 |
| paddsw m0, m2 ; t1 t0 |
| psubsw m5, m1, m3 ; t6 t7 |
| paddsw m1, m3 ; t2 t3 |
| ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a |
| ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a |
| %if mmsize > 16 |
| vbroadcasti128 m2, [o(deint_shuf)] |
| %else |
| mova m2, [o(deint_shuf)] |
| %endif |
| pshuflw m1, m1, q2301 |
| pshufhw m1, m1, q2301 |
| psubsw m3, m0, m1 ; t3 t2 |
| paddsw m0, m1 ; -out7 out0 |
| psubsw m1, m4, m5 ; t7 t6 |
| paddsw m4, m5 ; out6 -out1 |
| pshufb m0, m2 |
| pshufb m4, m2 |
| vpbroadcastd m5, [o(pw_m2896_2896)] |
| pmaddwd m2, m5, m3 |
| pmaddwd m5, m1 |
| paddd m2, m6 |
| paddd m5, m6 |
| psrad m2, 12 |
| psrad m5, 12 |
| packssdw m2, m5 ; out4 -out5 |
| vpbroadcastd m5, [o(pw_2896_2896)] |
| pmaddwd m3, m5 |
| pmaddwd m1, m5 |
| paddd m3, m6 |
| paddd m1, m6 |
| psrad m3, 12 |
| psrad m1, 12 |
| packssdw m1, m3 ; out2 -out3 |
| punpcklqdq m3, m4, m0 ; out6 -out7 |
| punpckhqdq m0, m4 ; out0 -out1 |
| %else |
| ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a |
| ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a |
| ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a |
| ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a |
| psubsw m4, m0, m2 ; t4 t5 |
| paddsw m0, m2 ; t0 t1 |
| psubsw m5, m1, m3 ; t6 t7 |
| paddsw m1, m3 ; t2 t3 |
| shufps m2, m5, m4, q1032 |
| punpckhwd m4, m2 |
| punpcklwd m5, m2 |
| ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a |
| ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a |
| psubsw m2, m0, m1 ; t2 t3 |
| paddsw m0, m1 ; out0 -out7 |
| psubsw m1, m4, m5 ; t7 t6 |
| paddsw m4, m5 ; out6 -out1 |
| vpbroadcastd m5, [o(pw_2896x8)] |
| vpblendd m3, m0, m4, 0x33 ; out6 -out7 |
| vpblendd m0, m0, m4, 0xcc ; out0 -out1 |
| shufps m4, m2, m1, q1032 ; t3 t7 |
| vpblendd m1, m2, m1, 0xcc ; t2 t6 |
| psubsw m2, m1, m4 ; t2-t3 t6-t7 |
| paddsw m1, m4 ; t2+t3 t6+t7 |
| pmulhrsw m2, m5 ; out4 -out5 |
| pshufd m1, m1, q1032 |
| pmulhrsw m1, m5 ; out2 -out3 |
| %endif |
| %endmacro |
| |
| INIT_YMM avx2 |
| INV_TXFM_4X8_FN dct, dct, 0 |
| INV_TXFM_4X8_FN dct, identity, 7 |
| INV_TXFM_4X8_FN dct, adst |
| INV_TXFM_4X8_FN dct, flipadst |
| |
| cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| IDCT4_1D_PACKED |
| vbroadcasti128 m2, [o(deint_shuf)] |
| shufps m3, m0, m1, q1331 |
| shufps m0, m0, m1, q0220 |
| pshufb m0, m2 |
| pshufb m1, m3, m2 |
| jmp tx2q |
| .pass2: |
| vextracti128 xm2, m0, 1 |
| vextracti128 xm3, m1, 1 |
| call .main |
| vpbroadcastd m4, [o(pw_2048)] |
| vinserti128 m0, m0, xm2, 1 |
| vinserti128 m1, m1, xm3, 1 |
| pshufd m1, m1, q1032 |
| jmp m(iadst_4x8_internal).end2 |
| ALIGN function_align |
| .main: |
| WRAP_XMM IDCT8_1D_PACKED |
| ret |
| |
| INV_TXFM_4X8_FN adst, dct, 0 |
| INV_TXFM_4X8_FN adst, adst |
| INV_TXFM_4X8_FN adst, flipadst |
| INV_TXFM_4X8_FN adst, identity |
| |
| cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| call m(iadst_8x4_internal).main |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| vextracti128 xm2, m0, 1 |
| vextracti128 xm3, m1, 1 |
| pshufd xm4, xm0, q1032 |
| pshufd xm5, xm1, q1032 |
| call .main_pass2 |
| vpbroadcastd m4, [o(pw_2048)] |
| vinserti128 m0, m0, xm2, 1 |
| vinserti128 m1, m1, xm3, 1 |
| pxor m5, m5 |
| psubw m5, m4 |
| .end: |
| vpblendd m4, m4, m5, 0xcc |
| .end2: |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| WIN64_RESTORE_XMM |
| .end3: |
| pxor m2, m2 |
| mova [cq+32*0], m2 |
| mova [cq+32*1], m2 |
| .end4: |
| lea r2, [dstq+strideq*4] |
| lea r3, [strideq*3] |
| WRITE_4X8 0, 1 |
| RET |
| ALIGN function_align |
| .main_pass1: |
| WRAP_XMM IADST8_1D_PACKED 1 |
| ret |
| ALIGN function_align |
| .main_pass2: |
| WRAP_XMM IADST8_1D_PACKED 2 |
| ret |
| |
| INV_TXFM_4X8_FN flipadst, dct, 0 |
| INV_TXFM_4X8_FN flipadst, adst |
| INV_TXFM_4X8_FN flipadst, flipadst |
| INV_TXFM_4X8_FN flipadst, identity |
| |
| cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| call m(iadst_8x4_internal).main |
| punpcklwd m3, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m3 |
| punpckhwd m1, m3 |
| jmp tx2q |
| .pass2: |
| vextracti128 xm2, m0, 1 |
| vextracti128 xm3, m1, 1 |
| pshufd xm4, xm0, q1032 |
| pshufd xm5, xm1, q1032 |
| call m(iadst_4x8_internal).main_pass2 |
| vpbroadcastd m5, [o(pw_2048)] |
| vinserti128 m3, m3, xm1, 1 |
| vinserti128 m2, m2, xm0, 1 |
| pxor m4, m4 |
| psubw m4, m5 |
| pshufd m0, m3, q1032 |
| pshufd m1, m2, q1032 |
| jmp m(iadst_4x8_internal).end |
| |
| INV_TXFM_4X8_FN identity, dct, 3 |
| INV_TXFM_4X8_FN identity, adst |
| INV_TXFM_4X8_FN identity, flipadst |
| INV_TXFM_4X8_FN identity, identity |
| |
| cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m2, [cq+32*0], q3120 |
| vpermq m0, [cq+32*1], q3120 |
| vpbroadcastd m3, [o(pw_2896x8)] |
| vpbroadcastd m4, [o(pw_1697x8)] |
| punpcklwd m1, m2, m0 |
| punpckhwd m2, m0 |
| pmulhrsw m1, m3 |
| pmulhrsw m2, m3 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| pmulhrsw m2, m4, m0 |
| pmulhrsw m4, m1 |
| paddw m0, m2 |
| paddw m1, m4 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m4, [o(pw_4096)] |
| jmp m(iadst_4x8_internal).end2 |
| |
| %macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 4x16 |
| %if %3 >= 0 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd m0, [o(pw_2896x8)] |
| pmulhrsw m0, [cq] |
| vpbroadcastd m1, [o(pw_16384)] |
| vpbroadcastd m2, [o(pw_1697x16)] |
| vpbroadcastd m3, [o(pw_2048)] |
| pmulhrsw m0, m1 |
| pmulhrsw m2, m0 |
| paddw m0, m0 |
| paddw m0, m2 |
| pmulhrsw m3, m0 |
| punpcklwd m1, m3, m3 |
| punpckhwd m3, m3 |
| punpckldq m0, m1, m1 |
| punpckhdq m1, m1 |
| punpckldq m2, m3, m3 |
| punpckhdq m3, m3 |
| jmp m(iadst_4x16_internal).end3 |
| %elifidn %1_%2, identity_dct |
| movd xm0, [cq+32*0] |
| punpcklwd xm0, [cq+32*1] |
| movd xm1, [cq+32*2] |
| punpcklwd xm1, [cq+32*3] |
| vpbroadcastd xm2, [o(pw_1697x8)] |
| vpbroadcastd xm3, [o(pw_16384)] |
| vpbroadcastd xm4, [o(pw_2896x8)] |
| punpckldq xm0, xm1 |
| pmulhrsw xm2, xm0 |
| paddw xm0, xm2 |
| pmulhrsw xm0, xm3 |
| psrlw xm3, 3 ; pw_2048 |
| pmulhrsw xm0, xm4 |
| pmulhrsw xm0, xm3 |
| vpbroadcastq m0, xm0 |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| jmp m(iadst_4x16_internal).end3 |
| %elifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| movd xm3, [o(pw_2048)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm2 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm3 |
| vpbroadcastw m0, xm0 |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| jmp m(iadst_4x16_internal).end4 |
| %else ; adst_dct / flipadst_dct |
| vpbroadcastw xm0, [cq] |
| pmulhrsw xm0, [o(iadst4_dconly1a)] |
| vpbroadcastd xm1, [o(pw_16384)] |
| vpbroadcastd xm2, [o(pw_2896x8)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| psrlw xm1, 3 ; pw_2048 |
| pmulhrsw xm0, xm2 |
| pmulhrsw xm0, xm1 |
| %ifidn %1, adst |
| vpbroadcastq m0, xm0 |
| %else ; flipadst |
| vpermq m0, m0, q1111 |
| %endif |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| jmp m(iadst_4x16_internal).end4 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IDCT16_1D_PACKED 0 |
| vpbroadcastd m10, [o(pd_2048)] |
| .main2: |
| punpckhwd m8, m7, m0 ; dct16 in15 in1 |
| punpcklwd m9, m4, m0 ; dct4 in2 in0 |
| punpckhwd m0, m3, m4 ; dct16 in7 in9 |
| punpcklwd m7, m1 ; dct8 in7 in1 |
| punpckhwd m1, m6 ; dct16 in3 in13 |
| punpcklwd m3, m5 ; dct8 in3 in5 |
| punpckhwd m5, m2 ; dct16 in11 in5 |
| punpcklwd m6, m2 ; dct4 in3 in1 |
| ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a |
| ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a |
| ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a |
| ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a |
| ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a |
| ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a |
| ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 |
| psubsw m2, m8, m0 ; t9 t14 |
| paddsw m8, m0 ; t8 t15 |
| psubsw m0, m1, m5 ; t10 t13 |
| paddsw m1, m5 ; t11 t12 |
| vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 |
| ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a |
| vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 |
| ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a |
| psubsw m4, m8, m1 ; t11a t12a |
| paddsw m8, m1 ; t8a t15a |
| psubsw m1, m7, m3 ; t5a t6a |
| paddsw m7, m3 ; t4 t7 |
| paddsw m3, m2, m0 ; t9 t14 |
| psubsw m2, m0 ; t10 t13 |
| %if mmsize > 16 |
| vbroadcasti128 m0, [o(deint_shuf)] |
| %else |
| mova m0, [o(deint_shuf)] |
| %endif |
| pshufb m8, m0 |
| pshufb m7, m0 |
| pshufb m3, m0 |
| ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 |
| vpbroadcastd m0, [o(pw_m2896_2896)] |
| ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 |
| vpbroadcastd m5, [o(pw_2896_2896)] |
| ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 |
| vpbroadcastd m0, [o(pw_m2896_2896)] |
| ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a |
| punpckhqdq m0, m8, m3 ; t15a t14 |
| punpcklqdq m8, m3 ; t8a t9 |
| shufps m5, m4, m2, q1032 ; t12 t13a |
| vpblendd m4, m2, 0xcc ; t11 t10a |
| shufps m2, m7, m1, q1032 ; t7 t6 |
| vpblendd m7, m1, 0xcc ; t4 t5 |
| psubsw m1, m9, m6 ; dct4 out3 out2 |
| paddsw m9, m6 ; dct4 out0 out1 |
| psubsw m3, m9, m2 ; dct8 out7 out6 |
| paddsw m9, m2 ; dct8 out0 out1 |
| psubsw m2, m1, m7 ; dct8 out4 out5 |
| paddsw m1, m7 ; dct8 out3 out2 |
| psubsw m7, m9, m0 ; out15 out14 |
| paddsw m0, m9 ; out0 out1 |
| psubsw m6, m1, m5 ; out12 out13 |
| paddsw m1, m5 ; out3 out2 |
| psubsw m5, m2, m4 ; out11 out10 |
| paddsw m2, m4 ; out4 out5 |
| psubsw m4, m3, m8 ; out8 out9 |
| paddsw m3, m8 ; out7 out6 |
| %endmacro |
| |
| INV_TXFM_4X16_FN dct, dct, 0 |
| INV_TXFM_4X16_FN dct, identity, 15 |
| INV_TXFM_4X16_FN dct, adst |
| INV_TXFM_4X16_FN dct, flipadst |
| |
| cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| call m(idct_16x4_internal).main |
| vpbroadcastd m5, [o(pw_16384)] |
| punpckhwd m4, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| REPX {pmulhrsw x, m5}, m0, m4, m2, m3 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| jmp tx2q |
| .pass2: |
| vextracti128 xm4, m0, 1 |
| vextracti128 xm5, m1, 1 |
| vextracti128 xm6, m2, 1 |
| vextracti128 xm7, m3, 1 |
| call .main |
| vinserti128 m0, m0, xm4, 1 |
| vinserti128 m1, m1, xm5, 1 |
| vpbroadcastd m5, [o(pw_2048)] |
| vinserti128 m2, m2, xm6, 1 |
| vinserti128 m3, m3, xm7, 1 |
| pshufd m1, m1, q1032 |
| pshufd m3, m3, q1032 |
| jmp m(iadst_4x16_internal).end2 |
| ALIGN function_align |
| .main: |
| WRAP_XMM IDCT16_1D_PACKED |
| ret |
| |
| INV_TXFM_4X16_FN adst, dct, 0 |
| INV_TXFM_4X16_FN adst, adst |
| INV_TXFM_4X16_FN adst, flipadst |
| INV_TXFM_4X16_FN adst, identity |
| |
| cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| call m(iadst_16x4_internal).main |
| vpbroadcastd m5, [o(pw_16384)] |
| punpckhwd m4, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| REPX {pmulhrsw x, m5}, m4, m2, m3, m0 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| jmp tx2q |
| .pass2: |
| call .main |
| vpbroadcastd m5, [o(pw_2896x8)] |
| paddsw m1, m2, m4 |
| psubsw m2, m4 |
| pmulhrsw m1, m5 ; -out7 out4 out6 -out5 |
| pmulhrsw m2, m5 ; out8 -out11 -out9 out10 |
| vpbroadcastd m5, [o(pw_2048)] |
| pshufd m1, m1, q1032 |
| vpblendd m4, m1, m0, 0x33 |
| vpblendd m0, m0, m2, 0x33 |
| vpblendd m2, m2, m3, 0x33 |
| vpblendd m3, m3, m1, 0x33 |
| vpermq m0, m0, q2031 |
| vpermq m1, m2, q1302 |
| vpermq m2, m3, q3120 |
| vpermq m3, m4, q0213 |
| psubw m6, m7, m5 |
| .end: |
| vpblendd m5, m5, m6, 0xcc |
| .end2: |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| WIN64_RESTORE_XMM |
| .end3: |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| .end4: |
| lea r2, [dstq+strideq*8] |
| lea r3, [strideq*3] |
| WRITE_4X8 0, 1 |
| lea dstq, [dstq+strideq*4] |
| lea r2, [r2 +strideq*4] |
| WRITE_4X8 2, 3 |
| RET |
| ALIGN function_align |
| .main: |
| vpblendd m4, m1, m0, 0xcc |
| vpblendd m1, m1, m0, 0x33 |
| vpblendd m5, m2, m3, 0xcc |
| vpblendd m2, m2, m3, 0x33 |
| vperm2i128 m3, m5, m2, 0x31 |
| vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 |
| vperm2i128 m4, m1, m4, 0x31 |
| vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 |
| pshufd m3, m3, q1032 ; in12 in15 in13 in14 |
| pshufd m2, m4, q1032 ; in11 in8 in9 in10 |
| .main2: |
| vpbroadcastd m8, [o(pd_2048)] |
| pxor m7, m7 |
| punpckhwd m4, m3, m0 ; in12 in3 in14 in1 |
| punpcklwd m0, m3 ; in0 in15 in2 in13 |
| punpckhwd m3, m2, m1 ; in8 in7 in10 in5 |
| punpcklwd m1, m2 ; in4 in11 in6 in9 |
| ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 |
| ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 |
| ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 |
| ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 |
| psubsw m2, m0, m3 ; t9a t8a t11a t10a |
| paddsw m0, m3 ; t1a t0a t3a t2a |
| psubsw m3, m1, m4 ; t13a t12a t15a t14a |
| paddsw m1, m4 ; t5a t4a t7a t6a |
| ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 |
| psubw m6, m7, m5 |
| ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 |
| vpbroadcastd m6, [o(pw_m3784_1567)] |
| vpbroadcastd m5, [o(pw_1567_3784)] |
| psubsw m4, m0, m1 ; t5 t4 t7 t6 |
| paddsw m0, m1 ; t1 t0 t3 t2 |
| psubsw m1, m2, m3 ; t13a t12a t15a t14a |
| paddsw m2, m3 ; t9a t8a t11a t10a |
| psubw m3, m7, m6 ; pw_3784_m1567 |
| vpblendd m6, m6, m3, 0xf0 |
| ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a |
| ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 |
| vbroadcasti128 m5, [o(deint_shuf)] |
| pshufb m0, m5 |
| pshufb m2, m5 |
| vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a |
| vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a |
| vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 |
| vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13 |
| pshufd m2, m2, q1032 ; t6a t7a t14 t15 |
| psubsw m1, m0, m3 ; t3a t2a t11 t10 |
| paddsw m0, m3 ; -out15 out0 out14 -out1 |
| paddsw m3, m4, m2 ; -out3 out12 out2 -out13 |
| psubsw m4, m2 ; t6 t7 t14a t15a |
| shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a |
| vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a |
| ret |
| ALIGN function_align |
| .main_pass1_end: |
| vpbroadcastd m5, [o(pw_m2896_2896)] |
| vpbroadcastd m6, [o(pw_2896_2896)] |
| punpcklwd m1, m4, m2 |
| punpckhwd m4, m2 |
| pmaddwd m2, m5, m4 |
| pmaddwd m4, m6 |
| pmaddwd m5, m1 |
| pmaddwd m1, m6 |
| REPX {paddd x, m8}, m5, m1, m2, m4 |
| REPX {psrad x, 12}, m5, m2, m1, m4 |
| packssdw m2, m5 ; -out11 out8 out10 -out9 |
| packssdw m1, m4 ; -out7 out4 out6 -out5 |
| ret |
| |
| INV_TXFM_4X16_FN flipadst, dct, 0 |
| INV_TXFM_4X16_FN flipadst, adst |
| INV_TXFM_4X16_FN flipadst, flipadst |
| INV_TXFM_4X16_FN flipadst, identity |
| |
| cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| call m(iadst_16x4_internal).main |
| vpbroadcastd m5, [o(pw_16384)] |
| punpcklwd m4, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m3, m2 |
| punpckhwd m3, m2 |
| REPX {pmulhrsw x, m5}, m4, m1, m0, m3 |
| punpckldq m2, m3, m1 |
| punpckhdq m3, m1 |
| punpckhdq m1, m0, m4 |
| punpckldq m0, m4 |
| jmp tx2q |
| .pass2: |
| call m(iadst_4x16_internal).main |
| vpbroadcastd m5, [o(pw_2896x8)] |
| paddsw m1, m2, m4 |
| psubsw m2, m4 |
| pmulhrsw m1, m5 ; -out7 out4 out6 -out5 |
| pmulhrsw m2, m5 ; out8 -out11 -out9 out10 |
| vpbroadcastd m6, [o(pw_2048)] |
| pshufd m1, m1, q1032 |
| vpblendd m4, m0, m2, 0x33 |
| vpblendd m0, m0, m1, 0xcc |
| vpblendd m1, m1, m3, 0xcc |
| vpblendd m2, m2, m3, 0x33 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q0213 |
| vpermq m2, m2, q2031 |
| vpermq m3, m4, q1302 |
| psubw m5, m7, m6 |
| jmp m(iadst_4x16_internal).end |
| |
| INV_TXFM_4X16_FN identity, dct, 3 |
| INV_TXFM_4X16_FN identity, adst |
| INV_TXFM_4X16_FN identity, flipadst |
| INV_TXFM_4X16_FN identity, identity |
| |
| cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova m3, [cq+32*0] |
| mova m2, [cq+32*1] |
| mova m4, [cq+32*2] |
| mova m0, [cq+32*3] |
| vpbroadcastd m5, [o(pw_1697x8)] |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| pmulhrsw m0, m5, m1 |
| pmulhrsw m6, m5, m2 |
| pmulhrsw m7, m5, m3 |
| pmulhrsw m5, m4 |
| paddw m1, m0 |
| paddw m2, m6 |
| paddw m3, m7 |
| paddw m4, m5 |
| vpbroadcastd m5, [o(pw_16384)] |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [o(pw_1697x16)] |
| vpbroadcastd m5, [o(pw_2048)] |
| pmulhrsw m4, m8, m0 |
| pmulhrsw m6, m8, m1 |
| pmulhrsw m7, m8, m2 |
| pmulhrsw m8, m3 |
| REPX {paddw x, x}, m0, m1, m2, m3 |
| paddw m0, m4 |
| paddw m1, m6 |
| paddw m2, m7 |
| paddw m3, m8 |
| jmp m(iadst_4x16_internal).end2 |
| |
| %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3] |
| movq xm%3, [dstq ] |
| movhps xm%3, [dstq+%5] |
| movq xm%4, [dstq+%6] |
| movhps xm%4, [dstq+%7] |
| pmovzxbw m%3, xm%3 |
| pmovzxbw m%4, xm%4 |
| %ifnum %1 |
| paddw m%3, m%1 |
| %else |
| paddw m%3, %1 |
| %endif |
| %ifnum %2 |
| paddw m%4, m%2 |
| %else |
| paddw m%4, %2 |
| %endif |
| packuswb m%3, m%4 |
| vextracti128 xm%4, m%3, 1 |
| movq [dstq ], xm%3 |
| movhps [dstq+%6], xm%3 |
| movq [dstq+%5], xm%4 |
| movhps [dstq+%7], xm%4 |
| %endmacro |
| |
| %macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 8x4 |
| %if %3 >= 0 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pmulhrsw xm1, xm0, [cq] |
| vpbroadcastd xm2, [o(pw_1697x8)] |
| vpbroadcastd xm3, [o(pw_2048)] |
| pmulhrsw xm1, xm0 |
| pmulhrsw xm2, xm1 |
| paddw xm1, xm2 |
| pmulhrsw xm1, xm3 |
| punpcklwd xm1, xm1 |
| punpckldq xm0, xm1, xm1 |
| punpckhdq xm1, xm1 |
| vpermq m0, m0, q1100 |
| vpermq m1, m1, q1100 |
| %elifidn %1_%2, identity_dct |
| mova xm0, [cq+16*0] |
| packusdw xm0, [cq+16*1] |
| mova xm1, [cq+16*2] |
| packusdw xm1, [cq+16*3] |
| vpbroadcastd xm2, [o(pw_2896x8)] |
| vpbroadcastd xm3, [o(pw_2048)] |
| packusdw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| paddw xm0, xm0 |
| pmulhrsw xm0, xm2 |
| pmulhrsw xm0, xm3 |
| vinserti128 m0, m0, xm0, 1 |
| mova m1, m0 |
| %else |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| pmulhrsw xm0, xm1 |
| %ifidn %2, dct |
| movd xm2, [o(pw_2048)] |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| mova m1, m0 |
| %else ; adst / flipadst |
| vpbroadcastw m0, xm0 |
| pmulhrsw m0, [o(iadst4_dconly2a)] |
| vpbroadcastd m1, [o(pw_2048)] |
| pmulhrsw m1, m0 |
| %ifidn %2, adst |
| vpermq m0, m1, q1100 |
| vpermq m1, m1, q3322 |
| %else ; flipadst |
| vpermq m0, m1, q2233 |
| vpermq m1, m1, q0011 |
| %endif |
| %endif |
| %endif |
| jmp m(iadst_8x4_internal).end3 |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X4_FN dct, dct, 0 |
| INV_TXFM_8X4_FN dct, adst, 0 |
| INV_TXFM_8X4_FN dct, flipadst, 0 |
| INV_TXFM_8X4_FN dct, identity, 3 |
| |
| cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpbroadcastd xm3, [o(pw_2896x8)] |
| pmulhrsw xm0, xm3, [cq+16*0] |
| pmulhrsw xm1, xm3, [cq+16*1] |
| pmulhrsw xm2, xm3, [cq+16*2] |
| pmulhrsw xm3, [cq+16*3] |
| call m(idct_4x8_internal).main |
| vbroadcasti128 m4, [o(deint_shuf)] |
| vinserti128 m3, m1, xm3, 1 |
| vinserti128 m1, m0, xm2, 1 |
| shufps m0, m1, m3, q0220 |
| shufps m1, m1, m3, q1331 |
| pshufb m0, m4 |
| pshufb m1, m4 |
| jmp tx2q |
| .pass2: |
| IDCT4_1D_PACKED |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| jmp m(iadst_8x4_internal).end2 |
| |
| INV_TXFM_8X4_FN adst, dct |
| INV_TXFM_8X4_FN adst, adst |
| INV_TXFM_8X4_FN adst, flipadst |
| INV_TXFM_8X4_FN adst, identity |
| |
| cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pshufd xm4, [cq+16*0], q1032 |
| pmulhrsw xm3, xm0, [cq+16*3] |
| pshufd xm5, [cq+16*1], q1032 |
| pmulhrsw xm2, xm0, [cq+16*2] |
| pmulhrsw xm4, xm0 |
| pmulhrsw xm5, xm0 |
| call m(iadst_4x8_internal).main_pass1 |
| vinserti128 m0, m0, xm2, 1 |
| vinserti128 m1, m1, xm3, 1 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pxor m3, m3 |
| psubw m3, m2 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| .end2: |
| vpbroadcastd m2, [o(pw_2048)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| WIN64_RESTORE_XMM |
| .end3: |
| pxor m2, m2 |
| mova [cq+32*0], m2 |
| mova [cq+32*1], m2 |
| lea r3, [strideq*3] |
| WRITE_8X4 0, 1, 4, 5 |
| RET |
| ALIGN function_align |
| .main: |
| IADST4_1D_PACKED |
| ret |
| |
| INV_TXFM_8X4_FN flipadst, dct |
| INV_TXFM_8X4_FN flipadst, adst |
| INV_TXFM_8X4_FN flipadst, flipadst |
| INV_TXFM_8X4_FN flipadst, identity |
| |
| cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pshufd xm4, [cq+16*0], q1032 |
| pmulhrsw xm3, xm0, [cq+16*3] |
| pshufd xm5, [cq+16*1], q1032 |
| pmulhrsw xm2, xm0, [cq+16*2] |
| pmulhrsw xm4, xm0 |
| pmulhrsw xm5, xm0 |
| call m(iadst_4x8_internal).main_pass1 |
| vinserti128 m3, m3, xm1, 1 |
| vinserti128 m2, m2, xm0, 1 |
| punpckhwd m1, m3, m2 |
| punpcklwd m3, m2 |
| pxor m0, m0 |
| psubw m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call m(iadst_8x4_internal).main |
| mova m2, m1 |
| vpermq m1, m0, q2031 |
| vpermq m0, m2, q2031 |
| jmp m(iadst_8x4_internal).end2 |
| |
| INV_TXFM_8X4_FN identity, dct, 7 |
| INV_TXFM_8X4_FN identity, adst |
| INV_TXFM_8X4_FN identity, flipadst |
| INV_TXFM_8X4_FN identity, identity |
| |
| cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| mova xm2, [cq+16*0] |
| mova xm0, [cq+16*1] |
| vinserti128 m2, m2, [cq+16*2], 1 |
| vinserti128 m0, m0, [cq+16*3], 1 |
| vpbroadcastd m3, [o(pw_2896x8)] |
| punpcklwd m1, m2, m0 |
| punpckhwd m2, m0 |
| pmulhrsw m1, m3 |
| pmulhrsw m2, m3 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| paddw m0, m0 |
| paddw m1, m1 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddw m0, m2 |
| paddw m1, m3 |
| jmp m(iadst_8x4_internal).end |
| |
| %macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 8x8 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pmulhrsw xm0, [cq] |
| vpbroadcastd xm1, [o(pw_16384)] |
| pmulhrsw xm0, xm1 |
| psrlw xm1, 2 ; pw_4096 |
| pmulhrsw xm0, xm1 |
| pshufb xm0, [o(deint_shuf)] |
| vpermq m3, m0, q1100 |
| punpcklwd m3, m3 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| jmp m(iadst_8x8_internal).end4 |
| %elif %3 >= 0 |
| %ifidn %1, dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm2 |
| psrlw xm2, 3 ; pw_2048 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| .end: |
| mov r2d, 2 |
| .end2: |
| lea r3, [strideq*3] |
| .loop: |
| WRITE_8X4 0, 0, 1, 2 |
| lea dstq, [dstq+strideq*4] |
| dec r2d |
| jg .loop |
| RET |
| %else ; identity |
| mova m0, [cq+32*0] |
| punpcklwd m0, [cq+32*1] |
| mova m1, [cq+32*2] |
| punpcklwd m1, [cq+32*3] |
| vpbroadcastd m2, [o(pw_2896x8)] |
| vpbroadcastd m3, [o(pw_2048)] |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| punpckldq m0, m1 |
| vpermq m1, m0, q3232 |
| vpermq m0, m0, q1010 |
| punpcklwd m0, m1 |
| pmulhrsw m0, m2 |
| pmulhrsw m0, m3 |
| jmp m(inv_txfm_add_dct_dct_8x8).end |
| %endif |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X8_FN dct, dct, 0 |
| INV_TXFM_8X8_FN dct, identity, 7 |
| INV_TXFM_8X8_FN dct, adst |
| INV_TXFM_8X8_FN dct, flipadst |
| |
| cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 ; 0 1 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| vpermq m1, [cq+32*1], q3120 ; 2 3 |
| call .main |
| shufps m4, m0, m1, q0220 |
| shufps m5, m0, m1, q1331 |
| shufps m1, m2, m3, q0220 |
| shufps m3, m2, m3, q1331 |
| vbroadcasti128 m0, [o(deint_shuf)] |
| vpbroadcastd m2, [o(pw_16384)] |
| REPX {pshufb x, m0}, m4, m5, m1, m3 |
| REPX {pmulhrsw x, m2}, m4, m5, m1, m3 |
| vinserti128 m0, m4, xm1, 1 |
| vperm2i128 m2, m4, m1, 0x31 |
| vinserti128 m1, m5, xm3, 1 |
| vperm2i128 m3, m5, m3, 0x31 |
| jmp tx2q |
| .pass2: |
| call .main |
| vpbroadcastd m4, [o(pw_2048)] |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| jmp m(iadst_8x8_internal).end2 |
| ALIGN function_align |
| .main: |
| IDCT8_1D_PACKED |
| ret |
| |
| INV_TXFM_8X8_FN adst, dct |
| INV_TXFM_8X8_FN adst, adst |
| INV_TXFM_8X8_FN adst, flipadst |
| INV_TXFM_8X8_FN adst, identity |
| |
| cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m4, [cq+32*0], q1302 ; 1 0 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m5, [cq+32*1], q1302 ; 3 2 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| call .main_pass1 |
| vpbroadcastd m5, [o(pw_16384)] |
| punpcklwd m4, m0, m1 |
| punpckhwd m0, m1 |
| punpcklwd m1, m2, m3 |
| punpckhwd m2, m3 |
| pxor m3, m3 |
| psubw m3, m5 ; negate odd elements during rounding |
| pmulhrsw m4, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m5 |
| pmulhrsw m2, m3 |
| punpcklwd m3, m4, m0 |
| punpckhwd m4, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| vperm2i128 m2, m3, m0, 0x31 |
| vinserti128 m0, m3, xm0, 1 |
| vperm2i128 m3, m4, m1, 0x31 |
| vinserti128 m1, m4, xm1, 1 |
| jmp tx2q |
| .pass2: |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call .main_pass2 |
| vpbroadcastd m5, [o(pw_2048)] |
| vpbroadcastd xm4, [o(pw_4096)] |
| psubw m4, m5 ; lower half = 2048, upper half = -2048 |
| .end: |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| .end2: |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| .end3: |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m4 |
| WIN64_RESTORE_XMM |
| .end4: |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| lea r3, [strideq*3] |
| WRITE_8X4 0, 1, 4, 5 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 2, 3, 4, 5 |
| RET |
| ALIGN function_align |
| .main_pass1: |
| IADST8_1D_PACKED 1 |
| ret |
| ALIGN function_align |
| .main_pass2: |
| IADST8_1D_PACKED 2 |
| ret |
| |
| INV_TXFM_8X8_FN flipadst, dct |
| INV_TXFM_8X8_FN flipadst, adst |
| INV_TXFM_8X8_FN flipadst, flipadst |
| INV_TXFM_8X8_FN flipadst, identity |
| |
| cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| vpermq m4, [cq+32*0], q1302 ; 1 0 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m5, [cq+32*1], q1302 ; 3 2 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| call m(iadst_8x8_internal).main_pass1 |
| vpbroadcastd m5, [o(pw_16384)] |
| punpckhwd m4, m3, m2 |
| punpcklwd m3, m2 |
| punpckhwd m2, m1, m0 |
| punpcklwd m1, m0 |
| pxor m0, m0 |
| psubw m0, m5 |
| pmulhrsw m4, m0 |
| pmulhrsw m3, m5 |
| pmulhrsw m2, m0 |
| pmulhrsw m1, m5 |
| punpckhwd m0, m4, m3 |
| punpcklwd m4, m3 |
| punpckhwd m3, m2, m1 |
| punpcklwd m2, m1 |
| vinserti128 m1, m0, xm3, 1 |
| vperm2i128 m3, m0, m3, 0x31 |
| vinserti128 m0, m4, xm2, 1 |
| vperm2i128 m2, m4, m2, 0x31 |
| jmp tx2q |
| .pass2: |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call m(iadst_8x8_internal).main_pass2 |
| vpbroadcastd m4, [o(pw_2048)] |
| vpbroadcastd xm5, [o(pw_4096)] |
| psubw m4, m5 ; lower half = -2048, upper half = 2048 |
| vpermq m5, m3, q2031 |
| vpermq m3, m0, q2031 |
| vpermq m0, m2, q2031 |
| vpermq m2, m1, q2031 |
| pmulhrsw m1, m0, m4 |
| pmulhrsw m0, m5, m4 |
| jmp m(iadst_8x8_internal).end3 |
| |
| INV_TXFM_8X8_FN identity, dct, 7 |
| INV_TXFM_8X8_FN identity, adst |
| INV_TXFM_8X8_FN identity, flipadst |
| INV_TXFM_8X8_FN identity, identity |
| |
| cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 |
| mova xm3, [cq+16*0] |
| mova xm2, [cq+16*1] |
| vinserti128 m3, m3, [cq+16*4], 1 |
| vinserti128 m2, m2, [cq+16*5], 1 |
| mova xm4, [cq+16*2] |
| mova xm0, [cq+16*3] |
| vinserti128 m4, m4, [cq+16*6], 1 |
| vinserti128 m0, m0, [cq+16*7], 1 |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m4, [o(pw_4096)] |
| jmp m(iadst_8x8_internal).end |
| |
| %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 8x16 |
| %ifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| psrlw xm2, 3 ; pw_2048 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| mov r2d, 4 |
| jmp m(inv_txfm_add_dct_dct_8x8).end2 |
| %elifidn %1_%2, dct_identity |
| WIN64_SPILL_XMM 13 |
| vpbroadcastd m0, [o(pw_2896x8)] |
| pmulhrsw m7, m0, [cq] |
| vpbroadcastd m1, [o(pw_16384)] |
| vpbroadcastd m2, [o(pw_1697x16)] |
| pxor m3, m3 |
| mova [cq], m3 |
| pmulhrsw m7, m0 |
| pmulhrsw m7, m1 |
| psrlw m1, 3 ; pw_2048 |
| pmulhrsw m2, m7 |
| paddw m7, m7 |
| paddw m7, m2 |
| pmulhrsw m7, m1 |
| punpcklwd m5, m7, m7 |
| punpckhwd m7, m7 |
| punpcklwd m4, m5, m5 |
| punpckhwd m5, m5 |
| punpcklwd m6, m7, m7 |
| punpckhwd m7, m7 |
| vpermq m0, m4, q1100 |
| vpermq m1, m5, q1100 |
| vpermq m2, m6, q1100 |
| vpermq m3, m7, q1100 |
| vpermq m4, m4, q3322 |
| vpermq m5, m5, q3322 |
| vpermq m6, m6, q3322 |
| vpermq m7, m7, q3322 |
| jmp m(idct_8x16_internal).end4 |
| %elifidn %1_%2, identity_dct |
| movd xm0, [cq+32*0] |
| punpcklwd xm0, [cq+32*1] |
| movd xm2, [cq+32*2] |
| punpcklwd xm2, [cq+32*3] |
| add cq, 32*4 |
| movd xm1, [cq+32*0] |
| punpcklwd xm1, [cq+32*1] |
| movd xm3, [cq+32*2] |
| punpcklwd xm3, [cq+32*3] |
| vpbroadcastd xm4, [o(pw_2896x8)] |
| vpbroadcastd xm5, [o(pw_2048)] |
| xor eax, eax |
| mov [cq-32*4], eax |
| mov [cq-32*3], eax |
| mov [cq-32*2], eax |
| mov [cq-32*1], eax |
| punpckldq xm0, xm2 |
| punpckldq xm1, xm3 |
| punpcklqdq xm0, xm1 |
| pmulhrsw xm0, xm4 |
| pmulhrsw xm0, xm4 |
| pmulhrsw xm0, xm5 |
| mov [cq+32*0], eax |
| mov [cq+32*1], eax |
| mov [cq+32*2], eax |
| mov [cq+32*3], eax |
| vinserti128 m0, m0, xm0, 1 |
| mov r2d, 4 |
| jmp m(inv_txfm_add_dct_dct_8x8).end2 |
| %endif |
| %endmacro |
| |
| %macro ITX_8X16_LOAD_COEFS 0 |
| vpbroadcastd m4, [o(pw_2896x8)] |
| pmulhrsw m0, m4, [cq+32*0] |
| add cq, 32*4 |
| pmulhrsw m7, m4, [cq+32*3] |
| pmulhrsw m1, m4, [cq-32*3] |
| pmulhrsw m6, m4, [cq+32*2] |
| pmulhrsw m2, m4, [cq-32*2] |
| pmulhrsw m5, m4, [cq+32*1] |
| pmulhrsw m3, m4, [cq-32*1] |
| pmulhrsw m4, [cq+32*0] |
| %endmacro |
| |
| INV_TXFM_8X16_FN dct, dct, 0 |
| INV_TXFM_8X16_FN dct, identity, 15 |
| INV_TXFM_8X16_FN dct, adst |
| INV_TXFM_8X16_FN dct, flipadst |
| |
| cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_8X16_LOAD_COEFS |
| call m(idct_16x8_internal).main |
| vpbroadcastd m10, [o(pw_16384)] |
| .pass1_end: |
| vperm2i128 m9, m3, m7, 0x31 |
| vinserti128 m3, m3, xm7, 1 |
| vperm2i128 m8, m2, m6, 0x31 |
| vinserti128 m2, m2, xm6, 1 |
| vperm2i128 m6, m1, m5, 0x31 |
| vinserti128 m1, m1, xm5, 1 |
| vperm2i128 m5, m0, m4, 0x31 |
| vinserti128 m0, m0, xm4, 1 |
| punpckhwd m4, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| .pass1_end2: |
| punpckhwd m7, m5, m6 |
| punpcklwd m5, m6 |
| punpcklwd m6, m8, m9 |
| punpckhwd m8, m9 |
| REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| punpckldq m4, m5, m6 |
| punpckhdq m5, m6 |
| punpckldq m6, m7, m8 |
| punpckhdq m7, m8 |
| jmp tx2q |
| .pass2: |
| call .main |
| REPX {vpermq x, x, q3120}, m0, m2, m4, m6 |
| REPX {vpermq x, x, q2031}, m1, m3, m5, m7 |
| .end: |
| vpbroadcastd m8, [o(pw_2048)] |
| .end2: |
| REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| .end3: |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 |
| .end4: |
| lea r3, [strideq*3] |
| WRITE_8X4 0, 1, 8, 9 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 2, 3, 0, 1 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 4, 5, 0, 1 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 6, 7, 0, 1 |
| RET |
| ALIGN function_align |
| .main: |
| IDCT16_1D_PACKED |
| ret |
| |
| INV_TXFM_8X16_FN adst, dct |
| INV_TXFM_8X16_FN adst, adst |
| INV_TXFM_8X16_FN adst, flipadst |
| INV_TXFM_8X16_FN adst, identity |
| |
| cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_8X16_LOAD_COEFS |
| call m(iadst_16x8_internal).main |
| call m(iadst_16x8_internal).main_pass1_end |
| vpbroadcastd m10, [o(pw_16384)] |
| pslld m9, m10, 17 |
| psubw m10, m9 ; 16384, -16384 |
| jmp m(idct_8x16_internal).pass1_end |
| ALIGN function_align |
| .pass2: |
| call .main |
| call .main_pass2_end |
| vpbroadcastd m9, [o(pw_2048)] |
| vpbroadcastd xm8, [o(pw_4096)] |
| psubw m8, m9 |
| REPX {vpermq x, x, q2031}, m0, m1, m2, m3 |
| REPX {vpermq x, x, q3120}, m4, m5, m6, m7 |
| jmp m(idct_8x16_internal).end2 |
| ALIGN function_align |
| .main: |
| REPX {pshufd x, x, q1032}, m7, m1, m5, m3 |
| .main2: |
| vpbroadcastd m10, [o(pd_2048)] |
| punpckhwd m8, m7, m0 ; in14 in1 |
| punpcklwd m0, m7 ; in0 in15 |
| punpcklwd m7, m6, m1 ; in12 in3 |
| punpckhwd m1, m6 ; in2 in13 |
| punpckhwd m6, m5, m2 ; in10 in5 |
| punpcklwd m2, m5 ; in4 in11 |
| punpcklwd m5, m4, m3 ; in8 in7 |
| punpckhwd m3, m4 ; in6 in9 |
| ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 |
| ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 |
| ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 |
| ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 |
| ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 |
| ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 |
| ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 |
| ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 |
| psubsw m4, m0, m5 ; t9a t8a |
| paddsw m0, m5 ; t1a t0a |
| psubsw m5, m1, m6 ; t11a t10a |
| paddsw m1, m6 ; t3a t2a |
| psubsw m6, m2, m7 ; t13a t12a |
| paddsw m2, m7 ; t5a t4a |
| psubsw m7, m3, m8 ; t15a t14a |
| paddsw m3, m8 ; t7a t6a |
| vpbroadcastd m11, [o(pw_m4017_799)] |
| vpbroadcastd m12, [o(pw_799_4017)] |
| pxor m9, m9 |
| ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 |
| psubw m8, m9, m11 ; pw_4017_m799 |
| ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 |
| vpbroadcastd m11, [o(pw_m2276_3406)] |
| vpbroadcastd m12, [o(pw_3406_2276)] |
| ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 |
| psubw m8, m9, m11 ; pw_2276_m3406 |
| ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 |
| psubsw m8, m1, m3 ; t7 t6 |
| paddsw m1, m3 ; t3 t2 |
| psubsw m3, m0, m2 ; t5 t4 |
| paddsw m0, m2 ; t1 t0 |
| psubsw m2, m5, m7 ; t14a t15a |
| paddsw m7, m5 ; t10a t11a |
| psubsw m5, m4, m6 ; t12a t13a |
| paddsw m4, m6 ; t8a t9a |
| vpbroadcastd m11, [o(pw_m3784_1567)] |
| vpbroadcastd m12, [o(pw_1567_3784)] |
| ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a |
| psubw m6, m9, m11 ; pw_3784_m1567 |
| ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a |
| vpbroadcastd m11, [o(pw_m1567_3784)] |
| vpbroadcastd m12, [o(pw_3784_1567)] |
| ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 |
| psubw m6, m9, m11 ; pw_1567_m3784 |
| ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 |
| vbroadcasti128 m12, [o(deint_shuf)] |
| paddsw m6, m4, m7 ; -out1 out14 |
| psubsw m4, m7 ; t10 t11 |
| psubsw m11, m3, m8 ; t7 t6 |
| paddsw m8, m3 ; out12 -out3 |
| psubsw m3, m0, m1 ; t3a t2a |
| paddsw m0, m1 ; -out15 out0 |
| paddsw m1, m2, m5 ; -out13 out2 |
| psubsw m5, m2 ; t15a t14a |
| pshufb m0, m12 |
| pshufb m6, m12 |
| pshufb m8, m12 |
| pshufb m1, m12 |
| shufps m7, m6, m0, q1032 ; out14 -out15 |
| vpblendd m0, m6, 0x33 ; -out1 out0 |
| punpcklqdq m6, m8, m1 ; out12 -out13 |
| punpckhqdq m1, m8, m1 ; -out3 out2 |
| ret |
| ALIGN function_align |
| .main_pass1_end: |
| vpbroadcastd m8, [o(pw_m2896_2896)] |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| pmaddwd m9, m8, m11 ; -out11 |
| pmaddwd m2, m12, m5 ; -out5 |
| pmaddwd m5, m8 ; out10 |
| pmaddwd m11, m12 ; out4 |
| REPX {paddd x, m10}, m9, m5, m2, m11 |
| REPX {psrad x, 12 }, m9, m5, m2, m11 |
| packssdw m5, m9 ; out10 -out11 |
| packssdw m2, m11 ; -out5 out4 |
| pmaddwd m11, m8, m3 ; out8 |
| vpbroadcastd m8, [o(pw_2896_m2896)] |
| pmaddwd m3, m12 ; -out7 |
| pmaddwd m8, m4 ; -out9 |
| pmaddwd m4, m12 ; out6 |
| REPX {paddd x, m10}, m11, m3, m8, m4 |
| REPX {psrad x, 12 }, m11, m3, m8, m4 |
| packssdw m3, m4 ; -out7 out6 |
| packssdw m4, m11, m8 ; out8 -out9 |
| vpbroadcastd m10, [o(pw_16384)] |
| pxor m9, m9 |
| ret |
| ALIGN function_align |
| .main_pass2_end: |
| vpbroadcastd m8, [o(pw_2896x8)] |
| pshufb m2, m11, m12 |
| pshufb m5, m12 |
| pshufb m3, m12 |
| pshufb m4, m12 |
| punpcklqdq m11, m5, m2 ; t15a t7 |
| punpckhqdq m5, m2 ; t14a t6 |
| shufps m2, m3, m4, q1032 ; t2a t10 |
| vpblendd m3, m4, 0xcc ; t3a t11 |
| psubsw m4, m2, m3 ; out8 -out9 |
| paddsw m3, m2 ; -out7 out6 |
| paddsw m2, m5, m11 ; -out5 out4 |
| psubsw m5, m11 ; out10 -out11 |
| REPX {pmulhrsw x, m8}, m2, m3, m4, m5 |
| ret |
| |
| INV_TXFM_8X16_FN flipadst, dct |
| INV_TXFM_8X16_FN flipadst, adst |
| INV_TXFM_8X16_FN flipadst, flipadst |
| INV_TXFM_8X16_FN flipadst, identity |
| |
| cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_8X16_LOAD_COEFS |
| call m(iadst_16x8_internal).main |
| call m(iadst_16x8_internal).main_pass1_end |
| vpbroadcastd m9, [o(pw_16384)] |
| pslld m10, m9, 17 |
| psubw m10, m9 ; -16384, 16384 |
| vperm2i128 m9, m4, m0, 0x31 |
| vinserti128 m0, m4, xm0, 1 |
| vperm2i128 m8, m5, m1, 0x31 |
| vinserti128 m4, m5, xm1, 1 |
| vperm2i128 m5, m7, m3, 0x31 |
| vinserti128 m3, m7, xm3, 1 |
| vinserti128 m1, m6, xm2, 1 |
| vperm2i128 m6, m6, m2, 0x31 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| punpcklwd m0, m3, m1 |
| punpckhwd m3, m1 |
| jmp m(idct_8x16_internal).pass1_end2 |
| .pass2: |
| call m(iadst_8x16_internal).main |
| call m(iadst_8x16_internal).main_pass2_end |
| vpbroadcastd m8, [o(pw_2048)] |
| vpbroadcastd xm9, [o(pw_4096)] |
| psubw m8, m9 |
| vpermq m9, m0, q3120 |
| vpermq m0, m7, q2031 |
| vpermq m7, m1, q3120 |
| vpermq m1, m6, q2031 |
| vpermq m6, m2, q3120 |
| vpermq m2, m5, q2031 |
| vpermq m5, m3, q3120 |
| vpermq m3, m4, q2031 |
| pmulhrsw m0, m8 |
| pmulhrsw m1, m8 |
| pmulhrsw m2, m8 |
| pmulhrsw m3, m8 |
| pmulhrsw m4, m5, m8 |
| pmulhrsw m5, m6, m8 |
| pmulhrsw m6, m7, m8 |
| pmulhrsw m7, m9, m8 |
| jmp m(idct_8x16_internal).end3 |
| |
| INV_TXFM_8X16_FN identity, dct, 7 |
| INV_TXFM_8X16_FN identity, adst |
| INV_TXFM_8X16_FN identity, flipadst |
| INV_TXFM_8X16_FN identity, identity |
| |
| %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] |
| pmulhrsw m%2, m%3, m%1 |
| %if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw |
| pmulhrsw m%2, m%4 |
| %else |
| paddw m%1, m%1 |
| %endif |
| paddw m%1, m%2 |
| %endmacro |
| |
| cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| mova xm3, [cq+16*0] |
| mova xm2, [cq+16*2] |
| add cq, 16*8 |
| vinserti128 m3, m3, [cq+16*0], 1 |
| vinserti128 m2, m2, [cq+16*2], 1 |
| vpbroadcastd m9, [o(pw_2896x8)] |
| mova xm4, [cq-16*4] |
| mova xm5, [cq-16*2] |
| vinserti128 m4, m4, [cq+16*4], 1 |
| vinserti128 m5, m5, [cq+16*6], 1 |
| mova xm7, [cq-16*7] |
| mova xm6, [cq-16*5] |
| vinserti128 m7, m7, [cq+16*1], 1 |
| vinserti128 m6, m6, [cq+16*3], 1 |
| mova xm8, [cq-16*3] |
| mova xm0, [cq-16*1] |
| vinserti128 m8, m8, [cq+16*5], 1 |
| vinserti128 m0, m0, [cq+16*7], 1 |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m5 |
| punpckhwd m4, m5 |
| punpcklwd m5, m7, m6 |
| punpckhwd m7, m6 |
| punpcklwd m6, m8, m0 |
| punpckhwd m8, m0 |
| REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| punpckldq m4, m5, m6 |
| punpckhdq m5, m6 |
| punpckldq m6, m7, m8 |
| punpckhdq m7, m8 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [o(pw_1697x16)] |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 |
| jmp m(idct_8x16_internal).end |
| |
| %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] |
| pmovzxbw m%3, [dstq+%5] |
| %ifnum %1 |
| paddw m%3, m%1 |
| %else |
| paddw m%3, %1 |
| %endif |
| pmovzxbw m%4, [dstq+%6] |
| %ifnum %2 |
| paddw m%4, m%2 |
| %else |
| paddw m%4, %2 |
| %endif |
| packuswb m%3, m%4 |
| vpermq m%3, m%3, q3120 |
| mova [dstq+%5], xm%3 |
| vextracti128 [dstq+%6], m%3, 1 |
| %endmacro |
| |
| %macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 16x4 |
| %if %3 >= 0 |
| %ifidn %1_%2, dct_identity |
| vpbroadcastd xm3, [o(pw_2896x8)] |
| pmulhrsw xm3, [cq] |
| vpbroadcastd xm0, [o(pw_16384)] |
| vpbroadcastd xm1, [o(pw_1697x8)] |
| pmulhrsw xm3, xm0 |
| psrlw xm0, 3 ; pw_2048 |
| pmulhrsw xm1, xm3 |
| paddw xm3, xm1 |
| pmulhrsw xm3, xm0 |
| punpcklwd xm3, xm3 |
| punpckldq xm1, xm3, xm3 |
| punpckhdq xm3, xm3 |
| vpbroadcastq m0, xm1 |
| vpermq m1, m1, q1111 |
| vpbroadcastq m2, xm3 |
| vpermq m3, m3, q1111 |
| jmp m(iadst_16x4_internal).end2 |
| %elifidn %1_%2, identity_dct |
| mova xm0, [cq+16*0] |
| mova xm2, [cq+16*1] |
| vinserti128 m0, m0, [cq+16*4], 1 |
| vinserti128 m2, m2, [cq+16*5], 1 |
| mova xm1, [cq+16*2] |
| mova xm3, [cq+16*3] |
| vinserti128 m1, m1, [cq+16*6], 1 |
| vinserti128 m3, m3, [cq+16*7], 1 |
| vpbroadcastd m4, [o(pw_1697x16)] |
| vpbroadcastd m5, [o(pw_16384)] |
| packusdw m0, m2 |
| packusdw m1, m3 |
| packusdw m0, m1 |
| vpbroadcastd m1, [o(pw_2896x8)] |
| pmulhrsw m4, m0 |
| pmulhrsw m4, m5 |
| paddw m0, m4 |
| psrlw m5, 3 ; pw_2048 |
| pmulhrsw m0, m1 |
| pmulhrsw m0, m5 |
| mov r3d, 2 |
| .end: |
| pxor m3, m3 |
| .end_loop: |
| mova [cq+32*0], m3 |
| mova [cq+32*1], m3 |
| add cq, 32*2 |
| WRITE_16X2 0, 0, 1, 2, strideq*0, strideq*1 |
| lea dstq, [dstq+strideq*2] |
| dec r3d |
| jg .end_loop |
| RET |
| %else |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| %ifidn %2, dct |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| mov r2d, 2 |
| .dconly: |
| pmulhrsw xm0, xm2 |
| movd xm2, [pw_2048] ; intentionally rip-relative |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| pxor m3, m3 |
| .dconly_loop: |
| mova xm1, [dstq] |
| vinserti128 m1, m1, [dstq+strideq], 1 |
| punpckhbw m2, m1, m3 |
| punpcklbw m1, m3 |
| paddw m2, m0 |
| paddw m1, m0 |
| packuswb m1, m2 |
| mova [dstq], xm1 |
| vextracti128 [dstq+strideq], m1, 1 |
| lea dstq, [dstq+strideq*2] |
| dec r2d |
| jg .dconly_loop |
| RET |
| %else ; adst / flipadst |
| movd xm2, [o(pw_16384)] |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| pmulhrsw m0, [o(iadst4_dconly2a)] |
| vpbroadcastd m3, [o(pw_2048)] |
| mov [cq], eobd |
| pmulhrsw m3, m0 |
| %ifidn %2, adst |
| vpbroadcastq m0, xm3 |
| vpermq m1, m3, q1111 |
| vpermq m2, m3, q2222 |
| vpermq m3, m3, q3333 |
| %else ; flipadst |
| vpermq m0, m3, q3333 |
| vpermq m1, m3, q2222 |
| vpermq m2, m3, q1111 |
| vpbroadcastq m3, xm3 |
| %endif |
| jmp m(iadst_16x4_internal).end3 |
| %endif |
| %endif |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X4_FN dct, dct, 0 |
| INV_TXFM_16X4_FN dct, adst, 0 |
| INV_TXFM_16X4_FN dct, flipadst, 0 |
| INV_TXFM_16X4_FN dct, identity, 3 |
| |
| cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova xm0, [cq+16*0] |
| mova xm1, [cq+16*1] |
| mova xm2, [cq+16*2] |
| mova xm3, [cq+16*3] |
| mova xm4, [cq+16*4] |
| mova xm5, [cq+16*5] |
| mova xm6, [cq+16*6] |
| mova xm7, [cq+16*7] |
| call m(idct_4x16_internal).main |
| vinserti128 m6, m2, xm6, 1 |
| vinserti128 m2, m0, xm4, 1 |
| vinserti128 m0, m1, xm5, 1 |
| vinserti128 m1, m3, xm7, 1 |
| punpcklwd m3, m2, m6 |
| punpckhwd m2, m6 |
| vpbroadcastd m6, [o(pw_16384)] |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| mova m1, m6 |
| jmp m(iadst_16x4_internal).pass1_end |
| .pass2: |
| call .main |
| jmp m(iadst_16x4_internal).end |
| ALIGN function_align |
| .main: |
| vpbroadcastd m6, [o(pd_2048)] |
| IDCT4_1D 0, 1, 2, 3, 4, 5, 6 |
| ret |
| |
| INV_TXFM_16X4_FN adst, dct |
| INV_TXFM_16X4_FN adst, adst |
| INV_TXFM_16X4_FN adst, flipadst |
| INV_TXFM_16X4_FN adst, identity |
| |
| cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q1230 |
| vpermq m3, [cq+32*3], q2103 |
| vpermq m1, [cq+32*1], q1230 |
| vpermq m2, [cq+32*2], q2103 |
| call m(iadst_4x16_internal).main2 |
| call m(iadst_4x16_internal).main_pass1_end |
| punpcklwd m4, m3, m1 |
| punpcklwd m5, m2, m0 |
| punpckhwd m0, m1 |
| punpckhwd m2, m3 |
| vpbroadcastd m1, [o(pw_16384)] |
| vinserti128 m3, m0, xm2, 1 |
| vperm2i128 m2, m0, m2, 0x31 |
| vinserti128 m0, m4, xm5, 1 |
| vperm2i128 m4, m4, m5, 0x31 |
| psubw m6, m7, m1 |
| .pass1_end: |
| pmulhrsw m3, m1 |
| pmulhrsw m2, m6 |
| pmulhrsw m4, m1 |
| pmulhrsw m0, m6 |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| vpbroadcastd m4, [o(pw_2048)] |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| WIN64_RESTORE_XMM |
| .end2: |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| .end3: |
| WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 |
| lea dstq, [dstq+strideq*2] |
| WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 |
| RET |
| ALIGN function_align |
| .main: |
| vpbroadcastd m6, [o(pw_m3344_3344)] |
| vpbroadcastd m7, [o(pw_3803_1321)] |
| vpbroadcastd m8, [o(pw_m1321_2482)] |
| vpbroadcastd m9, [o(pw_2482_3344)] |
| punpcklwd m4, m2, m0 ; in2 in0 l |
| punpckhwd m2, m0 ; in2 in0 h |
| psrld m5, m6, 16 |
| pmaddwd m10, m6, m4 ; t2:02 l |
| pmaddwd m6, m2 ; t2:02 h |
| pmaddwd m0, m7, m4 ; t0:02 l |
| pmaddwd m7, m2 ; t0:02 h |
| pmaddwd m4, m8 ; t1:02 l |
| pmaddwd m8, m2 ; t1:02 h |
| punpckhwd m2, m3, m1 ; in3 in1 h |
| punpcklwd m3, m1 ; in3 in1 l |
| pmaddwd m1, m5, m2 ; t2:3 h |
| pmaddwd m5, m3 ; t2:3 l |
| paddd m6, m1 |
| vpbroadcastd m1, [o(pd_2048)] |
| paddd m10, m5 |
| pmaddwd m5, m9, m3 |
| pmaddwd m9, m2 |
| paddd m0, m1 |
| paddd m7, m1 |
| paddd m0, m5 ; t0 + t3 + 2048 l |
| paddd m7, m9 ; t0 + t3 + 2048 h |
| vpbroadcastd m9, [o(pw_m3803_3344)] |
| pmaddwd m5, m9, m2 |
| pmaddwd m9, m3 |
| paddd m10, m1 ; t2 + 2048 l |
| paddd m6, m1 ; t2 + 2048 h |
| paddd m5, m1 ; t1:13 + 2048 h |
| paddd m1, m9 ; t1:13 + 2048 l |
| vpbroadcastd m9, [o(pw_m3803_m6688)] |
| pmaddwd m2, m9 |
| pmaddwd m3, m9 |
| paddd m5, m8 ; t1 + t3 + 2048 h |
| paddd m1, m4 ; t1 + t3 + 2048 l |
| paddd m8, m7 |
| paddd m4, m0 |
| paddd m2, m8 ; t0 + t1 - t3 + 2048 h |
| paddd m3, m4 ; t0 + t1 - t3 + 2048 l |
| REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 |
| packssdw m0, m7 |
| packssdw m1, m5 |
| packssdw m3, m2 |
| packssdw m2, m10, m6 |
| ret |
| |
| INV_TXFM_16X4_FN flipadst, dct |
| INV_TXFM_16X4_FN flipadst, adst |
| INV_TXFM_16X4_FN flipadst, flipadst |
| INV_TXFM_16X4_FN flipadst, identity |
| |
| cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q1230 |
| vpermq m3, [cq+32*3], q2103 |
| vpermq m1, [cq+32*1], q1230 |
| vpermq m2, [cq+32*2], q2103 |
| call m(iadst_4x16_internal).main2 |
| call m(iadst_4x16_internal).main_pass1_end |
| punpckhwd m4, m3, m2 |
| punpckhwd m5, m1, m0 |
| punpcklwd m0, m2 |
| punpcklwd m1, m3 |
| vpbroadcastd m6, [o(pw_16384)] |
| vinserti128 m3, m0, xm1, 1 |
| vperm2i128 m2, m0, m1, 0x31 |
| vinserti128 m0, m4, xm5, 1 |
| vperm2i128 m4, m4, m5, 0x31 |
| psubw m1, m7, m6 |
| jmp m(iadst_16x4_internal).pass1_end |
| ALIGN function_align |
| .pass2: |
| call m(iadst_16x4_internal).main |
| vpbroadcastd m4, [o(pw_2048)] |
| REPX {pmulhrsw x, m4}, m3, m2, m1, m0 |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 |
| lea dstq, [dstq+strideq*2] |
| WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 |
| RET |
| |
| INV_TXFM_16X4_FN identity, dct, 15 |
| INV_TXFM_16X4_FN identity, adst |
| INV_TXFM_16X4_FN identity, flipadst |
| INV_TXFM_16X4_FN identity, identity |
| |
| cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 |
| mova xm2, [cq+16*0] |
| mova xm4, [cq+16*1] |
| vinserti128 m2, m2, [cq+16*4], 1 |
| vinserti128 m4, m4, [cq+16*5], 1 |
| mova xm0, [cq+16*2] |
| mova xm1, [cq+16*3] |
| vinserti128 m0, m0, [cq+16*6], 1 |
| vinserti128 m1, m1, [cq+16*7], 1 |
| vpbroadcastd m7, [o(pw_1697x16)] |
| vpbroadcastd m8, [o(pw_16384)] |
| punpcklwd m3, m2, m4 |
| punpckhwd m2, m4 |
| punpcklwd m4, m0, m1 |
| punpckhwd m0, m1 |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| pmulhrsw m0, m7, m1 |
| pmulhrsw m5, m7, m2 |
| pmulhrsw m6, m7, m3 |
| pmulhrsw m7, m4 |
| REPX {pmulhrsw x, m8}, m0, m5, m6, m7 |
| paddw m1, m0 |
| paddw m2, m5 |
| paddw m3, m6 |
| paddw m4, m7 |
| punpcklqdq m0, m1, m2 |
| punpckhqdq m1, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m7, [o(pw_1697x8)] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| paddw m0, m4 |
| paddw m1, m5 |
| paddw m2, m6 |
| paddw m3, m7 |
| jmp m(iadst_16x4_internal).end |
| |
| %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 16x8 |
| %ifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| mov r2d, 4 |
| jmp m(inv_txfm_add_dct_dct_16x4).dconly |
| %elifidn %1_%2, dct_identity |
| WIN64_SPILL_XMM 13 |
| vbroadcasti128 m7, [cq] |
| vpbroadcastd m0, [o(pw_2896x8)] |
| vpbroadcastd m1, [o(pw_16384)] |
| pxor xm2, xm2 |
| mova [cq], xm2 |
| pmulhrsw m7, m0 |
| pmulhrsw m7, m0 |
| pmulhrsw m7, m1 |
| psrlw m1, 2 ; pw_4096 |
| pmulhrsw m7, m1 |
| punpcklwd m3, m7, m7 |
| punpckhwd m7, m7 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| pshufd m4, m7, q0000 |
| pshufd m5, m7, q1111 |
| pshufd m6, m7, q2222 |
| pshufd m7, m7, q3333 |
| lea r3, [strideq*3] |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r3 |
| jmp m(idct_16x8_internal).end4 |
| %elifidn %1_%2, identity_dct |
| mova m0, [cq+32*0] |
| packusdw m0, [cq+32*1] |
| mova m2, [cq+32*2] |
| packusdw m2, [cq+32*3] |
| mova m1, [cq+32*4] |
| packusdw m1, [cq+32*5] |
| mova m3, [cq+32*6] |
| packusdw m3, [cq+32*7] |
| vpbroadcastd m4, [o(pw_2896x8)] |
| vpbroadcastd m5, [o(pw_1697x16)] |
| packusdw m0, m2 |
| packusdw m1, m3 |
| vpbroadcastd m2, [o(pw_16384)] |
| packusdw m0, m1 |
| vpermq m1, m0, q3322 |
| vpermq m0, m0, q1100 |
| punpcklwd m0, m1 |
| pmulhrsw m0, m4 |
| pmulhrsw m5, m0 |
| pmulhrsw m5, m2 |
| paddw m0, m5 |
| psrlw m2, 3 ; pw_2048 |
| pmulhrsw m0, m4 |
| pmulhrsw m0, m2 |
| mov r3d, 4 |
| jmp m(inv_txfm_add_identity_dct_16x4).end |
| %endif |
| %endmacro |
| |
| %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd |
| vpbroadcastd m8, [o(pw_2896x8)] |
| vpermq m0, [cq+32*0], q3120 |
| add cq, 32*4 |
| vpermq m7, [cq+32*3], q%1 |
| vpermq m1, [cq-32*3], q%1 |
| vpermq m6, [cq+32*2], q3120 |
| vpermq m2, [cq-32*2], q3120 |
| vpermq m5, [cq+32*1], q%1 |
| vpermq m3, [cq-32*1], q%1 |
| vpermq m4, [cq+32*0], q3120 |
| REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 |
| %endmacro |
| |
| INV_TXFM_16X8_FN dct, dct, 0 |
| INV_TXFM_16X8_FN dct, identity, 7 |
| INV_TXFM_16X8_FN dct, adst |
| INV_TXFM_16X8_FN dct, flipadst |
| |
| cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_16X8_LOAD_COEFS 3120 |
| call m(idct_8x16_internal).main |
| vpbroadcastd m10, [o(pw_16384)] |
| punpckhwd m8, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m2, m1, m3 |
| punpcklwd m1, m3 |
| punpcklwd m9, m4, m6 |
| punpckhwd m4, m6 |
| punpcklwd m6, m5, m7 |
| punpckhwd m5, m7 |
| REPX {pmulhrsw x, m10}, m8, m1, m4, m6 |
| .pass1_end: |
| REPX {pmulhrsw x, m10}, m0, m2, m9, m5 |
| punpckhwd m3, m0, m8 |
| punpcklwd m0, m8 |
| punpckhwd m8, m2, m1 |
| punpcklwd m2, m1 |
| punpcklwd m7, m9, m4 |
| punpckhwd m9, m4 |
| punpcklwd m4, m5, m6 |
| punpckhwd m5, m6 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m8 |
| punpckhdq m3, m8 |
| punpckldq m6, m7, m4 |
| punpckhdq m7, m4 |
| punpckldq m8, m9, m5 |
| punpckhdq m9, m5 |
| vperm2i128 m4, m0, m6, 0x31 |
| vinserti128 m0, m0, xm6, 1 |
| vperm2i128 m5, m1, m7, 0x31 |
| vinserti128 m1, m1, xm7, 1 |
| vperm2i128 m6, m2, m8, 0x31 |
| vinserti128 m2, m2, xm8, 1 |
| vperm2i128 m7, m3, m9, 0x31 |
| vinserti128 m3, m3, xm9, 1 |
| jmp tx2q |
| .pass2: |
| call .main |
| vpbroadcastd m8, [o(pw_2048)] |
| .end: |
| REPX {pmulhrsw x, m8}, m0, m2, m4, m6 |
| .end2: |
| REPX {pmulhrsw x, m8}, m1, m3, m5, m7 |
| lea r3, [strideq*3] |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r3 |
| .end3: |
| pxor m0, m0 |
| REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| .end4: |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r3 |
| RET |
| ALIGN function_align |
| .main: |
| vpbroadcastd m10, [o(pd_2048)] |
| .main2: |
| IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 |
| ret |
| |
| INV_TXFM_16X8_FN adst, dct |
| INV_TXFM_16X8_FN adst, adst |
| INV_TXFM_16X8_FN adst, flipadst |
| INV_TXFM_16X8_FN adst, identity |
| |
| cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_16X8_LOAD_COEFS 1302 |
| call m(iadst_8x16_internal).main2 |
| call m(iadst_8x16_internal).main_pass1_end |
| psubw m11, m9, m10 |
| punpcklwd m8, m0, m2 |
| punpckhwd m0, m2 |
| punpckhwd m2, m1, m3 |
| punpcklwd m1, m3 |
| punpcklwd m9, m4, m6 |
| punpckhwd m4, m6 |
| punpckhwd m6, m5, m7 |
| punpcklwd m5, m7 |
| REPX {pmulhrsw x, m11}, m8, m1, m4, m6 |
| jmp m(idct_16x8_internal).pass1_end |
| ALIGN function_align |
| .pass2: |
| call .main |
| call .main_pass2_end |
| pxor m8, m8 |
| psubw m8, m9 |
| REPX {pmulhrsw x, m9}, m0, m2, m4, m6 |
| jmp m(idct_16x8_internal).end2 |
| ALIGN function_align |
| .main: |
| vpbroadcastd m10, [o(pd_2048)] |
| ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a |
| ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a |
| ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a |
| ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a |
| psubsw m8, m2, m6 ; t6 |
| paddsw m2, m6 ; t2 |
| psubsw m6, m0, m4 ; t4 |
| paddsw m0, m4 ; t0 |
| psubsw m4, m5, m1 ; t7 |
| paddsw m5, m1 ; t3 |
| psubsw m1, m7, m3 ; t5 |
| paddsw m7, m3 ; t1 |
| ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a |
| ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a |
| psubsw m9, m6, m8 ; t7 |
| paddsw m6, m8 ; out6 |
| psubsw m3, m7, m5 ; t3 |
| paddsw m7, m5 ; -out7 |
| psubsw m5, m0, m2 ; t2 |
| paddsw m0, m2 ; out0 |
| psubsw m2, m1, m4 ; t6 |
| paddsw m1, m4 ; -out1 |
| ret |
| ALIGN function_align |
| .main_pass1_end: |
| vpbroadcastd m11, [o(pw_m2896_2896)] |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| punpckhwd m4, m3, m5 |
| punpcklwd m3, m5 |
| pmaddwd m5, m11, m4 |
| pmaddwd m4, m12 |
| pmaddwd m8, m11, m3 |
| pmaddwd m3, m12 |
| REPX {paddd x, m10}, m5, m4, m8, m3 |
| REPX {psrad x, 12 }, m5, m8, m4, m3 |
| packssdw m3, m4 ; -out3 |
| packssdw m4, m8, m5 ; out4 |
| punpcklwd m5, m9, m2 |
| punpckhwd m9, m2 |
| pmaddwd m2, m12, m5 |
| pmaddwd m5, m11 |
| pmaddwd m12, m9 |
| pmaddwd m11, m9 |
| REPX {paddd x, m10}, m2, m5, m12, m11 |
| REPX {psrad x, 12 }, m2, m12, m5, m11 |
| packssdw m2, m12 ; out2 |
| packssdw m5, m11 ; -out5 |
| ret |
| ALIGN function_align |
| .main_pass2_end: |
| vpbroadcastd m8, [o(pw_2896x8)] |
| psubsw m4, m5, m3 |
| paddsw m3, m5 |
| psubsw m5, m2, m9 |
| paddsw m2, m9 |
| pmulhrsw m2, m8 ; out2 |
| pmulhrsw m3, m8 ; -out3 |
| pmulhrsw m4, m8 ; out4 |
| pmulhrsw m5, m8 ; -out5 |
| vpbroadcastd m9, [o(pw_2048)] |
| ret |
| |
| INV_TXFM_16X8_FN flipadst, dct |
| INV_TXFM_16X8_FN flipadst, adst |
| INV_TXFM_16X8_FN flipadst, flipadst |
| INV_TXFM_16X8_FN flipadst, identity |
| |
| cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| ITX_16X8_LOAD_COEFS 1302 |
| call m(iadst_8x16_internal).main2 |
| call m(iadst_8x16_internal).main_pass1_end |
| psubw m9, m10 |
| punpcklwd m8, m6, m4 |
| punpckhwd m6, m4 |
| punpcklwd m4, m7, m5 |
| punpckhwd m7, m5 |
| punpckhwd m5, m3, m1 |
| punpcklwd m3, m1 |
| punpckhwd m1, m2, m0 |
| punpcklwd m2, m0 |
| REPX {pmulhrsw x, m10}, m8, m4, m5, m1 |
| REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 |
| punpcklwd m0, m7, m4 |
| punpckhwd m7, m4 |
| punpckhwd m4, m6, m8 |
| punpcklwd m6, m8 |
| punpckhwd m8, m3, m5 |
| punpcklwd m3, m5 |
| punpcklwd m5, m2, m1 |
| punpckhwd m2, m1 |
| punpckhdq m1, m0, m6 |
| punpckldq m0, m6 |
| punpckldq m6, m7, m4 |
| punpckhdq m7, m4 |
| punpckhdq m4, m3, m5 |
| punpckldq m3, m5 |
| punpckldq m5, m8, m2 |
| punpckhdq m8, m2 |
| vinserti128 m2, m6, xm5, 1 |
| vperm2i128 m6, m6, m5, 0x31 |
| vperm2i128 m5, m1, m4, 0x31 |
| vinserti128 m1, m1, xm4, 1 |
| vperm2i128 m4, m0, m3, 0x31 |
| vinserti128 m0, m0, xm3, 1 |
| vinserti128 m3, m7, xm8, 1 |
| vperm2i128 m7, m7, m8, 0x31 |
| jmp tx2q |
| .pass2: |
| call m(iadst_16x8_internal).main |
| call m(iadst_16x8_internal).main_pass2_end |
| pxor m8, m8 |
| psubw m8, m9 |
| pmulhrsw m10, m7, m8 |
| pmulhrsw m7, m0, m9 |
| pmulhrsw m0, m6, m9 |
| pmulhrsw m6, m1, m8 |
| pmulhrsw m1, m5, m8 |
| pmulhrsw m5, m2, m9 |
| pmulhrsw m2, m4, m9 |
| pmulhrsw m4, m3, m8 |
| lea r3, [strideq*3] |
| WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 |
| WRITE_16X2 1, 2, 0, 1, strideq*2, r3 |
| jmp m(idct_16x8_internal).end3 |
| |
| INV_TXFM_16X8_FN identity, dct, 15 |
| INV_TXFM_16X8_FN identity, adst |
| INV_TXFM_16X8_FN identity, flipadst |
| INV_TXFM_16X8_FN identity, identity |
| |
| cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 |
| mova xm7, [cq+16*0] |
| mova xm2, [cq+16*1] |
| add cq, 16*8 |
| vpbroadcastd m3, [o(pw_2896x8)] |
| vinserti128 m7, m7, [cq+16*0], 1 |
| vinserti128 m2, m2, [cq+16*1], 1 |
| mova xm6, [cq-16*6] |
| mova xm4, [cq-16*5] |
| vinserti128 m6, m6, [cq+16*2], 1 |
| vinserti128 m4, m4, [cq+16*3], 1 |
| mova xm8, [cq-16*4] |
| mova xm5, [cq-16*3] |
| vinserti128 m8, m8, [cq+16*4], 1 |
| vinserti128 m5, m5, [cq+16*5], 1 |
| mova xm0, [cq-16*2] |
| mova xm1, [cq-16*1] |
| vinserti128 m0, m0, [cq+16*6], 1 |
| vinserti128 m1, m1, [cq+16*7], 1 |
| vpbroadcastd m10, [o(pw_1697x16)] |
| vpbroadcastd m11, [o(pw_16384)] |
| REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 |
| punpcklwd m3, m7, m2 |
| punpckhwd m7, m2 |
| punpcklwd m2, m6, m4 |
| punpckhwd m6, m4 |
| punpcklwd m4, m8, m5 |
| punpckhwd m8, m5 |
| punpcklwd m5, m0, m1 |
| punpckhwd m0, m1 |
| punpckldq m1, m3, m2 |
| punpckhdq m3, m2 |
| punpckldq m2, m4, m5 |
| punpckhdq m4, m5 |
| punpckldq m5, m7, m6 |
| punpckhdq m7, m6 |
| punpckldq m6, m8, m0 |
| punpckhdq m8, m0 |
| REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 |
| punpcklqdq m0, m1, m2 |
| punpckhqdq m1, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m6 |
| punpckhqdq m5, m6 |
| punpcklqdq m6, m7, m8 |
| punpckhqdq m7, m8 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [o(pw_4096)] |
| jmp m(idct_16x8_internal).end |
| |
| %define o_base pw_5 + 128 |
| |
| %macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh |
| INV_TXFM_FN %1, %2, %3, 16x16 |
| %ifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 8 |
| jmp m(inv_txfm_add_dct_dct_16x4).dconly |
| %elifidn %1_%2, dct_identity |
| WIN64_SPILL_XMM 7 |
| vpbroadcastd m3, [o(pw_2896x8)] |
| pmulhrsw m3, [cq] |
| vpbroadcastd m0, [o(pw_8192)] |
| vpbroadcastd m1, [o(pw_5793x4)] |
| vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1 |
| pcmpeqb m5, m5 |
| pxor m6, m6 |
| mova [cq], m6 |
| paddb m5, m5 ; pb_m2 |
| pmulhrsw m3, m0 |
| psrlw m0, 2 ; pw_2048 |
| psllw m3, 2 |
| pmulhrsw m3, m1 |
| pmulhrsw m3, m0 |
| mov r3d, 8 |
| .loop: |
| mova xm1, [dstq] |
| vinserti128 m1, m1, [dstq+strideq*8], 1 |
| pshufb m0, m3, m4 |
| psubb m4, m5 ; += 2 |
| punpckhbw m2, m1, m6 |
| punpcklbw m1, m6 |
| paddw m2, m0 |
| paddw m1, m0 |
| packuswb m1, m2 |
| mova [dstq], xm1 |
| vextracti128 [dstq+strideq*8], m1, 1 |
| add dstq, strideq |
| dec r3d |
| jg .loop |
| RET |
| %elifidn %1_%2, identity_dct |
| movd xm0, [cq+32*0 ] |
| movd xm2, [cq+32*1 ] |
| movd xm1, [cq+32*2 ] |
| movd xm3, [cq+32*3 ] |
| vinserti128 m0, m0, [cq+32*8 ], 1 |
| vinserti128 m2, m2, [cq+32*9 ], 1 |
| vinserti128 m1, m1, [cq+32*10], 1 |
| vinserti128 m3, m3, [cq+32*11], 1 |
| punpcklwd m0, m2 |
| punpcklwd m1, m3 |
| punpckldq m0, m1 |
| movd xm1, [cq+32*4 ] |
| movd xm3, [cq+32*5 ] |
| movd xm2, [cq+32*6 ] |
| movd xm4, [cq+32*7 ] |
| vinserti128 m1, m1, [cq+32*12], 1 |
| vinserti128 m3, m3, [cq+32*13], 1 |
| vinserti128 m2, m2, [cq+32*14], 1 |
| vinserti128 m4, m4, [cq+32*15], 1 |
| punpcklwd m1, m3 |
| vpbroadcastd m3, [o(pw_1697x16)] |
| punpcklwd m2, m4 |
| vpbroadcastd m4, [o(pw_8192)] |
| punpckldq m1, m2 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| punpcklqdq m0, m1 |
| pmulhrsw m3, m0 |
| paddw m0, m0 |
| paddw m0, m3 |
| pmulhrsw m0, m4 |
| psrlw m4, 2 ; pw_2048 |
| pmulhrsw m0, m2 |
| pmulhrsw m0, m4 |
| mov r3d, 8 |
| jmp m(inv_txfm_add_identity_dct_16x4).end |
| %endif |
| %endmacro |
| |
| %macro ITX_16X16_LOAD_COEFS 0 |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| add cq, 32*8 |
| mova m4, [cq-32*4] |
| mova m5, [cq-32*3] |
| mova m6, [cq-32*2] |
| mova m7, [cq-32*1] |
| mova m8, [cq+32*0] |
| mova m9, [cq+32*1] |
| mova m10, [cq+32*2] |
| mova m11, [cq+32*3] |
| mova m12, [cq+32*4] |
| mova m13, [cq+32*5] |
| mova m14, [cq+32*6] |
| mova m15, [cq+32*7] |
| mova [rsp], m15 |
| %endmacro |
| |
| INV_TXFM_16X16_FN dct, dct, 0 |
| INV_TXFM_16X16_FN dct, identity, 15 |
| INV_TXFM_16X16_FN dct, adst |
| INV_TXFM_16X16_FN dct, flipadst |
| |
| cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 |
| ITX_16X16_LOAD_COEFS |
| call .main |
| .pass1_end: |
| vpbroadcastd m1, [o(pw_8192)] |
| REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 |
| vextracti128 [rsp+16*5], m8, 1 |
| mova [rsp+16*1], xm8 |
| .pass1_end2: |
| vextracti128 [rsp+16*4], m0, 1 |
| mova [rsp+16*0], xm0 |
| REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 |
| pmulhrsw m1, [rsp+32*1] |
| vperm2i128 m8, m1, m9, 0x31 |
| vinserti128 m1, m1, xm9, 1 |
| vperm2i128 m9, m2, m10, 0x31 |
| vinserti128 m2, m2, xm10, 1 |
| vperm2i128 m10, m3, m11, 0x31 |
| vinserti128 m3, m3, xm11, 1 |
| vperm2i128 m11, m4, m12, 0x31 |
| vinserti128 m4, m4, xm12, 1 |
| vperm2i128 m12, m5, m13, 0x31 |
| vinserti128 m5, m5, xm13, 1 |
| vperm2i128 m13, m6, m14, 0x31 |
| vinserti128 m6, m6, xm14, 1 |
| vperm2i128 m14, m7, m15, 0x31 |
| vinserti128 m7, m7, xm15, 1 |
| mova m15, [rsp+32*2] |
| .pass1_end3: |
| punpcklwd m0, m9, m10 |
| punpckhwd m9, m10 |
| punpcklwd m10, m15, m8 |
| punpckhwd m15, m8 |
| punpckhwd m8, m11, m12 |
| punpcklwd m11, m12 |
| punpckhwd m12, m13, m14 |
| punpcklwd m13, m14 |
| punpckhdq m14, m11, m13 |
| punpckldq m11, m13 |
| punpckldq m13, m15, m9 |
| punpckhdq m15, m9 |
| punpckldq m9, m10, m0 |
| punpckhdq m10, m0 |
| punpckhdq m0, m8, m12 |
| punpckldq m8, m12 |
| punpcklqdq m12, m13, m8 |
| punpckhqdq m13, m8 |
| punpcklqdq m8, m9, m11 |
| punpckhqdq m9, m11 |
| punpckhqdq m11, m10, m14 |
| punpcklqdq m10, m14 |
| punpcklqdq m14, m15, m0 |
| punpckhqdq m15, m0 |
| mova m0, [rsp] |
| mova [rsp], m15 |
| punpckhwd m15, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m7 |
| punpcklwd m6, m7 |
| punpckhwd m7, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckhdq m6, m5, m7 |
| punpckldq m5, m7 |
| punpckldq m7, m15, m1 |
| punpckhdq m15, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m7 |
| punpckhqdq m5, m7 |
| punpckhqdq m7, m6, m15 |
| punpcklqdq m6, m15 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| vpbroadcastd m1, [o(pw_2048)] |
| REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 |
| mova [rsp], m6 |
| .end2: |
| REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 |
| pmulhrsw m1, [rsp+32*1] |
| lea r3, [strideq*3] |
| WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 |
| .end3: |
| pxor m2, m2 |
| REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 10, 11, 0, 1, strideq*2, r3 |
| REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 14, 15, 0, 1, strideq*2, r3 |
| RET |
| ALIGN function_align |
| .main: |
| vpbroadcastd m15, [o(pd_2048)] |
| mova [rsp+gprsize+32*1], m1 |
| mova [rsp+gprsize+32*2], m9 |
| IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 |
| mova m1, [rsp+gprsize+32*2] ; in9 |
| mova [rsp+gprsize+32*2], m14 ; tmp7 |
| mova m9, [rsp+gprsize+32*1] ; in1 |
| mova [rsp+gprsize+32*1], m10 ; tmp5 |
| mova m14, [rsp+gprsize+32*0] ; in15 |
| mova [rsp+gprsize+32*0], m6 ; tmp3 |
| IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 |
| mova m6, [rsp+gprsize+32*1] ; tmp5 |
| psubsw m15, m0, m14 ; out15 |
| paddsw m0, m14 ; out0 |
| psubsw m14, m2, m13 ; out14 |
| paddsw m2, m13 ; out1 |
| mova [rsp+gprsize+32*1], m2 |
| psubsw m13, m4, m11 ; out13 |
| paddsw m2, m4, m11 ; out2 |
| psubsw m11, m8, m7 ; out11 |
| paddsw m4, m8, m7 ; out4 |
| mova m7, [rsp+gprsize+32*2] ; tmp7 |
| psubsw m10, m6, m5 ; out10 |
| paddsw m5, m6 ; out5 |
| psubsw m8, m7, m9 ; out8 |
| paddsw m7, m9 ; out7 |
| psubsw m9, m12, m3 ; out9 |
| paddsw m6, m12, m3 ; out6 |
| mova m3, [rsp+gprsize+32*0] ; tmp3 |
| psubsw m12, m3, m1 ; out12 |
| paddsw m3, m1 ; out3 |
| ret |
| |
| INV_TXFM_16X16_FN adst, dct |
| INV_TXFM_16X16_FN adst, adst |
| INV_TXFM_16X16_FN adst, flipadst |
| |
| cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 |
| ITX_16X16_LOAD_COEFS |
| call .main |
| call .main_pass1_end |
| pmulhrsw m0, m1, [cq+32*0] |
| pmulhrsw m2, m1, [cq+32*1] |
| REPX {pmulhrsw x, m1}, m4, m6, m8, m10 |
| pmulhrsw m12, m1, [cq+32*2] |
| pmulhrsw m14, m1, [cq+32*3] |
| vextracti128 [rsp+16*5], m8, 1 |
| mova [rsp+16*1], xm8 |
| pxor m8, m8 |
| psubw m1, m8, m1 |
| jmp m(idct_16x16_internal).pass1_end2 |
| ALIGN function_align |
| .pass2: |
| call .main |
| call .main_pass2_end |
| REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 |
| mova [rsp+32*0], m6 |
| pxor m6, m6 |
| psubw m1, m6, m1 |
| jmp m(idct_16x16_internal).end2 |
| ALIGN function_align |
| .main: |
| vpbroadcastd m15, [o(pd_2048)] |
| mova [rsp+gprsize+32*1], m0 |
| mova [rsp+gprsize+32*2], m4 |
| ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 |
| ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 |
| ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 |
| ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 |
| psubsw m0, m2, m10 ; t10a |
| paddsw m2, m10 ; t2a |
| psubsw m10, m13, m5 ; t11a |
| paddsw m13, m5 ; t3a |
| psubsw m5, m6, m14 ; t14a |
| paddsw m6, m14 ; t6a |
| psubsw m14, m9, m1 ; t15a |
| paddsw m9, m1 ; t7a |
| ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 |
| ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 |
| psubsw m1, m10, m14 ; t14a |
| paddsw m10, m14 ; t10a |
| psubsw m14, m0, m5 ; t15a |
| paddsw m0, m5 ; t11a |
| psubsw m5, m2, m6 ; t6 |
| paddsw m2, m6 ; t2 |
| psubsw m6, m13, m9 ; t7 |
| paddsw m13, m9 ; t3 |
| ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a |
| ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 |
| mova m9, [rsp+gprsize+32*0] ; in15 |
| mova [rsp+gprsize+32*0], m10 ; t10a |
| mova m4, [rsp+gprsize+32*1] ; in0 |
| mova [rsp+gprsize+32*1], m6 ; t6a |
| mova m6, [rsp+gprsize+32*2] ; in4 |
| mova [rsp+gprsize+32*2], m2 ; t2 |
| ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 |
| ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 |
| ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 |
| ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 |
| psubsw m10, m4, m8 ; t8a |
| paddsw m8, m4 ; t0a |
| psubsw m4, m9, m7 ; t9a |
| paddsw m9, m7 ; t1a |
| psubsw m7, m6, m12 ; t12a |
| paddsw m6, m12 ; t4a |
| psubsw m12, m11, m3 ; t13a |
| paddsw m11, m3 ; t5a |
| ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 |
| ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 |
| psubsw m3, m9, m11 ; t5 |
| paddsw m9, m11 ; t1 |
| psubsw m11, m4, m12 ; t12a |
| paddsw m4, m12 ; t8a |
| paddsw m12, m8, m6 ; t0 |
| psubsw m8, m6 ; t4 |
| paddsw m6, m10, m7 ; t9a |
| psubsw m10, m7 ; t13a |
| ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a |
| ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 |
| mova m7, [rsp+gprsize+32*0] ; t10a |
| mova m2, [rsp+gprsize+32*1] ; t6a |
| paddsw m15, m9, m13 ; -out15 |
| psubsw m9, m13 ; t3a |
| paddsw m13, m11, m1 ; -out13 |
| psubsw m11, m1 ; t15a |
| psubsw m1, m4, m7 ; t10 |
| paddsw m7, m4 ; -out1 |
| psubsw m4, m3, m2 ; t6 |
| paddsw m3, m2 ; -out3 |
| paddsw m2, m10, m14 ; out2 |
| psubsw m10, m14 ; t14a |
| paddsw m14, m6, m0 ; out14 |
| psubsw m6, m0 ; t11 |
| mova m0, [rsp+gprsize+32*2] ; t2 |
| mova [rsp+gprsize+32*1], m7 |
| psubsw m7, m12, m0 ; t2a |
| paddsw m0, m12 ; out0 |
| paddsw m12, m8, m5 ; out12 |
| psubsw m8, m5 ; t7 |
| ret |
| ALIGN function_align |
| .main_pass1_end: |
| mova [cq+32*0], m0 |
| mova [cq+32*1], m2 |
| mova [cq+32*2], m12 |
| mova [cq+32*3], m14 |
| vpbroadcastd m14, [pw_m2896_2896] |
| vpbroadcastd m12, [pw_2896_2896] |
| vpbroadcastd m2, [pd_2048] |
| punpcklwd m5, m11, m10 |
| punpckhwd m11, m10 |
| pmaddwd m10, m14, m5 |
| pmaddwd m0, m14, m11 |
| pmaddwd m5, m12 |
| pmaddwd m11, m12 |
| REPX {paddd x, m2}, m10, m0, m5, m11 |
| REPX {psrad x, 12}, m10, m0, m5, m11 |
| packssdw m10, m0 ; out10 |
| packssdw m5, m11 ; -out5 |
| punpcklwd m11, m8, m4 |
| punpckhwd m8, m4 |
| pmaddwd m4, m12, m11 |
| pmaddwd m0, m12, m8 |
| pmaddwd m11, m14 |
| pmaddwd m8, m14 |
| REPX {paddd x, m2}, m4, m0, m11, m8 |
| REPX {psrad x, 12}, m4, m0, m11, m8 |
| packssdw m4, m0 ; out4 |
| packssdw m11, m8 ; -out11 |
| punpcklwd m8, m9, m7 |
| punpckhwd m9, m7 |
| pmaddwd m7, m12, m8 |
| pmaddwd m0, m12, m9 |
| pmaddwd m8, m14 |
| pmaddwd m9, m14 |
| REPX {paddd x, m2}, m7, m0, m8, m9 |
| REPX {psrad x, 12}, m7, m0, m8, m9 |
| packssdw m7, m0 ; -out7 |
| packssdw m8, m9 ; out8 |
| punpckhwd m0, m6, m1 |
| punpcklwd m6, m1 |
| pmaddwd m1, m14, m0 |
| pmaddwd m9, m14, m6 |
| pmaddwd m0, m12 |
| pmaddwd m6, m12 |
| REPX {paddd x, m2}, m1, m9, m0, m6 |
| REPX {psrad x, 12}, m1, m9, m0, m6 |
| packssdw m9, m1 ; -out7 |
| packssdw m6, m0 ; out8 |
| vpbroadcastd m1, [o(pw_8192)] |
| ret |
| ALIGN function_align |
| .main_pass2_end: |
| ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to |
| ; 16-bit here will produce the same result as using 32-bit intermediates. |
| paddsw m5, m10, m11 ; -out5 |
| psubsw m10, m11 ; out10 |
| psubsw m11, m4, m8 ; -out11 |
| paddsw m4, m8 ; out4 |
| psubsw m8, m7, m9 ; out8 |
| paddsw m7, m9 ; -out7 |
| psubsw m9, m1, m6 ; -out9 |
| paddsw m6, m1 ; out6 |
| vpbroadcastd m1, [o(pw_2896x8)] |
| REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 |
| vpbroadcastd m1, [o(pw_2048)] |
| ret |
| |
| INV_TXFM_16X16_FN flipadst, dct |
| INV_TXFM_16X16_FN flipadst, adst |
| INV_TXFM_16X16_FN flipadst, flipadst |
| |
| cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 |
| ITX_16X16_LOAD_COEFS |
| call m(iadst_16x16_internal).main |
| call m(iadst_16x16_internal).main_pass1_end |
| pmulhrsw m6, m1 |
| pmulhrsw m2, m1, m8 |
| mova [rsp+32*2], m6 |
| pmulhrsw m6, m1, m4 |
| pmulhrsw m4, m1, m10 |
| pmulhrsw m8, m1, [cq+32*3] |
| pmulhrsw m10, m1, [cq+32*2] |
| pmulhrsw m12, m1, [cq+32*1] |
| pmulhrsw m14, m1, [cq+32*0] |
| pxor m0, m0 |
| psubw m0, m1 |
| REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 |
| pmulhrsw m1, m0, m9 |
| pmulhrsw m9, m0, m13 |
| pmulhrsw m0, [rsp+32*1] |
| mova [rsp+16*0], xm15 |
| mova [rsp+16*1], xm7 |
| vperm2i128 m15, m15, m7, 0x31 |
| vinserti128 m7, m2, xm14, 1 |
| vperm2i128 m14, m2, m14, 0x31 |
| vinserti128 m2, m9, xm5, 1 |
| vperm2i128 m9, m9, m5, 0x31 |
| vinserti128 m5, m4, xm12, 1 |
| vperm2i128 m12, m4, m12, 0x31 |
| vinserti128 m4, m11, xm3, 1 |
| vperm2i128 m11, m11, m3, 0x31 |
| vinserti128 m3, m10, xm6, 1 |
| vperm2i128 m10, m10, m6, 0x31 |
| vinserti128 m6, m1, xm0, 1 |
| vperm2i128 m13, m1, m0, 0x31 |
| vinserti128 m1, m8, [rsp+32*2], 1 |
| vperm2i128 m8, m8, [rsp+32*2], 0x31 |
| jmp m(idct_16x16_internal).pass1_end3 |
| .pass2: |
| call m(iadst_16x16_internal).main |
| call m(iadst_16x16_internal).main_pass2_end |
| pmulhrsw m0, m1 |
| pmulhrsw m8, m1 |
| mova [rsp+32*0], m0 |
| mova [rsp+32*2], m8 |
| pxor m0, m0 |
| psubw m0, m1 |
| pmulhrsw m8, m0, m7 |
| pmulhrsw m7, m0, m9 |
| pmulhrsw m9, m1, m6 |
| pmulhrsw m6, m1, m10 |
| pmulhrsw m10, m0, m5 |
| pmulhrsw m5, m0, m11 |
| pmulhrsw m11, m1, m4 |
| pmulhrsw m4, m1, m12 |
| pmulhrsw m12, m0, m3 |
| pmulhrsw m3, m0, m13 |
| pmulhrsw m13, m1, m2 |
| pmulhrsw m1, m14 |
| pmulhrsw m14, m0, [rsp+32*1] |
| pmulhrsw m0, m15 |
| lea r3, [strideq*3] |
| WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 |
| mova m15, [rsp+32*0] |
| WRITE_16X2 3, 4, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 |
| jmp m(idct_16x16_internal).end3 |
| |
| INV_TXFM_16X16_FN identity, dct, 15 |
| INV_TXFM_16X16_FN identity, identity |
| |
| cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 |
| vpbroadcastd m7, [o(pw_1697x16)] |
| mova xm0, [cq+16* 0] |
| vinserti128 m0, [cq+16*16], 1 |
| mova xm15, [cq+16* 1] |
| vinserti128 m15, [cq+16*17], 1 |
| mova xm1, [cq+16* 2] |
| vinserti128 m1, [cq+16*18], 1 |
| mova xm8, [cq+16* 3] |
| vinserti128 m8, [cq+16*19], 1 |
| mova xm2, [cq+16* 4] |
| vinserti128 m2, [cq+16*20], 1 |
| mova xm9, [cq+16* 5] |
| vinserti128 m9, [cq+16*21], 1 |
| mova xm3, [cq+16* 6] |
| vinserti128 m3, [cq+16*22], 1 |
| mova xm10, [cq+16* 7] |
| add cq, 16*16 |
| vinserti128 m10, [cq+16* 7], 1 |
| mova xm4, [cq-16* 8] |
| vinserti128 m4, [cq+16* 8], 1 |
| mova xm11, [cq-16* 7] |
| vinserti128 m11, [cq+16* 9], 1 |
| mova xm5, [cq-16* 6] |
| vinserti128 m5, [cq+16*10], 1 |
| mova xm12, [cq-16* 5] |
| vinserti128 m12, [cq+16*11], 1 |
| mova xm13, [cq-16* 3] |
| vinserti128 m13, [cq+16*13], 1 |
| mova xm14, [cq-16* 1] |
| vinserti128 m14, [cq+16*15], 1 |
| REPX {IDTX16 x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ |
| 10, 4, 11, 5, 12, 13, 14 |
| mova xm6, [cq-16* 4] |
| vinserti128 m6, [cq+16*12], 1 |
| mova [rsp], m1 |
| IDTX16 6, 1, 7 |
| mova xm1, [cq-16* 2] |
| vinserti128 m1, [cq+16*14], 1 |
| pmulhrsw m7, m1 |
| paddw m1, m1 |
| paddw m7, m1 |
| vpbroadcastd m1, [o(pw_8192)] |
| REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| pmulhrsw m1, [rsp] |
| mova [rsp], m0 |
| jmp m(idct_16x16_internal).pass1_end3 |
| ALIGN function_align |
| .pass2: |
| vpbroadcastd m15, [o(pw_1697x16)] |
| mova [rsp+32*1], m0 |
| REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ |
| 8, 9, 10, 11, 12, 13, 14 |
| mova m0, [rsp+32*1] |
| mova [rsp+32*1], m1 |
| IDTX16 0, 1, 15 |
| mova m1, [rsp+32*0] |
| pmulhrsw m15, m1 |
| paddw m1, m1 |
| paddw m15, m1 |
| jmp m(idct_16x16_internal).end |
| |
| %define o_base iadst4_dconly2a + 128 |
| |
| %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 |
| %if %3 |
| vpbroadcastd m15, [o(pw_2896x8)] |
| pmulhrsw m0, m15, [%1+%2*0] |
| pmulhrsw m1, m15, [%1+%2*1] |
| pmulhrsw m2, m15, [%1+%2*2] |
| pmulhrsw m3, m15, [%1+%2*3] |
| pmulhrsw m4, m15, [%1+%2*4] |
| pmulhrsw m5, m15, [%1+%2*5] |
| pmulhrsw m6, m15, [%1+%2*6] |
| pmulhrsw m7, m15, [%1+%2*7] |
| %else |
| mova m0, [%1+%2*0] |
| mova m1, [%1+%2*1] |
| mova m2, [%1+%2*2] |
| mova m3, [%1+%2*3] |
| mova m4, [%1+%2*4] |
| mova m5, [%1+%2*5] |
| mova m6, [%1+%2*6] |
| mova m7, [%1+%2*7] |
| %endif |
| %endmacro |
| |
| %macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 |
| %if %3 |
| %if %3 == 1 |
| vpbroadcastd m15, [o(pw_2896x8)] |
| %endif |
| pmulhrsw m8, m15, [%1+%2*0] |
| pmulhrsw m9, m15, [%1+%2*1] |
| pmulhrsw m10, m15, [%1+%2*2] |
| pmulhrsw m11, m15, [%1+%2*3] |
| pmulhrsw m12, m15, [%1+%2*4] |
| pmulhrsw m13, m15, [%1+%2*5] |
| pmulhrsw m14, m15, [%1+%2*6] |
| pmulhrsw m15, [%1+%2*7] |
| %else |
| mova m8, [%1+%2*0] |
| mova m9, [%1+%2*1] |
| mova m10, [%1+%2*2] |
| mova m11, [%1+%2*3] |
| mova m12, [%1+%2*4] |
| mova m13, [%1+%2*5] |
| mova m14, [%1+%2*6] |
| mova m15, [%1+%2*7] |
| %endif |
| %endmacro |
| |
| %macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] |
| vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] |
| punpcklwd m%1, m%2, m%2 |
| pmulhrsw m%1, m%3 |
| vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] |
| punpckhwd m%2, m%2 |
| pmulhrsw m%2, m%3 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob |
| %undef cmp |
| cmp eobd, 106 |
| jle .fast |
| LOAD_8ROWS cq+32*1, 32*2 |
| call m(idct_16x8_internal).main |
| vperm2i128 m11, m0, m4, 0x31 |
| vinserti128 m0, m0, xm4, 1 |
| vperm2i128 m4, m1, m5, 0x31 |
| vinserti128 m1, m1, xm5, 1 |
| vperm2i128 m5, m2, m6, 0x31 |
| vinserti128 m2, m2, xm6, 1 |
| vperm2i128 m6, m3, m7, 0x31 |
| vinserti128 m3, m3, xm7, 1 |
| pxor m7, m7 |
| REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 |
| punpckhwd m7, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpcklwd m3, m11, m4 |
| punpckhwd m11, m4 |
| punpckhwd m4, m5, m6 |
| punpcklwd m5, m6 |
| punpckhdq m6, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m5 |
| punpckhdq m3, m5 |
| punpckhdq m5, m11, m4 |
| punpckldq m11, m4 |
| punpckldq m4, m7, m1 |
| punpckhdq m7, m1 |
| punpckhqdq m12, m6, m0 |
| punpcklqdq m0, m6 ; out4 |
| punpckhqdq m13, m7, m4 |
| punpcklqdq m4, m7 ; out5 |
| punpckhqdq m14, m3, m2 |
| punpcklqdq m2, m3 ; out6 |
| punpckhqdq m15, m5, m11 |
| punpcklqdq m11, m5 ; out7 |
| mova [rsp+32*0], m0 |
| mova [rsp+32*1], m4 |
| mova [rsp+32*2], m2 |
| .fast: |
| LOAD_8ROWS cq+32*0, 32*2 |
| call m(idct_16x8_internal).main |
| vperm2i128 m8, m0, m4, 0x31 |
| vinserti128 m0, m0, xm4, 1 |
| vperm2i128 m4, m1, m5, 0x31 |
| vinserti128 m1, m1, xm5, 1 |
| vperm2i128 m5, m2, m6, 0x31 |
| vinserti128 m2, m2, xm6, 1 |
| vperm2i128 m6, m3, m7, 0x31 |
| vinserti128 m3, m3, xm7, 1 |
| vpbroadcastd m9, [o(pw_8192)] |
| pxor m7, m7 |
| REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 |
| punpckhwd m7, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m8, m4 |
| punpcklwd m8, m4 |
| punpckhwd m4, m5, m6 |
| punpcklwd m5, m6 |
| punpckhdq m6, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m8, m5 |
| punpckhdq m8, m5 |
| punpckhdq m5, m3, m4 |
| punpckldq m3, m4 |
| punpckhdq m4, m7, m1 |
| punpckldq m7, m1 |
| punpcklqdq m1, m7, m4 |
| punpckhqdq m7, m4 ; out9 |
| punpckhqdq m4, m2, m8 ; out10 |
| punpcklqdq m2, m8 |
| punpckhqdq m8, m3, m5 |
| punpcklqdq m3, m5 |
| punpckhqdq m5, m0, m6 ; out8 |
| punpcklqdq m0, m6 |
| REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 |
| cmp eobd, 106 |
| jg .full |
| mova [rsp+32*0], m5 |
| mova [rsp+32*1], m7 |
| mova [rsp+32*2], m4 |
| pmulhrsw m11, m9, m8 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call .main_fast |
| jmp .pass2 |
| .dconly: |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm2 |
| psrlw xm2, 2 ; pw_2048 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| mov r2d, 8 |
| jmp m(inv_txfm_add_dct_dct_8x8).end2 |
| .full: |
| REPX {pmulhrsw x, m9}, m12, m13, m14, m15 |
| pmulhrsw m6, m9, [rsp+32*2] |
| mova [rsp+32*2], m4 |
| pmulhrsw m4, m9, [rsp+32*0] |
| mova [rsp+32*0], m5 |
| pmulhrsw m5, m9, [rsp+32*1] |
| mova [rsp+32*1], m7 |
| pmulhrsw m7, m9, m11 |
| pmulhrsw m11, m9, m8 |
| call .main |
| .pass2: |
| vpbroadcastd m12, [o(pw_2048)] |
| REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m13, m14, m15 |
| pmulhrsw m12, [rsp] |
| REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 |
| REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 |
| mova [rsp+32*0], m4 |
| mova [rsp+32*1], m6 |
| lea r3, [strideq*3] |
| WRITE_8X4 0, 1, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 2, 3, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 [rsp+32*0], 5, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 [rsp+32*1], 7, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 8, 9, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 10, 11, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 12, 13, 4, 6 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 14, 15, 4, 6 |
| RET |
| ALIGN function_align |
| .main_fast: ; bottom half is zero |
| call m(idct_8x16_internal).main |
| mova m8, [rsp+gprsize+0*32] |
| mova [rsp+gprsize+0*32], m0 |
| mova m9, [rsp+gprsize+1*32] |
| mova [rsp+gprsize+1*32], m1 |
| mova m0, [rsp+gprsize+2*32] |
| mova [rsp+gprsize+2*32], m6 |
| lea r5, [rax-(o_base)+pw_201_4091x8] |
| ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a |
| ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a |
| ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a |
| ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a |
| jmp .main2 |
| ALIGN function_align |
| .main: |
| call m(idct_8x16_internal).main |
| mova m8, [rsp+gprsize+0*32] |
| mova [rsp+gprsize+0*32], m0 |
| mova m9, [rsp+gprsize+1*32] |
| mova [rsp+gprsize+1*32], m1 |
| mova m0, [rsp+gprsize+2*32] |
| mova [rsp+gprsize+2*32], m6 |
| punpcklwd m1, m15, m8 ; in31 in1 |
| punpckhwd m8, m15 ; in3 in29 |
| punpcklwd m15, m14, m9 ; in27 in5 |
| punpckhwd m9, m14 ; in7 in25 |
| punpcklwd m14, m13, m0 ; in23 in9 |
| punpckhwd m0, m13 ; in11 in21 |
| punpcklwd m13, m12, m11 ; in19 in13 |
| punpckhwd m11, m12 ; in15 in17 |
| ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a |
| ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a |
| ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a |
| ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a |
| ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a |
| ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a |
| ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a |
| ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a |
| .main2: |
| psubsw m6, m1, m11 ; t17 t30 |
| paddsw m1, m11 ; t16 t31 |
| psubsw m11, m9, m14 ; t18 t29 |
| paddsw m9, m14 ; t19 t28 |
| psubsw m14, m15, m0 ; t21 t26 |
| paddsw m15, m0 ; t20 t27 |
| psubsw m0, m8, m13 ; t22 t25 |
| paddsw m8, m13 ; t23 t24 |
| ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a |
| ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a |
| ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a |
| ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a |
| psubsw m13, m1, m9 ; t19a t28a |
| paddsw m1, m9 ; t16a t31a |
| psubsw m9, m8, m15 ; t20a t27a |
| paddsw m8, m15 ; t23a t24a |
| psubsw m15, m6, m11 ; t18 t29 |
| paddsw m6, m11 ; t17 t30 |
| psubsw m11, m0, m14 ; t21 t26 |
| paddsw m0, m14 ; t22 t25 |
| ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a |
| ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 |
| ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 |
| ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a |
| vbroadcasti128 m12, [o(deint_shuf)] |
| psubsw m14, m1, m8 ; t23 t24 |
| paddsw m1, m8 ; t16 t31 |
| psubsw m8, m6, m0 ; t22a t25a |
| paddsw m6, m0 ; t17a t30a |
| psubsw m0, m15, m11 ; t21 t26 |
| paddsw m15, m11 ; t18 t29 |
| psubsw m11, m13, m9 ; t20a t27a |
| paddsw m13, m9 ; t19a t28a |
| REPX {pshufb x, m12}, m1, m6, m15, m13 |
| ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a |
| vpbroadcastd m9, [o(pw_m2896_2896)] |
| ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 |
| shufps m9, m14, m8, q1032 ; t23a t22 |
| vpblendd m14, m8, 0xcc ; t24a t25 |
| shufps m8, m11, m0, q1032 ; t20 t21a |
| vpblendd m11, m0, 0xcc ; t27 t26a |
| punpcklqdq m0, m1, m6 ; t16 t17a |
| punpckhqdq m1, m6 ; t31 t30a |
| psubsw m10, m5, m8 ; out20 out21 |
| paddsw m5, m8 ; out11 out10 |
| psubsw m6, m3, m14 ; out24 out25 |
| paddsw m3, m14 ; out7 out6 |
| psubsw m8, m7, m0 ; out16 out17 |
| paddsw m7, m0 ; out15 out14 |
| mova m0, [rsp+gprsize+0*32] |
| punpcklqdq m12, m13, m15 ; t19a t18 |
| punpckhqdq m13, m15 ; t28a t29 |
| psubsw m15, m0, m1 ; out31 out30 |
| paddsw m0, m1 ; out0 out1 |
| mova m1, [rsp+gprsize+1*32] |
| mova [rsp+gprsize+0*32], m6 |
| mova m6, [rsp+gprsize+2*32] |
| psubsw m14, m1, m13 ; out28 out29 |
| paddsw m1, m13 ; out3 out2 |
| psubsw m13, m2, m11 ; out27 out26 |
| paddsw m2, m11 ; out4 out5 |
| psubsw m11, m4, m9 ; out23 out22 |
| paddsw m4, m9 ; out8 out9 |
| psubsw m9, m6, m12 ; out19 out18 |
| paddsw m6, m12 ; out12 out13 |
| ret |
| |
| %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] |
| vbroadcasti128 m%1, [cq+16*%3] |
| vbroadcasti128 m%2, [cq+16*%4] |
| shufpd m%1, m%1, m%2, 0x0c |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 8 |
| .dconly: |
| pmulhrsw xm0, xm2 |
| movd xm2, [pw_2048] ; intentionally rip-relative |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| pxor m3, m3 |
| .dconly_loop: |
| mova m1, [dstq] |
| punpckhbw m2, m1, m3 |
| punpcklbw m1, m3 |
| paddw m2, m0 |
| paddw m1, m0 |
| packuswb m1, m2 |
| mova [dstq], m1 |
| add dstq, strideq |
| dec r2d |
| jg .dconly_loop |
| RET |
| .normal: |
| PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob |
| %undef cmp |
| LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 |
| LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 |
| LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 |
| LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 |
| add cq, 16*16 |
| LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 |
| LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 |
| LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 |
| LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 |
| REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 |
| mova [rsp+32*0], m4 |
| mova [rsp+32*1], m5 |
| mova [rsp+32*2], m6 |
| cmp eobd, 106 |
| jg .full |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call m(inv_txfm_add_dct_dct_8x32).main_fast |
| jmp .pass2 |
| .full: |
| LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 |
| LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 |
| LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 |
| LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 |
| REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 |
| add cq, 16*8 |
| LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 |
| LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 |
| LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 |
| LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 |
| call m(inv_txfm_add_dct_dct_8x32).main |
| .pass2: |
| vpbroadcastd m12, [o(pw_8192)] |
| REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 |
| mova [rsp+32*1], m9 |
| mova [rsp+32*2], m10 |
| punpckhwd m9, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m2, m1, m3 |
| punpcklwd m1, m3 |
| punpcklwd m10, m4, m6 |
| punpckhwd m4, m6 |
| punpcklwd m6, m5, m7 |
| punpckhwd m5, m7 |
| punpckhwd m3, m0, m9 |
| punpcklwd m0, m9 |
| punpckhwd m9, m2, m1 |
| punpcklwd m2, m1 |
| punpcklwd m7, m10, m4 |
| punpckhwd m10, m4 |
| punpcklwd m4, m5, m6 |
| punpckhwd m5, m6 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m9 |
| punpckhdq m3, m9 |
| punpckldq m6, m7, m4 |
| punpckhdq m7, m4 |
| punpckldq m9, m10, m5 |
| punpckhdq m10, m5 |
| REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 |
| pmulhrsw m12, [rsp+32*0] |
| mova [rsp+32*0], m8 |
| vperm2i128 m4, m0, m6, 0x31 |
| vinserti128 m0, m0, xm6, 1 |
| vperm2i128 m5, m1, m7, 0x31 |
| vinserti128 m1, m1, xm7, 1 |
| vperm2i128 m6, m2, m9, 0x31 |
| vinserti128 m2, m2, xm9, 1 |
| vperm2i128 m7, m3, m10, 0x31 |
| vinserti128 m3, m3, xm10, 1 |
| call m(idct_16x8_internal).main |
| vpbroadcastd m8, [o(pw_2048)] |
| REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| lea r2, [strideq*3] |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r2 |
| lea r3, [dstq+strideq*4] |
| %define dstq r3 |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r2 |
| mova m0, [rsp+32*0] |
| mova m1, [rsp+32*1] |
| mova m2, [rsp+32*2] |
| punpckhwd m7, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m2, m1, m11 |
| punpcklwd m1, m11 |
| punpckhwd m4, m12, m14 |
| punpcklwd m12, m14 |
| punpckhwd m5, m13, m15 |
| punpcklwd m13, m15 |
| punpckhwd m3, m0, m7 |
| punpcklwd m0, m7 |
| punpckhwd m9, m2, m1 |
| punpcklwd m2, m1 |
| punpcklwd m7, m12, m4 |
| punpckhwd m12, m4 |
| punpcklwd m4, m5, m13 |
| punpckhwd m5, m13 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m3, m9 |
| punpckhdq m3, m9 |
| punpckldq m6, m7, m4 |
| punpckhdq m7, m4 |
| punpckldq m9, m12, m5 |
| punpckhdq m12, m5 |
| vperm2i128 m4, m0, m6, 0x31 |
| vinserti128 m0, m0, xm6, 1 |
| vperm2i128 m5, m1, m7, 0x31 |
| vinserti128 m1, m1, xm7, 1 |
| vperm2i128 m6, m2, m9, 0x31 |
| vinserti128 m2, m2, xm9, 1 |
| vperm2i128 m7, m3, m12, 0x31 |
| vinserti128 m3, m3, xm12, 1 |
| call m(idct_16x8_internal).main2 |
| vpbroadcastd m8, [o(pw_2048)] |
| REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| add r0, 16 |
| add r3, 16 |
| %define dstq r0 |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r2 |
| %define dstq r3 |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r2 |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob |
| vpbroadcastd m9, [pw_5] |
| lea r4, [strideq*3] |
| sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) |
| .loop: |
| mova xm0, [cq+16* 0] |
| mova xm1, [cq+16* 4] |
| vinserti128 m0, m0, [cq+16* 1], 1 |
| vinserti128 m1, m1, [cq+16* 5], 1 |
| pxor m8, m8 |
| mova [cq+32*0], m8 |
| mova [cq+32*2], m8 |
| add cq, 16*16 |
| mova xm2, [cq-16* 8] |
| mova xm3, [cq-16* 4] |
| vinserti128 m2, m2, [cq-16* 7], 1 |
| vinserti128 m3, m3, [cq-16* 3], 1 |
| mova xm4, [cq+16* 0] |
| mova xm5, [cq+16* 4] |
| vinserti128 m4, m4, [cq+16* 1], 1 |
| vinserti128 m5, m5, [cq+16* 5], 1 |
| mova xm6, [cq+16* 8] |
| mova xm7, [cq+16*12] |
| vinserti128 m6, m6, [cq+16* 9], 1 |
| vinserti128 m7, m7, [cq+16*13], 1 |
| REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 |
| REPX {paddw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call .transpose8x8 |
| REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 |
| add dstq, strideq |
| WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 |
| add dstq, strideq |
| WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 |
| add dstq, strideq |
| WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 |
| add dstq, strideq |
| sub cq, 16*16-32 |
| lea dstq, [dstq+r4*4] |
| add eobd, 0x80000000 |
| jnc .loop |
| RET |
| ALIGN function_align |
| .transpose8x8: |
| punpckhwd m8, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m7 |
| punpcklwd m6, m7 |
| punpckhwd m7, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckhdq m6, m5, m7 |
| punpckldq m5, m7 |
| punpckldq m7, m8, m1 |
| punpckhdq m8, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m7 |
| punpckhqdq m5, m7 |
| punpckhqdq m7, m6, m8 |
| punpcklqdq m6, m8 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob |
| add cq, 16*8 |
| vpbroadcastd m9, [pw_4096] |
| lea r4, [strideq*3] |
| lea r5, [dstq+strideq*4] |
| sub eobd, 107 |
| .loop: |
| mova xm0, [cq-16*8] |
| mova xm1, [cq-16*7] |
| vinserti128 m0, m0, [cq+16*0], 1 |
| vinserti128 m1, m1, [cq+16*1], 1 |
| mova xm2, [cq-16*6] |
| mova xm3, [cq-16*5] |
| vinserti128 m2, m2, [cq+16*2], 1 |
| vinserti128 m3, m3, [cq+16*3], 1 |
| mova xm4, [cq-16*4] |
| mova xm5, [cq-16*3] |
| vinserti128 m4, m4, [cq+16*4], 1 |
| vinserti128 m5, m5, [cq+16*5], 1 |
| mova xm6, [cq-16*2] |
| mova xm7, [cq-16*1] |
| vinserti128 m6, m6, [cq+16*6], 1 |
| vinserti128 m7, m7, [cq+16*7], 1 |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r4 |
| %define dstq r5 |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r4 |
| add cq, 16*16 |
| add r0, 16 |
| add r5, 16 |
| add eobd, 0x80000000 |
| jnc .loop |
| RET |
| |
| %define o_base pw_5 + 128 |
| |
| %macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs |
| %if %3 |
| vpbroadcastd m15, [o(pw_2896x8)] |
| pmulhrsw m0, m15, [%1+%2* 0] |
| pmulhrsw m1, m15, [%1+%2* 1] |
| pmulhrsw m2, m15, [%1+%2* 2] |
| pmulhrsw m3, m15, [%1+%2* 3] |
| pmulhrsw m4, m15, [%1+%2* 4] |
| pmulhrsw m5, m15, [%1+%2* 5] |
| pmulhrsw m6, m15, [%1+%2* 6] |
| pmulhrsw m7, m15, [%1+%2* 7] |
| pmulhrsw m8, m15, [%1+%2* 8] |
| pmulhrsw m9, m15, [%1+%2* 9] |
| pmulhrsw m10, m15, [%1+%2*10] |
| pmulhrsw m11, m15, [%1+%2*11] |
| pmulhrsw m12, m15, [%1+%2*12] |
| pmulhrsw m13, m15, [%1+%2*13] |
| pmulhrsw m14, m15, [%1+%2*14] |
| pmulhrsw m15, [%1+%2*15] |
| %else |
| mova m0, [%1+%2* 0] |
| mova m1, [%1+%2* 1] |
| mova m2, [%1+%2* 2] |
| mova m3, [%1+%2* 3] |
| mova m4, [%1+%2* 4] |
| mova m5, [%1+%2* 5] |
| mova m6, [%1+%2* 6] |
| mova m7, [%1+%2* 7] |
| mova m8, [%1+%2* 8] |
| mova m9, [%1+%2* 9] |
| mova m10, [%1+%2*10] |
| mova m11, [%1+%2*11] |
| mova m12, [%1+%2*12] |
| mova m13, [%1+%2*13] |
| mova m14, [%1+%2*14] |
| mova m15, [%1+%2*15] |
| %endif |
| mova [rsp], m15 |
| %if %4 |
| pxor m15, m15 |
| REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ |
| 8, 9, 10, 11, 12, 13, 14, 15 |
| %endif |
| %endmacro |
| |
| %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] |
| mova m%4, [%2] |
| paddsw m%3, m%1, m%4 |
| psubsw m%1, m%4 |
| pmovzxbw m%4, [dstq+%6] |
| pmulhrsw m%3, m%5 |
| pmulhrsw m%1, m%5 |
| paddw m%3, m%4 |
| pmovzxbw m%4, [r2+%7] |
| paddw m%1, m%4 |
| packuswb m%3, m%1 |
| vpermq m%3, m%3, q3120 |
| mova [dstq+%6], xm%3 |
| vextracti128 [r2+%7], m%3, 1 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ |
| base, tmp3 |
| %undef cmp |
| LOAD_16ROWS cq, 64, 1 |
| call m(idct_16x16_internal).main |
| lea tmp1q, [rsp+32*7] |
| lea tmp2q, [tmp1q+32*8] |
| lea tmp3q, [tmp1q+32*16] |
| mova m1, [rsp+32*1] |
| mova [rsp+32*0], m6 |
| mova [rsp+32*1], m7 |
| vpbroadcastd m7, [o(pw_16384)] |
| call .transpose_2x8x8_round |
| mova m15, [rsp+32*0] |
| mova [tmp3q-32*4+ 0], xm0 |
| vextracti128 [tmp3q+32*0+ 0], m0, 1 |
| mova [tmp3q-32*3+ 0], xm2 |
| vextracti128 [tmp3q+32*1+ 0], m2, 1 |
| mova [tmp3q-32*2+ 0], xm4 |
| vextracti128 [tmp3q+32*2+ 0], m4, 1 |
| mova [tmp3q-32*1+ 0], xm6 |
| vextracti128 [tmp3q+32*3+ 0], m6, 1 |
| mova [tmp3q-32*4+16], xm8 |
| vextracti128 [tmp3q+32*0+16], m8, 1 |
| mova [tmp3q-32*3+16], xm10 |
| vextracti128 [tmp3q+32*1+16], m10, 1 |
| mova [tmp3q-32*2+16], xm12 |
| vextracti128 [tmp3q+32*2+16], m12, 1 |
| mova [tmp3q-32*1+16], xm14 |
| vextracti128 [tmp3q+32*3+16], m14, 1 |
| cmp eobd, 150 |
| jg .full |
| vinserti128 m0, m1, xm9, 1 |
| vperm2i128 m4, m1, m9, 0x31 |
| vinserti128 m2, m5, xm13, 1 |
| vperm2i128 m6, m5, m13, 0x31 |
| vinserti128 m1, m3, xm11, 1 |
| vperm2i128 m5, m3, m11, 0x31 |
| vinserti128 m3, m7, xm15, 1 |
| vperm2i128 m7, m7, m15, 0x31 |
| call .main_oddhalf_fast |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| jmp .idct16 |
| .dconly: |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| mov r2d, 16 |
| jmp m(inv_txfm_add_dct_dct_16x4).dconly |
| .full: |
| mova [tmp1q-32*4], m1 |
| mova [tmp1q-32*3], m3 |
| mova [tmp1q-32*2], m5 |
| mova [tmp1q-32*1], m7 |
| mova [tmp1q+32*0], m9 |
| mova [tmp1q+32*1], m11 |
| mova [tmp1q+32*2], m13 |
| mova [tmp1q+32*3], m15 |
| LOAD_16ROWS cq+32, 64, 1 |
| call m(idct_16x16_internal).main |
| lea r2, [tmp3q+32*8] |
| mova m1, [rsp+32*1] |
| mova [rsp+32*0], m6 |
| mova [rsp+32*1], m7 |
| vpbroadcastd m7, [o(pw_16384)] |
| call .transpose_2x8x8_round |
| mova m15, [rsp+32*0] |
| mova [r2-32*4+ 0], xm0 |
| vextracti128 [r2+32*0+ 0], m0, 1 |
| mova [r2-32*3+ 0], xm2 |
| vextracti128 [r2+32*1+ 0], m2, 1 |
| mova [r2-32*2+ 0], xm4 |
| vextracti128 [r2+32*2+ 0], m4, 1 |
| mova [r2-32*1+ 0], xm6 |
| vextracti128 [r2+32*3+ 0], m6, 1 |
| mova [r2-32*4+16], xm8 |
| vextracti128 [r2+32*0+16], m8, 1 |
| mova [r2-32*3+16], xm10 |
| vextracti128 [r2+32*1+16], m10, 1 |
| mova [r2-32*2+16], xm12 |
| vextracti128 [r2+32*2+16], m12, 1 |
| mova [r2-32*1+16], xm14 |
| vextracti128 [r2+32*3+16], m14, 1 |
| vinserti128 m8, m1, xm9, 1 |
| vperm2i128 m12, m1, m9, 0x31 |
| mova xm0, [tmp1q-32*4] |
| mova xm1, [tmp1q-32*3] |
| vinserti128 m0, m0, [tmp1q+32*0], 1 |
| vinserti128 m1, m1, [tmp1q+32*1], 1 |
| vinserti128 m10, m5, xm13, 1 |
| vperm2i128 m14, m5, m13, 0x31 |
| mova xm4, [tmp1q-32*4+16] |
| mova xm5, [tmp1q-32*3+16] |
| vinserti128 m4, m4, [tmp1q+32*0+16], 1 |
| vinserti128 m5, m5, [tmp1q+32*1+16], 1 |
| vinserti128 m9, m3, xm11, 1 |
| vperm2i128 m13, m3, m11, 0x31 |
| mova xm2, [tmp1q-32*2] |
| mova xm3, [tmp1q-32*1] |
| vinserti128 m2, m2, [tmp1q+32*2], 1 |
| vinserti128 m3, m3, [tmp1q+32*3], 1 |
| vinserti128 m11, m7, xm15, 1 |
| vperm2i128 m15, m7, m15, 0x31 |
| mova xm6, [tmp1q-32*2+16] |
| mova xm7, [tmp1q-32*1+16] |
| vinserti128 m6, m6, [tmp1q+32*2+16], 1 |
| vinserti128 m7, m7, [tmp1q+32*3+16], 1 |
| call .main_oddhalf |
| LOAD_8ROWS_H r2-32*4, 32 |
| .idct16: |
| LOAD_8ROWS tmp3q-32*4, 32 |
| mova [rsp], m15 |
| call m(idct_16x16_internal).main |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .main_oddhalf_fast: ; lower half is zero |
| mova [rsp+gprsize+32*1], m7 |
| pxor m7, m7 |
| mova [rsp+gprsize+32*0], m7 |
| mova [rsp+gprsize+32*2], m7 |
| vpbroadcastd m11, [o(pw_3703x8)] |
| vpbroadcastd m7, [o(pw_1751x8)] |
| vpbroadcastd m12, [o(pw_m1380x8)] |
| vpbroadcastd m8, [o(pw_3857x8)] |
| vpbroadcastd m13, [o(pw_3973x8)] |
| vpbroadcastd m15, [o(pw_995x8)] |
| pmulhrsw m11, m4 ; t29a |
| pmulhrsw m4, m7 ; t18a |
| pmulhrsw m12, m3 ; t19a |
| pmulhrsw m3, m8 ; t28a |
| pmulhrsw m13, m2 ; t27a |
| pmulhrsw m2, m15 ; t20a |
| vpbroadcastd m10, [o(pw_m2106x8)] |
| vpbroadcastd m7, [o(pw_3513x8)] |
| vpbroadcastd m9, [o(pw_3290x8)] |
| vpbroadcastd m8, [o(pw_2440x8)] |
| vpbroadcastd m14, [o(pw_m601x8)] |
| vpbroadcastd m15, [o(pw_4052x8)] |
| pmulhrsw m10, m5 ; t21a |
| pmulhrsw m5, m7 ; t26a |
| pmulhrsw m9, m6 ; t25a |
| pmulhrsw m6, m8 ; t22a |
| pmulhrsw m14, m1 ; t23a |
| pmulhrsw m1, m15 ; t24a |
| vpbroadcastd m15, [o(pd_2048)] |
| jmp .main2 |
| ALIGN function_align |
| .main_oddhalf: |
| mova [rsp+gprsize+32*0], m15 |
| mova [rsp+gprsize+32*1], m7 |
| mova [rsp+gprsize+32*2], m8 |
| vpbroadcastd m15, [o(pd_2048)] |
| ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a |
| ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a |
| ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a |
| ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a |
| ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a |
| ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a |
| .main2: |
| psubsw m7, m12, m4 ; t18 |
| paddsw m12, m4 ; t19 |
| psubsw m4, m2, m10 ; t21 |
| paddsw m2, m10 ; t20 |
| psubsw m10, m14, m6 ; t22 |
| paddsw m14, m6 ; t23 |
| psubsw m6, m1, m9 ; t25 |
| paddsw m1, m9 ; t24 |
| psubsw m9, m13, m5 ; t26 |
| paddsw m13, m5 ; t27 |
| psubsw m5, m3, m11 ; t29 |
| paddsw m3, m11 ; t28 |
| ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a |
| ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a |
| ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a |
| psubsw m8, m14, m2 ; t20a |
| paddsw m14, m2 ; t23a |
| psubsw m2, m1, m13 ; t27a |
| paddsw m1, m13 ; t24a |
| psubsw m13, m6, m9 ; t21 |
| paddsw m6, m9 ; t22 |
| psubsw m9, m10, m4 ; t26 |
| paddsw m10, m4 ; t25 |
| ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 |
| ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a |
| mova m4, [rsp+gprsize+32*0] ; in31 |
| mova [rsp+gprsize+32*0], m6 ; t22 |
| mova m6, [rsp+gprsize+32*1] ; in15 |
| mova [rsp+gprsize+32*1], m14 ; t23a |
| mova m14, [rsp+gprsize+32*2] ; in17 |
| mova [rsp+gprsize+32*2], m1 ; t24a |
| ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a |
| ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a |
| psubsw m1, m0, m14 ; t17 |
| paddsw m0, m14 ; t16 |
| psubsw m14, m4, m6 ; t30 |
| paddsw m4, m6 ; t31 |
| ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a |
| psubsw m6, m0, m12 ; t19a |
| paddsw m0, m12 ; t16a |
| psubsw m12, m4, m3 ; t28a |
| paddsw m4, m3 ; t31a |
| psubsw m3, m14, m5 ; t18 |
| paddsw m14, m5 ; t17 |
| psubsw m5, m1, m7 ; t29 |
| paddsw m1, m7 ; t30 |
| ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a |
| ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 |
| psubsw m7, m1, m10 ; t25a |
| paddsw m1, m10 ; t30a |
| psubsw m10, m5, m9 ; t21 |
| paddsw m5, m9 ; t18 |
| psubsw m9, m12, m2 ; t20a |
| paddsw m12, m2 ; t19a |
| psubsw m2, m3, m13 ; t26 |
| paddsw m3, m13 ; t29 |
| psubsw m13, m6, m8 ; t27a |
| paddsw m6, m8 ; t28a |
| mova [tmp1q-32*2], m5 |
| mova [tmp1q-32*1], m12 |
| mova [tmp2q+32*0], m6 |
| mova [tmp2q+32*1], m3 |
| mova [tmp2q+32*2], m1 |
| mova m5, [rsp+gprsize+32*0] ; t22 |
| mova m6, [rsp+gprsize+32*1] ; t23 |
| mova m3, [rsp+gprsize+32*2] ; t24a |
| psubsw m1, m14, m5 ; t22a |
| paddsw m14, m5 ; t17a |
| psubsw m5, m0, m6 ; t23 |
| paddsw m0, m6 ; t16 |
| psubsw m6, m4, m3 ; t24 |
| paddsw m4, m3 ; t31 |
| vpbroadcastd m8, [o(pw_m2896_2896)] |
| vpbroadcastd m3, [o(pw_2896_2896)] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m14 |
| mova [tmp2q+32*3], m4 |
| ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 |
| ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a |
| ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 |
| ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a |
| mova [tmp1q+32*0], m13 |
| mova [tmp1q+32*1], m2 |
| mova [tmp1q+32*2], m7 |
| mova [tmp1q+32*3], m6 |
| mova [tmp2q-32*4], m5 |
| mova [tmp2q-32*3], m1 |
| mova [tmp2q-32*2], m10 |
| mova [tmp2q-32*1], m9 |
| ret |
| ALIGN function_align |
| .transpose_2x8x8_round: |
| punpckhwd m6, m12, m13 |
| punpcklwd m12, m13 |
| punpckhwd m13, m8, m9 |
| punpcklwd m8, m9 |
| punpckhwd m9, m14, m15 |
| punpcklwd m14, m15 |
| punpckhwd m15, m10, m11 |
| punpcklwd m10, m11 |
| REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 |
| punpckhdq m11, m8, m10 |
| punpckldq m8, m10 |
| punpckldq m10, m12, m14 |
| punpckhdq m12, m14 |
| punpckhdq m14, m13, m15 |
| punpckldq m13, m15 |
| punpckldq m15, m6, m9 |
| punpckhdq m6, m9 |
| punpckhqdq m9, m8, m10 |
| punpcklqdq m8, m10 |
| punpcklqdq m10, m11, m12 |
| punpckhqdq m11, m12 |
| punpcklqdq m12, m13, m15 |
| punpckhqdq m13, m15 |
| punpckhqdq m15, m14, m6 |
| punpcklqdq m14, m6 |
| pmulhrsw m6, m7, [rsp+gprsize+32*0] |
| REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 |
| pmulhrsw m7, [rsp+gprsize+32*1] |
| mova [rsp+gprsize+32*0], m15 |
| punpckhwd m15, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m7 |
| punpcklwd m6, m7 |
| punpckhwd m7, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckhdq m6, m5, m7 |
| punpckldq m5, m7 |
| punpckldq m7, m15, m1 |
| punpckhdq m15, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m7 |
| punpckhqdq m5, m7 |
| punpckhqdq m7, m6, m15 |
| punpcklqdq m6, m15 |
| ret |
| ALIGN function_align |
| .pass2_end: |
| mova [rsp+gprsize+32*0], m7 |
| mova [rsp+gprsize+32*2], m15 |
| vpbroadcastd m15, [o(pw_2048)] |
| IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 |
| IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 |
| IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 |
| IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| mova m1, [rsp+gprsize+32*1] |
| IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 |
| IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 |
| IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 |
| IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 |
| IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 |
| IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 |
| IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| mova m7, [rsp+gprsize+32*0] |
| mova m1, [rsp+gprsize+32*2] |
| IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 |
| IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 |
| IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 |
| IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 |
| ret |
| |
| ; Perform the final sumsub step and YMM lane shuffling |
| %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] |
| mova m%3, [tmp2q+32*( 3-%1)] |
| psubsw m%4, m%1, m%3 |
| paddsw m%1, m%3 |
| mova m%3, [tmp1q+32*(11-%2)] |
| mova [tmp1q+32*(11-%2)+16], xm%4 |
| vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 |
| paddsw m%4, m%2, m%3 |
| psubsw m%2, m%3 |
| mova [tmp1q+32*(11-%2)], xm%2 |
| vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 |
| vperm2i128 m%2, m%1, m%4, 0x31 |
| vinserti128 m%1, m%1, xm%4, 1 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| mov r2d, 16 |
| jmp m(inv_txfm_add_dct_dct_32x8).dconly |
| .normal: |
| PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 |
| vpbroadcastd m15, [o(pw_2896x8)] |
| pmulhrsw m0, m15, [cq+32* 1] |
| pmulhrsw m1, m15, [cq+32* 3] |
| pmulhrsw m2, m15, [cq+32* 5] |
| pmulhrsw m3, m15, [cq+32* 7] |
| pmulhrsw m4, m15, [cq+32* 9] |
| pmulhrsw m5, m15, [cq+32*11] |
| pmulhrsw m6, m15, [cq+32*13] |
| pmulhrsw m7, m15, [cq+32*15] |
| pmulhrsw m8, m15, [cq+32*17] |
| pmulhrsw m9, m15, [cq+32*19] |
| pmulhrsw m10, m15, [cq+32*21] |
| pmulhrsw m11, m15, [cq+32*23] |
| pmulhrsw m12, m15, [cq+32*25] |
| pmulhrsw m13, m15, [cq+32*27] |
| pmulhrsw m14, m15, [cq+32*29] |
| pmulhrsw m15, [cq+32*31] |
| lea tmp1q, [rsp+32*7] |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf |
| LOAD_16ROWS cq+32*0, 32*2, 1, 0 |
| pxor m15, m15 |
| mov r3d, 8 |
| .zero_loop: |
| mova [cq+32*0], m15 |
| mova [cq+32*1], m15 |
| mova [cq+32*2], m15 |
| mova [cq+32*3], m15 |
| add cq, 32*4 |
| dec r3d |
| jg .zero_loop |
| call m(idct_16x16_internal).main |
| call .pass1_end |
| lea r2, [strideq*3] |
| mov r3, dstq |
| .pass2: |
| vpbroadcastd m7, [o(pw_16384)] |
| call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round |
| call m(idct_16x16_internal).main |
| mova [rsp+32*2], m15 |
| vpbroadcastd m15, [o(pw_2048)] |
| REPX {pmulhrsw x, m15}, m2, m3, m0 |
| WRITE_16X2 2, 3, 1, 2, strideq*2, r2 |
| pmulhrsw m1, m15, [rsp+32*1] |
| WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 |
| lea dstq, [dstq+strideq*4] |
| REPX {pmulhrsw x, m15}, m4, m5, m6, m7 |
| WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 2, 3, strideq*2, r2 |
| lea dstq, [dstq+strideq*4] |
| REPX {pmulhrsw x, m15}, m8, m9, m10, m11 |
| WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 10, 11, 2, 3, strideq*2, r2 |
| lea dstq, [dstq+strideq*4] |
| REPX {pmulhrsw x, m15}, m11, m12, m13, m14 |
| pmulhrsw m15, [rsp+32*2] |
| WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 14, 15, 2, 3, strideq*2, r2 |
| test r3, r3 |
| jnz .right_half |
| RET |
| .right_half: |
| LOAD_8ROWS tmp1q-32*4, 32 |
| LOAD_8ROWS_H tmp2q-32*4, 32 |
| lea dstq, [r3+16] |
| xor r3d, r3d |
| mova [rsp+32*0], m6 |
| mova [rsp+32*1], m7 |
| jmp .pass2 |
| ALIGN function_align |
| .pass1_end: |
| mova [rsp+gprsize+32*0], m9 |
| IDCT32_PASS1_END 0, 8, 1, 9 |
| IDCT32_PASS1_END 2, 10, 1, 9 |
| IDCT32_PASS1_END 3, 11, 1, 9 |
| IDCT32_PASS1_END 4, 12, 1, 9 |
| IDCT32_PASS1_END 5, 13, 1, 9 |
| IDCT32_PASS1_END 6, 14, 1, 9 |
| IDCT32_PASS1_END 7, 15, 1, 9 |
| mova m1, [rsp+gprsize+32*1] |
| mova m9, [rsp+gprsize+32*0] |
| mova [rsp+gprsize+32*0], m6 |
| mova [rsp+gprsize+32*1], m7 |
| IDCT32_PASS1_END 1, 9, 6, 7 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob |
| %undef cmp |
| lea rax, [o_base] |
| vpbroadcastd m9, [o(pw_2896x8)] |
| vpbroadcastd m10, [o(pw_5793x4)] |
| vpbroadcastd m11, [o(pw_5)] |
| cmp eobd, 43 ; if (eob > 43) |
| setg r4b ; iteration_count++ |
| cmp eobd, 150 ; if (eob > 150) |
| setg al ; iteration_count++ |
| add eobd, -279 ; if (eob > 278) |
| adc r4b, al ; iteration_count++ |
| lea r3, [strideq*3] |
| mov rax, cq |
| .loop: |
| mova xm0, [cq+64* 0] |
| mova xm1, [cq+64* 1] |
| vinserti128 m0, m0, [cq+64* 8], 1 |
| vinserti128 m1, m1, [cq+64* 9], 1 |
| mova xm2, [cq+64* 2] |
| mova xm3, [cq+64* 3] |
| vinserti128 m2, m2, [cq+64*10], 1 |
| vinserti128 m3, m3, [cq+64*11], 1 |
| mova xm4, [cq+64* 4] |
| mova xm5, [cq+64* 5] |
| vinserti128 m4, m4, [cq+64*12], 1 |
| vinserti128 m5, m5, [cq+64*13], 1 |
| mova xm6, [cq+64* 6] |
| mova xm7, [cq+64* 7] |
| vinserti128 m6, m6, [cq+64*14], 1 |
| vinserti128 m7, m7, [cq+64*15], 1 |
| REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {paddw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| dec r4b |
| jge .loop |
| sub cq, 32 |
| pxor m0, m0 |
| mov r0d, 8 |
| cmp cq, rax |
| jg .zero_loop |
| .zero_loop_half: |
| mova [rax+64*0], m0 |
| mova [rax+64*1], m0 |
| mova [rax+64*2], m0 |
| mova [rax+64*3], m0 |
| add rax, 64*4 |
| sub r0d, 2 |
| jg .zero_loop_half |
| RET |
| .zero_loop: |
| mova [rax+32*0], m0 |
| mova [rax+32*1], m0 |
| mova [rax+32*2], m0 |
| mova [rax+32*3], m0 |
| add rax, 32*4 |
| dec r0d |
| jg .zero_loop |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob |
| %undef cmp |
| lea rax, [o_base] |
| vpbroadcastd m9, [o(pw_2896x8)] |
| vpbroadcastd m10, [o(pw_1697x8)] |
| vpbroadcastd m11, [o(pw_2048)] |
| cmp eobd, 35 ; if (eob > 35) |
| setg r4b ; iteration_count++ |
| cmp eobd, 150 ; if (eob > 150) |
| setg r3b ; iteration_count += 2 |
| lea r4d, [r4+r3*2] |
| lea r3, [strideq*3] |
| mov r5, dstq |
| mov rax, cq |
| .loop: |
| mova xm0, [cq+32* 0] |
| mova xm1, [cq+32* 1] |
| vinserti128 m0, m0, [cq+32* 8], 1 |
| vinserti128 m1, m1, [cq+32* 9], 1 |
| mova xm2, [cq+32* 2] |
| mova xm3, [cq+32* 3] |
| vinserti128 m2, m2, [cq+32*10], 1 |
| vinserti128 m3, m3, [cq+32*11], 1 |
| mova xm4, [cq+32* 4] |
| mova xm5, [cq+32* 5] |
| vinserti128 m4, m4, [cq+32*12], 1 |
| vinserti128 m5, m5, [cq+32*13], 1 |
| mova xm6, [cq+32* 6] |
| mova xm7, [cq+32* 7] |
| vinserti128 m6, m6, [cq+32*14], 1 |
| vinserti128 m7, m7, [cq+32*15], 1 |
| REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| pmulhrsw m8, m10, m0 |
| paddw m0, m8 |
| pmulhrsw m8, m10, m1 |
| paddw m1, m8 |
| pmulhrsw m8, m10, m2 |
| paddw m2, m8 |
| pmulhrsw m8, m10, m3 |
| paddw m3, m8 |
| pmulhrsw m8, m10, m4 |
| paddw m4, m8 |
| pmulhrsw m8, m10, m5 |
| paddw m5, m8 |
| pmulhrsw m8, m10, m6 |
| paddw m6, m8 |
| pmulhrsw m8, m10, m7 |
| paddw m7, m8 |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r3 |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| dec r4b |
| jl .ret |
| test r4b, 1 |
| jz .loop |
| add cq, 32*15 |
| lea dstq, [r5+16] |
| jmp .loop |
| .ret: |
| sub cq, 32 |
| pxor m0, m0 |
| mov r0d, 4 |
| mov r1d, 8 |
| cmp cq, rax |
| cmovg r0d, r1d |
| .zero_loop: |
| mova [rax+32*0], m0 |
| mova [rax+32*1], m0 |
| mova [rax+32*2], m0 |
| mova [rax+32*3], m0 |
| add rax, 32*4 |
| dec r0d |
| jg .zero_loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 32 |
| jmp m(inv_txfm_add_dct_dct_32x8).dconly |
| .normal: |
| PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ |
| base, tmp3, tmp4 |
| %undef cmp |
| lea tmp1q, [rsp+32*7] |
| lea tmp2q, [tmp1q+32*8] |
| sub eobd, 136 |
| mov tmp4d, eobd |
| .pass1_loop: |
| LOAD_8ROWS cq+64*1, 64*2 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 |
| test tmp4d, tmp4d |
| jl .fast |
| LOAD_8ROWS_H cq+64*17, 64*2 |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf |
| LOAD_8ROWS_H cq+64*16, 64*2 |
| pxor m0, m0 |
| REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| mova [rsp], m15 |
| jmp .idct16 |
| .fast: |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| .idct16: |
| LOAD_8ROWS cq+64*0, 64*2 |
| pxor m15, m15 |
| REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 |
| call m(idct_16x16_internal).main |
| call m(inv_txfm_add_dct_dct_32x16).pass1_end |
| vpbroadcastd m7, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round |
| lea tmp3q, [tmp1q+32*32] |
| mova m15, [rsp] |
| mova [tmp3q-32*4], m0 |
| mova [tmp3q-32*3], m2 |
| mova [tmp3q-32*2], m4 |
| mova [tmp3q-32*1], m6 |
| mova [tmp3q+32*0], m8 |
| mova [tmp3q+32*1], m10 |
| mova [tmp3q+32*2], m12 |
| mova [tmp3q+32*3], m14 |
| add tmp3q, 32*8 |
| mova [tmp3q-32*4], m1 |
| mova [tmp3q-32*3], m3 |
| mova [tmp3q-32*2], m5 |
| mova [tmp3q-32*1], m7 |
| mova [tmp3q+32*0], m9 |
| mova [tmp3q+32*1], m11 |
| mova [tmp3q+32*2], m13 |
| mova [tmp3q+32*3], m15 |
| vpbroadcastd m9, [o(pw_8192)] |
| pmulhrsw m0, m9, [tmp1q-32*4] |
| pmulhrsw m1, m9, [tmp1q-32*3] |
| pmulhrsw m2, m9, [tmp1q-32*2] |
| pmulhrsw m3, m9, [tmp1q-32*1] |
| pmulhrsw m4, m9, [tmp1q+32*0] |
| pmulhrsw m5, m9, [tmp1q+32*1] |
| pmulhrsw m6, m9, [tmp1q+32*2] |
| pmulhrsw m7, m9, [tmp1q+32*3] |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova [tmp1q-32*4], m0 |
| pmulhrsw m0, m9, [tmp2q-32*4] |
| mova [tmp2q-32*4], m1 |
| pmulhrsw m1, m9, [tmp2q-32*3] |
| mova [tmp1q-32*3], m2 |
| pmulhrsw m2, m9, [tmp2q-32*2] |
| mova [tmp2q-32*3], m3 |
| pmulhrsw m3, m9, [tmp2q-32*1] |
| mova [tmp1q-32*2], m4 |
| pmulhrsw m4, m9, [tmp2q+32*0] |
| mova [tmp2q-32*2], m5 |
| pmulhrsw m5, m9, [tmp2q+32*1] |
| mova [tmp1q-32*1], m6 |
| pmulhrsw m6, m9, [tmp2q+32*2] |
| mova [tmp2q-32*1], m7 |
| pmulhrsw m7, m9, [tmp2q+32*3] |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova [tmp1q+32*0], m0 |
| mova [tmp2q+32*0], m1 |
| mova [tmp1q+32*1], m2 |
| mova [tmp2q+32*1], m3 |
| mova [tmp1q+32*2], m4 |
| mova [tmp2q+32*2], m5 |
| mova [tmp1q+32*3], m6 |
| mova [tmp2q+32*3], m7 |
| add cq, 32 |
| add tmp1q, 32*16 |
| add tmp2q, 32*16 |
| add eobd, 0x80000000 |
| jnc .pass1_loop |
| add tmp1q, 32*24 |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| test tmp4d, tmp4d |
| jge .pass2_loop |
| add tmp1q, 32*16 |
| add tmp2q, 32*16 |
| add tmp3q, 32*16 |
| .pass2_loop: |
| LOAD_8ROWS tmp2q-32*4, 32 |
| test tmp4d, tmp4d |
| jl .fast2 |
| LOAD_8ROWS_H tmp3q-32*4, 32 |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf |
| sub tmp3q, 32*8 |
| LOAD_8ROWS_H tmp3q-32*4, 32 |
| sub tmp3q, 32*16 |
| jmp .pass2_loop_end |
| .fast2: |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| sub tmp3q, 32*24 |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| .pass2_loop_end: |
| LOAD_8ROWS tmp3q-32*4, 32 |
| mova [rsp], m15 |
| call m(idct_16x16_internal).main |
| call m(inv_txfm_add_dct_dct_16x32).pass2_end |
| lea tmp3q, [tmp1q-32*32] |
| cmp tmp2q, tmp3q |
| jl .ret |
| sub tmp2q, 32*32 |
| sub dstq, r3 |
| lea r2, [r2+r3+16] |
| add dstq, 16 |
| jmp .pass2_loop |
| .ret: |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m9, [pw_8192] |
| sub eobd, 136 ; if (eob < 136) |
| shr eobd, 30 ; topleft 16x16 only |
| lea eobd, [eobq*2-8] |
| lea r4, [strideq*3] |
| mov r5, dstq |
| lea rax, [cq+32] |
| .loop: |
| mova xm0, [cq+64* 0] |
| mova xm1, [cq+64* 1] |
| vinserti128 m0, m0, [cq+64* 8], 1 |
| vinserti128 m1, m1, [cq+64* 9], 1 |
| mova xm2, [cq+64* 2] |
| mova xm3, [cq+64* 3] |
| vinserti128 m2, m2, [cq+64*10], 1 |
| vinserti128 m3, m3, [cq+64*11], 1 |
| mova xm4, [cq+64* 4] |
| mova xm5, [cq+64* 5] |
| vinserti128 m4, m4, [cq+64*12], 1 |
| vinserti128 m5, m5, [cq+64*13], 1 |
| mova xm6, [cq+64* 6] |
| mova xm7, [cq+64* 7] |
| vinserti128 m6, m6, [cq+64*14], 1 |
| vinserti128 m7, m7, [cq+64*15], 1 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 |
| WRITE_16X2 2, 3, 0, 1, strideq*2, r4 |
| lea dstq, [dstq+strideq*4] |
| WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 0, 1, strideq*2, r4 |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| inc eobd |
| jz .ret |
| test eobd, 3 |
| jnz .loop |
| add cq, 64*15 |
| lea dstq, [r5+16] |
| jmp .loop |
| .ret: |
| pxor m0, m0 |
| mov r0d, 16 |
| cmp cq, rax |
| jne .zero_loop |
| .zero_loop_topleft: |
| mova [rax-32*1], m0 |
| mova [rax+32*1], m0 |
| mova [rax+32*3], m0 |
| mova [rax+32*5], m0 |
| add rax, 64*4 |
| sub r0d, 4 |
| jg .zero_loop_topleft |
| RET |
| .zero_loop: |
| mova [rax-32*1], m0 |
| mova [rax+32*0], m0 |
| mova [rax+32*1], m0 |
| mova [rax+32*2], m0 |
| add rax, 32*4 |
| dec r0d |
| jg .zero_loop |
| RET |
| |
| %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) |
| %if %1 & 1 |
| mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n |
| mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n |
| %else |
| mova m%5, [tmp1q-32*(45-%1)] |
| mova m%4, [tmp2q-32*(20+%1)] |
| %endif |
| psubsw m%6, m%5, m%4 ; idct32 out31-n |
| paddsw m%5, m%4 ; idct32 out 0+n |
| psubsw m%4, m%6, m%3 ; out32+n |
| paddsw m%6, m%3 ; out31-n |
| psubsw m%3, m%5, m%2 ; out63-n |
| paddsw m%5, m%2 ; out 0+n |
| %if %0 == 6 ; pass 1 |
| %if %1 & 1 |
| mova [tmp2q-32*(19-%1)], m%4 |
| mova [tmp1q-32*(14+%1)], m%6 |
| mova [tmp1q+32*(18-%1)], m%3 |
| mova [tmp2q-32*(51-%1)], m%5 |
| %else |
| mova [tmp1q-32*(13-%1)], m%4 |
| mova [tmp2q-32*(20+%1)], m%6 |
| mova [tmp2q+32*(12-%1)], m%3 |
| mova [tmp1q-32*(45-%1)], m%5 |
| %endif |
| %else ; pass 2 |
| REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 |
| %if %1 & 1 |
| %define %%d0 r2 |
| %define %%d1 dstq |
| %else |
| %define %%d0 dstq |
| %define %%d1 r2 |
| %endif |
| pmovzxbw m%2, [%%d0+%9 ] |
| paddw m%2, m%4 |
| pmovzxbw m%4, [%%d1+%8 ] |
| paddw m%4, m%6 |
| pmovzxbw m%6, [%%d1+%10] |
| paddw m%3, m%6 |
| pmovzxbw m%6, [%%d0+%7 ] |
| paddw m%5, m%6 |
| packuswb m%2, m%4 |
| packuswb m%3, m%5 |
| vpermq m%2, m%2, q3120 |
| vpermq m%3, m%3, q3120 |
| mova [%%d0+%9 ], xm%2 |
| vextracti128 [%%d1+%8 ], m%2, 1 |
| mova [%%d1+%10], xm%3 |
| vextracti128 [%%d0+%7 ], m%3, 1 |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 32 |
| jmp m(inv_txfm_add_dct_dct_16x4).dconly |
| .normal: |
| PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 |
| %undef cmp |
| lea tmp1q, [rsp+32*23] |
| lea tmp2q, [tmp1q+32*24] |
| sub eobd, 151 |
| mov r7d, eobd |
| .pass1_loop: |
| LOAD_16ROWS cq, 64 |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [rsp+32*0], m6 |
| mova [rsp+32*1], m7 |
| vpbroadcastd m7, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round |
| mova m15, [rsp+32*0] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m2 |
| mova [tmp1q-32*2], m4 |
| mova [tmp1q-32*1], m6 |
| mova [tmp1q+32*0], m8 |
| mova [tmp1q+32*1], m10 |
| mova [tmp1q+32*2], m12 |
| mova [tmp1q+32*3], m14 |
| mova [tmp2q-32*4], m1 |
| mova [tmp2q-32*3], m3 |
| mova [tmp2q-32*2], m5 |
| mova [tmp2q-32*1], m7 |
| mova [tmp2q+32*0], m9 |
| mova [tmp2q+32*1], m11 |
| mova [tmp2q+32*2], m13 |
| mova [tmp2q+32*3], m15 |
| add cq, 32 |
| add tmp1q, 32*8 |
| add tmp2q, 32*8 |
| add eobd, 0x80000000 |
| jnc .pass1_loop |
| lea r2, [rsp+32*23] |
| mova xm0, [r2-32*4+ 0] |
| mova xm1, [r2-32*2+ 0] |
| vinserti128 m0, m0, [r2+32*0+ 0], 1 |
| vinserti128 m1, m1, [r2+32*2+ 0], 1 |
| mova xm2, [r2-32*4+16] |
| mova xm3, [r2-32*2+16] |
| vinserti128 m2, m2, [r2+32*0+16], 1 |
| vinserti128 m3, m3, [r2+32*2+16], 1 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 |
| test r7d, r7d |
| jl .fast |
| lea r3, [r2+32*8] |
| mova xm4, [r3-32*4+ 0] |
| mova xm5, [r3-32*2+ 0] |
| vinserti128 m4, m4, [r3+32*0+ 0], 1 |
| vinserti128 m5, m5, [r3+32*2+ 0], 1 |
| mova xm6, [r3-32*4+16] |
| mova xm7, [r3-32*2+16] |
| vinserti128 m6, m6, [r3+32*0+16], 1 |
| vinserti128 m7, m7, [r3+32*2+16], 1 |
| .fast: |
| mova [rsp], m8 |
| lea tmp1q, [rsp+32*7] |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| mova xm0, [r2-32*3+ 0] |
| mova xm1, [r2-32*1+ 0] |
| vinserti128 m0, m0, [r2+32*1+ 0], 1 |
| vinserti128 m1, m1, [r2+32*3+ 0], 1 |
| mova xm2, [r2-32*3+16] |
| mova xm3, [r2-32*1+16] |
| vinserti128 m2, m2, [r2+32*1+16], 1 |
| vinserti128 m3, m3, [r2+32*3+16], 1 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| test r7d, r7d |
| jl .fast2 |
| mova xm4, [r3-32*3+ 0] |
| mova xm5, [r3-32*1+ 0] |
| vinserti128 m4, m4, [r3+32*1+ 0], 1 |
| vinserti128 m5, m5, [r3+32*3+ 0], 1 |
| mova xm6, [r3-32*3+16] |
| mova xm7, [r3-32*1+16] |
| vinserti128 m6, m6, [r3+32*1+16], 1 |
| vinserti128 m7, m7, [r3+32*3+16], 1 |
| .fast2: |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| add r2, 32*24 |
| vpbroadcastd m15, [o(pd_2048)] |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| mova xm0, [r2-32*4+ 0] |
| mova xm3, [r2-32*1+16] |
| vinserti128 m0, m0, [r2+32*0+ 0], 1 |
| vinserti128 m3, m3, [r2+32*3+16], 1 |
| mova xm4, [r2-32*4+16] |
| mova xm7, [r2-32*1+ 0] |
| vinserti128 m4, m4, [r2+32*0+16], 1 |
| vinserti128 m7, m7, [r2+32*3+ 0], 1 |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r7d, r7d |
| jl .fast3 |
| add r3, 32*24 |
| mova xm1, [r3-32*1+16] |
| mova xm2, [r3-32*4+ 0] |
| vinserti128 m1, m1, [r3+32*3+16], 1 |
| vinserti128 m2, m2, [r3+32*0+ 0], 1 |
| mova xm5, [r3-32*1+ 0] |
| mova xm6, [r3-32*4+16] |
| vinserti128 m5, m5, [r3+32*3+ 0], 1 |
| vinserti128 m6, m6, [r3+32*0+16], 1 |
| .fast3: |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| mova xm0, [r2-32*2+ 0] |
| mova xm3, [r2-32*3+16] |
| vinserti128 m0, m0, [r2+32*2+ 0], 1 |
| vinserti128 m3, m3, [r2+32*1+16], 1 |
| mova xm4, [r2-32*2+16] |
| mova xm7, [r2-32*3+ 0] |
| vinserti128 m4, m4, [r2+32*2+16], 1 |
| vinserti128 m7, m7, [r2+32*1+ 0], 1 |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r7d, r7d |
| jl .fast4 |
| mova xm1, [r3-32*3+16] |
| mova xm2, [r3-32*2+ 0] |
| vinserti128 m1, m1, [r3+32*1+16], 1 |
| vinserti128 m2, m2, [r3+32*2+ 0], 1 |
| mova xm5, [r3-32*3+ 0] |
| mova xm6, [r3-32*2+16] |
| vinserti128 m5, m5, [r3+32*1+ 0], 1 |
| vinserti128 m6, m6, [r3+32*2+16], 1 |
| .fast4: |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 |
| RET |
| ALIGN function_align |
| %define o_base idct64_mul - 8 |
| .main_part1: |
| ; idct64 steps 1-5: |
| ; in1/31/17/15/ 9/23/25/ 7 -> |
| ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a |
| ; in5/27/21/11/13/19/29/ 3 -> |
| ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a |
| vpbroadcastd m11, [o(idct64_mul+4* 0)] |
| vpbroadcastd m13, [o(idct64_mul+4* 1)] |
| vpbroadcastd m10, [o(idct64_mul+4* 4)] |
| vpbroadcastd m12, [o(idct64_mul+4* 5)] |
| pmulhrsw m11, m0 ; t63a |
| pmulhrsw m0, m13 ; t32a |
| pmulhrsw m10, m1 ; t62a |
| pmulhrsw m1, m12 ; t33a |
| vpbroadcastd m9, [o(idct64_mul+4* 8)] |
| vpbroadcastd m13, [o(idct64_mul+4* 9)] |
| vpbroadcastd m8, [o(idct64_mul+4*12)] |
| vpbroadcastd m12, [o(idct64_mul+4*13)] |
| pmulhrsw m9, m2 ; t61a |
| pmulhrsw m2, m13 ; t34a |
| pmulhrsw m8, m3 ; t60a |
| pmulhrsw m3, m12 ; t35a |
| psubsw m12, m0, m1 ; t33 |
| paddsw m0, m1 ; t32 |
| psubsw m1, m3, m2 ; t34 |
| paddsw m3, m2 ; t35 |
| psubsw m2, m8, m9 ; t61 |
| paddsw m8, m9 ; t60 |
| psubsw m9, m11, m10 ; t62 |
| paddsw m11, m10 ; t63 |
| ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a |
| vpbroadcastd m14, [o(pw_401_4076)] |
| ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a |
| psubsw m10, m0, m3 ; t35a |
| paddsw m0, m3 ; t32a |
| psubsw m3, m11, m8 ; t60a |
| paddsw m11, m8 ; t63a |
| psubsw m8, m9, m2 ; t34 |
| paddsw m9, m2 ; t33 |
| psubsw m2, m12, m1 ; t61 |
| paddsw m12, m1 ; t62 |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m9 |
| mova [tmp2q+32*2], m12 |
| mova [tmp2q+32*3], m11 |
| vpbroadcastd m13, [o(pw_m4017_799)] |
| vpbroadcastd m14, [o(pw_799_4017)] |
| ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a |
| ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp2q+32*0], m10 |
| mova [tmp2q+32*1], m8 |
| vpbroadcastd m3, [o(idct64_mul+4*16)] |
| vpbroadcastd m11, [o(idct64_mul+4*17)] |
| vpbroadcastd m2, [o(idct64_mul+4*20)] |
| vpbroadcastd m10, [o(idct64_mul+4*21)] |
| vpbroadcastd m1, [o(idct64_mul+4*24)] |
| vpbroadcastd m9, [o(idct64_mul+4*25)] |
| vpbroadcastd m0, [o(idct64_mul+4*28)] |
| vpbroadcastd m8, [o(idct64_mul+4*29)] |
| pmulhrsw m3, m4 ; t59a |
| pmulhrsw m4, m11 ; t36a |
| pmulhrsw m2, m5 ; t58a |
| pmulhrsw m5, m10 ; t37a |
| pmulhrsw m1, m6 ; t57a |
| pmulhrsw m6, m9 ; t38a |
| pmulhrsw m0, m7 ; t56a |
| pmulhrsw m7, m8 ; t39a |
| psubsw m8, m4, m5 ; t37 |
| paddsw m4, m5 ; t36 |
| psubsw m5, m7, m6 ; t38 |
| paddsw m7, m6 ; t39 |
| psubsw m6, m0, m1 ; t57 |
| paddsw m0, m1 ; t56 |
| psubsw m1, m3, m2 ; t58 |
| paddsw m3, m2 ; t59 |
| ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a |
| vpbroadcastd m10, [o(pw_3166_2598)] |
| ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a |
| psubsw m2, m7, m4 ; t36a |
| paddsw m7, m4 ; t39a |
| psubsw m4, m0, m3 ; t59a |
| paddsw m0, m3 ; t56a |
| psubsw m3, m6, m1 ; t37 |
| paddsw m6, m1 ; t38 |
| psubsw m1, m5, m8 ; t58 |
| paddsw m5, m8 ; t57 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| mova [tmp2q-32*4], m0 |
| mova [tmp2q-32*3], m5 |
| vpbroadcastd m6, [o(pw_m799_m4017)] |
| vpbroadcastd m7, [o(pw_m4017_799)] |
| ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 |
| ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m1 |
| mova [tmp2q-32*2], m3 |
| mova [tmp2q-32*1], m2 |
| ret |
| %define o_base pw_5 + 128 |
| .main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub |
| sub rax, o_idct64_offset + 8 |
| vpbroadcastd m11, [o(pw_1567_3784)] |
| vpbroadcastd m12, [o(pw_m3784_1567)] |
| vpbroadcastd m13, [o(pw_2896_2896)] |
| vpbroadcastd m14, [o(pw_m2896_2896)] |
| .main_part2_pass1_loop: |
| call .main_part2_internal |
| IDCT64_PART2_END 0, 7, 0, 6, 9, 10 |
| IDCT64_PART2_END 7, 8, 5, 0, 6, 7 |
| IDCT64_PART2_END 8, 2, 1, 0, 6, 7 |
| IDCT64_PART2_END 15, 3, 4, 0, 6, 7 |
| cmp tmp1q, tmp2q |
| jne .main_part2_pass1_loop |
| ret |
| .main_part2_internal: |
| mova m0, [tmp1q-32*12] ; t32a |
| mova m6, [tmp2q-32*13] ; t39a |
| mova m1, [tmp1q-32* 4] ; t40a |
| mova m5, [tmp2q+32* 3] ; t55a |
| add tmp1q, 32 |
| sub tmp2q, 32 |
| mova m2, [tmp1q+32* 3] ; t48a |
| mova m4, [tmp2q-32* 4] ; t47a |
| mova m3, [tmp1q+32*11] ; t56a |
| mova m7, [tmp2q+32*12] ; t63a |
| psubsw m8, m0, m6 ; t39 |
| paddsw m0, m6 ; t32 |
| psubsw m6, m4, m1 ; t40 |
| paddsw m4, m1 ; t47 |
| psubsw m1, m2, m5 ; t55 |
| paddsw m2, m5 ; t48 |
| psubsw m5, m7, m3 ; t56 |
| paddsw m7, m3 ; t63 |
| ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a |
| vpbroadcastd m9, [o(pw_m1567_m3784)] |
| ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a |
| psubsw m3, m0, m4 ; t47a |
| paddsw m0, m4 ; t32a |
| psubsw m4, m7, m2 ; t48a |
| paddsw m7, m2 ; t63a |
| psubsw m2, m5, m1 ; t40 |
| paddsw m5, m1 ; t39 |
| psubsw m1, m8, m6 ; t55 |
| paddsw m8, m6 ; t56 |
| ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 |
| ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a |
| ret |
| .main_part2_pass2: |
| sub rax, o_idct64_offset + 8 |
| vpbroadcastd m11, [o(pw_1567_3784)] |
| vpbroadcastd m12, [o(pw_m3784_1567)] |
| vpbroadcastd m13, [o(pw_2896_2896)] |
| lea r9, [strideq*5] ; stride*5 |
| lea r3, [r9+strideq*1] ; stride*6 |
| lea r7, [r9+strideq*2] ; stride*7 |
| lea r8, [r3+strideq*2] ; stride*8 |
| lea r2, [dstq+r7] |
| .main_part2_pass2_loop: |
| vpbroadcastd m14, [o(pw_m2896_2896)] |
| call .main_part2_internal |
| vpbroadcastd m14, [o(pw_2048)] |
| IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 |
| IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 |
| IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 |
| IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 |
| add dstq, strideq |
| sub r2, strideq |
| cmp tmp1q, tmp2q |
| jne .main_part2_pass2_loop |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 16 |
| .dconly: |
| pmulhrsw xm0, xm2 |
| movd xm2, [o(pw_2048)] |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| pxor m1, m1 |
| .dconly_loop: |
| mova m2, [dstq+32*0] |
| mova m3, [dstq+32*1] |
| punpckhbw m4, m2, m1 |
| punpcklbw m2, m1 |
| punpckhbw m5, m3, m1 |
| punpcklbw m3, m1 |
| paddw m4, m0 |
| paddw m2, m0 |
| paddw m5, m0 |
| paddw m3, m0 |
| packuswb m2, m4 |
| packuswb m3, m5 |
| mova [dstq+32*0], m2 |
| mova [dstq+32*1], m3 |
| add dstq, strideq |
| dec r2d |
| jg .dconly_loop |
| RET |
| .normal: |
| PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 |
| LOAD_8ROWS cq+32*0, 32*4 |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| lea tmp1q, [rsp+32*7] |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| LOAD_8ROWS cq+32*2, 32*4 |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| vpbroadcastd m15, [o(pd_2048)] |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| mova m0, [cq+32* 1] |
| mova m1, [cq+32*31] |
| mova m2, [cq+32*17] |
| mova m3, [cq+32*15] |
| mova m4, [cq+32* 9] |
| mova m5, [cq+32*23] |
| mova m6, [cq+32*25] |
| mova m7, [cq+32* 7] |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| mova m0, [cq+32* 5] |
| mova m1, [cq+32*27] |
| mova m2, [cq+32*21] |
| mova m3, [cq+32*11] |
| mova m4, [cq+32*13] |
| mova m5, [cq+32*19] |
| mova m6, [cq+32*29] |
| mova m7, [cq+32* 3] |
| pxor m8, m8 |
| REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 |
| sub tmp1q, 32*36 |
| lea r2, [strideq*3] |
| mov tmp2d, 4 |
| .pass2_loop: |
| lea r3, [tmp1q-32*8] |
| mova xm0, [r3 -32*4] |
| mova xm1, [r3 -32*3] |
| vinserti128 m0, m0, [tmp1q-32*4], 1 |
| vinserti128 m1, m1, [tmp1q-32*3], 1 |
| mova xm2, [r3 -32*2] |
| mova xm3, [r3 -32*1] |
| vinserti128 m2, m2, [tmp1q-32*2], 1 |
| vinserti128 m3, m3, [tmp1q-32*1], 1 |
| mova xm4, [r3 +32*0] |
| mova xm5, [r3 +32*1] |
| vinserti128 m4, m4, [tmp1q+32*0], 1 |
| vinserti128 m5, m5, [tmp1q+32*1], 1 |
| mova xm6, [r3 +32*2] |
| mova xm7, [r3 +32*3] |
| vinserti128 m6, m6, [tmp1q+32*2], 1 |
| vinserti128 m7, m7, [tmp1q+32*3], 1 |
| mova xm8, [r3 -32*4+16] |
| mova xm9, [r3 -32*3+16] |
| vinserti128 m8, m8, [tmp1q-32*4+16], 1 |
| vinserti128 m9, m9, [tmp1q-32*3+16], 1 |
| mova xm10, [r3 -32*2+16] |
| mova xm11, [r3 -32*1+16] |
| vinserti128 m10, m10, [tmp1q-32*2+16], 1 |
| vinserti128 m11, m11, [tmp1q-32*1+16], 1 |
| mova xm12, [r3 +32*0+16] |
| mova xm13, [r3 +32*1+16] |
| vinserti128 m12, m12, [tmp1q+32*0+16], 1 |
| vinserti128 m13, m13, [tmp1q+32*1+16], 1 |
| mova xm14, [r3 +32*2+16] |
| mova xm15, [r3 +32*3+16] |
| vinserti128 m14, m14, [tmp1q+32*2+16], 1 |
| vinserti128 m15, m15, [tmp1q+32*3+16], 1 |
| mova [rsp+32*0], m6 |
| mova [rsp+32*1], m7 |
| vpbroadcastd m7, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round |
| call m(idct_16x16_internal).main |
| mova [rsp+32*0], m15 |
| vpbroadcastd m15, [o(pw_2048)] |
| REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 |
| WRITE_16X2 2, 3, 1, 2, strideq*2, r2 |
| pmulhrsw m1, m15, [rsp+32*1] |
| WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 |
| lea r3, [dstq+strideq*4] |
| %define dstq r3 |
| WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 6, 7, 2, 3, strideq*2, r2 |
| REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 |
| lea r3, [r3+strideq*4] |
| WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 10, 11, 2, 3, strideq*2, r2 |
| pmulhrsw m15, [rsp+32*0] |
| lea r3, [r3+strideq*4] |
| WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 |
| WRITE_16X2 14, 15, 2, 3, strideq*2, r2 |
| add tmp1q, 32*16 |
| add r0, 16 |
| dec tmp2d |
| jg .pass2_loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| mov r2d, 64 |
| jmp m(inv_txfm_add_dct_dct_32x8).dconly |
| .normal: |
| PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 |
| lea tmp1q, [rsp+32*7] |
| lea r10d, [eobq-136] |
| sar r10d, 31 |
| .pass1_loop: |
| lea tmp2q, [tmp1q+32*16] |
| LOAD_8ROWS cq+64*1, 64*2, 1 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 |
| test r10b, r10b |
| jnz .fast |
| LOAD_8ROWS_H cq+64*17, 64*2, 2 |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf |
| LOAD_8ROWS_H cq+64*16, 64*2, 1 |
| mova [rsp], m15 |
| pxor m15, m15 |
| REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| jmp .idct16 |
| .fast: |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| .idct16: |
| LOAD_8ROWS cq+64*0, 64*2, 1 |
| pxor m15, m15 |
| REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 |
| call m(idct_16x16_internal).main |
| call m(inv_txfm_add_dct_dct_32x16).pass1_end |
| vpbroadcastd m7, [o(pw_16384)] |
| call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round |
| lea r3, [tmp1q+32*48] |
| mova m15, [rsp] |
| mova [r3-32*4], m0 |
| mova [r3-32*3], m2 |
| mova [r3-32*2], m4 |
| mova [r3-32*1], m6 |
| mova [r3+32*0], m8 |
| mova [r3+32*1], m10 |
| mova [r3+32*2], m12 |
| mova [r3+32*3], m14 |
| add r3, 32*24 |
| mova [r3-32*4], m1 |
| mova [r3-32*3], m3 |
| mova [r3-32*2], m5 |
| mova [r3-32*1], m7 |
| mova [r3+32*0], m9 |
| mova [r3+32*1], m11 |
| mova [r3+32*2], m13 |
| mova [r3+32*3], m15 |
| vpbroadcastd m9, [o(pw_16384)] |
| pmulhrsw m0, m9, [tmp1q-32*4] |
| pmulhrsw m1, m9, [tmp1q-32*3] |
| pmulhrsw m2, m9, [tmp1q-32*2] |
| pmulhrsw m3, m9, [tmp1q-32*1] |
| pmulhrsw m4, m9, [tmp1q+32*0] |
| pmulhrsw m5, m9, [tmp1q+32*1] |
| pmulhrsw m6, m9, [tmp1q+32*2] |
| pmulhrsw m7, m9, [tmp1q+32*3] |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova [tmp1q-32*4], m0 |
| pmulhrsw m0, m9, [tmp2q-32*4] |
| mova [tmp2q-32*4], m1 |
| pmulhrsw m1, m9, [tmp2q-32*3] |
| mova [tmp1q-32*3], m2 |
| pmulhrsw m2, m9, [tmp2q-32*2] |
| mova [tmp2q-32*3], m3 |
| pmulhrsw m3, m9, [tmp2q-32*1] |
| mova [tmp1q-32*2], m4 |
| pmulhrsw m4, m9, [tmp2q+32*0] |
| mova [tmp2q-32*2], m5 |
| pmulhrsw m5, m9, [tmp2q+32*1] |
| mova [tmp1q-32*1], m6 |
| pmulhrsw m6, m9, [tmp2q+32*2] |
| mova [tmp2q-32*1], m7 |
| pmulhrsw m7, m9, [tmp2q+32*3] |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova [tmp1q+32*0], m0 |
| mova [tmp2q+32*0], m1 |
| mova [tmp1q+32*1], m2 |
| mova [tmp2q+32*1], m3 |
| mova [tmp1q+32*2], m4 |
| mova [tmp2q+32*2], m5 |
| mova [tmp1q+32*3], m6 |
| mova [tmp2q+32*3], m7 |
| add cq, 32 |
| add tmp1q, 32*8 |
| add r10d, 0x80000000 |
| jnc .pass1_loop |
| lea r2, [rsp+32*55] |
| lea r7, [r2+32*24] |
| .pass2_loop: |
| lea r3, [r2+32*8] |
| lea r8, [r7+32*8] |
| mova m0, [r2-32*4] |
| mova m1, [r2-32*2] |
| mova m2, [r2+32*0] |
| mova m3, [r2+32*2] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 |
| test r10b, r10b |
| jnz .fast2 |
| mova m4, [r3-32*4] |
| mova m5, [r3-32*2] |
| mova m6, [r3+32*0] |
| mova m7, [r3+32*2] |
| .fast2: |
| mova [rsp], m8 |
| lea tmp1q, [rsp+32*39] |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| mova m0, [r2-32*3] |
| mova m1, [r2-32*1] |
| mova m2, [r2+32*1] |
| mova m3, [r2+32*3] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| test r10b, r10b |
| jnz .fast3 |
| mova m4, [r3-32*3] |
| mova m5, [r3-32*1] |
| mova m6, [r3+32*1] |
| mova m7, [r3+32*3] |
| .fast3: |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| vpbroadcastd m15, [o(pd_2048)] |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| mova m0, [r7-32*4] |
| mova m3, [r7+32*3] |
| mova m4, [r7+32*0] |
| mova m7, [r7-32*1] |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r10b, r10b |
| jnz .fast4 |
| mova m1, [r8+32*3] |
| mova m2, [r8-32*4] |
| mova m5, [r8-32*1] |
| mova m6, [r8+32*0] |
| .fast4: |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| mova m0, [r7-32*2] |
| mova m3, [r7+32*1] |
| mova m4, [r7+32*2] |
| mova m7, [r7-32*3] |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r10b, r10b |
| jnz .fast5 |
| mova m1, [r8+32*1] |
| mova m2, [r8-32*2] |
| mova m5, [r8-32*3] |
| mova m6, [r8+32*2] |
| .fast5: |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 |
| add r10d, 0x80000000 |
| jc .ret |
| lea r2, [rsp+32*7] |
| lea r7, [r2+32*16] |
| sub dstq, r8 |
| lea dstq, [dstq+strideq*4+16] |
| jmp .pass2_loop |
| .ret: |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_16384)] |
| mov [cq], eobd |
| pmulhrsw xm0, xm1 |
| mov r2d, 32 |
| jmp m(inv_txfm_add_dct_dct_64x16).dconly |
| .normal: |
| PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ |
| base, tmp3, tmp4 |
| lea tmp1q, [rsp+32*7] |
| lea tmp4d, [eobq-136] |
| .pass1_loop: |
| LOAD_8ROWS cq+64*0, 64*4, 1 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| LOAD_8ROWS cq+64*2, 64*4, 1 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| vpbroadcastd m15, [o(pd_2048)] |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| vpbroadcastd m7, [o(pw_2896x8)] |
| pmulhrsw m0, m7, [cq+64* 1] |
| pmulhrsw m1, m7, [cq+64*31] |
| pmulhrsw m2, m7, [cq+64*17] |
| pmulhrsw m3, m7, [cq+64*15] |
| pmulhrsw m4, m7, [cq+64* 9] |
| pmulhrsw m5, m7, [cq+64*23] |
| pmulhrsw m6, m7, [cq+64*25] |
| pmulhrsw m7, [cq+64* 7] |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| pmulhrsw m0, m7, [cq+64* 5] |
| pmulhrsw m1, m7, [cq+64*27] |
| pmulhrsw m2, m7, [cq+64*21] |
| pmulhrsw m3, m7, [cq+64*11] |
| pmulhrsw m4, m7, [cq+64*13] |
| pmulhrsw m5, m7, [cq+64*19] |
| pmulhrsw m6, m7, [cq+64*29] |
| pmulhrsw m7, [cq+64* 3] |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 |
| sub tmp1q, 32*44 |
| vpbroadcastd m10, [o(pw_16384)] |
| call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave |
| add cq, 32 |
| add tmp4d, 0x80000000 |
| jnc .pass1_loop |
| lea tmp1q, [rsp+32*15] |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| mov tmp4b, 4 |
| .pass2_loop: |
| lea tmp2q, [tmp1q+32*64] |
| LOAD_8ROWS tmp1q-32*4, 32 |
| test tmp4d, 0x40000000 |
| jnz .fast |
| LOAD_8ROWS_H tmp2q-32*4, 32 |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf |
| lea tmp3q, [tmp2q-32*8] |
| LOAD_8ROWS_H tmp3q-32*4, 32 |
| mova [rsp], m15 |
| jmp .idct16 |
| .fast: |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| .idct16: |
| lea tmp3q, [tmp1q-32*8] |
| LOAD_8ROWS tmp3q-32*4, 32 |
| call m(idct_16x16_internal).main |
| call m(inv_txfm_add_dct_dct_16x32).pass2_end |
| add tmp1q, 32*16 |
| sub dstq, r3 |
| lea r2, [r2+r3+16] |
| add dstq, 16 |
| dec tmp4b |
| jg .pass2_loop |
| RET |
| ALIGN function_align |
| .transpose_round_interleave: |
| mov tmp3d, 4 |
| .loop: |
| lea tmp2q, [tmp1q+32*8] |
| mova xm0, [tmp1q-32*4] |
| mova xm1, [tmp1q-32*3] |
| vinserti128 m0, m0, [tmp2q-32*4], 1 |
| vinserti128 m1, m1, [tmp2q-32*3], 1 |
| mova xm2, [tmp1q-32*2] |
| mova xm3, [tmp1q-32*1] |
| vinserti128 m2, m2, [tmp2q-32*2], 1 |
| vinserti128 m3, m3, [tmp2q-32*1], 1 |
| mova xm4, [tmp1q+32*0] |
| mova xm5, [tmp1q+32*1] |
| vinserti128 m4, m4, [tmp2q+32*0], 1 |
| vinserti128 m5, m5, [tmp2q+32*1], 1 |
| mova xm6, [tmp1q+32*2] |
| mova xm7, [tmp1q+32*3] |
| vinserti128 m6, m6, [tmp2q+32*2], 1 |
| vinserti128 m7, m7, [tmp2q+32*3], 1 |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova xm8, [tmp1q-32*4+16] |
| mova xm9, [tmp1q-32*3+16] |
| vinserti128 m8, m8, [tmp2q-32*4+16], 1 |
| vinserti128 m9, m9, [tmp2q-32*3+16], 1 |
| mova [tmp1q-32*4], m0 |
| mova [tmp2q-32*4], m1 |
| mova [tmp1q-32*3], m2 |
| mova [tmp2q-32*3], m3 |
| mova xm2, [tmp1q-32*2+16] |
| mova xm3, [tmp1q-32*1+16] |
| vinserti128 m2, m2, [tmp2q-32*2+16], 1 |
| vinserti128 m3, m3, [tmp2q-32*1+16], 1 |
| mova [tmp1q-32*2], m4 |
| mova [tmp2q-32*2], m5 |
| mova [tmp1q-32*1], m6 |
| mova [tmp2q-32*1], m7 |
| mova xm4, [tmp1q+32*0+16] |
| mova xm5, [tmp1q+32*1+16] |
| vinserti128 m4, m4, [tmp2q+32*0+16], 1 |
| vinserti128 m5, m5, [tmp2q+32*1+16], 1 |
| mova xm6, [tmp1q+32*2+16] |
| mova xm7, [tmp1q+32*3+16] |
| vinserti128 m6, m6, [tmp2q+32*2+16], 1 |
| vinserti128 m7, m7, [tmp2q+32*3+16], 1 |
| pmulhrsw m0, m8, m10 |
| pmulhrsw m1, m9, m10 |
| REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_identity_identity_8x32).transpose8x8 |
| mova [tmp1q+32*0], m0 |
| mova [tmp2q+32*0], m1 |
| mova [tmp1q+32*1], m2 |
| mova [tmp2q+32*1], m3 |
| mova [tmp1q+32*2], m4 |
| mova [tmp2q+32*2], m5 |
| mova [tmp1q+32*3], m6 |
| mova [tmp2q+32*3], m7 |
| add tmp1q, 32*16 |
| dec tmp3d |
| jg .loop |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob |
| lea rax, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_8192)] |
| mov [cq], eobd |
| mov r2d, 64 |
| jmp m(inv_txfm_add_dct_dct_64x16).dconly |
| .normal: |
| PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 |
| lea tmp1q, [rsp+32*71] |
| lea r10d, [eobq-136] |
| .pass1_loop: |
| LOAD_8ROWS cq+64*0, 64*4 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| LOAD_8ROWS cq+64*2, 64*4 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| vpbroadcastd m15, [o(pd_2048)] |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| mova m0, [cq+64* 1] |
| mova m1, [cq+64*31] |
| mova m2, [cq+64*17] |
| mova m3, [cq+64*15] |
| mova m4, [cq+64* 9] |
| mova m5, [cq+64*23] |
| mova m6, [cq+64*25] |
| mova m7, [cq+64* 7] |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| mova m0, [cq+64* 5] |
| mova m1, [cq+64*27] |
| mova m2, [cq+64*21] |
| mova m3, [cq+64*11] |
| mova m4, [cq+64*13] |
| mova m5, [cq+64*19] |
| mova m6, [cq+64*29] |
| mova m7, [cq+64* 3] |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 |
| sub tmp1q, 32*44 |
| vpbroadcastd m10, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave |
| add cq, 32 |
| add r10d, 0x80000000 |
| jnc .pass1_loop |
| lea tmp1q, [rsp+32*7] |
| mov r10b, 4 |
| .pass2_loop: |
| lea r2, [tmp1q+32*64] |
| mova m0, [r2-32*4] |
| mova m1, [r2-32*2] |
| mova m2, [r2+32*0] |
| mova m3, [r2+32*2] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m4 |
| test r10d, 0x40000000 |
| jnz .fast |
| lea r3, [r2+32*64] |
| mova m4, [r3-32*4] |
| mova m5, [r3-32*2] |
| mova m6, [r3+32*0] |
| mova m7, [r3+32*2] |
| .fast: |
| call m(idct_16x16_internal).main |
| mova m1, [rsp+32*1] |
| mova [tmp1q-32*4], m0 |
| mova [tmp1q-32*3], m1 |
| mova [tmp1q-32*2], m2 |
| mova [tmp1q-32*1], m3 |
| mova [tmp1q+32*0], m4 |
| mova [tmp1q+32*1], m5 |
| mova [tmp1q+32*2], m6 |
| mova [tmp1q+32*3], m7 |
| add tmp1q, 32*8 |
| mova [tmp1q-32*4], m8 |
| mova [tmp1q-32*3], m9 |
| mova [tmp1q-32*2], m10 |
| mova [tmp1q-32*1], m11 |
| mova [tmp1q+32*0], m12 |
| mova [tmp1q+32*1], m13 |
| mova [tmp1q+32*2], m14 |
| mova [tmp1q+32*3], m15 |
| mova m0, [r2-32*3] |
| mova m1, [r2-32*1] |
| mova m2, [r2+32*1] |
| mova m3, [r2+32*3] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| test r10d, 0x40000000 |
| jnz .fast2 |
| mova m4, [r3-32*3] |
| mova m5, [r3-32*1] |
| mova m6, [r3+32*1] |
| mova m7, [r3+32*3] |
| .fast2: |
| add tmp1q, 32*8 |
| lea tmp2q, [tmp1q+32*8] |
| call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast |
| vpbroadcastd m15, [o(pd_2048)] |
| add r2, 32*8 |
| add r3, 32*8 |
| add tmp1q, 32*16 |
| add tmp2q, 32*32 |
| mova m0, [r2-32*4] ; 1 |
| mova m3, [r2+32*3] ; 15 |
| mova m4, [r2+32*0] ; 9 |
| mova m7, [r2-32*1] ; 7 |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r10d, 0x40000000 |
| jnz .fast3 |
| mova m1, [r3+32*3] ; 31 |
| mova m2, [r3-32*4] ; 17 |
| mova m5, [r3-32*1] ; 23 |
| mova m6, [r3+32*0] ; 25 |
| .fast3: |
| add rax, o_idct64_offset |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| add rax, 8 |
| add tmp1q, 32*8 |
| sub tmp2q, 32*8 |
| mova m0, [r2-32*2] ; 5 |
| mova m3, [r2+32*1] ; 11 |
| mova m4, [r2+32*2] ; 13 |
| mova m7, [r2-32*3] ; 3 |
| pxor m1, m1 |
| REPX {mova x, m1}, m2, m5, m6 |
| test r10d, 0x40000000 |
| jnz .fast4 |
| mova m1, [r3+32*1] ; 27 |
| mova m2, [r3-32*2] ; 21 |
| mova m5, [r3-32*3] ; 19 |
| mova m6, [r3+32*2] ; 29 |
| .fast4: |
| call m(inv_txfm_add_dct_dct_16x64).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 |
| sub tmp1q, 32*28 |
| sub dstq, r8 |
| lea dstq, [dstq+strideq*4+16] |
| dec r10b |
| jg .pass2_loop |
| RET |
| |
| %endif ; ARCH_X86_64 |