| ; Copyright © 2021, VideoLAN and dav1d authors |
| ; Copyright © 2021, Two Orioles, LLC |
| ; Copyright © 2021, Matthias Dressel |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 32 |
| itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 |
| dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 |
| idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 |
| idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 |
| iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 |
| idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 |
| iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 |
| pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 |
| idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 |
| idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 |
| |
| %macro COEF_PAIR 2-3 0 |
| pd_%1_%2: dd %1, %1, %2, %2 |
| %define pd_%1 (pd_%1_%2 + 4*0) |
| %define pd_%2 (pd_%1_%2 + 4*2) |
| %if %3 |
| dd -%2, -%2 |
| %define pd_%2_m%2 pd_%2 |
| %endif |
| %endmacro |
| |
| COEF_PAIR 201, 995 |
| COEF_PAIR 401, 1931 |
| COEF_PAIR 799, 3406 |
| COEF_PAIR 1380, 601 |
| COEF_PAIR 1751, 2440 |
| COEF_PAIR 2598, 1189 |
| COEF_PAIR 2751, 2106 |
| COEF_PAIR 2896, 1567, 1 |
| COEF_PAIR 2896, 3784, 1 |
| COEF_PAIR 3035, 3513 |
| COEF_PAIR 3166, 3920 |
| COEF_PAIR 3703, 3290 |
| COEF_PAIR 3857, 4052 |
| COEF_PAIR 4017, 2276 |
| COEF_PAIR 4076, 3612 |
| COEF_PAIR 4091, 3973 |
| |
| pd_8: dd 8 |
| pd_m601: dd -601 |
| pd_m1189: dd -1189 |
| pd_m1380: dd -1380 |
| pd_m2106: dd -2106 |
| pd_m2598: dd -2598 |
| pd_m2751: dd -2751 |
| pd_m3344: dd -3344 |
| pd_1024: dd 1024 |
| pd_1321: dd 1321 |
| pd_1448: dd 1448 |
| pd_1697: dd 1697 |
| pd_2482: dd 2482 |
| pd_3072: dd 3072 ; 1024 + 2048 |
| pd_3803: dd 3803 |
| pd_5119: dd 5119 ; 1024 + 4096 - 1 |
| pd_5120: dd 5120 ; 1024 + 4096 |
| pd_5793: dd 5793 |
| pd_6144: dd 6144 ; 2048 + 4096 |
| pd_17408: dd 17408 ; 1024 + 16384 |
| |
| pixel_10bpc_max: times 2 dw 0x03ff |
| pixel_12bpc_max: times 2 dw 0x0fff |
| dconly_10bpc: times 2 dw 0x7c00 |
| dconly_12bpc: times 2 dw 0x7000 |
| clip_18b_min: dd -0x20000 |
| clip_18b_max: dd 0x1ffff |
| clip_20b_min: dd -0x80000 |
| clip_20b_max: dd 0x7ffff |
| |
| idct64_mul_16bpc: |
| dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 |
| dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 |
| dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 |
| dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 |
| |
| cextern deint_shuf |
| cextern idct64_mul |
| cextern pw_1697x8 |
| cextern pw_1697x16 |
| cextern pw_1567_3784 |
| cextern pw_m1567_m3784 |
| cextern pw_m3784_1567 |
| cextern pw_2896_2896 |
| cextern pw_m2896_2896 |
| cextern pw_5 |
| cextern pw_2048 |
| cextern pw_4096 |
| cextern pw_8192 |
| cextern pw_16384 |
| cextern pw_2896x8 |
| cextern pd_2048 |
| |
| cextern idct_4x8_internal_8bpc_avx2.main |
| cextern idct_4x16_internal_8bpc_avx2.main |
| cextern idct_8x8_internal_8bpc_avx2.main |
| cextern idct_8x16_internal_8bpc_avx2.main |
| cextern idct_16x4_internal_8bpc_avx2.main |
| cextern idct_16x8_internal_8bpc_avx2.main |
| cextern idct_16x16_internal_8bpc_avx2.main |
| cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main |
| cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast |
| cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf |
| cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast |
| cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 |
| cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal |
| |
| cextern iadst_4x4_internal_8bpc_avx2.main |
| cextern iadst_4x8_internal_8bpc_avx2.main_pass2 |
| cextern iadst_4x16_internal_8bpc_avx2.main2 |
| cextern iadst_8x4_internal_8bpc_avx2.main |
| cextern iadst_8x8_internal_8bpc_avx2.main_pass2 |
| cextern iadst_8x16_internal_8bpc_avx2.main |
| cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end |
| cextern iadst_16x4_internal_8bpc_avx2.main |
| cextern iadst_16x8_internal_8bpc_avx2.main |
| cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end |
| cextern iadst_16x16_internal_8bpc_avx2.main |
| cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end |
| |
| SECTION .text |
| |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| %macro WRAP_XMM 1+ |
| INIT_XMM cpuname |
| %1 |
| INIT_YMM cpuname |
| %endmacro |
| |
| %macro IWHT4_1D_PACKED 0 |
| ; m0 = in0 in2, m1 = in1 in3 |
| psubd m2, m0, m1 ; t2 |
| paddd xm0, xm1 ; t0 |
| vpermq m2, m2, q3322 |
| vpermq m0, m0, q1100 |
| vpermq m1, m1, q3120 |
| psubd m3, m0, m2 |
| psrad m3, 1 |
| psubd m3, m1 ; t1 t3 |
| psubd m0, m3 ; ____ out0 |
| paddd m2, m3 ; out3 ____ |
| %endmacro |
| |
| INIT_YMM avx2 |
| cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax |
| mova xm0, [cq+16*0] |
| vinserti128 m0, [cq+16*2], 1 |
| mova xm1, [cq+16*1] |
| vinserti128 m1, [cq+16*3], 1 |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| lea r6, [dstq+strideq*2] |
| psrad m0, 2 |
| psrad m1, 2 |
| IWHT4_1D_PACKED |
| punpckhdq m0, m3 |
| punpckldq m3, m2 |
| punpckhqdq m1, m0, m3 |
| punpcklqdq m0, m3 |
| IWHT4_1D_PACKED |
| vpblendd m0, m2, 0x33 |
| packssdw m0, m3 |
| vextracti128 xm2, m0, 1 |
| punpckhdq xm1, xm0, xm2 ; out2 out1 |
| punpckldq xm0, xm2 ; out3 out0 |
| movq xm2, [r6 +strideq*1] |
| movhps xm2, [dstq+strideq*0] |
| movq xm3, [r6 +strideq*0] |
| movhps xm3, [dstq+strideq*1] |
| %ifidn bdmaxd, bdmaxm |
| movd xm5, bdmaxd |
| vpbroadcastw xm5, xm5 |
| %else ; win64: load from stack |
| vpbroadcastw xm5, bdmaxm |
| %endif |
| paddsw xm0, xm2 |
| paddsw xm1, xm3 |
| pmaxsw xm0, xm4 |
| pmaxsw xm1, xm4 |
| pminsw xm0, xm5 |
| pminsw xm1, xm5 |
| movhps [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm1 |
| movq [r6 +strideq*0], xm1 |
| movq [r6 +strideq*1], xm0 |
| RET |
| |
| ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 |
| ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 |
| ; flags: 1 = packed, 2 = inv_dst2 |
| ; skip round/shift if rnd is not a number |
| %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags |
| %if %8 < 32 |
| pmulld m%4, m%1, m%8 |
| pmulld m%3, m%2, m%8 |
| %else |
| %if %9 & 1 |
| vbroadcasti128 m%3, [pd_%8] |
| %else |
| vpbroadcastd m%3, [pd_%8] |
| %endif |
| pmulld m%4, m%1, m%3 |
| pmulld m%3, m%2 |
| %endif |
| %if %7 < 32 |
| pmulld m%1, m%7 |
| pmulld m%2, m%7 |
| %else |
| %if %9 & 1 |
| vbroadcasti128 m%5, [pd_%7] |
| %else |
| vpbroadcastd m%5, [pd_%7] |
| %endif |
| pmulld m%1, m%5 |
| pmulld m%2, m%5 |
| %endif |
| %if %9 & 2 |
| psubd m%4, m%6, m%4 |
| psubd m%2, m%4, m%2 |
| %else |
| %ifnum %6 |
| paddd m%4, m%6 |
| %endif |
| paddd m%2, m%4 |
| %endif |
| %ifnum %6 |
| paddd m%1, m%6 |
| %endif |
| psubd m%1, m%3 |
| %ifnum %6 |
| psrad m%2, 12 |
| psrad m%1, 12 |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth |
| cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 |
| %define %%p1 m(i%1_%4_internal_%5bpc) |
| ; Jump to the 1st txfm function if we're not taking the fast path, which |
| ; in turn performs an indirect jump to the 2nd txfm function. |
| lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] |
| %ifidn %1_%2, dct_dct |
| test eobd, eobd |
| jnz %%p1 |
| %else |
| %if %3 |
| add eobd, %3 |
| %endif |
| ; jump to the 1st txfm function unless it's located directly after this |
| times ((%%end - %%p1) >> 31) & 1 jmp %%p1 |
| ALIGN function_align |
| %%end: |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 4x4, %3 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastd xm2, [dconly_%3bpc] |
| %if %3 = 10 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 4 |
| .dconly2: |
| add r6d, 128 |
| sar r6d, 8 |
| .dconly3: |
| imul r6d, 181 |
| add r6d, 2176 |
| sar r6d, 12 |
| movd xm0, r6d |
| paddsw xm0, xm2 |
| vpbroadcastw xm0, xm0 |
| .dconly_loop: |
| movq xm1, [dstq+strideq*0] |
| movhps xm1, [dstq+strideq*1] |
| paddsw xm1, xm0 |
| psubusw xm1, xm2 |
| movq [dstq+strideq*0], xm1 |
| movhps [dstq+strideq*1], xm1 |
| lea dstq, [dstq+strideq*2] |
| sub r3d, 2 |
| jg .dconly_loop |
| WRAP_XMM RET |
| %else |
| jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd |
| ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 |
| punpckhqdq m%3, m%2, m%1 ; t3 t2 |
| punpcklqdq m%2, m%1 ; t0 t1 |
| paddd m%1, m%2, m%3 ; out0 out1 |
| psubd m%2, m%3 ; out3 out2 |
| %endmacro |
| |
| %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd |
| vpbroadcastd m%5, [pw_m3784_1567] |
| punpckhwd m%3, m%2, m%1 |
| vpbroadcastd m%4, [pw_1567_3784] |
| punpcklwd m%2, m%1 |
| vpbroadcastd m%1, [pw_m2896_2896] |
| pmaddwd m%5, m%3 |
| pmaddwd m%3, m%4 |
| vpbroadcastd m%4, [pw_2896_2896] |
| pmaddwd m%1, m%2 |
| pmaddwd m%2, m%4 |
| REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 |
| REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 |
| packssdw m%3, m%5 ; t3 t2 |
| packssdw m%2, m%1 ; t0 t1 |
| paddsw m%1, m%2, m%3 ; out0 out1 |
| psubsw m%2, m%3 ; out3 out2 |
| %endmacro |
| |
| INV_TXFM_4X4_FN dct, dct |
| INV_TXFM_4X4_FN dct, identity |
| INV_TXFM_4X4_FN dct, adst |
| INV_TXFM_4X4_FN dct, flipadst |
| |
| cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 |
| call .main |
| vbroadcasti128 m2, [idct4_shuf] |
| packssdw m0, m1 |
| pshufb m0, m2 |
| jmp tx2q |
| .pass2: |
| vextracti128 xm1, m0, 1 |
| WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 |
| packssdw xm5, xm5 ; pw_2048 |
| pmulhrsw xm0, xm5 |
| pmulhrsw xm1, xm5 |
| movq xm2, [dstq+strideq*0] |
| movhps xm2, [dstq+strideq*1] |
| lea r6, [dstq+strideq*2] |
| movq xm3, [r6 +strideq*1] |
| movhps xm3, [r6 +strideq*0] |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| paddw xm0, xm2 |
| paddw xm1, xm3 |
| pmaxsw xm0, xm4 |
| pmaxsw xm1, xm4 |
| pminsw xm0, xm5 |
| pminsw xm1, xm5 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movhps [r6 +strideq*0], xm1 |
| movq [r6 +strideq*1], xm1 |
| RET |
| ALIGN function_align |
| .main: |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m5, [pd_2048] |
| .main2: |
| IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 |
| ret |
| |
| INV_TXFM_4X4_FN adst, dct |
| INV_TXFM_4X4_FN adst, adst |
| INV_TXFM_4X4_FN adst, flipadst |
| INV_TXFM_4X4_FN adst, identity |
| |
| %macro IADST4_1D 0 |
| vpbroadcastd m5, [pd_1321] |
| vpbroadcastd m7, [pd_2482] |
| pmulld m4, m0, m5 ; 1321*in0 |
| pmulld m6, m3, m7 ; 2482*in3 |
| paddd m4, m6 ; 1321*in0 + 2482*in3 |
| pmulld m6, m0, m7 ; 2482*in0 |
| paddd m0, m3 ; in0 + in3 |
| paddd m7, m5 ; pd_3803 |
| pmulld m5, m2 ; 1321*in2 |
| pmulld m3, m7 ; 3803*in3 |
| pmulld m7, m2 ; 3803*in2 |
| psubd m2, m0 ; in2 - in0 - in3 |
| vpbroadcastd m0, [pd_m3344] |
| pmulld m1, m0 ; -t3 |
| pmulld m2, m0 ; out2 (unrounded) |
| psubd m6, m5 ; 2482*in0 - 1321*in2 |
| paddd m4, m7 ; t0 |
| psubd m6, m3 ; t1 |
| paddd m3, m4, m6 |
| psubd m4, m1 ; out0 (unrounded) |
| psubd m6, m1 ; out1 (unrounded) |
| paddd m3, m1 ; out3 (unrounded) |
| %endmacro |
| |
| cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 |
| call .main |
| vinserti128 m0, m4, xm6, 1 |
| vinserti128 m1, m2, xm3, 1 |
| .pass1_end: |
| vpbroadcastd m5, [pd_2048] |
| mova m2, [itx4_shuf] |
| paddd m0, m5 |
| paddd m1, m5 |
| psrad m0, 12 |
| psrad m1, 12 |
| packssdw m0, m1 |
| vpermd m0, m2, m0 |
| psrld m2, 4 |
| pshufb m0, m2 |
| %if WIN64 |
| movaps xmm6, [rsp+ 8] |
| movaps xmm7, [rsp+24] |
| %endif |
| jmp tx2q |
| .pass2: |
| lea r6, [deint_shuf+128] |
| vextracti128 xm1, m0, 1 |
| call m(iadst_4x4_internal_8bpc).main |
| .end: |
| vpbroadcastd xm4, [pw_2048] |
| movq xm2, [dstq+strideq*0] |
| movhps xm2, [dstq+strideq*1] |
| lea r6, [dstq+strideq*2] |
| movq xm3, [r6 +strideq*0] |
| movhps xm3, [r6 +strideq*1] |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pmulhrsw xm0, xm4 |
| pmulhrsw xm1, xm4 |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| paddw xm0, xm2 |
| paddw xm1, xm3 |
| pmaxsw xm0, xm4 |
| pmaxsw xm1, xm4 |
| pminsw xm0, xm5 |
| pminsw xm1, xm5 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [r6 +strideq*0], xm1 |
| movhps [r6 +strideq*1], xm1 |
| RET |
| ALIGN function_align |
| .main: |
| mova xm0, [cq+16*0] |
| mova xm1, [cq+16*1] |
| mova xm2, [cq+16*2] |
| mova xm3, [cq+16*3] |
| %if WIN64 |
| movaps [rsp+16], xmm6 |
| movaps [rsp+32], xmm7 |
| %endif |
| .main2: |
| WRAP_XMM IADST4_1D |
| ret |
| |
| INV_TXFM_4X4_FN flipadst, dct |
| INV_TXFM_4X4_FN flipadst, adst |
| INV_TXFM_4X4_FN flipadst, flipadst |
| INV_TXFM_4X4_FN flipadst, identity |
| |
| cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 |
| call m(iadst_4x4_internal_10bpc).main |
| vinserti128 m0, m3, xm2, 1 |
| vinserti128 m1, m6, xm4, 1 |
| jmp m(iadst_4x4_internal_10bpc).pass1_end |
| .pass2: |
| lea r6, [deint_shuf+128] |
| vextracti128 xm1, m0, 1 |
| call m(iadst_4x4_internal_8bpc).main |
| vpbroadcastd xm4, [pw_2048] |
| movq xm3, [dstq+strideq*1] |
| movhps xm3, [dstq+strideq*0] |
| lea r6, [dstq+strideq*2] |
| movq xm2, [r6 +strideq*1] |
| movhps xm2, [r6 +strideq*0] |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pmulhrsw xm0, xm4 |
| pmulhrsw xm1, xm4 |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| paddw xm0, xm2 |
| paddw xm1, xm3 |
| pmaxsw xm0, xm4 |
| pmaxsw xm1, xm4 |
| pminsw xm0, xm5 |
| pminsw xm1, xm5 |
| movhps [dstq+strideq*0], xm1 |
| movq [dstq+strideq*1], xm1 |
| movhps [r6 +strideq*0], xm0 |
| movq [r6 +strideq*1], xm0 |
| RET |
| |
| INV_TXFM_4X4_FN identity, dct |
| INV_TXFM_4X4_FN identity, adst |
| INV_TXFM_4X4_FN identity, flipadst |
| INV_TXFM_4X4_FN identity, identity |
| |
| cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 |
| vpbroadcastd m1, [pd_5793] |
| pmulld m0, m1, [cq+32*0] |
| pmulld m1, [cq+32*1] |
| vpbroadcastd m5, [pd_2048] |
| mova m3, [itx4_shuf] |
| paddd m0, m5 |
| paddd m1, m5 |
| psrad m0, 12 |
| psrad m1, 12 |
| packssdw m0, m1 |
| vpermd m0, m3, m0 |
| psrld m3, 4 |
| pshufb m0, m3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m1, [pw_1697x8] |
| movq xm2, [dstq+strideq*0] |
| movhps xm2, [dstq+strideq*1] |
| lea r6, [dstq+strideq*2] |
| pmulhrsw m1, m0 |
| paddsw m0, m1 |
| movq xm3, [r6 +strideq*0] |
| movhps xm3, [r6 +strideq*1] |
| vpbroadcastd xm4, [pixel_10bpc_max] |
| packssdw m5, m5 ; pw_2048 |
| pmulhrsw m0, m5 |
| pxor m5, m5 |
| mova [cq+32*0], m5 |
| mova [cq+32*1], m5 |
| vextracti128 xm1, m0, 1 |
| paddw xm0, xm2 |
| paddw xm1, xm3 |
| pmaxsw xm0, xm5 |
| pmaxsw xm1, xm5 |
| pminsw xm0, xm4 |
| pminsw xm1, xm4 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [r6 +strideq*0], xm1 |
| movhps [r6 +strideq*1], xm1 |
| RET |
| |
| INV_TXFM_4X4_FN dct, dct, 12 |
| INV_TXFM_4X4_FN dct, identity, 12 |
| INV_TXFM_4X4_FN dct, adst, 12 |
| INV_TXFM_4X4_FN dct, flipadst, 12 |
| |
| cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| call m(idct_4x4_internal_10bpc).main |
| mova m3, [idct4_12_shuf] |
| mova m4, [idct4_12_shuf2] |
| vpermd m2, m4, m1 |
| vpermd m1, m3, m0 |
| jmp m(iadst_4x4_internal_12bpc).pass1_end2 |
| .pass2: |
| vpbroadcastd m5, [pd_2048] |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| call m(idct_4x4_internal_10bpc).main2 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| jmp m(iadst_4x4_internal_12bpc).end |
| |
| INV_TXFM_4X4_FN adst, dct, 12 |
| INV_TXFM_4X4_FN adst, adst, 12 |
| INV_TXFM_4X4_FN adst, flipadst, 12 |
| INV_TXFM_4X4_FN adst, identity, 12 |
| |
| cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| call m(iadst_4x4_internal_10bpc).main |
| vinserti128 m1, m4, xm6, 1 |
| vinserti128 m2, xm3, 1 |
| .pass1_end: |
| mova m3, [itx4_shuf] |
| vpbroadcastd m5, [pd_1024] |
| psrad m1, 1 |
| psrad m2, 1 |
| vpermd m1, m3, m1 |
| vpermd m2, m3, m2 |
| paddd m1, m5 |
| paddd m2, m5 |
| psrad m1, 11 |
| psrad m2, 11 |
| .pass1_end2: |
| vpbroadcastd m3, [clip_18b_min] |
| vpbroadcastd m4, [clip_18b_max] |
| punpcklqdq m0, m1, m2 |
| punpckhqdq m1, m2 |
| pmaxsd m0, m3 |
| pmaxsd m1, m3 |
| pminsd m0, m4 |
| pminsd m1, m4 |
| jmp tx2q |
| .pass2: |
| call .main_pass2 |
| vinserti128 m0, m4, xm6, 1 |
| vinserti128 m1, m2, xm3, 1 |
| .pass2_end: |
| vpbroadcastd m5, [pd_2048] |
| paddd m0, m5 |
| paddd m1, m5 |
| psrad m0, 12 |
| psrad m1, 12 |
| .end: |
| %if WIN64 |
| WIN64_RESTORE_XMM_INTERNAL |
| %assign xmm_regs_used 6 |
| %endif |
| .end2: |
| vpbroadcastd m4, [pw_16384] |
| movq xm2, [dstq+strideq*0] |
| movq xm3, [dstq+strideq*1] |
| lea r6, [dstq+strideq*2] |
| movhps xm2, [r6 +strideq*0] ; dst0 dst2 |
| movhps xm3, [r6 +strideq*1] ; dst1 dst3 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| vinserti128 m2, xm3, 1 |
| psrad m0, 3 |
| psrad m1, 3 |
| packssdw m0, m1 ; t0 t2 t1 t3 |
| pmulhrsw m0, m4 |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| paddw m0, m2 ; out0 out2 out1 out3 |
| pmaxsw m0, m4 |
| pminsw m0, m5 |
| vextracti128 xm1, m0, 1 ; out1 out3 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [r6 +strideq*0], xm0 |
| movhps [r6 +strideq*1], xm1 |
| RET |
| .main_pass2: |
| vextracti128 xm3, m1, 1 |
| mova xm2, xm1 |
| vextracti128 xm1, m0, 1 |
| jmp m(iadst_4x4_internal_10bpc).main2 |
| |
| INV_TXFM_4X4_FN flipadst, dct, 12 |
| INV_TXFM_4X4_FN flipadst, adst, 12 |
| INV_TXFM_4X4_FN flipadst, flipadst, 12 |
| INV_TXFM_4X4_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| call m(iadst_4x4_internal_10bpc).main |
| vinserti128 m1, m3, xm2, 1 |
| vinserti128 m2, m6, xm4, 1 |
| jmp m(iadst_4x4_internal_12bpc).pass1_end |
| .pass2: |
| call m(iadst_4x4_internal_12bpc).main_pass2 |
| vinserti128 m0, m3, xm2, 1 |
| vinserti128 m1, m6, xm4, 1 |
| jmp m(iadst_4x4_internal_12bpc).pass2_end |
| |
| INV_TXFM_4X4_FN identity, dct, 12 |
| INV_TXFM_4X4_FN identity, adst, 12 |
| INV_TXFM_4X4_FN identity, flipadst, 12 |
| INV_TXFM_4X4_FN identity, identity, 12 |
| |
| cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| mova m2, [itx4_shuf] |
| vpbroadcastd m3, [pd_1697] |
| vpermd m0, m2, [cq+32*0] |
| vpermd m2, m2, [cq+32*1] |
| vpbroadcastd m5, [pd_2048] |
| pmulld m1, m3, m0 |
| pmulld m3, m2 |
| paddd m1, m5 |
| paddd m3, m5 |
| psrad m1, 12 |
| psrad m3, 12 |
| paddd m1, m0 |
| paddd m2, m3 |
| jmp m(iadst_4x4_internal_12bpc).pass1_end2 |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| vpbroadcastd m3, [pd_5793] |
| vpbroadcastd m5, [pd_2048] |
| pmulld m0, m3 |
| pmulld m1, m3 |
| paddd m0, m5 ; 2048 |
| paddd m1, m5 |
| psrad m0, 12 |
| psrad m1, 12 |
| jmp m(iadst_4x4_internal_12bpc).end |
| |
| %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 4x8, %3 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastd xm2, [dconly_%3bpc] |
| %if %3 = 10 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 8 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 |
| %else |
| jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd |
| ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 |
| vpbroadcastd m%5, [pd_2896] |
| pmulld m%1, m%5 |
| pmulld m%3, m%5 |
| paddd m%1, m%8 |
| paddd m%5, m%1, m%3 |
| psubd m%1, m%3 |
| psrad m%5, 12 ; t0 |
| psrad m%1, 12 ; t1 |
| psubd m%3, m%1, m%2 |
| paddd m%2, m%1 |
| paddd m%1, m%5, m%4 |
| psubd m%4, m%5, m%4 |
| %endmacro |
| |
| INV_TXFM_4X8_FN dct, dct |
| INV_TXFM_4X8_FN dct, identity |
| INV_TXFM_4X8_FN dct, adst |
| INV_TXFM_4X8_FN dct, flipadst |
| |
| cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m3, [pd_2896] |
| pmulld m0, m3, [cq+32*0] |
| pmulld m1, m3, [cq+32*1] |
| pmulld m2, m3, [cq+32*2] |
| pmulld m3, m3, [cq+32*3] |
| vpbroadcastd m7, [pd_2048] |
| REPX {paddd x, m7}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 |
| jmp tx2q |
| .pass2: |
| packssdw m0, m2 |
| packssdw m1, m3 |
| lea r6, [deint_shuf+128] |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhdq m1, m0, m2 ; 2 3 |
| punpckldq m0, m2 ; 0 1 |
| vextracti128 xm2, m0, 1 ; 4 5 |
| vextracti128 xm3, m1, 1 ; 6 7 |
| call m(idct_4x8_internal_8bpc).main |
| vpbroadcastd xm4, [pw_2048] |
| REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 |
| lea r3, [strideq*3] |
| lea r6, [dstq+strideq*4] |
| movq xm4, [dstq+strideq*0] |
| movhps xm4, [dstq+strideq*1] |
| movq xm5, [dstq+r3 ] |
| movhps xm5, [dstq+strideq*2] |
| movq xm6, [r6 +strideq*0] |
| movhps xm6, [r6 +strideq*1] |
| movq xm7, [r6 +r3 ] |
| movhps xm7, [r6 +strideq*2] |
| paddw xm0, xm4 ; 0 1 |
| paddw xm1, xm5 ; 3 2 |
| paddw xm2, xm6 ; 4 5 |
| paddw xm3, xm7 ; 7 6 |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 |
| REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movhps [dstq+strideq*2], xm1 |
| movq [dstq+r3 ], xm1 |
| movq [r6 +strideq*0], xm2 |
| movhps [r6 +strideq*1], xm2 |
| movhps [r6 +strideq*2], xm3 |
| movq [r6 +r3 ], xm3 |
| RET |
| |
| INV_TXFM_4X8_FN adst, dct |
| INV_TXFM_4X8_FN adst, adst |
| INV_TXFM_4X8_FN adst, flipadst |
| INV_TXFM_4X8_FN adst, identity |
| |
| cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| call m(iadst_8x4_internal_10bpc).main |
| vpbroadcastd m5, [pd_2048] |
| paddd m0, m5, m4 |
| paddd m1, m5, m6 |
| paddd m2, m5 |
| paddd m3, m5 |
| .pass1_end: |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| jmp tx2q |
| .pass2: |
| call .pass2_main |
| mova xm4, [pw_2048_m2048] |
| REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 |
| .end: |
| lea r3, [strideq*3] |
| lea r6, [dstq+strideq*4] |
| movq xm4, [dstq+strideq*0] |
| movhps xm4, [dstq+strideq*1] |
| movq xm5, [dstq+strideq*2] |
| movhps xm5, [dstq+r3 ] |
| movq xm6, [r6 +strideq*0] |
| movhps xm6, [r6 +strideq*1] |
| movq xm7, [r6 +strideq*2] |
| movhps xm7, [r6 +r3 ] |
| paddw xm0, xm4 ; 0 1 |
| paddw xm1, xm5 ; 2 3 |
| paddw xm2, xm6 ; 4 5 |
| paddw xm3, xm7 ; 6 7 |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 |
| REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm1 |
| movhps [dstq+r3 ], xm1 |
| movq [r6 +strideq*0], xm2 |
| movhps [r6 +strideq*1], xm2 |
| movq [r6 +strideq*2], xm3 |
| movhps [r6 +r3 ], xm3 |
| RET |
| ALIGN function_align |
| .pass2_main: |
| packssdw m0, m2 |
| packssdw m1, m3 |
| lea r6, [deint_shuf+128] |
| punpcklwd m4, m0, m1 |
| punpckhwd m0, m1 |
| punpckhdq m5, m4, m0 |
| punpckldq m4, m0 |
| vextracti128 xm2, m4, 1 ; 4 5 |
| vextracti128 xm3, m5, 1 ; 6 7 |
| pshufd xm4, xm4, q1032 ; 1 0 |
| pshufd xm5, xm5, q1032 ; 3 2 |
| jmp m(iadst_4x8_internal_8bpc).main_pass2 |
| ALIGN function_align |
| .main: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| .main2: |
| vbroadcasti128 m0, [cq+16*0] |
| vbroadcasti128 m2, [cq+16*2] |
| vbroadcasti128 m3, [cq+16*5] |
| vbroadcasti128 m1, [cq+16*7] |
| vpbroadcastd m6, [pd_2896] |
| shufpd m0, m2, 0x0c ; 0 2 |
| shufpd m1, m3, 0x0c ; 7 5 |
| vbroadcasti128 m2, [cq+16*4] |
| vbroadcasti128 m4, [cq+16*6] |
| vbroadcasti128 m5, [cq+16*1] |
| vbroadcasti128 m3, [cq+16*3] |
| vpbroadcastd m7, [pd_2048] |
| shufpd m2, m4, 0x0c ; 4 6 |
| shufpd m3, m5, 0x0c ; 3 1 |
| REPX {pmulld x, m6}, m0, m1, m2, m3 |
| REPX {paddd x, m7}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| .main3: |
| ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 |
| ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 |
| psubd m4, m0, m2 ; t4 t6 |
| paddd m0, m2 ; t0 t2 |
| psubd m2, m1, m3 ; t5 t7 |
| paddd m1, m3 ; t1 t3 |
| REPX {pmaxsd x, m8}, m4, m2, m0, m1 |
| REPX {pminsd x, m9}, m4, m2, m0, m1 |
| pxor m5, m5 |
| psubd m5, m4 |
| vpblendd m4, m2, 0xcc ; t4 t7 |
| vpblendd m2, m5, 0xcc ; t5 -t6 |
| ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 |
| vpbroadcastd m5, [pd_2896] |
| vbroadcasti128 m6, [pw_2048_m2048] ; + + - - |
| punpckhqdq m3, m0, m1 |
| punpcklqdq m0, m1 |
| psubd m1, m0, m3 ; t2 t3 |
| paddd m0, m3 ; out0 -out7 |
| punpckhqdq m3, m4, m2 ; t7a t6a |
| punpcklqdq m4, m2 ; t5a t4a |
| psubd m2, m4, m3 ; t7 t6 |
| paddd m4, m3 ; out6 -out1 |
| REPX {pmaxsd x, m8}, m1, m2 |
| REPX {pminsd x, m9}, m1, m2 |
| vpblendd m3, m1, m2, 0xcc |
| shufpd m1, m2, 0x05 |
| pmulld m3, m5 |
| pmulld m5, m1 |
| psignd m0, m6 ; out0 out7 |
| psignd m4, m6 ; out6 out1 |
| paddd m3, m7 |
| psubd m2, m3, m5 |
| paddd m5, m3 |
| psrad m2, 12 ; out4 -out5 |
| psrad m5, 12 ; -out3 out2 |
| ret |
| |
| INV_TXFM_4X8_FN flipadst, dct |
| INV_TXFM_4X8_FN flipadst, adst |
| INV_TXFM_4X8_FN flipadst, flipadst |
| INV_TXFM_4X8_FN flipadst, identity |
| |
| cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| call m(iadst_8x4_internal_10bpc).main |
| vpbroadcastd m5, [pd_2048] |
| paddd m0, m5, m3 |
| paddd m1, m5, m2 |
| paddd m2, m5, m6 |
| paddd m3, m5, m4 |
| jmp m(iadst_4x8_internal_10bpc).pass1_end |
| .pass2: |
| call m(iadst_4x8_internal_10bpc).pass2_main |
| mova xm4, [pw_2048_m2048] |
| REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 |
| lea r3, [strideq*3] |
| lea r6, [dstq+strideq*4] |
| movq xm4, [dstq+strideq*1] |
| movhps xm4, [dstq+strideq*0] |
| movq xm5, [dstq+r3 ] |
| movhps xm5, [dstq+strideq*2] |
| movq xm6, [r6 +strideq*1] |
| movhps xm6, [r6 +strideq*0] |
| movq xm7, [r6 +r3 ] |
| movhps xm7, [r6 +strideq*2] |
| paddw xm3, xm4 ; 1 0 |
| paddw xm2, xm5 ; 3 2 |
| paddw xm1, xm6 ; 5 4 |
| paddw xm0, xm7 ; 7 6 |
| vpbroadcastd xm5, [pixel_10bpc_max] |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 |
| REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 |
| movhps [dstq+strideq*0], xm3 |
| movq [dstq+strideq*1], xm3 |
| movhps [dstq+strideq*2], xm2 |
| movq [dstq+r3 ], xm2 |
| movhps [r6 +strideq*0], xm1 |
| movq [r6 +strideq*1], xm1 |
| movhps [r6 +strideq*2], xm0 |
| movq [r6 +r3 ], xm0 |
| RET |
| |
| INV_TXFM_4X8_FN identity, dct |
| INV_TXFM_4X8_FN identity, adst |
| INV_TXFM_4X8_FN identity, flipadst |
| INV_TXFM_4X8_FN identity, identity |
| |
| cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m3, [pd_2896] |
| pmulld m0, m3, [cq+32*0] |
| pmulld m1, m3, [cq+32*1] |
| pmulld m2, m3, [cq+32*2] |
| pmulld m3, [cq+32*3] |
| vpbroadcastd m5, [pd_2048] |
| vpbroadcastd m4, [pd_5793] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| REPX {pmulld x, m4}, m0, m1, m2, m3 |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m6, [pixel_10bpc_max] |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .pass2_end: |
| vpbroadcastd m4, [pw_4096] |
| packssdw m0, m2 |
| packssdw m1, m3 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m0, m4 |
| punpckhdq m1, m0, m2 ; 2 3 6 7 |
| punpckldq m0, m2 ; 0 1 4 5 |
| lea r3, [strideq*3] |
| lea r6, [dstq+strideq*4] |
| movq xm2, [dstq+strideq*0] |
| movhps xm2, [dstq+strideq*1] |
| vpbroadcastq m4, [r6 +strideq*0] |
| vpbroadcastq m5, [r6 +strideq*1] |
| movq xm3, [dstq+strideq*2] |
| movhps xm3, [dstq+r3 ] |
| vpblendd m2, m4, 0x30 |
| vpblendd m2, m5, 0xc0 |
| vpbroadcastq m4, [r6 +strideq*2] |
| vpbroadcastq m5, [r6 +r3 ] |
| vpblendd m3, m4, 0x30 |
| vpblendd m3, m5, 0xc0 |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| paddw m0, m2 ; out0 out1 out4 out5 |
| paddw m1, m3 ; out2 out3 out6 out7 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m6 |
| pminsw m1, m6 |
| vextracti128 xm2, m0, 1 ; out4 out5 |
| vextracti128 xm3, m1, 1 ; out6 out7 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm1 |
| movhps [dstq+r3 ], xm1 |
| movq [r6 +strideq*0], xm2 |
| movhps [r6 +strideq*1], xm2 |
| movq [r6 +strideq*2], xm3 |
| movhps [r6 +r3 ], xm3 |
| ret |
| |
| INV_TXFM_4X8_FN dct, dct, 12 |
| INV_TXFM_4X8_FN dct, identity, 12 |
| INV_TXFM_4X8_FN dct, adst, 12 |
| INV_TXFM_4X8_FN dct, flipadst, 12 |
| |
| cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| jmp m(idct_4x8_internal_10bpc).pass1 |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| ; transpose & interleave |
| pshufd m0, m0, q1320 |
| pshufd m1, m1, q1320 |
| pshufd m2, m2, q1320 |
| pshufd m3, m3, q1320 |
| punpckldq m4, m0, m1 |
| punpckhdq m0, m1 |
| punpckldq m5, m2, m3 |
| punpckhdq m2, m3 |
| vpermq m0, m0, q3102 |
| vpermq m2, m2, q3102 |
| vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) |
| vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) |
| vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) |
| vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) |
| vpbroadcastd m7, [pd_2048] |
| call m(idct_8x4_internal_10bpc).main |
| psubd m3, m0, m4 ; out7 out6 |
| paddd m0, m4 ; out0 out1 |
| paddd m1, m2, m5 ; out3 out2 |
| psubd m2, m5 ; out4 out5 |
| pshufd m1, m1, q1032 |
| pshufd m3, m3, q1032 |
| jmp m(iadst_4x8_internal_12bpc).end |
| |
| INV_TXFM_4X8_FN adst, dct, 12 |
| INV_TXFM_4X8_FN adst, adst, 12 |
| INV_TXFM_4X8_FN adst, flipadst, 12 |
| INV_TXFM_4X8_FN adst, identity, 12 |
| |
| cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| call m(iadst_8x4_internal_10bpc).main |
| psrad m0, m4, 1 |
| psrad m1, m6, 1 |
| psrad m2, 1 |
| psrad m3, 1 |
| .pass1_end: |
| vpbroadcastd m5, [pd_1024] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 11}, m0, m1, m2, m3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| call .pass2_main |
| vpblendd m3, m0, m4, 0x33 ; out6 out7 |
| vpblendd m0, m4, 0xcc ; out0 out1 |
| pshufd m1, m5, q1032 |
| psignd m2, m6 ; out4 out5 |
| psignd m1, m6 ; out2 out3 |
| .end: |
| vpbroadcastd m4, [pw_16384] |
| REPX {psrad x, 3}, m0, m1, m2, m3 |
| packssdw m0, m2 ; 0 1 4 5 (interleaved) |
| packssdw m1, m3 ; 2 3 6 7 (interleaved) |
| mova m2, [iadst8_12_shuf] |
| vpermd m0, m2, m0 ; 0 1 4 5 |
| vpermd m1, m2, m1 ; 2 3 6 7 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| lea r3, [strideq*3] |
| lea r6, [dstq+strideq*4] |
| movq xm4, [dstq+strideq*0] |
| movhps xm4, [dstq+strideq*1] |
| movq xm5, [dstq+strideq*2] |
| movhps xm5, [dstq+r3 ] |
| movq xm6, [r6 +strideq*0] |
| movhps xm6, [r6 +strideq*1] |
| vinserti128 m4, xm6, 1 |
| movq xm7, [r6 +strideq*2] |
| movhps xm7, [r6 +r3 ] |
| vinserti128 m5, xm7, 1 |
| paddw m0, m4 ; 0 1 4 5 |
| paddw m1, m5 ; 2 3 6 7 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| REPX {pmaxsw x, m4}, m0, m1 |
| REPX {pminsw x, m5}, m0, m1 |
| vextracti128 xm2, m0, 1 ; out4 out5 |
| vextracti128 xm3, m1, 1 ; out6 out7 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm1 |
| movhps [dstq+r3 ], xm1 |
| movq [r6 +strideq*0], xm2 |
| movhps [r6 +strideq*1], xm2 |
| movq [r6 +strideq*2], xm3 |
| movhps [r6 +r3 ], xm3 |
| RET |
| ALIGN function_align |
| .pass2_main: |
| ; transpose & interleave |
| pshufd m0, m0, q1320 |
| pshufd m1, m1, q1320 |
| pshufd m2, m2, q1320 |
| pshufd m3, m3, q1320 |
| punpckldq m4, m0, m1 |
| punpckhdq m0, m1 |
| punpckldq m5, m2, m3 |
| punpckhdq m2, m3 |
| vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) |
| vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) |
| vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) |
| vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) |
| vpbroadcastd m7, [pd_2048] |
| jmp m(iadst_4x8_internal_10bpc).main3 |
| |
| INV_TXFM_4X8_FN flipadst, dct, 12 |
| INV_TXFM_4X8_FN flipadst, adst, 12 |
| INV_TXFM_4X8_FN flipadst, flipadst, 12 |
| INV_TXFM_4X8_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| call m(iadst_8x4_internal_10bpc).main |
| psrad m0, m3, 1 |
| psrad m1, m2, 1 |
| psrad m2, m6, 1 |
| psrad m3, m4, 1 |
| jmp m(iadst_4x8_internal_12bpc).pass1_end |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| call m(iadst_4x8_internal_12bpc).pass2_main |
| shufpd m3, m4, m0, 0x05 ; out1 out0 |
| shufpd m0, m4, 0x05 ; out7 out6 |
| psignd m2, m6 |
| pshufd m6, m6, q1032 |
| pshufd m1, m2, q1032 ; out5 out4 |
| psignd m2, m5, m6 ; out3 out2 |
| jmp m(iadst_4x8_internal_12bpc).end |
| |
| INV_TXFM_4X8_FN identity, dct, 12 |
| INV_TXFM_4X8_FN identity, adst, 12 |
| INV_TXFM_4X8_FN identity, flipadst, 12 |
| INV_TXFM_4X8_FN identity, identity, 12 |
| |
| cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| jmp m(iidentity_4x8_internal_10bpc).pass1 |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| ; m2 = in4 in5 |
| ; m3 = in6 in7 |
| vpbroadcastd m6, [pixel_12bpc_max] |
| call m(iidentity_4x8_internal_10bpc).pass2_end |
| RET |
| |
| %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 4x16, %3 |
| %ifidn %1_%2, dct_dct |
| imul r6d, [cq], 181 |
| vpbroadcastd xm2, [dconly_%3bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 16 |
| add r6d, 384 |
| sar r6d, 9 |
| jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 |
| %endif |
| %endmacro |
| |
| INV_TXFM_4X16_FN dct, dct |
| INV_TXFM_4X16_FN dct, identity |
| INV_TXFM_4X16_FN dct, adst |
| INV_TXFM_4X16_FN dct, flipadst |
| |
| cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m10, [pd_3072] |
| mova m1, [cq+32*2] |
| mova m3, [cq+32*6] |
| mova m5, [cq+32*3] |
| mova m7, [cq+32*7] |
| call .pass1_main |
| pmulld m0, m6, [cq+32*0] |
| pmulld m2, m6, [cq+32*4] |
| pmulld m4, m6, [cq+32*1] |
| pmulld m6, [cq+32*5] |
| call .pass1_main2 |
| REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp tx2q |
| .pass2: |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| lea r6, [deint_shuf+128] |
| punpcklwd m4, m2, m3 |
| punpckhwd m2, m3 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhdq m1, m0, m4 ; 2 3 |
| punpckldq m0, m4 ; 0 1 |
| punpckldq m4, m5, m2 ; 8 9 |
| punpckhdq m5, m2 ; a b |
| vextracti128 xm2, m0, 1 ; 4 5 |
| vextracti128 xm3, m1, 1 ; 6 7 |
| vextracti128 xm6, m4, 1 ; c d |
| vextracti128 xm7, m5, 1 ; e f |
| call m(idct_4x16_internal_8bpc).main |
| vpbroadcastd m9, [pw_2048] |
| vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 |
| vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 |
| vinserti128 m2, m4, xm5, 1 ; 8 9 b a |
| vinserti128 m3, m6, xm7, 1 ; c d f e |
| vpbroadcastd m8, [pixel_10bpc_max] |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .pass1_main: |
| vpbroadcastd m4, [pd_3784] |
| vpbroadcastd m8, [pd_1567] |
| vpbroadcastd m9, [pd_2048] |
| vpbroadcastd m6, [pd_1448] |
| ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l |
| ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h |
| ret |
| ALIGN function_align |
| .pass1_main2: |
| paddd m0, m10 |
| paddd m4, m10 |
| paddd m8, m0, m2 |
| psubd m0, m2 |
| paddd m9, m4, m6 |
| psubd m4, m6 |
| REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h |
| psubd m2, m0, m1 |
| paddd m1, m0 |
| psubd m6, m4, m5 |
| paddd m5, m4 |
| paddd m0, m8, m3 |
| psubd m3, m8, m3 |
| paddd m4, m9, m7 |
| psubd m7, m9, m7 |
| ret |
| ALIGN function_align |
| .pass2_end: |
| lea r6, [strideq*3] |
| pxor m7, m7 |
| pmulhrsw m0, m9 |
| call .write_4x4 |
| pmulhrsw m0, m1, m9 |
| call .write_4x4 |
| pmulhrsw m0, m2, m9 |
| call .write_4x4 |
| pmulhrsw m0, m3, m9 |
| call .write_4x4 |
| ret |
| ALIGN function_align |
| .write_4x4: |
| movq xm4, [dstq+strideq*0] |
| movhps xm4, [dstq+strideq*1] |
| vpbroadcastq m5, [dstq+strideq*2] |
| vpbroadcastq m6, [dstq+r6 ] |
| mova [cq+32*0], m7 |
| mova [cq+32*1], m7 |
| add cq, 32*2 |
| vpblendd m4, m5, 0xc0 |
| vpblendd m4, m6, 0x30 |
| paddw m4, m0 |
| pmaxsw m4, m7 |
| pminsw m4, m8 |
| vextracti128 xm5, m4, 1 |
| movq [dstq+strideq*0], xm4 |
| movhps [dstq+strideq*1], xm4 |
| movhps [dstq+strideq*2], xm5 |
| movq [dstq+r6 ], xm5 |
| lea dstq, [dstq+strideq*4] |
| ret |
| |
| INV_TXFM_4X16_FN adst, dct |
| INV_TXFM_4X16_FN adst, adst |
| INV_TXFM_4X16_FN adst, flipadst |
| INV_TXFM_4X16_FN adst, identity |
| |
| cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 |
| call m(iadst_16x4_internal_10bpc).main |
| vpbroadcastd m6, [pd_6144] |
| call m(iadst_16x4_internal_10bpc).main_end |
| psrad m0, m4, 13 |
| psrad m1, m5, 13 |
| psrad m2, 13 |
| psrad m3, 13 |
| psrad m4, m8, 13 |
| psrad m5, m9, 13 |
| psrad m6, 13 |
| psrad m7, 13 |
| jmp tx2q |
| .pass2: |
| call .pass2_main |
| vpbroadcastd m5, [pw_2048] |
| vpbroadcastd m8, [pixel_10bpc_max] |
| lea r6, [strideq*3] |
| vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 |
| pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 |
| vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 |
| pxor m7, m7 |
| psubw m9, m7, m5 |
| vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 |
| pmulhrsw m0, m4, m9 |
| call .write_4x4 |
| pmulhrsw m0, m1, m9 |
| call .write_4x4 |
| pmulhrsw m0, m2, m9 |
| call .write_4x4 |
| pmulhrsw m0, m3, m9 |
| call .write_4x4 |
| RET |
| ALIGN function_align |
| .write_4x4: |
| movq xm4, [dstq+r6 ] |
| movhps xm4, [dstq+strideq*0] |
| vpbroadcastq m5, [dstq+strideq*1] |
| vpbroadcastq m6, [dstq+strideq*2] |
| mova [cq+32*0], m7 |
| mova [cq+32*1], m7 |
| add cq, 32*2 |
| vpblendd m4, m5, 0xc0 |
| vpblendd m4, m6, 0x30 |
| paddw m4, m0 |
| pmaxsw m4, m7 |
| pminsw m4, m8 |
| vextracti128 xm5, m4, 1 |
| movhps [dstq+strideq*0], xm4 |
| movhps [dstq+strideq*1], xm5 |
| movq [dstq+strideq*2], xm5 |
| movq [dstq+r6 ], xm4 |
| lea dstq, [dstq+strideq*4] |
| ret |
| ALIGN function_align |
| .pass2_main: |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| lea r6, [deint_shuf+128] |
| punpcklwd m4, m2, m3 |
| punpckhwd m2, m3 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhdq m1, m0, m4 |
| punpckldq m0, m4 |
| punpckldq m4, m5, m2 |
| punpckhdq m5, m2 |
| vpblendd m3, m0, m1, 0x33 |
| vpblendd m0, m1, 0xcc |
| shufpd m2, m5, m4, 0x05 |
| shufpd m4, m5, 0x05 |
| vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 |
| vinserti128 m0, xm3, 1 ; 0 3 2 1 |
| vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? |
| vinserti128 m2, xm4, 1 ; b 8 9 a |
| call m(iadst_4x16_internal_8bpc).main2 |
| vpbroadcastd m5, [pw_2896x8] |
| paddsw m1, m2, m4 |
| psubsw m2, m4 |
| pmulhrsw m1, m5 ; -out7 out4 out6 -out5 |
| pmulhrsw m2, m5 ; out8 -out11 -out9 out10 |
| ret |
| ALIGN function_align |
| .main: |
| vbroadcasti128 m0, [cq+16* 0] |
| vbroadcasti128 m4, [cq+16* 2] |
| vbroadcasti128 m1, [cq+16*15] |
| vbroadcasti128 m5, [cq+16*13] |
| vbroadcasti128 m2, [cq+16* 4] |
| vbroadcasti128 m6, [cq+16* 6] |
| vbroadcasti128 m3, [cq+16*11] |
| vbroadcasti128 m7, [cq+16* 9] |
| shufpd m0, m4, 0x0c ; 0 2 |
| shufpd m1, m5, 0x0c ; 15 13 |
| shufpd m2, m6, 0x0c ; 4 6 |
| shufpd m3, m7, 0x0c ; 11 9 |
| vbroadcasti128 m4, [cq+16* 8] |
| vbroadcasti128 m6, [cq+16*10] |
| vbroadcasti128 m5, [cq+16* 7] |
| vbroadcasti128 m7, [cq+16* 5] |
| shufpd m4, m6, 0x0c ; 8 10 |
| shufpd m5, m7, 0x0c ; 7 5 |
| vbroadcasti128 m6, [cq+16*12] |
| vbroadcasti128 m7, [cq+16*14] |
| shufpd m6, m7, 0x0c ; 12 14 |
| vbroadcasti128 m7, [cq+16* 3] |
| vbroadcasti128 m8, [cq+16* 1] |
| shufpd m7, m8, 0x0c ; 3 1 |
| .main2: |
| ; expects: m12 = clip_min m13 = clip_max |
| vpbroadcastd m11, [pd_2048] |
| ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 |
| ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 |
| ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 |
| ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 |
| psubd m8, m0, m4 ; t8a t10a |
| paddd m0, m4 ; t0a t2a |
| psubd m4, m1, m5 ; t9a t11a |
| paddd m1, m5 ; t1a t3a |
| psubd m5, m2, m6 ; t12a t14a |
| paddd m2, m6 ; t4a t6a |
| psubd m6, m3, m7 ; t13a t15a |
| paddd m3, m7 ; t5a t7a |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 |
| ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 |
| ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 |
| psubd m7, m0, m2 ; t4 t6 |
| paddd m0, m2 ; t0 t2 |
| psubd m2, m1, m3 ; t5 t7 |
| paddd m1, m3 ; t1 t3 |
| psubd m3, m4, m6 ; t12a t14a |
| paddd m4, m6 ; t8a t10a |
| psubd m6, m8, m5 ; t13a t15a |
| paddd m8, m5 ; t9a t11a |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 |
| punpcklqdq m5, m3, m7 ; t12a t4 |
| punpckhqdq m3, m7 ; t14a t6 |
| punpckhqdq m7, m6, m2 ; t15a t7 |
| punpcklqdq m6, m2 ; t13a t5 |
| ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 |
| ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 |
| vpbroadcastd m10, [pd_2896] |
| vbroadcasti128 m9, [pw_2048_m2048] ; + + - - |
| punpckhqdq m2, m4, m0 ; t10a t2 |
| punpcklqdq m4, m0 ; t8a t0 |
| punpckhqdq m0, m8, m1 ; t11a t3 |
| punpcklqdq m8, m1 ; t9a t1 |
| paddd m1, m6, m7 ; out2 -out3 |
| psubd m6, m7 ; t14a t6 |
| paddd m7, m5, m3 ; -out13 out12 |
| psubd m5, m3 ; t15a t7 |
| psubd m3, m8, m0 ; t11 t3a |
| paddd m8, m0 ; out14 -out15 |
| paddd m0, m4, m2 ; -out1 out0 |
| psubd m4, m2 ; t10 t2a |
| REPX {pmaxsd x, m12}, m6, m5, m3, m4 |
| REPX {pminsd x, m13}, m6, m5, m3, m4 |
| REPX {pmulld x, m10}, m6, m5, m3, m4 |
| paddd m6, m11 |
| paddd m4, m11 |
| paddd m2, m6, m5 ; -out5 out4 |
| psubd m6, m5 ; out10 -out11 |
| psubd m5, m4, m3 ; -out9 out8 |
| paddd m3, m4 ; out6 -out7 |
| REPX {psrad x, 12}, m2, m3, m5, m6 |
| REPX {psignd x, m9}, m1, m8, m3, m6 |
| pshufd m9, m9, q1032 |
| REPX {psignd x, m9}, m0, m7, m2, m5 |
| ret |
| |
| INV_TXFM_4X16_FN flipadst, dct |
| INV_TXFM_4X16_FN flipadst, adst |
| INV_TXFM_4X16_FN flipadst, flipadst |
| INV_TXFM_4X16_FN flipadst, identity |
| |
| cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 |
| .pass1: |
| call m(iadst_16x4_internal_10bpc).main |
| vpbroadcastd m6, [pd_6144] |
| call m(iadst_16x4_internal_10bpc).main_end |
| psrad m0, m3, 13 |
| psrad m1, m2, 13 |
| psrad m2, m5, 13 |
| psrad m3, m4, 13 |
| psrad m4, m7, 13 |
| psrad m5, m6, 13 |
| psrad m6, m9, 13 |
| psrad m7, m8, 13 |
| jmp tx2q |
| .pass2: |
| call m(iadst_4x16_internal_10bpc).pass2_main |
| vpbroadcastd m5, [pw_2048] |
| vpbroadcastd m8, [pixel_10bpc_max] |
| lea r6, [strideq*3] |
| vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 |
| pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 |
| vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 |
| pxor m7, m7 |
| psubw m9, m7, m5 |
| vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 |
| pmulhrsw m0, m4, m9 |
| call .write_4x4 |
| pmulhrsw m0, m2, m9 |
| call .write_4x4 |
| pmulhrsw m0, m1, m9 |
| call .write_4x4 |
| pmulhrsw m0, m3, m9 |
| call .write_4x4 |
| RET |
| ALIGN function_align |
| .write_4x4: |
| movq xm4, [dstq+strideq*0] |
| movhps xm4, [dstq+r6 ] |
| vpbroadcastq m5, [dstq+strideq*1] |
| vpbroadcastq m6, [dstq+strideq*2] |
| mova [cq+32*0], m7 |
| mova [cq+32*1], m7 |
| add cq, 32*2 |
| vpblendd m4, m5, 0x30 |
| vpblendd m4, m6, 0xc0 |
| paddw m4, m0 |
| pmaxsw m4, m7 |
| pminsw m4, m8 |
| vextracti128 xm5, m4, 1 |
| movq [dstq+strideq*0], xm4 |
| movq [dstq+strideq*1], xm5 |
| movhps [dstq+strideq*2], xm5 |
| movhps [dstq+r6 ], xm4 |
| lea dstq, [dstq+strideq*4] |
| ret |
| |
| INV_TXFM_4X16_FN identity, dct |
| INV_TXFM_4X16_FN identity, adst |
| INV_TXFM_4X16_FN identity, flipadst |
| INV_TXFM_4X16_FN identity, identity |
| |
| cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 |
| vpbroadcastd m7, [pd_5793] |
| pmulld m0, m7, [cq+32*0] |
| pmulld m4, m7, [cq+32*1] |
| pmulld m1, m7, [cq+32*2] |
| pmulld m5, m7, [cq+32*3] |
| pmulld m2, m7, [cq+32*4] |
| pmulld m6, m7, [cq+32*5] |
| pmulld m3, m7, [cq+32*6] |
| pmulld m7, [cq+32*7] |
| vpbroadcastd m8, [pd_6144] |
| REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 |
| REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 |
| jmp tx2q |
| .pass2: |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| vpbroadcastd m7, [pw_1697x16] |
| vpbroadcastd m8, [pw_2048] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| REPX {paddsw x, x}, m0, m1, m2, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| vpbroadcastd m4, [pixel_10bpc_max] |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .pass2_end: |
| punpckhwd m7, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| lea r6, [strideq*5] |
| pxor m3, m3 |
| punpckhdq m5, m0, m2 ; 2 3 6 7 |
| punpckldq m0, m2 ; 0 1 4 5 |
| punpckldq m6, m7, m1 ; 8 9 c d |
| punpckhdq m7, m1 ; a b e f |
| pmulhrsw m0, m8 |
| call .write_2x4x2 |
| pmulhrsw m0, m5, m8 |
| call .write_2x4x2 |
| pmulhrsw m0, m6, m8 |
| lea dstq, [dstq+strideq*4] |
| call .write_2x4x2 |
| pmulhrsw m0, m7, m8 |
| call .write_2x4x2 |
| ret |
| ALIGN function_align |
| .write_2x4x2: |
| movq xm1, [dstq+strideq*0] |
| movhps xm1, [dstq+strideq*1] |
| vpbroadcastq m2, [dstq+strideq*4] |
| vpblendd m1, m2, 0x30 |
| vpbroadcastq m2, [dstq+r6 ] |
| vpblendd m1, m2, 0xc0 |
| mova [cq+32*0], m3 |
| mova [cq+32*1], m3 |
| add cq, 32*2 |
| paddw m1, m0 |
| pmaxsw m1, m3 |
| pminsw m1, m4 |
| vextracti128 xm2, m1, 1 |
| movq [dstq+strideq*0], xm1 |
| movhps [dstq+strideq*1], xm1 |
| movq [dstq+strideq*4], xm2 |
| movhps [dstq+r6 ], xm2 |
| lea dstq, [dstq+strideq*2] |
| ret |
| |
| INV_TXFM_4X16_FN dct, dct, 12 |
| INV_TXFM_4X16_FN dct, identity, 12 |
| INV_TXFM_4X16_FN dct, adst, 12 |
| INV_TXFM_4X16_FN dct, flipadst, 12 |
| |
| cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| jmp m(idct_4x16_internal_10bpc).pass1 |
| .pass2: |
| punpckldq m8, m0, m1 |
| punpckhdq m0, m1 |
| punpckldq m9, m2, m3 |
| punpckhdq m2, m3 |
| punpckldq m1, m4, m5 |
| punpckhdq m4, m5 |
| punpckldq m3, m6, m7 |
| punpckhdq m6, m7 |
| punpcklqdq m5, m0, m2 ; 2 6 |
| punpckhqdq m12, m0, m2 ; 3 7 |
| punpcklqdq m0, m8, m9 ; 0 4 |
| punpckhqdq m10, m8, m9 ; 1 5 |
| punpcklqdq m2, m1, m3 ; 8 12 |
| punpckhqdq m13, m1, m3 ; 9 13 |
| punpcklqdq m9, m4, m6 ; 10 14 |
| punpckhqdq m4, m6 ; 11 15 |
| vperm2i128 m1, m5, m9, 0x20 ; 2 10 |
| vperm2i128 m3, m9, m5, 0x31 ; 14 6 |
| vpermq m11, m4, q1302 ; 15 11 |
| ; interleave |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 |
| REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 |
| call m(idct_16x4_internal_10bpc).pass1_main |
| vpermq m6, m12, q1302 ; 7 3 |
| vpermq m5, m13, q3120 ; 9 13 |
| call m(idct_16x4_internal_10bpc).pass1_main2 |
| call m(idct_16x4_internal_10bpc).pass1_main3 |
| REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| packssdw m2, m4, m5 |
| packssdw m3, m6, m7 |
| mova m4, [idct16_12_shuf] |
| REPX {vpermd x, m4, x}, m0, m1, m2, m3 |
| vpbroadcastd m9, [pw_16384] |
| vpbroadcastd m8, [pixel_12bpc_max] |
| call m(idct_4x16_internal_10bpc).pass2_end |
| RET |
| |
| INV_TXFM_4X16_FN adst, dct, 12 |
| INV_TXFM_4X16_FN adst, adst, 12 |
| INV_TXFM_4X16_FN adst, flipadst, 12 |
| INV_TXFM_4X16_FN adst, identity, 12 |
| |
| cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| call .main_pass1 |
| psrad m0, m4, 12 |
| psrad m1, m5, 12 |
| psrad m2, 12 |
| psrad m3, 12 |
| psrad m4, m8, 12 |
| psrad m5, m9, 12 |
| psrad m6, 12 |
| psrad m7, 12 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call .transpose_16x4 |
| call m(iadst_4x16_internal_10bpc).main2 |
| pshufd m4, m5, q1032 |
| psrad m5, m6, 3 |
| pshufd m6, m7, q1032 |
| psrad m7, m8, 3 |
| REPX {pshufd x, x, q1032}, m0, m2 |
| REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 |
| .pass2_end: |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| packssdw m2, m4, m5 |
| packssdw m3, m6, m7 |
| mova m4, [iadst16_12_shuf] |
| REPX {vpermd x, m4, x}, m0, m1, m2, m3 |
| vpbroadcastd m9, [pw_16384] |
| vpbroadcastd m8, [pixel_12bpc_max] |
| lea r6, [strideq*3] |
| pxor m7, m7 |
| pmulhrsw m0, m9 |
| call m(iadst_4x16_internal_10bpc).write_4x4 |
| pmulhrsw m0, m9, m1 |
| call m(iadst_4x16_internal_10bpc).write_4x4 |
| pmulhrsw m0, m9, m2 |
| call m(iadst_4x16_internal_10bpc).write_4x4 |
| pmulhrsw m0, m9, m3 |
| call m(iadst_4x16_internal_10bpc).write_4x4 |
| RET |
| ALIGN function_align |
| .transpose_16x4: |
| ; transpose & interleave |
| punpckldq m8, m0, m1 |
| punpckhdq m0, m1 |
| punpckldq m9, m2, m3 |
| punpckhdq m2, m3 |
| punpckldq m1, m4, m5 |
| punpckhdq m4, m5 |
| punpckldq m3, m6, m7 |
| punpckhdq m6, m7 |
| punpcklqdq m10, m8, m0 |
| punpckhqdq m0, m8 |
| punpcklqdq m11, m9, m2 |
| punpckhqdq m2, m9 |
| punpcklqdq m8, m1, m4 |
| punpckhqdq m4, m1 |
| punpcklqdq m9, m3, m6 |
| punpckhqdq m6, m3 |
| vperm2i128 m5, m0, m2, 0x31 ; 7 5 |
| vperm2i128 m7, m0, m2, 0x20 ; 3 1 |
| vperm2i128 m0, m10, m11, 0x20 ; 0 2 |
| vperm2i128 m2, m10, m11, 0x31 ; 4 6 |
| vperm2i128 m1, m4, m6, 0x31 ; 15 13 |
| vperm2i128 m3, m4, m6, 0x20 ; 11 9 |
| vperm2i128 m4, m8, m9, 0x20 ; 8 10 |
| vperm2i128 m6, m8, m9, 0x31 ; 12 14 |
| ret |
| ALIGN function_align |
| .main_pass1: |
| call m(iadst_16x4_internal_10bpc).main |
| vpbroadcastd m6, [pd_3072] |
| paddd m10, m4, m5 |
| psubd m4, m3 |
| psubd m5, m3 |
| paddd m3, m10 |
| psubd m8, m7, m1 |
| paddd m7, m9 |
| psubd m9, m1 |
| paddd m7, m1 |
| REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 |
| REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 |
| paddd m6, m0 |
| ret |
| |
| INV_TXFM_4X16_FN flipadst, dct, 12 |
| INV_TXFM_4X16_FN flipadst, adst, 12 |
| INV_TXFM_4X16_FN flipadst, flipadst, 12 |
| INV_TXFM_4X16_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| call m(iadst_4x16_internal_12bpc).main_pass1 |
| psrad m0, m3, 12 |
| psrad m1, m2, 12 |
| psrad m2, m5, 12 |
| psrad m3, m4, 12 |
| psrad m4, m7, 12 |
| psrad m5, m6, 12 |
| psrad m6, m9, 12 |
| psrad m7, m8, 12 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(iadst_4x16_internal_12bpc).transpose_16x4 |
| call m(iadst_4x16_internal_10bpc).main2 |
| pshufd m4, m3, q1032 |
| psrad m3, m5, 3 |
| psrad m5, m2, 3 |
| pshufd m2, m6, q1032 |
| pshufd m6, m1, q1032 |
| psrad m1, m7, 3 |
| psrad m7, m0, 3 |
| pshufd m0, m8, q1032 |
| REPX {psrad x, 3}, m0, m2, m4, m6 |
| jmp m(iadst_4x16_internal_12bpc).pass2_end |
| |
| INV_TXFM_4X16_FN identity, dct, 12 |
| INV_TXFM_4X16_FN identity, adst, 12 |
| INV_TXFM_4X16_FN identity, flipadst, 12 |
| INV_TXFM_4X16_FN identity, identity, 12 |
| |
| cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [pd_1697] |
| mova m0, [cq+32*0] |
| mova m4, [cq+32*1] |
| mova m1, [cq+32*2] |
| mova m5, [cq+32*3] |
| vpbroadcastd m9, [pd_6144] |
| pmulld m2, m8, m0 |
| pmulld m6, m8, m4 |
| pmulld m3, m8, m1 |
| pmulld m7, m8, m5 |
| mova m10, [cq+32*4] |
| mova m11, [cq+32*5] |
| mova m12, [cq+32*6] |
| mova m13, [cq+32*7] |
| REPX {paddd x, m9}, m2, m6, m3, m7 |
| REPX {psrad x, 12}, m2, m6, m3, m7 |
| paddd m0, m2 |
| pmulld m2, m8, m10 |
| paddd m4, m6 |
| pmulld m6, m8, m11 |
| paddd m1, m3 |
| pmulld m3, m8, m12 |
| paddd m5, m7 |
| pmulld m7, m8, m13 |
| REPX {psrad x, 1 }, m0, m4, m1, m5 |
| REPX {paddd x, m9}, m2, m6, m3, m7 |
| REPX {psrad x, 12}, m2, m6, m3, m7 |
| paddd m2, m10 |
| paddd m6, m11 |
| paddd m3, m12 |
| paddd m7, m13 |
| REPX {psrad x, 1 }, m2, m6, m3, m7 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpbroadcastd m8, [pd_5793] |
| vpbroadcastd m9, [pd_1024] |
| REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| vpbroadcastd m8, [pw_16384] |
| vpbroadcastd m4, [pixel_12bpc_max] |
| call m(iidentity_4x16_internal_10bpc).pass2_end |
| RET |
| |
| %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 8x4, %3 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastd m2, [dconly_%3bpc] |
| %if %3 = 10 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 4 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 |
| %else |
| jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly |
| %endif |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X4_FN dct, dct |
| INV_TXFM_8X4_FN dct, identity |
| INV_TXFM_8X4_FN dct, adst |
| INV_TXFM_8X4_FN dct, flipadst |
| |
| cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| .pass1: |
| vbroadcasti128 m1, [cq+16*1] |
| vbroadcasti128 m0, [cq+16*5] |
| vbroadcasti128 m2, [cq+16*3] |
| vbroadcasti128 m3, [cq+16*7] |
| vpbroadcastd m6, [pd_2896] |
| shufpd m1, m0, 0x0c ; 1 5 |
| shufpd m3, m2, 0x0c ; 7 3 |
| vbroadcasti128 m0, [cq+16*0] |
| vbroadcasti128 m4, [cq+16*2] |
| vbroadcasti128 m2, [cq+16*4] |
| vbroadcasti128 m5, [cq+16*6] |
| vpbroadcastd m7, [pd_2048] |
| shufpd m0, m4, 0x0c ; 0 2 |
| shufpd m2, m5, 0x0c ; 4 6 |
| REPX {pmulld x, m6}, m1, m3, m0, m2 |
| REPX {paddd x, m7}, m1, m3, m0, m2 |
| REPX {psrad x, 12}, m1, m3, m0, m2 |
| call .main |
| psubd m3, m0, m4 ; out7 out6 (interleaved) |
| paddd m0, m4 ; out0 out1 (interleaved) |
| paddd m1, m2, m5 ; out3 out2 (interleaved) |
| psubd m2, m5 ; out4 out5 (interleaved) |
| pshufd m1, m1, q1032 |
| pshufd m3, m3, q1032 |
| jmp tx2q |
| .pass2: |
| vbroadcasti128 m4, [deint_shuf] |
| packssdw m0, m1 |
| packssdw m2, m3 |
| vperm2i128 m1, m0, m2, 0x31 |
| vinserti128 m0, xm2, 1 |
| pshufb m0, m4 |
| pshufb m1, m4 |
| IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 |
| vpermq m0, m0, q3120 ; out0 out1 |
| vpermq m2, m1, q2031 ; out2 out3 |
| jmp m(iadst_8x4_internal_10bpc).end |
| ALIGN function_align |
| .main: |
| ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 |
| IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 |
| vpbroadcastd m6, [pd_2896] |
| punpcklqdq m4, m1, m3 ; t4a t7a |
| punpckhqdq m1, m3 ; t5a t6a |
| psubd m3, m4, m1 ; t5a t6a |
| paddd m4, m1 ; t4 t7 |
| REPX {pmaxsd x, m8}, m3, m4, m0, m2 |
| REPX {pminsd x, m9}, m3, m4, m0, m2 |
| pmulld m3, m6 |
| pshufd m1, m3, q1032 |
| paddd m3, m7 |
| psubd m5, m3, m1 |
| paddd m1, m3 |
| psrad m5, 12 |
| psrad m1, 12 |
| vpblendd m5, m4, 0x33 ; t4 t5 |
| punpckhqdq m4, m1 ; t7 t6 |
| ret |
| |
| INV_TXFM_8X4_FN adst, dct |
| INV_TXFM_8X4_FN adst, adst |
| INV_TXFM_8X4_FN adst, flipadst |
| INV_TXFM_8X4_FN adst, identity |
| |
| cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| call m(iadst_4x8_internal_10bpc).main |
| vpblendd m3, m0, m4, 0x33 ; out6 out7 |
| vpblendd m0, m4, 0xcc ; out0 out1 |
| pshufd m1, m5, q1032 |
| psignd m2, m6 ; out4 out5 |
| psignd m1, m6 ; out2 out3 |
| jmp tx2q |
| .pass2: |
| call .pass2_main |
| vpermq m0, m0, q3120 ; out0 out1 |
| vpermq m2, m1, q3120 ; out2 out3 |
| .end: |
| vpbroadcastd m1, [pw_2048] |
| pmulhrsw m0, m1 |
| pmulhrsw m1, m2 |
| vpbroadcastd m5, [pixel_10bpc_max] |
| .end2: |
| mova xm2, [dstq+strideq*0] |
| vinserti128 m2, [dstq+strideq*1], 1 |
| lea r6, [dstq+strideq*2] |
| mova xm3, [r6 +strideq*0] |
| vinserti128 m3, [r6 +strideq*1], 1 |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m5 |
| pminsw m1, m5 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| mova [r6 +strideq*0], xm1 |
| vextracti128 [r6 +strideq*1], m1, 1 |
| RET |
| ALIGN function_align |
| .pass2_main: |
| vbroadcasti128 m4, [deint_shuf] |
| packssdw m0, m1 |
| packssdw m2, m3 |
| lea r6, [deint_shuf+128] |
| vperm2i128 m1, m0, m2, 0x31 |
| vinserti128 m0, xm2, 1 |
| pshufb m0, m4 |
| pshufb m1, m4 |
| jmp m(iadst_8x4_internal_8bpc).main |
| ALIGN function_align |
| .main: |
| vpbroadcastd m1, [pd_2896] |
| pmulld m0, m1, [cq+32*0] |
| pmulld m3, m1, [cq+32*3] |
| pmulld m2, m1, [cq+32*2] |
| pmulld m1, [cq+32*1] |
| vpbroadcastd m4, [pd_2048] |
| REPX {paddd x, m4}, m0, m3, m2, m1 |
| REPX {psrad x, 12}, m0, m3, m2, m1 |
| .main2: |
| IADST4_1D |
| ret |
| |
| INV_TXFM_8X4_FN flipadst, dct |
| INV_TXFM_8X4_FN flipadst, adst |
| INV_TXFM_8X4_FN flipadst, flipadst |
| INV_TXFM_8X4_FN flipadst, identity |
| |
| cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 |
| call m(iadst_4x8_internal_10bpc).main |
| shufpd m3, m4, m0, 0x05 |
| shufpd m0, m4, 0x05 |
| psignd m2, m6 |
| pshufd m6, m6, q1032 |
| pshufd m1, m2, q1032 |
| psignd m2, m5, m6 |
| jmp tx2q |
| .pass2: |
| call m(iadst_8x4_internal_10bpc).pass2_main |
| vpermq m2, m0, q2031 |
| vpermq m0, m1, q2031 |
| jmp m(iadst_8x4_internal_10bpc).end |
| |
| INV_TXFM_8X4_FN identity, dct |
| INV_TXFM_8X4_FN identity, adst |
| INV_TXFM_8X4_FN identity, flipadst |
| INV_TXFM_8X4_FN identity, identity |
| |
| cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m4, [pd_2896] |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpermq m2, [cq+32*2], q3120 |
| vpermq m3, [cq+32*3], q3120 |
| vpbroadcastd m7, [pd_2048] |
| REPX {pmulld x, m4}, m0, m1, m2, m3 |
| REPX {paddd x, m7}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| REPX {paddd x, x }, m0, m1, m2, m3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m5, [pixel_10bpc_max] |
| vpbroadcastd m4, [pw_1697x8] |
| packssdw m0, m1 |
| packssdw m2, m3 |
| pmulhrsw m1, m4, m0 |
| pmulhrsw m4, m2 |
| paddsw m0, m1 |
| paddsw m2, m4 |
| packssdw m7, m7 ; pw_2048 |
| .pass2_end: |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| lea r6, [dstq+strideq*2] |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmulhrsw m2, m7 |
| pmulhrsw m0, m7 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| mova xm2, [dstq+strideq*0] |
| vinserti128 m2, [r6 +strideq*0], 1 |
| mova xm3, [dstq+strideq*1] |
| vinserti128 m3, [r6 +strideq*1], 1 |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m5 |
| pminsw m1, m5 |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm1 |
| vextracti128 [r6 +strideq*0], m0, 1 |
| vextracti128 [r6 +strideq*1], m1, 1 |
| RET |
| |
| INV_TXFM_8X4_FN dct, dct, 12 |
| INV_TXFM_8X4_FN dct, identity, 12 |
| INV_TXFM_8X4_FN dct, adst, 12 |
| INV_TXFM_8X4_FN dct, flipadst, 12 |
| |
| cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_20b_min] |
| vpbroadcastd m9, [clip_20b_max] |
| jmp m(idct_8x4_internal_10bpc).pass1 |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| call m(iadst_8x4_internal_12bpc).transpose_4x8 |
| IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 |
| jmp m(iadst_8x4_internal_12bpc).end |
| |
| INV_TXFM_8X4_FN adst, dct, 12 |
| INV_TXFM_8X4_FN adst, adst, 12 |
| INV_TXFM_8X4_FN adst, flipadst, 12 |
| INV_TXFM_8X4_FN adst, identity, 12 |
| |
| cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_20b_min] |
| vpbroadcastd m9, [clip_20b_max] |
| call m(iadst_4x8_internal_10bpc).main2 |
| vpblendd m3, m0, m4, 0x33 ; out6 out7 |
| vpblendd m0, m4, 0xcc ; out0 out1 |
| pshufd m1, m5, q1032 |
| psignd m2, m6 ; out4 out5 |
| psignd m1, m6 ; out2 out3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| call .pass2_main |
| vpbroadcastd m5, [pd_2048] |
| paddd m0, m5, m4 |
| paddd m1, m5, m6 |
| paddd m2, m5 |
| paddd m3, m5 |
| .pass2_end: |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| .end: |
| vpbroadcastd m4, [pw_16384] |
| REPX {psrad x, 3}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m2, m4 |
| vpermq m0, m0, q3120 ; out0 out1 |
| vpermq m1, m1, q3120 ; out2 out3 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| jmp m(iadst_8x4_internal_10bpc).end2 |
| ALIGN function_align |
| .pass2_main: |
| call .transpose_4x8 |
| jmp m(iadst_8x4_internal_10bpc).main2 |
| ALIGN function_align |
| .transpose_4x8: |
| ; deinterleave |
| pshufd m0, m0, q3120 |
| pshufd m1, m1, q3120 |
| pshufd m2, m2, q3120 |
| pshufd m3, m3, q3120 |
| ; transpose |
| punpcklqdq m4, m0, m1 |
| punpckhqdq m0, m1 |
| punpcklqdq m5, m2, m3 |
| punpckhqdq m2, m3 |
| vperm2i128 m1, m0, m2, 0x20 ; out1 |
| vperm2i128 m3, m0, m2, 0x31 ; out3 |
| vperm2i128 m2, m4, m5, 0x31 ; out2 |
| vperm2i128 m0, m4, m5, 0x20 ; out0 |
| ret |
| |
| INV_TXFM_8X4_FN flipadst, dct, 12 |
| INV_TXFM_8X4_FN flipadst, adst, 12 |
| INV_TXFM_8X4_FN flipadst, flipadst, 12 |
| INV_TXFM_8X4_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_20b_min] |
| vpbroadcastd m9, [clip_20b_max] |
| call m(iadst_4x8_internal_10bpc).main2 |
| shufpd m3, m4, m0, 0x05 |
| shufpd m0, m4, 0x05 |
| psignd m2, m6 |
| pshufd m6, m6, q1032 |
| pshufd m1, m2, q1032 |
| psignd m2, m5, m6 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| call m(iadst_8x4_internal_12bpc).pass2_main |
| vpbroadcastd m5, [pd_2048] |
| paddd m0, m5, m3 |
| paddd m1, m5, m2 |
| paddd m3, m5, m4 |
| paddd m2, m5, m6 |
| jmp m(iadst_8x4_internal_12bpc).pass2_end |
| |
| INV_TXFM_8X4_FN identity, dct, 12 |
| INV_TXFM_8X4_FN identity, adst, 12 |
| INV_TXFM_8X4_FN identity, flipadst, 12 |
| INV_TXFM_8X4_FN identity, identity, 12 |
| |
| cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 |
| jmp m(iidentity_8x4_internal_10bpc).pass1 |
| .pass2: |
| ; m0 = in0 in1 (interleaved) |
| ; m1 = in2 in3 (interleaved) |
| ; m2 = in4 in5 (interleaved) |
| ; m3 = in6 in7 (interleaved) |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| REPX {pmaxsd x, m8}, m0, m1, m2, m3 |
| REPX {pminsd x, m9}, m0, m1, m2, m3 |
| vpbroadcastd m4, [pd_5793] |
| REPX {pmulld x, m4}, m0, m1, m2, m3 |
| REPX {paddd x, m7}, m0, m1, m2, m3 |
| REPX {psrad x, 15}, m0, m1, m2, m3 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| vpbroadcastd m7, [pw_16384] |
| packssdw m0, m1 |
| packssdw m2, m3 |
| jmp m(iidentity_8x4_internal_10bpc).pass2_end |
| |
| %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 8x8, %3 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastd m2, [dconly_%3bpc] |
| %if %3 = 10 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 8 |
| .dconly2: |
| add r6d, 384 |
| sar r6d, 9 |
| .dconly3: |
| imul r6d, 181 |
| add r6d, 2176 |
| sar r6d, 12 |
| movd xm0, r6d |
| paddsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| .dconly_loop: |
| mova xm1, [dstq+strideq*0] |
| vinserti128 m1, [dstq+strideq*1], 1 |
| paddsw m1, m0 |
| psubusw m1, m2 |
| mova [dstq+strideq*0], xm1 |
| vextracti128 [dstq+strideq*1], m1, 1 |
| lea dstq, [dstq+strideq*2] |
| sub r3d, 2 |
| jg .dconly_loop |
| RET |
| %else |
| jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly |
| %endif |
| %endif |
| %endmacro |
| |
| %macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] |
| ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a |
| ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a |
| ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a |
| ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a |
| psubd m%9, m%3, m%7 ; t6 |
| paddd m%3, m%7 ; t2 |
| psubd m%7, m%1, m%5 ; t4 |
| paddd m%1, m%5 ; t0 |
| psubd m%5, m%6, m%2 ; t7 |
| paddd m%6, m%2 ; t3 |
| psubd m%2, m%8, m%4 ; t5 |
| paddd m%8, m%4 ; t1 |
| REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 |
| REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 |
| ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a |
| ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a |
| psubd m%10, m%7, m%9 ; t7 |
| paddd m%7, m%9 ; out6 |
| vpbroadcastd m%9, [pd_1448] |
| psubd m%4, m%8, m%6 ; t3 |
| paddd m%8, m%6 ; -out7 |
| psubd m%6, m%1, m%3 ; t2 |
| paddd m%1, m%3 ; out0 |
| psubd m%3, m%2, m%5 ; t6 |
| paddd m%2, m%5 ; -out1 |
| REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 |
| REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 |
| REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 |
| psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 |
| paddd m%4, m%6 ; (t2 + t3) * 1448 |
| psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 |
| paddd m%3, m%10 ; (t6 + t7) * 1448 |
| %endmacro |
| |
| INV_TXFM_8X8_FN dct, dct |
| INV_TXFM_8X8_FN dct, identity |
| INV_TXFM_8X8_FN dct, adst |
| INV_TXFM_8X8_FN dct, flipadst |
| |
| cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| mova m4, [cq+32*4] |
| mova m5, [cq+32*5] |
| mova m6, [cq+32*6] |
| mova m7, [cq+32*7] |
| vpbroadcastd m11, [pd_2048] |
| call .main |
| call .round_shift1 |
| jmp tx2q |
| .pass2: |
| call .transpose_8x8_packed |
| call m(idct_8x8_internal_8bpc).main |
| vpbroadcastd m12, [pw_2048] |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call .write_8x4_start |
| pmulhrsw m0, m2, m12 |
| pmulhrsw m1, m3, m12 |
| call .write_8x4 |
| RET |
| ALIGN function_align |
| .write_8x4_start: |
| vpbroadcastd m11, [pixel_10bpc_max] |
| lea r6, [strideq*3] |
| pxor m10, m10 |
| .write_8x4: |
| mova xm8, [dstq+strideq*0] |
| vinserti128 m8, [dstq+strideq*1], 1 |
| mova xm9, [dstq+strideq*2] |
| vinserti128 m9, [dstq+r6 ], 1 |
| mova [cq+32*0], m10 |
| mova [cq+32*1], m10 |
| mova [cq+32*2], m10 |
| mova [cq+32*3], m10 |
| add cq, 32*4 |
| paddw m0, m8 |
| paddw m1, m9 |
| pmaxsw m0, m10 |
| pmaxsw m1, m10 |
| pminsw m0, m11 |
| pminsw m1, m11 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], xm1 |
| vextracti128 [dstq+r6 ], m1, 1 |
| lea dstq, [dstq+strideq*4] |
| ret |
| ALIGN function_align |
| .transpose_8x8_packed: |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| lea r6, [deint_shuf+128] |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckhdq m2, m4, m1 |
| punpckldq m4, m1 |
| vinserti128 m1, m3, xm2, 1 |
| vperm2i128 m3, m2, 0x31 |
| vperm2i128 m2, m0, m4, 0x31 |
| vinserti128 m0, xm4, 1 |
| ret |
| ALIGN function_align |
| .main_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .main: |
| ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a |
| ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a |
| ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 |
| paddd m8, m1, m5 ; t4 |
| psubd m1, m5 ; t5a |
| paddd m9, m7, m3 ; t7 |
| psubd m7, m3 ; t6a |
| vpbroadcastd m3, [pd_2896] |
| REPX {pmaxsd x, m12}, m1, m8, m7, m9 |
| REPX {pminsd x, m13}, m1, m8, m7, m9 |
| REPX {pmulld x, m3 }, m0, m4, m7, m1 |
| paddd m0, m11 |
| paddd m7, m11 |
| psubd m5, m0, m4 |
| paddd m0, m4 |
| psubd m4, m7, m1 |
| paddd m7, m1 |
| REPX {psrad x, 12 }, m5, m0, m4, m7 |
| psubd m3, m0, m6 ; dct4 out3 |
| paddd m0, m6 ; dct4 out0 |
| paddd m6, m5, m2 ; dct4 out1 |
| psubd m5, m2 ; dct4 out2 |
| REPX {pmaxsd x, m12}, m0, m6, m5, m3 |
| REPX {pminsd x, m13}, m0, m6, m5, m3 |
| ret |
| ALIGN function_align |
| .round_shift1: |
| pcmpeqd m1, m1 |
| REPX {psubd x, m1}, m0, m6, m5, m3 |
| paddd m1, m6, m7 ; out1 |
| psubd m6, m7 ; out6 |
| psubd m7, m0, m9 ; out7 |
| paddd m0, m9 ; out0 |
| paddd m2, m5, m4 ; out2 |
| psubd m5, m4 ; out5 |
| psubd m4, m3, m8 ; out4 |
| paddd m3, m8 ; out3 |
| REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| |
| INV_TXFM_8X8_FN adst, dct |
| INV_TXFM_8X8_FN adst, adst |
| INV_TXFM_8X8_FN adst, flipadst |
| INV_TXFM_8X8_FN adst, identity |
| |
| cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| call .main |
| call .main_end |
| jmp tx2q |
| .pass2: |
| call m(idct_8x8_internal_10bpc).transpose_8x8_packed |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call m(iadst_8x8_internal_8bpc).main_pass2 |
| vpbroadcastd m5, [pw_2048] |
| vpbroadcastd xm12, [pw_4096] |
| psubw m12, m5 |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4_start |
| pmulhrsw m0, m2, m12 |
| pmulhrsw m1, m3, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+32*0] |
| mova m7, [cq+32*7] |
| mova m1, [cq+32*1] |
| mova m6, [cq+32*6] |
| mova m2, [cq+32*2] |
| mova m5, [cq+32*5] |
| mova m3, [cq+32*3] |
| mova m4, [cq+32*4] |
| vpbroadcastd m11, [pd_2048] |
| .main2: |
| IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 |
| psrld m8, 10 ; pd_1 |
| vpbroadcastd m9, [pd_3072] |
| ret |
| ALIGN function_align |
| .main_end: |
| paddd m0, m8 |
| psubd m1, m8, m1 |
| paddd m6, m8 |
| psubd m7, m8, m7 |
| REPX {psrad x, 1 }, m0, m1, m6, m7 |
| ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 |
| ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 |
| psubd m8, m9, m8 ; pd_3071 |
| paddd m2, m9 |
| psubd m3, m8, m3 |
| paddd m4, m9 |
| psubd m5, m8, m5 |
| REPX {psrad x, 12}, m2, m3, m4, m5 |
| ret |
| |
| INV_TXFM_8X8_FN flipadst, dct |
| INV_TXFM_8X8_FN flipadst, adst |
| INV_TXFM_8X8_FN flipadst, flipadst |
| INV_TXFM_8X8_FN flipadst, identity |
| |
| cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| call m(iadst_8x8_internal_10bpc).main |
| call .main_end |
| jmp tx2q |
| .pass2: |
| call m(idct_8x8_internal_10bpc).transpose_8x8_packed |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call m(iadst_8x8_internal_8bpc).main_pass2 |
| vpbroadcastd m12, [pw_2048] |
| vpbroadcastd xm5, [pw_4096] |
| psubw m12, m5 |
| vpermq m8, m3, q2031 |
| vpermq m9, m2, q2031 |
| vpermq m2, m1, q2031 |
| vpermq m3, m0, q2031 |
| pmulhrsw m0, m8, m12 |
| pmulhrsw m1, m9, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4_start |
| pmulhrsw m0, m2, m12 |
| pmulhrsw m1, m3, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| ALIGN function_align |
| .main_end: |
| paddd m10, m8, m0 |
| psubd m0, m8, m7 |
| psubd m7, m8, m1 |
| paddd m1, m8, m6 |
| psrad m0, 1 |
| psrad m1, 1 |
| psrad m6, m7, 1 |
| psrad m7, m10, 1 |
| psubd m8, m9, m8 ; pd_6143 |
| psubd m10, m8, m5 |
| paddd m5, m9, m2 |
| psubd m2, m8, m3 |
| paddd m3, m9, m4 |
| psrad m4, m2, 12 |
| psrad m2, m10, 12 |
| psrad m3, 12 |
| psrad m5, 12 |
| ret |
| |
| INV_TXFM_8X8_FN identity, dct |
| INV_TXFM_8X8_FN identity, adst |
| INV_TXFM_8X8_FN identity, flipadst |
| INV_TXFM_8X8_FN identity, identity |
| |
| cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| .pass1: |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| mova m2, [cq+32*2] |
| mova m3, [cq+32*3] |
| mova m4, [cq+32*4] |
| mova m5, [cq+32*5] |
| mova m6, [cq+32*6] |
| mova m7, [cq+32*7] |
| jmp tx2q |
| .pass2: |
| packssdw m3, m7 |
| vpbroadcastd m7, [pixel_10bpc_max] |
| .pass2_main: |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| vpbroadcastd m12, [pw_4096] |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m1 |
| punpckhdq m4, m1 |
| punpckhqdq m1, m0, m2 ; 1 5 |
| punpcklqdq m0, m2 ; 0 4 |
| punpcklqdq m2, m3, m4 ; 2 6 |
| punpckhqdq m3, m4 ; 3 7 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call .write_2x8x2_start |
| pmulhrsw m0, m2, m12 |
| pmulhrsw m1, m3, m12 |
| call .write_2x8x2_zero |
| RET |
| .write_2x8x2_start: |
| lea r6, [strideq*5] |
| pxor m6, m6 |
| .write_2x8x2_zero: |
| mova [cq+32*0], m6 |
| mova [cq+32*1], m6 |
| mova [cq+32*2], m6 |
| mova [cq+32*3], m6 |
| add cq, 32*4 |
| .write_2x8x2: |
| mova xm4, [dstq+strideq*0] |
| vinserti128 m4, [dstq+strideq*4], 1 |
| mova xm5, [dstq+strideq*1] |
| vinserti128 m5, [dstq+r6 ], 1 |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m6 |
| pmaxsw m1, m6 |
| pminsw m0, m7 |
| pminsw m1, m7 |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm1 |
| vextracti128 [dstq+strideq*4], m0, 1 |
| vextracti128 [dstq+r6 ], m1, 1 |
| lea dstq, [dstq+strideq*2] |
| ret |
| |
| %macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] |
| punpckldq m%9, m%1, m%2 ; aibj emfn |
| punpckhdq m%1, m%2 ; ckdl gohp |
| punpckldq m%10, m%3, m%4 ; qyrz uCvD |
| punpckhdq m%3, m%4 ; sAtB wExF |
| punpckldq m%11, m%5, m%6 ; GOHP KSLT |
| punpckhdq m%5, m%6 ; IQJR MUNV |
| punpckldq m%12, m%7, m%8 ; WeXf aibj |
| punpckhdq m%7, m%8 ; YgZh ckdl |
| punpcklqdq m%2, m%9, m%10 ; aiqy emuC |
| punpckhqdq m%9, m%10 ; bjrz fnvD |
| punpcklqdq m%4, m%1, m%3 ; cksA gowE |
| punpckhqdq m%10, m%1, m%3 ; dltB hpxF |
| punpcklqdq m%6, m%11, m%12 ; GOWe KSai |
| punpckhqdq m%11, m%12 ; HPXf LTbj |
| punpcklqdq m%8, m%5, m%7 ; IQYg MUck |
| punpckhqdq m%12, m%5, m%7 ; JRZh NVdl |
| vperm2i128 m%1, m%2, m%6, 0x20 ; out0 |
| vperm2i128 m%5, m%2, m%6, 0x31 ; out4 |
| vperm2i128 m%2, m%9, m%11, 0x20 ; out1 |
| vperm2i128 m%6, m%9, m%11, 0x31 ; out5 |
| vperm2i128 m%3, m%4, m%8, 0x20 ; out2 |
| vperm2i128 m%7, m%4, m%8, 0x31 ; out6 |
| vperm2i128 m%4, m%10, m%12, 0x20 ; out3 |
| vperm2i128 m%8, m%10, m%12, 0x31 ; out7 |
| %endmacro |
| |
| INV_TXFM_8X8_FN dct, dct, 12 |
| INV_TXFM_8X8_FN dct, identity, 12 |
| INV_TXFM_8X8_FN dct, adst, 12 |
| INV_TXFM_8X8_FN dct, flipadst, 12 |
| |
| cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(idct_8x8_internal_10bpc).pass1 |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call .transpose_8x8 |
| vpbroadcastd m11, [pd_2048] |
| call m(idct_8x8_internal_10bpc).main |
| call .round_shift4 |
| jmp m(iadst_8x8_internal_12bpc).pass2_end |
| ALIGN function_align |
| .write_8x4_start: |
| vpbroadcastd m11, [pixel_12bpc_max] |
| lea r6, [strideq*3] |
| pxor m10, m10 |
| ret |
| ALIGN function_align |
| .transpose_8x8: |
| TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 |
| ret |
| ALIGN function_align |
| .round_shift4: |
| vpbroadcastd m1, [pd_8] |
| REPX {paddd x, m1}, m0, m6, m5, m3 |
| paddd m1, m6, m7 ; out1 |
| psubd m6, m7 ; out6 |
| psubd m7, m0, m9 ; out7 |
| paddd m0, m9 ; out0 |
| paddd m2, m5, m4 ; out2 |
| psubd m5, m4 ; out5 |
| psubd m4, m3, m8 ; out4 |
| paddd m3, m8 ; out3 |
| REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| |
| INV_TXFM_8X8_FN adst, dct, 12 |
| INV_TXFM_8X8_FN adst, adst, 12 |
| INV_TXFM_8X8_FN adst, flipadst, 12 |
| INV_TXFM_8X8_FN adst, identity, 12 |
| |
| cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iadst_8x8_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_main |
| .pass2_end: |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| REPX {vpermq x, x, q3120}, m0, m1 |
| call m(idct_8x8_internal_12bpc).write_8x4_start |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| packssdw m0, m4, m5 |
| packssdw m1, m6, m7 |
| REPX {vpermq x, x, q3120}, m0, m1 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| ALIGN function_align |
| .pass2_main: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x8_internal_12bpc).transpose_8x8 |
| vpbroadcastd m11, [pd_2048] |
| .pass2_main2: |
| call m(iadst_8x8_internal_10bpc).main2 |
| pslld m9, m8, 3 ; pd_8 |
| paddd m0, m9 |
| psubd m1, m9, m1 ; 8+x |
| paddd m6, m9 |
| psubd m7, m9, m7 |
| REPX {psrad x, 4}, m0, m1, m6, m7 |
| vpbroadcastd m9, [pd_17408] |
| psubd m8, m9, m8 ; 17407 |
| paddd m2, m9 |
| psubd m3, m8, m3 |
| paddd m4, m9 |
| psubd m5, m8, m5 |
| REPX {psrad x, 15}, m2, m3, m4, m5 |
| ret |
| |
| INV_TXFM_8X8_FN flipadst, dct, 12 |
| INV_TXFM_8X8_FN flipadst, adst, 12 |
| INV_TXFM_8X8_FN flipadst, flipadst, 12 |
| INV_TXFM_8X8_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iflipadst_8x8_internal_10bpc).pass1 |
| .pass2: |
| call m(iadst_8x8_internal_12bpc).pass2_main |
| packssdw m7, m7, m6 |
| packssdw m6, m1, m0 |
| packssdw m1, m5, m4 |
| vpermq m0, m7, q3120 |
| vpermq m1, m1, q3120 |
| call m(idct_8x8_internal_12bpc).write_8x4_start |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| packssdw m0, m3, m2 |
| vpermq m0, m0, q3120 |
| vpermq m1, m6, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| |
| INV_TXFM_8X8_FN identity, dct, 12 |
| INV_TXFM_8X8_FN identity, adst, 12 |
| INV_TXFM_8X8_FN identity, flipadst, 12 |
| INV_TXFM_8X8_FN identity, identity, 12 |
| |
| cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| jmp m(iidentity_8x8_internal_10bpc).pass1 |
| .pass2: |
| packssdw m3, m7 |
| vpbroadcastd m7, [pixel_12bpc_max] |
| jmp m(iidentity_8x8_internal_10bpc).pass2_main |
| |
| %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth |
| INV_TXFM_FN %1, %2, %3, 8x16, %4 |
| %ifidn %1_%2, dct_dct |
| imul r6d, [cq], 181 |
| vpbroadcastd m2, [dconly_%4bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 16 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X16_FN dct, dct |
| INV_TXFM_8X16_FN dct, identity, 35 |
| INV_TXFM_8X16_FN dct, adst |
| INV_TXFM_8X16_FN dct, flipadst |
| |
| cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m14, [pd_2896] |
| vpbroadcastd m11, [pd_2048] |
| cmp eobd, 43 |
| jl .fast |
| add cq, 32 |
| call .pass1_main |
| sub cq, 32 |
| mova [cq+32* 1], m0 |
| mova [cq+32* 3], m1 |
| mova [cq+32* 5], m2 |
| mova [cq+32* 7], m3 |
| mova [cq+32* 9], m4 |
| mova [cq+32*11], m5 |
| mova [cq+32*13], m6 |
| mova m15, m7 |
| call .pass1_main |
| mova m8, [cq+32* 1] |
| mova m9, [cq+32* 3] |
| mova m10, [cq+32* 5] |
| mova m11, [cq+32* 7] |
| mova m12, [cq+32* 9] |
| mova m13, [cq+32*11] |
| mova m14, [cq+32*13] |
| jmp tx2q |
| .fast: |
| call .pass1_main |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call .transpose |
| call m(idct_8x16_internal_8bpc).main |
| vpbroadcastd m12, [pw_2048] |
| REPX {vpermq x, x, q3120}, m0, m2, m4, m6 |
| REPX {vpermq x, x, q2031}, m1, m3, m5, m7 |
| .end: |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4_start |
| pmulhrsw m0, m2, m12 |
| pmulhrsw m1, m3, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| pmulhrsw m0, m4, m12 |
| pmulhrsw m1, m5, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| pmulhrsw m0, m6, m12 |
| pmulhrsw m1, m7, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| ALIGN function_align |
| .transpose: |
| packssdw m0, m8 |
| packssdw m1, m9 |
| packssdw m2, m10 |
| packssdw m3, m11 |
| packssdw m4, m12 |
| packssdw m5, m13 |
| packssdw m6, m14 |
| packssdw m7, m15 |
| lea r6, [deint_shuf+128] |
| punpckhwd m8, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpcklwd m3, m4, m5 |
| punpckhwd m4, m5 |
| punpckhwd m5, m6, m7 |
| punpcklwd m6, m7 |
| punpckhdq m7, m3, m6 |
| punpckldq m3, m6 |
| punpckhdq m6, m4, m5 |
| punpckldq m4, m5 |
| punpckhdq m5, m8, m1 |
| punpckldq m8, m1 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| vperm2i128 m2, m0, m3, 0x31 |
| vinserti128 m0, xm3, 1 |
| vperm2i128 m3, m1, m7, 0x31 |
| vinserti128 m1, xm7, 1 |
| vperm2i128 m7, m5, m6, 0x31 |
| vinserti128 m5, xm6, 1 |
| vperm2i128 m6, m8, m4, 0x31 |
| vinserti128 m4, m8, xm4, 1 |
| ret |
| ALIGN function_align |
| .pass1_main: |
| pmulld m0, m14, [cq+32* 0] |
| pmulld m1, m14, [cq+32* 2] |
| pmulld m2, m14, [cq+32* 4] |
| pmulld m3, m14, [cq+32* 6] |
| pmulld m4, m14, [cq+32* 8] |
| pmulld m5, m14, [cq+32*10] |
| pmulld m6, m14, [cq+32*12] |
| pmulld m7, m14, [cq+32*14] |
| call m(idct_8x8_internal_10bpc).main_rect2 |
| jmp m(idct_8x8_internal_10bpc).round_shift1 |
| ALIGN function_align |
| .main_evenhalf: |
| paddd m1, m6, m7 ; idct8 out1 |
| psubd m6, m7 ; idct8 out6 |
| psubd m7, m0, m9 ; idct8 out7 |
| paddd m0, m9 ; idct8 out0 |
| paddd m2, m5, m4 ; idct8 out2 |
| psubd m5, m4 ; idct8 out5 |
| psubd m4, m3, m8 ; idct8 out4 |
| paddd m3, m8 ; idct8 out3 |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| .main_oddhalf_fast_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| .main_oddhalf_fast: ; lower half zero |
| vpbroadcastd m7, [pd_4076] |
| vpbroadcastd m8, [pd_401] |
| vpbroadcastd m6, [pd_m1189] |
| vpbroadcastd m9, [pd_3920] |
| vpbroadcastd m5, [pd_3612] |
| vpbroadcastd m10, [pd_1931] |
| vpbroadcastd m4, [pd_m2598] |
| vpbroadcastd m15, [pd_3166] |
| pmulld m7, m0 |
| pmulld m0, m8 |
| pmulld m6, m1 |
| pmulld m1, m9 |
| pmulld m5, m2 |
| pmulld m2, m10 |
| pmulld m4, m3 |
| pmulld m3, m15 |
| jmp .main_oddhalf_fast2 |
| .main_oddhalf_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .main_oddhalf: |
| ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a |
| ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a |
| ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a |
| ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a |
| .main_oddhalf_fast2: |
| REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 |
| REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 |
| psubd m8, m0, m4 ; t9 |
| paddd m0, m4 ; t8 |
| psubd m4, m6, m2 ; t10 |
| paddd m2, m6 ; t11 |
| psubd m6, m1, m5 ; t13 |
| paddd m5, m1 ; t12 |
| psubd m1, m7, m3 ; t14 |
| paddd m7, m3 ; t15 |
| REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 |
| REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 |
| vpbroadcastd m15, [pd_3784] |
| vpbroadcastd m10, [pd_1567] |
| ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 |
| psubd m3, m1, m4 ; t10 |
| paddd m1, m4 ; t9 |
| psubd m4, m0, m2 ; t11a |
| paddd m0, m2 ; t8a |
| psubd m2, m8, m6 ; t13 |
| paddd m6, m8 ; t14 |
| psubd m8, m7, m5 ; t12a |
| paddd m7, m5 ; t15a |
| REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 |
| REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 |
| REPX {pmulld x, m14}, m2, m8, m3, m4 |
| paddd m2, m11 |
| paddd m8, m11 |
| paddd m5, m2, m3 ; t13a |
| psubd m2, m3 ; t10a |
| psubd m3, m8, m4 ; t11 |
| paddd m4, m8 ; t12 |
| REPX {psrad x, 12}, m5, m2, m3, m4 |
| mova [r6-32*4], m7 |
| mova [r6-32*3], m6 |
| mova [r6-32*2], m5 |
| mova [r6-32*1], m4 |
| mova [r6+32*0], m3 |
| mova [r6+32*1], m2 |
| mova [r6+32*2], m1 |
| mova [r6+32*3], m0 |
| ret |
| |
| INV_TXFM_8X16_FN adst, dct |
| INV_TXFM_8X16_FN adst, adst |
| INV_TXFM_8X16_FN adst, flipadst |
| INV_TXFM_8X16_FN adst, identity, 35 |
| |
| cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m14, [pd_2896] |
| vpbroadcastd m11, [pd_2048] |
| cmp eobd, 43 |
| jl .fast |
| add cq, 32 |
| call .pass1_main |
| call m(iadst_8x8_internal_10bpc).main_end |
| sub cq, 32 |
| mova [cq+32* 1], m0 |
| mova [cq+32* 3], m1 |
| mova [cq+32* 5], m2 |
| mova [cq+32* 7], m3 |
| mova [cq+32* 9], m4 |
| mova [cq+32*11], m5 |
| mova [cq+32*13], m6 |
| mova m15, m7 |
| call .pass1_main |
| call m(iadst_8x8_internal_10bpc).main_end |
| mova m8, [cq+32* 1] |
| mova m9, [cq+32* 3] |
| mova m10, [cq+32* 5] |
| mova m11, [cq+32* 7] |
| mova m12, [cq+32* 9] |
| mova m13, [cq+32*11] |
| mova m14, [cq+32*13] |
| jmp tx2q |
| .fast: |
| call .pass1_main |
| call m(iadst_8x8_internal_10bpc).main_end |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call m(idct_8x16_internal_10bpc).transpose |
| call m(iadst_8x16_internal_8bpc).main |
| call m(iadst_8x16_internal_8bpc).main_pass2_end |
| vpbroadcastd m8, [pw_2048] |
| vpbroadcastd xm12, [pw_4096] |
| REPX {vpermq x, x, q2031}, m0, m1, m2, m3 |
| REPX {vpermq x, x, q3120}, m4, m5, m6, m7 |
| psubw m12, m8 |
| jmp m(idct_8x16_internal_10bpc).end |
| ALIGN function_align |
| .pass1_main: |
| pmulld m0, m14, [cq+32* 0] |
| pmulld m7, m14, [cq+32*14] |
| pmulld m1, m14, [cq+32* 2] |
| pmulld m6, m14, [cq+32*12] |
| pmulld m2, m14, [cq+32* 4] |
| pmulld m5, m14, [cq+32*10] |
| pmulld m3, m14, [cq+32* 6] |
| pmulld m4, m14, [cq+32* 8] |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp m(iadst_8x8_internal_10bpc).main2 |
| |
| INV_TXFM_8X16_FN flipadst, dct |
| INV_TXFM_8X16_FN flipadst, adst |
| INV_TXFM_8X16_FN flipadst, flipadst |
| INV_TXFM_8X16_FN flipadst, identity, 35 |
| |
| cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m14, [pd_2896] |
| vpbroadcastd m11, [pd_2048] |
| cmp eobd, 43 |
| jl .fast |
| add cq, 32 |
| call m(iadst_8x16_internal_10bpc).pass1_main |
| call m(iflipadst_8x8_internal_10bpc).main_end |
| sub cq, 32 |
| mova [cq+32* 1], m0 |
| mova [cq+32* 3], m1 |
| mova [cq+32* 5], m2 |
| mova [cq+32* 7], m3 |
| mova [cq+32* 9], m4 |
| mova [cq+32*11], m5 |
| mova [cq+32*13], m6 |
| mova m15, m7 |
| call m(iadst_8x16_internal_10bpc).pass1_main |
| call m(iflipadst_8x8_internal_10bpc).main_end |
| mova m8, [cq+32* 1] |
| mova m9, [cq+32* 3] |
| mova m10, [cq+32* 5] |
| mova m11, [cq+32* 7] |
| mova m12, [cq+32* 9] |
| mova m13, [cq+32*11] |
| mova m14, [cq+32*13] |
| jmp tx2q |
| .fast: |
| call m(iadst_8x16_internal_10bpc).pass1_main |
| call m(iflipadst_8x8_internal_10bpc).main_end |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call m(idct_8x16_internal_10bpc).transpose |
| call m(iadst_8x16_internal_8bpc).main |
| call m(iadst_8x16_internal_8bpc).main_pass2_end |
| vpbroadcastd m12, [pw_2048] |
| vpbroadcastd xm13, [pw_4096] |
| mova m11, m0 |
| vpermq m0, m7, q2031 |
| mova m10, m1 |
| vpermq m1, m6, q2031 |
| mova m9, m2 |
| vpermq m2, m5, q2031 |
| mova m8, m3 |
| vpermq m3, m4, q2031 |
| vpermq m4, m8, q3120 |
| vpermq m5, m9, q3120 |
| vpermq m6, m10, q3120 |
| vpermq m7, m11, q3120 |
| psubw m12, m13 |
| jmp m(idct_8x16_internal_10bpc).end |
| |
| INV_TXFM_8X16_FN identity, dct |
| INV_TXFM_8X16_FN identity, adst |
| INV_TXFM_8X16_FN identity, flipadst |
| INV_TXFM_8X16_FN identity, identity |
| |
| %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] |
| pmulhrsw m%2, m%3, m%1 |
| %if %0 == 4 ; if downshifting by 1 |
| pmulhrsw m%2, m%4 |
| %else |
| paddsw m%1, m%1 |
| %endif |
| paddsw m%1, m%2 |
| %endmacro |
| |
| cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m15, [pd_2896] |
| pmulld m0, m15, [cq+32* 0] |
| pmulld m8, m15, [cq+32* 1] |
| pmulld m1, m15, [cq+32* 2] |
| pmulld m9, m15, [cq+32* 3] |
| pmulld m2, m15, [cq+32* 4] |
| pmulld m10, m15, [cq+32* 5] |
| pmulld m3, m15, [cq+32* 6] |
| pmulld m11, m15, [cq+32* 7] |
| pmulld m4, m15, [cq+32* 8] |
| pmulld m12, m15, [cq+32* 9] |
| pmulld m5, m15, [cq+32*10] |
| pmulld m13, m15, [cq+32*11] |
| pmulld m6, m15, [cq+32*12] |
| pmulld m14, m15, [cq+32*13] |
| pmulld m7, m15, [cq+32*14] |
| pmulld m15, [cq+32*15] |
| mova [cq], m7 |
| vpbroadcastd m7, [pd_2048] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| paddd m7, [cq] |
| REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| packssdw m0, m8 |
| packssdw m1, m9 |
| packssdw m2, m10 |
| packssdw m3, m11 |
| packssdw m4, m12 |
| packssdw m5, m13 |
| packssdw m6, m14 |
| packssdw m13, m7, m15 |
| vpbroadcastd m8, [pw_1697x16] |
| REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 |
| vpbroadcastd m7, [pixel_10bpc_max] |
| vpbroadcastd m12, [pw_2048] |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .pass2_end: |
| punpckhwd m9, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m13 |
| punpcklwd m6, m13 |
| punpckhwd m13, m4, m5 |
| punpcklwd m4, m5 |
| punpcklwd m5, m2, m3 |
| punpckhwd m2, m3 |
| punpckhdq m3, m0, m5 |
| punpckldq m0, m5 |
| punpckhdq m11, m9, m2 |
| punpckldq m9, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckldq m6, m13, m1 |
| punpckhdq m13, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m8, m9, m6 |
| punpckhqdq m9, m6 |
| punpcklqdq m10, m11, m13 |
| punpckhqdq m11, m13 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(iidentity_8x8_internal_10bpc).write_2x8x2_start |
| pmulhrsw m0, m12, m2 |
| pmulhrsw m1, m12, m3 |
| call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero |
| pmulhrsw m0, m12, m8 |
| pmulhrsw m1, m12, m9 |
| lea dstq, [dstq+strideq*4] |
| call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero |
| pmulhrsw m0, m12, m10 |
| pmulhrsw m1, m12, m11 |
| call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero |
| ret |
| |
| INV_TXFM_8X16_FN dct, dct, 0, 12 |
| INV_TXFM_8X16_FN dct, identity, 35, 12 |
| INV_TXFM_8X16_FN dct, adst, 0, 12 |
| INV_TXFM_8X16_FN dct, flipadst, 0, 12 |
| |
| cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(idct_8x16_internal_10bpc).pass1 |
| .pass2: |
| lea r6, [rsp+32*4] |
| call .transpose |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| mova [cq+32* 8], m0 |
| mova [cq+32*10], m2 |
| mova [cq+32*12], m4 |
| mova [cq+32*14], m6 |
| pmaxsd m0, m12, [cq+32* 1] |
| pmaxsd m4, m12, m1 |
| pmaxsd m1, m12, [cq+32* 3] |
| pmaxsd m2, m12, [cq+32* 5] |
| pmaxsd m6, m12, m5 |
| pmaxsd m5, m12, m3 |
| pmaxsd m3, m12, [cq+32* 7] |
| pmaxsd m7, m12 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m14, [pd_2896] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| pmaxsd m0, m12, [cq+32* 0] |
| pmaxsd m1, m12, [cq+32* 2] |
| pmaxsd m2, m12, [cq+32* 4] |
| pmaxsd m3, m12, [cq+32* 6] |
| pmaxsd m4, m12, [cq+32* 8] |
| pmaxsd m5, m12, [cq+32*10] |
| pmaxsd m6, m12, [cq+32*12] |
| pmaxsd m7, m12, [cq+32*14] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| vpbroadcastd m11, [pd_8] |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_16x8_internal_10bpc).pass1_rotations |
| REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| .end: |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| packssdw m2, m4, m5 |
| packssdw m3, m6, m7 |
| packssdw m4, m8, m9 |
| packssdw m5, m10, m11 |
| packssdw m6, m12, m13 |
| packssdw m7, m14, m15 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| call m(idct_8x8_internal_12bpc).write_8x4_start |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m2, q3120 |
| vpermq m1, m3, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m4, q3120 |
| vpermq m1, m5, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m6, q3120 |
| vpermq m1, m7, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| ALIGN function_align |
| .transpose: |
| mova [cq+32* 8], m8 |
| mova [cq+32* 9], m9 |
| mova [cq+32*10], m10 |
| mova [cq+32*11], m11 |
| call m(idct_8x8_internal_12bpc).transpose_8x8 |
| mova [cq+32* 0], m0 |
| mova [cq+32* 1], m1 |
| mova [cq+32* 2], m2 |
| mova [cq+32* 3], m3 |
| mova [cq+32* 4], m4 |
| mova [cq+32* 5], m5 |
| mova [cq+32* 6], m6 |
| mova [cq+32* 7], m7 |
| mova m0, [cq+32* 8] |
| mova m1, [cq+32* 9] |
| mova m2, [cq+32*10] |
| mova m3, [cq+32*11] |
| mova m4, m12 |
| mova m5, m13 |
| mova m6, m14 |
| mova m7, m15 |
| jmp m(idct_8x8_internal_12bpc).transpose_8x8 |
| |
| INV_TXFM_8X16_FN adst, dct, 0, 12 |
| INV_TXFM_8X16_FN adst, adst, 0, 12 |
| INV_TXFM_8X16_FN adst, flipadst, 0, 12 |
| INV_TXFM_8X16_FN adst, identity, 35, 12 |
| |
| cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iadst_8x16_internal_10bpc).pass1 |
| .pass2: |
| lea r6, [rsp+32*4] |
| call .pass2_main |
| call m(iadst_16x8_internal_10bpc).pass1_rotations |
| .pass2_end: |
| REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 |
| jmp m(idct_8x16_internal_12bpc).end |
| ALIGN function_align |
| .pass2_main: |
| call m(idct_8x16_internal_12bpc).transpose |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| mova [cq+32* 8], m0 |
| mova [cq+32*11], m3 |
| mova [cq+32*12], m4 |
| mova [cq+32*15], m7 |
| pmaxsd m0, m13, [cq+32* 2] ; 2 |
| pmaxsd m3, m13, m1 ; 9 |
| pmaxsd m1, m13, m5 ; 13 |
| pmaxsd m4, m13, m2 ; 10 |
| pmaxsd m2, m13, [cq+32* 6] ; 6 |
| pmaxsd m5, m13, [cq+32* 5] ; 5 |
| pmaxsd m6, m13, m6 ; 14 |
| pmaxsd m7, m13, [cq+32* 1] ; 1 |
| REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpbroadcastd m12, [pd_2048] |
| vpbroadcastd m15, [pd_2896] |
| call m(iadst_16x8_internal_10bpc).main_part1 |
| pmaxsd m0, m13, [cq+32* 0] ; 0 |
| pmaxsd m1, m13, [cq+32*15] ; 15 |
| pmaxsd m2, m13, [cq+32* 4] ; 4 |
| pmaxsd m3, m13, [cq+32*11] ; 11 |
| pmaxsd m4, m13, [cq+32* 8] ; 8 |
| pmaxsd m5, m13, [cq+32* 7] ; 7 |
| pmaxsd m6, m13, [cq+32*12] ; 12 |
| pmaxsd m7, m13, [cq+32* 3] ; 3 |
| REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(iadst_16x8_internal_10bpc).main_part2 |
| vpbroadcastd m14, [pd_17408] |
| psrld m15, 11 ; pd_1 |
| psubd m13, m14, m15 ; pd_17407 |
| pslld m15, 3 ; pd_8 |
| ret |
| |
| INV_TXFM_8X16_FN flipadst, dct, 0, 12 |
| INV_TXFM_8X16_FN flipadst, adst, 0, 12 |
| INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 |
| INV_TXFM_8X16_FN flipadst, identity, 35, 12 |
| |
| cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iflipadst_8x16_internal_10bpc).pass1 |
| .pass2: |
| lea r6, [rsp+32*4] |
| call m(iadst_8x16_internal_12bpc).pass2_main |
| call m(iflipadst_16x8_internal_10bpc).pass1_rotations |
| jmp m(iadst_8x16_internal_12bpc).pass2_end |
| |
| INV_TXFM_8X16_FN identity, dct, 0, 12 |
| INV_TXFM_8X16_FN identity, adst, 0, 12 |
| INV_TXFM_8X16_FN identity, flipadst, 0, 12 |
| INV_TXFM_8X16_FN identity, identity, 0, 12 |
| |
| cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| jmp m(iidentity_8x16_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_main |
| packssdw m0, m8 |
| packssdw m1, m9 |
| packssdw m2, m10 |
| packssdw m3, m11 |
| packssdw m4, m12 |
| packssdw m5, m13 |
| packssdw m6, m14 |
| packssdw m13, m7, m15 |
| vpbroadcastd m7, [pixel_12bpc_max] |
| vpbroadcastd m12, [pw_16384] |
| call m(iidentity_8x16_internal_10bpc).pass2_end |
| RET |
| ALIGN function_align |
| .pass2_main: |
| mova [cq], m7 |
| vpbroadcastd m7, [clip_18b_min] |
| REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| pmaxsd m7, [cq] |
| mova [cq], m15 |
| vpbroadcastd m15, [clip_18b_max] |
| REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| pminsd m15, [cq] |
| mova [cq], m7 |
| vpbroadcastd m7, [pd_5793] |
| REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| pmulld m7, [cq] |
| mova [cq], m15 |
| vpbroadcastd m15, [pd_1024] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| paddd m15, [cq] |
| REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| ret |
| |
| %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 16x4, %3 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastd m3, [dconly_%3bpc] |
| %if %3 = 10 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 4 |
| .dconly2: |
| add r6d, 384 |
| sar r6d, 9 |
| .dconly3: |
| imul r6d, 181 |
| add r6d, 2176 |
| sar r6d, 12 |
| movd xm0, r6d |
| paddsw xm0, xm3 |
| vpbroadcastw m0, xm0 |
| .dconly_loop: |
| paddsw m1, m0, [dstq+strideq*0] |
| paddsw m2, m0, [dstq+strideq*1] |
| psubusw m1, m3 |
| psubusw m2, m3 |
| mova [dstq+strideq*0], m1 |
| mova [dstq+strideq*1], m2 |
| lea dstq, [dstq+strideq*2] |
| sub r3d, 2 |
| jg .dconly_loop |
| RET |
| %else |
| jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly |
| %endif |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X4_FN dct, dct |
| INV_TXFM_16X4_FN dct, identity |
| INV_TXFM_16X4_FN dct, adst |
| INV_TXFM_16X4_FN dct, flipadst |
| |
| cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_18b_min] |
| vpbroadcastd m9, [clip_18b_max] |
| .pass1: |
| vbroadcasti128 m0, [cq+16* 0] |
| vbroadcasti128 m4, [cq+16* 4] |
| vbroadcasti128 m1, [cq+16* 2] |
| vbroadcasti128 m7, [cq+16* 6] |
| vbroadcasti128 m5, [cq+16*10] |
| vbroadcasti128 m2, [cq+16* 8] |
| vbroadcasti128 m6, [cq+16*12] |
| vbroadcasti128 m3, [cq+16*14] |
| shufpd m0, m4, 0x0c ; 0 4 |
| shufpd m1, m5, 0x0c ; 2 10 |
| shufpd m2, m6, 0x0c ; 8 12 |
| shufpd m3, m7, 0x0c ; 14 6 |
| call .pass1_main |
| vbroadcasti128 m10, [cq+16* 1] |
| vbroadcasti128 m4, [cq+16* 5] |
| vbroadcasti128 m11, [cq+16*15] |
| vbroadcasti128 m5, [cq+16*11] |
| shufpd m10, m4, 0x0c ; 1 5 |
| shufpd m11, m5, 0x0c ; 15 11 |
| vbroadcasti128 m5, [cq+16* 9] |
| vbroadcasti128 m4, [cq+16*13] |
| shufpd m5, m4, 0x0c ; 9 13 |
| vbroadcasti128 m6, [cq+16* 7] |
| vbroadcasti128 m4, [cq+16* 3] |
| shufpd m6, m4, 0x0c ; 7 3 |
| call .pass1_main2 |
| pcmpeqd m4, m4 |
| REPX {psubd x, m4}, m0, m1, m2, m3 |
| call .pass1_main3 |
| REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp tx2q |
| .pass2: |
| call .transpose_4x16_packed |
| lea r6, [deint_shuf+128] |
| call m(idct_16x4_internal_8bpc).main |
| .end: |
| vpbroadcastd m4, [pw_2048] |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| vpbroadcastd m5, [pixel_10bpc_max] |
| .end2: |
| paddw m0, [dstq+strideq*0] |
| paddw m1, [dstq+strideq*1] |
| .end3: |
| lea r6, [dstq+strideq*2] |
| paddw m2, [r6 +strideq*0] |
| paddw m3, [r6 +strideq*1] |
| pxor m4, m4 |
| REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 |
| REPX {pmaxsw x, m4}, m0, m1, m2, m3 |
| REPX {pminsw x, m5}, m0, m1, m2, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [r6 +strideq*0], m2 |
| mova [r6 +strideq*1], m3 |
| RET |
| ALIGN function_align |
| .pass1_main: |
| vpbroadcastd m7, [pd_2048] |
| call m(idct_8x4_internal_10bpc).main |
| psubd m3, m0, m4 ; idct8 out7 out6 |
| paddd m0, m4 ; idct8 out0 out1 |
| paddd m1, m2, m5 ; idct8 out3 out2 |
| psubd m2, m5 ; idct8 out4 out5 |
| ret |
| ALIGN function_align |
| .pass1_main2: |
| ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 |
| ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 |
| vbroadcasti128 m12, [pd_3784_m3784] |
| psubd m4, m10, m5 |
| paddd m10, m5 ; t8 t11 |
| psignd m4, m12 ; t9 t10 |
| psubd m5, m11, m6 |
| paddd m11, m6 ; t15 t12 |
| psignd m5, m12 ; t14 t13 |
| vpbroadcastd m6, [pd_1567] |
| vpbroadcastd m13, [pd_3784] |
| REPX {pmaxsd x, m8}, m5, m4 |
| REPX {pminsd x, m9}, m5, m4 |
| pmulld m12, m5 |
| pmulld m5, m6 |
| vbroadcasti128 m6, [pd_1567_m1567] |
| pmulld m13, m4 |
| pmulld m4, m6 |
| REPX {pmaxsd x, m8}, m10, m11, m0, m1 |
| REPX {pminsd x, m9}, m10, m11, m0, m1 |
| paddd m12, m7 |
| paddd m5, m7 |
| paddd m4, m12 |
| psubd m5, m13 |
| psrad m4, 12 ; t14a t10a |
| psrad m5, 12 ; t9a t13a |
| vpbroadcastd m12, [pd_2896] |
| punpckhqdq m6, m11, m5 |
| punpcklqdq m11, m4 |
| punpckhqdq m4, m10, m4 |
| punpcklqdq m10, m5 |
| psubd m5, m11, m6 ; t12a t13 |
| paddd m11, m6 ; t15a t14 |
| psubd m6, m10, m4 ; t11a t10 |
| paddd m10, m4 ; t8a t9 |
| REPX {pmaxsd x, m8}, m5, m6 |
| REPX {pminsd x, m9}, m5, m6 |
| pmulld m5, m12 |
| pmulld m6, m12 |
| REPX {pmaxsd x, m8}, m2, m3, m11, m10 |
| REPX {pminsd x, m9}, m2, m3, m11, m10 |
| ret |
| ALIGN function_align |
| .pass1_main3: |
| paddd m5, m7 |
| psubd m4, m5, m6 |
| paddd m5, m6 |
| psrad m4, 12 ; t11 t10a |
| psrad m5, 12 ; t12 t13a |
| psubd m7, m0, m11 ; out15 out14 |
| paddd m0, m11 ; out0 out1 |
| psubd m6, m1, m5 ; out12 out13 |
| paddd m1, m5 ; out3 out2 |
| psubd m5, m2, m4 ; out11 out10 |
| paddd m2, m4 ; out4 out5 |
| psubd m4, m3, m10 ; out8 out9 |
| paddd m3, m10 ; out7 out6 |
| REPX {pshufd x, x, q1032}, m1, m3, m5, m7 |
| ret |
| ALIGN function_align |
| .transpose_4x16_packed: |
| vbroadcasti128 m8, [deint_shuf] |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| REPX {pshufb x, m8}, m0, m2, m4, m6 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpckhqdq m2, m4, m6 |
| punpcklqdq m4, m6 |
| vperm2i128 m3, m1, m2, 0x31 |
| vinserti128 m1, xm2, 1 |
| vperm2i128 m2, m0, m4, 0x31 |
| vinserti128 m0, xm4, 1 |
| ret |
| |
| INV_TXFM_16X4_FN adst, dct |
| INV_TXFM_16X4_FN adst, adst |
| INV_TXFM_16X4_FN adst, flipadst |
| INV_TXFM_16X4_FN adst, identity |
| |
| cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| call m(iadst_4x16_internal_10bpc).main |
| psrad m11, 11 ; pd_1 |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| paddd m4, m5, m11 |
| paddd m5, m6, m11 |
| paddd m6, m7, m11 |
| paddd m7, m8, m11 |
| .pass1_end: |
| REPX {pshufd x, x, q1032}, m0, m2, m4, m6 |
| REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x4_internal_10bpc).transpose_4x16_packed |
| lea r6, [deint_shuf+128] |
| call m(iadst_16x4_internal_8bpc).main |
| jmp m(idct_16x4_internal_10bpc).end |
| ALIGN function_align |
| .main: |
| vpbroadcastd m6, [pd_1321] |
| mova m0, [cq+32*0] |
| mova m1, [cq+32*1] |
| vpbroadcastd m7, [pd_2482] |
| mova m2, [cq+32*6] |
| mova m3, [cq+32*7] |
| pmulld m4, m0, m6 |
| pmulld m5, m1, m6 ; 1321*in0 |
| pmulld m9, m2, m7 |
| pmulld m8, m3, m7 ; 2482*in3 |
| paddd m4, m9 |
| paddd m8, m5 ; 1321*in0 + 2482*in3 |
| pmulld m5, m0, m7 |
| pmulld m9, m1, m7 ; 2482*in0 |
| paddd m0, m2 |
| paddd m1, m3 ; in0 + in3 |
| paddd m7, m6 ; pd_3803 |
| pmulld m2, m7 |
| pmulld m3, m7 ; 3803*in3 |
| psubd m5, m2 |
| psubd m9, m3 ; 2482*in0 - 3803*in3 |
| mova m2, [cq+32*4] |
| pmulld m10, m7, m2 |
| pmulld m3, m6, m2 |
| psubd m2, m0 |
| mova m0, [cq+32*5] |
| pmulld m7, m0 ; 3803*in2 |
| pmulld m6, m0 ; 1321*in2 |
| psubd m0, m1 ; in2 - in0 - in3 |
| vpbroadcastd m1, [pd_m3344] |
| paddd m4, m10 |
| paddd m7, m8 ; t0 |
| psubd m5, m3 |
| psubd m9, m6 ; t1 |
| pmulld m2, m1 |
| pmulld m0, m1 ; t2 |
| pmulld m3, m1, [cq+32*2] |
| pmulld m1, [cq+32*3] ; -t3 |
| ret |
| ALIGN function_align |
| .main_end: |
| ; expects: m6 = rnd |
| paddd m5, m6 |
| paddd m9, m6 |
| paddd m10, m4, m5 |
| paddd m4, m6 |
| paddd m8, m7, m6 |
| paddd m7, m9 |
| psubd m4, m3 ; out0 (unshifted) |
| psubd m5, m3 ; out1 (unshifted) |
| paddd m2, m6 ; out2 (unshifted) |
| paddd m3, m10 ; out3 (unshifted) |
| psubd m8, m1 ; out4 (unshifted) |
| psubd m9, m1 ; out5 (unshifted) |
| paddd m6, m0 ; out6 (unshifted) |
| paddd m7, m1 ; out7 (unshifted) |
| ret |
| |
| INV_TXFM_16X4_FN flipadst, dct |
| INV_TXFM_16X4_FN flipadst, adst |
| INV_TXFM_16X4_FN flipadst, flipadst |
| INV_TXFM_16X4_FN flipadst, identity |
| |
| cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| call m(iadst_4x16_internal_10bpc).main |
| psrad m11, 11 ; pd_1 |
| paddd m4, m3, m11 |
| paddd m3, m5, m11 |
| paddd m5, m2, m11 |
| paddd m2, m6, m11 |
| paddd m6, m1, m11 |
| paddd m1, m7, m11 |
| paddd m7, m0, m11 |
| paddd m0, m8, m11 |
| jmp m(iadst_16x4_internal_10bpc).pass1_end |
| .pass2: |
| call m(idct_16x4_internal_10bpc).transpose_4x16_packed |
| lea r6, [deint_shuf+128] |
| call m(iadst_16x4_internal_8bpc).main |
| vpbroadcastd m4, [pw_2048] |
| pmulhrsw m5, m3, m4 |
| pmulhrsw m6, m2, m4 |
| pmulhrsw m2, m1, m4 |
| pmulhrsw m3, m0, m4 |
| paddw m0, m5, [dstq+strideq*0] |
| paddw m1, m6, [dstq+strideq*1] |
| vpbroadcastd m5, [pixel_10bpc_max] |
| jmp m(idct_16x4_internal_10bpc).end3 |
| |
| INV_TXFM_16X4_FN identity, dct |
| INV_TXFM_16X4_FN identity, adst |
| INV_TXFM_16X4_FN identity, flipadst |
| INV_TXFM_16X4_FN identity, identity |
| |
| cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [pd_5793] |
| vpermq m0, [cq+32*0], q3120 ; 0 1 |
| vpermq m1, [cq+32*1], q3120 ; 2 3 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m4, [cq+32*4], q3120 ; 8 9 |
| vpermq m5, [cq+32*5], q3120 ; a b |
| vpermq m6, [cq+32*6], q3120 ; c d |
| vpermq m7, [cq+32*7], q3120 ; e f |
| vpbroadcastd m9, [pd_3072] |
| REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x4_internal_10bpc).transpose_4x16_packed |
| vpbroadcastd m7, [pw_1697x8] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| jmp m(idct_16x4_internal_10bpc).end |
| |
| INV_TXFM_16X4_FN dct, dct, 12 |
| INV_TXFM_16X4_FN dct, identity, 12 |
| INV_TXFM_16X4_FN dct, adst, 12 |
| INV_TXFM_16X4_FN dct, flipadst, 12 |
| |
| cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [clip_20b_min] |
| vpbroadcastd m9, [clip_20b_max] |
| jmp m(idct_16x4_internal_10bpc).pass1 |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ; deinterleave |
| REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ; transpose |
| punpcklqdq m8, m0, m1 |
| punpckhqdq m0, m1 |
| punpcklqdq m9, m2, m3 |
| punpckhqdq m2, m3 |
| punpcklqdq m10, m4, m5 |
| punpckhqdq m4, m5 |
| punpcklqdq m11, m6, m7 |
| punpckhqdq m6, m7 |
| vperm2i128 m3, m0, m2, 0x31 ; out6 |
| vperm2i128 m1, m0, m2, 0x20 ; out2 |
| vperm2i128 m7, m4, m6, 0x31 ; out7 |
| vperm2i128 m5, m4, m6, 0x20 ; out3 |
| vperm2i128 m13, m10, m11, 0x31 ; out5 |
| vperm2i128 m12, m10, m11, 0x20 ; out1 |
| vperm2i128 m11, m8, m9, 0x31 ; out4 |
| vperm2i128 m10, m8, m9, 0x20 ; out0 |
| call m(idct_4x16_internal_10bpc).pass1_main |
| pmulld m0, m6, m10 |
| pmulld m2, m6, m11 |
| pmulld m4, m6, m12 |
| pmulld m6, m13 |
| vpbroadcastd m10, [pd_17408] |
| call m(idct_4x16_internal_10bpc).pass1_main2 |
| REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| jmp m(idct_16x4_internal_10bpc).end2 |
| |
| INV_TXFM_16X4_FN adst, dct, 12 |
| INV_TXFM_16X4_FN adst, adst, 12 |
| INV_TXFM_16X4_FN adst, flipadst, 12 |
| INV_TXFM_16X4_FN adst, identity, 12 |
| |
| cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iadst_16x4_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_main |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| jmp m(idct_16x4_internal_10bpc).end2 |
| ALIGN function_align |
| .pass2_main: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 |
| pmaxsd m8, m4, m12 |
| pmaxsd m9, m5, m12 |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| call m(iadst_8x4_internal_12bpc).transpose_4x8 |
| mova [cq+32*0], m0 |
| mova [cq+32*2], m1 |
| mova [cq+32*4], m2 |
| mova [cq+32*6], m3 |
| pminsd m0, m8, m13 |
| pminsd m1, m9, m13 |
| pminsd m2, m6, m13 |
| pminsd m3, m7, m13 |
| call m(iadst_8x4_internal_12bpc).transpose_4x8 |
| mova [cq+32*1], m0 |
| mova [cq+32*3], m1 |
| mova [cq+32*5], m2 |
| mova [cq+32*7], m3 |
| call m(iadst_16x4_internal_10bpc).main |
| vpbroadcastd m6, [pd_2048] |
| call m(iadst_16x4_internal_10bpc).main_end |
| psrad m0, m4, 15 |
| psrad m1, m5, 15 |
| psrad m2, 15 |
| psrad m3, 15 |
| psrad m4, m8, 15 |
| psrad m5, m9, 15 |
| psrad m6, 15 |
| psrad m7, 15 |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| vpbroadcastd m4, [pw_16384] |
| vpbroadcastd m5, [pixel_12bpc_max] |
| ret |
| |
| INV_TXFM_16X4_FN flipadst, dct, 12 |
| INV_TXFM_16X4_FN flipadst, adst, 12 |
| INV_TXFM_16X4_FN flipadst, flipadst, 12 |
| INV_TXFM_16X4_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(iflipadst_16x4_internal_10bpc).pass1 |
| .pass2: |
| call m(iadst_16x4_internal_12bpc).pass2_main |
| vpermq m7, m0, q3120 |
| vpermq m6, m1, q3120 |
| vpermq m1, m2, q3120 |
| vpermq m0, m3, q3120 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| pmulhrsw m2, m6, m4 |
| pmulhrsw m3, m7, m4 |
| jmp m(idct_16x4_internal_10bpc).end2 |
| |
| INV_TXFM_16X4_FN identity, dct, 12 |
| INV_TXFM_16X4_FN identity, adst, 12 |
| INV_TXFM_16X4_FN identity, flipadst, 12 |
| INV_TXFM_16X4_FN identity, identity, 12 |
| |
| cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 |
| vpbroadcastd m8, [pd_1697] |
| vpermq m0, [cq+32*0], q3120 ; 0 1 |
| vpermq m1, [cq+32*1], q3120 ; 2 3 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpbroadcastd m9, [pd_3072] |
| pmulld m4, m8, m0 |
| pmulld m5, m8, m1 |
| pmulld m6, m8, m2 |
| pmulld m7, m8, m3 |
| vpermq m10, [cq+32*4], q3120 ; 8 9 |
| vpermq m11, [cq+32*5], q3120 ; a b |
| vpermq m12, [cq+32*6], q3120 ; c d |
| vpermq m13, [cq+32*7], q3120 ; e f |
| REPX {paddd x, m9}, m4, m5, m6, m7 |
| REPX {psrad x, 12}, m4, m5, m6, m7 |
| paddd m0, m4 |
| pmulld m4, m8, m10 |
| paddd m1, m5 |
| pmulld m5, m8, m11 |
| paddd m2, m6 |
| pmulld m6, m8, m12 |
| paddd m3, m7 |
| pmulld m7, m8, m13 |
| REPX {paddd x, m9}, m4, m5, m6, m7 |
| REPX {psrad x, 12}, m4, m5, m6, m7 |
| paddd m4, m10 |
| paddd m5, m11 |
| paddd m6, m12 |
| paddd m7, m13 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpbroadcastd m8, [pd_5793] |
| vpbroadcastd m9, [pd_2048] |
| REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_16x4_internal_10bpc).transpose_4x16_packed |
| vpbroadcastd m4, [pw_16384] |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| vpbroadcastd m5, [pixel_12bpc_max] |
| jmp m(idct_16x4_internal_10bpc).end2 |
| |
| %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth |
| INV_TXFM_FN %1, %2, 0, 16x8, %3 |
| %ifidn %1_%2, dct_dct |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_%3bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 8 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X8_FN dct, dct |
| INV_TXFM_16X8_FN dct, identity |
| INV_TXFM_16X8_FN dct, adst |
| INV_TXFM_16X8_FN dct, flipadst |
| |
| cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m14, [pd_2896] |
| pmulld m0, m14, [cq+32* 1] |
| pmulld m1, m14, [cq+32* 3] |
| pmulld m2, m14, [cq+32* 5] |
| pmulld m3, m14, [cq+32* 7] |
| pmulld m4, m14, [cq+32* 9] |
| pmulld m5, m14, [cq+32*11] |
| pmulld m6, m14, [cq+32*13] |
| pmulld m7, m14, [cq+32*15] |
| vpbroadcastd m11, [pd_2048] |
| lea r6, [rsp+32*4] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 |
| pmulld m0, m14, [cq+32* 0] |
| pmulld m1, m14, [cq+32* 2] |
| pmulld m2, m14, [cq+32* 4] |
| pmulld m3, m14, [cq+32* 6] |
| pmulld m4, m14, [cq+32* 8] |
| pmulld m5, m14, [cq+32*10] |
| pmulld m6, m14, [cq+32*12] |
| pmulld m7, m14, [cq+32*14] |
| call m(idct_8x8_internal_10bpc).main_rect2 |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| psrld m11, 11 ; pd_1 |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call .pass1_rotations |
| REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call .transpose |
| call m(idct_16x8_internal_8bpc).main |
| vpbroadcastd m10, [pw_2048] |
| .end: |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pmulhrsw m2, m10 |
| pmulhrsw m3, m10 |
| call .write_16x4_start |
| .end2: |
| pmulhrsw m0, m4, m10 |
| pmulhrsw m1, m5, m10 |
| pmulhrsw m2, m6, m10 |
| pmulhrsw m3, m7, m10 |
| call .write_16x4_zero |
| RET |
| ALIGN function_align |
| .pass1_rotations: |
| mova m14, [r6-32*4] |
| mova m13, [r6-32*3] |
| mova m12, [r6-32*2] |
| mova m11, [r6-32*1] |
| mova m10, [r6+32*0] |
| mova m9, [r6+32*1] |
| mova m8, [r6+32*2] |
| psubd m15, m0, m14 ; out15 |
| paddd m0, m14 ; out0 |
| psubd m14, m1, m13 ; out14 |
| paddd m1, m13 ; out1 |
| psubd m13, m2, m12 ; out13 |
| paddd m2, m12 ; out2 |
| psubd m12, m3, m11 ; out12 |
| paddd m3, m11 ; out3 |
| psubd m11, m4, m10 ; out11 |
| paddd m4, m10 ; out4 |
| psubd m10, m5, m9 ; out10 |
| paddd m5, m9 ; out5 |
| psubd m9, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| psubd m8, m7, [r6+32*3] ; out8 |
| paddd m7, [r6+32*3] ; out7 |
| ret |
| ALIGN function_align |
| .transpose: |
| lea r6, [deint_shuf+128] |
| .transpose2: |
| packssdw m0, m8 |
| packssdw m1, m9 |
| packssdw m2, m10 |
| packssdw m3, m11 |
| packssdw m4, m12 |
| packssdw m5, m13 |
| packssdw m6, m14 |
| packssdw m7, m15 |
| .transpose3: |
| punpckhwd m8, m0, m1 |
| punpcklwd m0, m1 |
| punpcklwd m1, m2, m3 |
| punpckhwd m2, m3 |
| punpckhwd m3, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m6, m7 |
| punpcklwd m6, m7 |
| punpckhdq m7, m4, m6 |
| punpckldq m4, m6 |
| punpckldq m6, m8, m2 |
| punpckhdq m8, m2 |
| punpckhdq m2, m0, m1 |
| punpckldq m0, m1 |
| punpckhdq m1, m3, m5 |
| punpckldq m3, m5 |
| punpcklqdq m5, m6, m3 |
| punpckhqdq m6, m3 |
| punpckhqdq m3, m2, m7 |
| punpcklqdq m2, m7 |
| punpcklqdq m7, m8, m1 |
| punpckhqdq m8, m1 |
| punpckhqdq m1, m0, m4 |
| punpcklqdq m0, m4 |
| vperm2i128 m4, m0, m5, 0x31 |
| vinserti128 m0, xm5, 1 |
| vperm2i128 m5, m1, m6, 0x31 |
| vinserti128 m1, xm6, 1 |
| vperm2i128 m6, m2, m7, 0x31 |
| vinserti128 m2, xm7, 1 |
| vperm2i128 m7, m3, m8, 0x31 |
| vinserti128 m3, xm8, 1 |
| ret |
| ALIGN function_align |
| .write_16x4_start: |
| vpbroadcastd m9, [pixel_10bpc_max] |
| lea r3, [strideq*3] |
| pxor m8, m8 |
| .write_16x4_zero: |
| REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add cq, 32*8 |
| .write_16x4: |
| paddw m0, [dstq+strideq*0] |
| paddw m1, [dstq+strideq*1] |
| paddw m2, [dstq+strideq*2] |
| paddw m3, [dstq+r3 ] |
| REPX {pmaxsw x, m8}, m0, m1, m2, m3 |
| REPX {pminsw x, m9}, m0, m1, m2, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r3 ], m3 |
| lea dstq, [dstq+strideq*4] |
| ret |
| |
| INV_TXFM_16X8_FN adst, dct |
| INV_TXFM_16X8_FN adst, adst |
| INV_TXFM_16X8_FN adst, flipadst |
| INV_TXFM_16X8_FN adst, identity |
| |
| cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| .pass1: |
| lea r6, [rsp+32*4] |
| call .main |
| vpbroadcastd m14, [pd_3072] |
| psrld m15, 11 ; pd_1 |
| psubd m13, m14, m15 ; pd_3071 |
| call .pass1_rotations |
| .pass1_end: |
| REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x8_internal_10bpc).transpose |
| call m(iadst_16x8_internal_8bpc).main |
| call m(iadst_16x8_internal_8bpc).main_pass2_end |
| vpbroadcastd m10, [pw_2048] |
| pxor m11, m11 |
| psubw m11, m10 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m11 |
| pmulhrsw m2, m10 |
| pmulhrsw m3, m11 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| pmulhrsw m0, m4, m10 |
| pmulhrsw m1, m5, m11 |
| pmulhrsw m2, m6, m10 |
| pmulhrsw m3, m7, m11 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| ALIGN function_align |
| .pass1_rotations: |
| paddd m0, m15 |
| psubd m1, m15, m1 |
| paddd m2, m15 |
| psubd m3, m15, m3 |
| paddd m4, m14 |
| psubd m5, m13, m5 |
| paddd m6, m14 |
| psubd m7, m13, m7 |
| paddd m8, m14, m9 |
| psubd m9, m13, m10 |
| paddd m10, m14, m11 |
| psubd m11, m13, m12 |
| paddd m12, m15, [r6-32*1] |
| psubd m13, m15, [r6-32*2] |
| paddd m14, m15, [r6-32*3] |
| psubd m15, [r6-32*4] |
| ret |
| ALIGN function_align |
| .main: |
| ; expects: m13 = clip_min m14 = clip_max |
| vpbroadcastd m15, [pd_2896] |
| pmulld m0, m15, [cq+32* 2] |
| pmulld m1, m15, [cq+32*13] |
| pmulld m2, m15, [cq+32* 6] |
| pmulld m3, m15, [cq+32* 9] |
| pmulld m4, m15, [cq+32*10] |
| pmulld m5, m15, [cq+32* 5] |
| pmulld m6, m15, [cq+32*14] |
| pmulld m7, m15, [cq+32* 1] |
| vpbroadcastd m12, [pd_2048] |
| REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| call .main_part1 |
| pmulld m0, m15, [cq+32* 0] |
| pmulld m1, m15, [cq+32*15] |
| pmulld m2, m15, [cq+32* 4] |
| pmulld m3, m15, [cq+32*11] |
| pmulld m4, m15, [cq+32* 8] |
| pmulld m5, m15, [cq+32* 7] |
| pmulld m6, m15, [cq+32*12] |
| pmulld m7, m15, [cq+32* 3] |
| REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .main_part2: |
| ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 |
| ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 |
| ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 |
| ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 |
| psubd m8, m0, m4 ; t8a |
| paddd m0, m4 ; t0a |
| psubd m4, m1, m5 ; t9a |
| paddd m1, m5 ; t1a |
| psubd m5, m2, m6 ; t12a |
| paddd m2, m6 ; t4a |
| psubd m6, m3, m7 ; t13a |
| paddd m7, m3 ; t5a |
| REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 |
| REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 |
| vpbroadcastd m11, [pd_4017] |
| vpbroadcastd m10, [pd_799] |
| ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 |
| ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 |
| psubd m3, m0, m2 ; t4 |
| paddd m0, m2 ; t0 |
| psubd m2, m1, m7 ; t5 |
| paddd m1, m7 ; t1 |
| psubd m7, m4, m6 ; t12a |
| paddd m4, m6 ; t8a |
| psubd m6, m8, m5 ; t13a |
| paddd m5, m8 ; t9a |
| REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 |
| REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 |
| vpbroadcastd m11, [pd_3784] |
| vpbroadcastd m10, [pd_1567] |
| ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 |
| ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 |
| pminsd m10, m14, [r6-32*4] ; t2 |
| pminsd m8, m14, [r6-32*3] ; t3 |
| psubd m9, m0, m10 ; t2a |
| paddd m0, m10 ; out0 |
| psubd m10, m1, m8 ; t3a |
| paddd m1, m8 ; -out15 |
| pmaxsd m9, m13 |
| pmaxsd m10, m13 |
| pminsd m9, m14 |
| pminsd m10, m14 |
| mova [r6-32*4], m1 |
| mova m11, [r6-32*1] ; t7a |
| mova m1, [r6-32*2] ; t6a |
| psubd m8, m3, m11 ; t7 |
| paddd m11, m3 ; out12 |
| paddd m3, m2, m1 ; -out3 |
| psubd m2, m1 ; t6 |
| pmaxsd m8, m13 |
| pmaxsd m2, m13 |
| pminsd m8, m14 |
| pminsd m2, m14 |
| mova [r6-32*1], m11 |
| mova [r6-32*3], m2 |
| mova m1, [r6+32*3] ; t15 |
| mova m2, [r6+32*2] ; t14 |
| paddd m12, m7, m1 ; -out13 |
| psubd m7, m1 ; t15a |
| psubd m11, m6, m2 ; t14a |
| paddd m2, m6 ; out2 |
| pmaxsd m7, m13 |
| pmaxsd m11, m13 |
| pminsd m7, m14 |
| pminsd m11, m14 |
| mova [r6-32*2], m12 |
| pminsd m1, m14, [r6+32*0] ; t10a |
| pminsd m12, m14, [r6+32*1] ; t11a |
| psubd m6, m4, m1 ; t10 |
| paddd m1, m4 ; -out1 |
| psubd m4, m5, m12 ; t11 |
| paddd m5, m12 ; out14 |
| vpbroadcastd m12, [pd_1448] |
| pmaxsd m6, m13 |
| pmaxsd m4, m13 |
| pminsd m6, m14 |
| pminsd m4, m14 |
| REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 |
| pmulld m12, [r6-32*3] ; t6 |
| mova [r6-32*3], m5 |
| paddd m5, m11, m7 ; -out5 (unshifted) |
| psubd m11, m7 ; out10 (unshifted) |
| paddd m7, m9, m10 ; -out7 (unshifted) |
| psubd m9, m10 ; out8 (unshifted) |
| psubd m10, m6, m4 ; -out9 (unshifted) |
| paddd m6, m4 ; out6 (unshifted) |
| paddd m4, m12, m8 ; out4 (unshifted) |
| psubd m12, m8 ; -out11 (unshifted) |
| ret |
| .main_part1: |
| ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 |
| ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 |
| ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 |
| ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 |
| psubd m8, m0, m4 ; t10a |
| paddd m0, m4 ; t2a |
| psubd m4, m1, m5 ; t11a |
| paddd m1, m5 ; t3a |
| psubd m5, m2, m6 ; t14a |
| paddd m2, m6 ; t6a |
| psubd m6, m3, m7 ; t15a |
| paddd m7, m3 ; t7a |
| REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 |
| REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 |
| vpbroadcastd m11, [pd_2276] |
| vpbroadcastd m10, [pd_3406] |
| ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 |
| ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 |
| psubd m3, m0, m2 ; t6 |
| paddd m0, m2 ; t2 |
| psubd m2, m1, m7 ; t7 |
| paddd m1, m7 ; t3 |
| psubd m7, m4, m6 ; t14a |
| paddd m4, m6 ; t10a |
| psubd m6, m8, m5 ; t15a |
| paddd m5, m8 ; t11a |
| REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 |
| REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later |
| vpbroadcastd m11, [pd_1567] |
| vpbroadcastd m10, [pd_3784] |
| ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 |
| ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6-32*2], m2 |
| mova [r6-32*1], m3 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| ret |
| |
| INV_TXFM_16X8_FN flipadst, dct |
| INV_TXFM_16X8_FN flipadst, adst |
| INV_TXFM_16X8_FN flipadst, flipadst |
| INV_TXFM_16X8_FN flipadst, identity |
| |
| cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| .pass1: |
| lea r6, [rsp+32*4] |
| call m(iadst_16x8_internal_10bpc).main |
| vpbroadcastd m14, [pd_3072] |
| psrld m15, 11 |
| psubd m13, m14, m15 |
| call .pass1_rotations |
| jmp m(iadst_16x8_internal_10bpc).pass1_end |
| .pass2: |
| call m(idct_16x8_internal_10bpc).transpose |
| call m(iadst_16x8_internal_8bpc).main |
| call m(iadst_16x8_internal_8bpc).main_pass2_end |
| vpbroadcastd m10, [pw_2048] |
| pxor m11, m11 |
| psubw m11, m10 |
| mova m12, m0 |
| pmulhrsw m0, m7, m11 |
| mova m7, m1 |
| pmulhrsw m1, m6, m10 |
| mova m6, m2 |
| pmulhrsw m2, m5, m11 |
| mova m5, m3 |
| pmulhrsw m3, m4, m10 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| pmulhrsw m0, m5, m11 |
| pmulhrsw m1, m6, m10 |
| pmulhrsw m2, m7, m11 |
| pmulhrsw m3, m12, m10 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| ALIGN function_align |
| .pass1_rotations: |
| psubd m8, m13, m7 |
| paddd m7, m14, m9 |
| paddd m9, m14, m6 |
| psubd m6, m13, m10 |
| psubd m10, m13, m5 |
| paddd m5, m14, m11 |
| paddd m11, m14, m4 |
| psubd m4, m13, m12 |
| psubd m12, m15, m3 |
| paddd m3, m15, [r6-32*1] |
| paddd m13, m15, m2 |
| psubd m2, m15, [r6-32*2] |
| psubd m14, m15, m1 |
| mova m1, m15 |
| paddd m15, m0 |
| psubd m0, m1, [r6-32*4] |
| paddd m1, [r6-32*3] |
| ret |
| |
| INV_TXFM_16X8_FN identity, dct |
| INV_TXFM_16X8_FN identity, adst |
| INV_TXFM_16X8_FN identity, flipadst |
| INV_TXFM_16X8_FN identity, identity |
| |
| cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| .pass1: |
| vpbroadcastd m15, [pd_2896] |
| pmulld m0, m15, [cq+32* 0] |
| pmulld m1, m15, [cq+32* 1] |
| pmulld m2, m15, [cq+32* 2] |
| pmulld m3, m15, [cq+32* 3] |
| pmulld m4, m15, [cq+32* 4] |
| pmulld m5, m15, [cq+32* 5] |
| pmulld m6, m15, [cq+32* 6] |
| pmulld m7, m15, [cq+32* 7] |
| pmulld m8, m15, [cq+32* 8] |
| pmulld m9, m15, [cq+32* 9] |
| pmulld m10, m15, [cq+32*10] |
| pmulld m11, m15, [cq+32*11] |
| pmulld m12, m15, [cq+32*12] |
| pmulld m13, m15, [cq+32*13] |
| pmulld m14, m15, [cq+32*14] |
| pmulld m15, [cq+32*15] |
| mova [rsp], m7 |
| vpbroadcastd m7, [pd_2048] |
| REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| paddd m7, [rsp] |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| mova [rsp], m15 |
| vpbroadcastd m15, [pd_5793] |
| REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| pmulld m15, [rsp] |
| mova [rsp], m7 |
| vpbroadcastd m7, [pd_3072] |
| REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| paddd m7, [rsp] |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x8_internal_10bpc).transpose |
| vpbroadcastd m10, [pw_4096] |
| jmp m(idct_16x8_internal_10bpc).end |
| |
| INV_TXFM_16X8_FN dct, dct, 12 |
| INV_TXFM_16X8_FN dct, identity, 12 |
| INV_TXFM_16X8_FN dct, adst, 12 |
| INV_TXFM_16X8_FN dct, flipadst, 12 |
| |
| cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(idct_16x8_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_main |
| RET |
| ALIGN function_align |
| .pass2_main: |
| call m(idct_8x16_internal_12bpc).transpose |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m11, [pd_2048] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x8_internal_12bpc).round_shift4 |
| mova [cq+32* 8], m0 |
| mova [cq+32* 9], m1 |
| mova [cq+32*10], m2 |
| mova [cq+32*11], m3 |
| mova [cq+32*12], m4 |
| mova [cq+32*13], m5 |
| mova [cq+32*14], m6 |
| mova [cq+32*15], m7 |
| pmaxsd m0, m12, [cq+32*0] |
| pmaxsd m1, m12, [cq+32*1] |
| pmaxsd m2, m12, [cq+32*2] |
| pmaxsd m3, m12, [cq+32*3] |
| pmaxsd m4, m12, [cq+32*4] |
| pmaxsd m5, m12, [cq+32*5] |
| pmaxsd m6, m12, [cq+32*6] |
| pmaxsd m7, m12, [cq+32*7] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x8_internal_12bpc).round_shift4 |
| .end: |
| packssdw m0, [cq+32* 8] |
| packssdw m1, [cq+32* 9] |
| packssdw m2, [cq+32*10] |
| packssdw m3, [cq+32*11] |
| packssdw m4, [cq+32*12] |
| packssdw m5, [cq+32*13] |
| packssdw m6, [cq+32*14] |
| packssdw m7, [cq+32*15] |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| call .write_16x4_start |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| vpermq m0, m4, q3120 |
| vpermq m1, m5, q3120 |
| vpermq m2, m6, q3120 |
| vpermq m3, m7, q3120 |
| jmp m(idct_16x8_internal_10bpc).write_16x4_zero |
| ALIGN function_align |
| .write_16x4_start: |
| vpbroadcastd m9, [pixel_12bpc_max] |
| lea r3, [strideq*3] |
| pxor m8, m8 |
| ret |
| |
| INV_TXFM_16X8_FN adst, dct, 12 |
| INV_TXFM_16X8_FN adst, adst, 12 |
| INV_TXFM_16X8_FN adst, flipadst, 12 |
| INV_TXFM_16X8_FN adst, identity, 12 |
| |
| cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_20b_min] |
| vpbroadcastd m14, [clip_20b_max] |
| jmp m(iadst_16x8_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_main |
| call m(idct_16x8_internal_12bpc).end |
| RET |
| ALIGN function_align |
| .pass2_main: |
| call m(idct_8x16_internal_12bpc).transpose |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m11, [pd_2048] |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(iadst_8x8_internal_12bpc).pass2_main2 |
| mova [cq+32* 8], m0 |
| mova [cq+32* 9], m1 |
| mova [cq+32*10], m2 |
| mova [cq+32*11], m3 |
| mova [cq+32*12], m4 |
| mova [cq+32*13], m5 |
| mova [cq+32*14], m6 |
| mova [cq+32*15], m7 |
| pmaxsd m0, m12, [cq+32*0] |
| pmaxsd m1, m12, [cq+32*1] |
| pmaxsd m2, m12, [cq+32*2] |
| pmaxsd m3, m12, [cq+32*3] |
| pmaxsd m4, m12, [cq+32*4] |
| pmaxsd m5, m12, [cq+32*5] |
| pmaxsd m6, m12, [cq+32*6] |
| pmaxsd m7, m12, [cq+32*7] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(iadst_8x8_internal_12bpc).pass2_main2 |
| ret |
| |
| INV_TXFM_16X8_FN flipadst, dct, 12 |
| INV_TXFM_16X8_FN flipadst, adst, 12 |
| INV_TXFM_16X8_FN flipadst, flipadst, 12 |
| INV_TXFM_16X8_FN flipadst, identity, 12 |
| |
| cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_20b_min] |
| vpbroadcastd m14, [clip_20b_max] |
| jmp m(iflipadst_16x8_internal_10bpc).pass1 |
| .pass2: |
| call m(iadst_16x8_internal_12bpc).pass2_main |
| packssdw m13, m0, [cq+32* 8] |
| packssdw m12, m1, [cq+32* 9] |
| packssdw m11, m2, [cq+32*10] |
| packssdw m10, m3, [cq+32*11] |
| packssdw m3, m4, [cq+32*12] |
| packssdw m2, m5, [cq+32*13] |
| packssdw m1, m6, [cq+32*14] |
| packssdw m0, m7, [cq+32*15] |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| call m(idct_16x8_internal_12bpc).write_16x4_start |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| vpermq m0, m10, q3120 |
| vpermq m1, m11, q3120 |
| vpermq m2, m12, q3120 |
| vpermq m3, m13, q3120 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| |
| INV_TXFM_16X8_FN identity, dct, 12 |
| INV_TXFM_16X8_FN identity, adst, 12 |
| INV_TXFM_16X8_FN identity, flipadst, 12 |
| INV_TXFM_16X8_FN identity, identity, 12 |
| |
| cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 |
| jmp m(iidentity_16x8_internal_10bpc).pass1 |
| .pass2: |
| call m(idct_16x8_internal_10bpc).transpose2 |
| vpbroadcastd m10, [pw_4096] |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pmulhrsw m2, m10 |
| pmulhrsw m3, m10 |
| call m(idct_16x8_internal_12bpc).write_16x4_start |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| jmp m(idct_16x8_internal_10bpc).end2 |
| |
| %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth |
| INV_TXFM_FN %1, %2, %3, 16x16, %4 |
| %ifidn %1_%2, dct_dct |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_%4bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 16 |
| add r6d, 640 |
| sar r6d, 10 |
| jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X16_FN dct, dct |
| INV_TXFM_16X16_FN dct, identity, 28 |
| INV_TXFM_16X16_FN dct, adst |
| INV_TXFM_16X16_FN dct, flipadst |
| |
| cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*4] |
| sub eobd, 36 |
| jl .fast |
| add cq, 32 |
| call .main |
| sub cq, 32 |
| mova m10, [r6-32*4] |
| mova m9, [r6-32*3] |
| mova m8, [r6-32*2] |
| psubd m15, m0, m10 ; out15 |
| paddd m0, m10 ; out0 |
| psubd m10, m1, m9 ; out14 |
| paddd m1, m9 ; out1 |
| psubd m9, m2, m8 ; out13 |
| paddd m2, m8 ; out2 |
| REPX {psrad x, 2}, m0, m1, m2 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6-32*2], m2 |
| mova m2, [r6-32*1] |
| mova m1, [r6+32*0] |
| mova m0, [r6+32*1] |
| REPX {psrad x, 2}, m9, m10, m15 |
| psubd m8, m3, m2 ; out12 |
| paddd m3, m2 ; out3 |
| psubd m2, m4, m1 ; out11 |
| paddd m4, m1 ; out4 |
| psubd m1, m5, m0 ; out10 |
| paddd m5, m0 ; out5 |
| REPX {psrad x, 2}, m3, m4, m5 |
| mova [r6-32*1], m3 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova m4, [r6+32*2] |
| mova m3, [r6+32*3] |
| REPX {psrad x, 2}, m1, m2, m8 |
| psubd m5, m6, m4 ; out9 |
| paddd m6, m4 ; out6 |
| psubd m4, m7, m3 ; out8 |
| paddd m7, m3 ; out7 |
| REPX {psrad x, 2}, m6, m7, m4, m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| add r6, 32*8 |
| mova [r6-32*4], m4 |
| mova [r6-32*3], m5 |
| mova [r6-32*2], m1 |
| mova [r6-32*1], m2 |
| mova [r6+32*0], m8 |
| mova [r6+32*1], m9 |
| mova [r6+32*2], m10 |
| mova [r6+32*3], m15 |
| .fast: |
| add r6, 32*8 |
| call .main |
| mova m14, [r6-32*4] |
| mova m13, [r6-32*3] |
| mova m12, [r6-32*2] |
| mova m11, [r6-32*1] |
| mova m10, [r6+32*0] |
| mova m9, [r6+32*1] |
| mova m8, [r6+32*2] |
| psubd m15, m0, m14 ; out15 |
| paddd m0, m14 ; out0 |
| psubd m14, m1, m13 ; out14 |
| paddd m1, m13 ; out1 |
| psubd m13, m2, m12 ; out13 |
| paddd m2, m12 ; out2 |
| psubd m12, m3, m11 ; out12 |
| paddd m3, m11 ; out3 |
| psubd m11, m4, m10 ; out11 |
| paddd m4, m10 ; out4 |
| psubd m10, m5, m9 ; out10 |
| paddd m5, m9 ; out5 |
| psubd m9, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| psubd m8, m7, [r6+32*3] ; out8 |
| paddd m7, [r6+32*3] ; out7 |
| sub r6, 32*8 |
| REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call .transpose |
| lea r6, [pw_5+128] |
| mova [rsp], m15 |
| call m(idct_16x16_internal_8bpc).main |
| mova m1, [rsp+32*1] |
| .end: |
| call .write_16x16 |
| RET |
| ALIGN function_align |
| .write_16x16: |
| mova [rsp+gprsize+32*0], m8 |
| mova [rsp+gprsize+32*1], m9 |
| mova [rsp+gprsize+32*2], m12 |
| vpbroadcastd m12, [pw_2048] |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| pmulhrsw m2, m12 |
| pmulhrsw m3, m12 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| .write_16x16_2: |
| pmulhrsw m0, m12, m4 |
| pmulhrsw m1, m12, m5 |
| pmulhrsw m2, m12, m6 |
| pmulhrsw m3, m12, m7 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m12, [rsp+gprsize+32*0] |
| pmulhrsw m1, m12, [rsp+gprsize+32*1] |
| pmulhrsw m2, m12, m10 |
| pmulhrsw m3, m12, m11 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m12, [rsp+gprsize+32*2] |
| pmulhrsw m1, m12, m13 |
| pmulhrsw m2, m12, m14 |
| pmulhrsw m3, m12, m15 |
| jmp m(idct_16x8_internal_10bpc).write_16x4_zero |
| ALIGN function_align |
| .transpose: |
| test eobd, eobd |
| jl .transpose_fast |
| packssdw m8, [r6-32*4] |
| packssdw m9, [r6-32*3] |
| packssdw m10, [r6-32*2] |
| packssdw m11, [r6-32*1] |
| packssdw m12, [r6+32*0] |
| packssdw m13, [r6+32*1] |
| packssdw m14, [r6+32*2] |
| packssdw m15, [r6+32*3] |
| sub r6, 32*8 |
| packssdw m0, [r6-32*4] |
| packssdw m1, [r6-32*3] |
| packssdw m2, [r6-32*2] |
| packssdw m3, [r6-32*1] |
| packssdw m4, [r6+32*0] |
| packssdw m5, [r6+32*1] |
| packssdw m6, [r6+32*2] |
| packssdw m7, [r6+32*3] |
| mova [r6], m8 |
| punpckhwd m8, m0, m1 |
| punpcklwd m0, m1 |
| punpcklwd m1, m2, m3 |
| punpckhwd m2, m3 |
| punpckhwd m3, m6, m7 |
| punpcklwd m6, m7 |
| punpcklwd m7, m4, m5 |
| punpckhwd m4, m5 |
| punpckldq m5, m8, m2 |
| punpckhdq m8, m2 |
| punpckhdq m2, m0, m1 |
| punpckldq m0, m1 |
| punpckhdq m1, m7, m6 |
| punpckldq m7, m6 |
| punpckhdq m6, m4, m3 |
| punpckldq m4, m3 |
| punpckhqdq m3, m2, m1 |
| punpcklqdq m2, m1 |
| punpckhqdq m1, m0, m7 |
| punpcklqdq m0, m7 |
| punpcklqdq m7, m8, m6 |
| punpckhqdq m8, m6 |
| punpckhqdq m6, m5, m4 |
| punpcklqdq m5, m4 |
| mova m4, [r6] |
| mova [r6], m8 |
| punpcklwd m8, m4, m9 |
| punpckhwd m4, m9 |
| punpcklwd m9, m10, m11 |
| punpckhwd m10, m11 |
| punpckhwd m11, m14, m15 |
| punpcklwd m14, m15 |
| punpckhwd m15, m12, m13 |
| punpcklwd m12, m13 |
| punpckldq m13, m4, m10 |
| punpckhdq m4, m10 |
| punpckhdq m10, m8, m9 |
| punpckldq m8, m9 |
| punpckhdq m9, m12, m14 |
| punpckldq m12, m14 |
| punpckhdq m14, m15, m11 |
| punpckldq m15, m11 |
| punpckhqdq m11, m10, m9 |
| punpcklqdq m10, m9 |
| punpckhqdq m9, m8, m12 |
| punpcklqdq m8, m12 |
| punpcklqdq m12, m13, m15 |
| punpckhqdq m13, m15 |
| punpckhqdq m15, m4, m14 |
| punpcklqdq m14, m4, m14 |
| vperm2i128 m4, m0, m8, 0x31 |
| vinserti128 m0, xm8, 1 |
| vinserti128 m8, m5, xm12, 1 |
| vperm2i128 m12, m5, 0x13 |
| vperm2i128 m5, m1, m9, 0x31 |
| vinserti128 m1, xm9, 1 |
| vinserti128 m9, m6, xm13, 1 |
| vperm2i128 m13, m6, 0x13 |
| vperm2i128 m6, m2, m10, 0x31 |
| vinserti128 m2, xm10, 1 |
| vinserti128 m10, m7, xm14, 1 |
| vperm2i128 m14, m7, 0x13 |
| vperm2i128 m7, m3, m11, 0x31 |
| vinserti128 m3, xm11, 1 |
| mova xm11, [r6] |
| vinserti128 m11, xm15, 1 |
| vinserti128 m15, [r6+16], 0 |
| ret |
| .transpose_fast: |
| call m(idct_16x8_internal_10bpc).transpose2 |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| ret |
| ALIGN function_align |
| .main: |
| mova m0, [cq+64* 1] |
| mova m1, [cq+64* 3] |
| mova m2, [cq+64* 5] |
| mova m3, [cq+64* 7] |
| mova m4, [cq+64* 9] |
| mova m5, [cq+64*11] |
| mova m6, [cq+64*13] |
| mova m7, [cq+64*15] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 2] |
| mova m2, [cq+64* 4] |
| mova m3, [cq+64* 6] |
| mova m4, [cq+64* 8] |
| mova m5, [cq+64*10] |
| mova m6, [cq+64*12] |
| mova m7, [cq+64*14] |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| psrld m10, m11, 10 ; pd_2 |
| REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| |
| INV_TXFM_16X16_FN adst, dct |
| INV_TXFM_16X16_FN adst, adst |
| INV_TXFM_16X16_FN adst, flipadst |
| |
| cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m15, [pd_2896] |
| lea r6, [rsp+32*4] |
| sub eobd, 36 |
| jl .fast |
| add cq, 32 |
| call .main |
| sub cq, 32 |
| vpbroadcastd m8, [pd_5120] |
| paddd m4, m8 |
| paddd m6, m8 |
| paddd m9, m8 |
| paddd m11, m8 |
| vpbroadcastd m8, [pd_5119] |
| psubd m5, m8, m5 |
| psubd m7, m8, m7 |
| psubd m10, m8, m10 |
| psubd m12, m8, m12 |
| REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| psrld m4, m15, 10 ; pd_2 |
| paddd m0, m4 |
| psubd m1, m4, m1 |
| paddd m2, m4 |
| psubd m3, m4, m3 |
| psubd m7, m4, [r6-32*4] |
| paddd m6, m4, [r6-32*3] |
| psubd m5, m4, [r6-32*2] |
| paddd m4, [r6-32*1] |
| REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6-32*2], m2 |
| mova [r6-32*1], m3 |
| add r6, 32*8 |
| mova [r6-32*4], m9 |
| mova [r6-32*3], m10 |
| mova [r6-32*2], m11 |
| mova [r6-32*1], m12 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| .fast: |
| add r6, 32*8 |
| call .main |
| vpbroadcastd m14, [pd_5120] |
| vpbroadcastd m13, [pd_5119] |
| psrld m15, 10 ; pd_2 |
| paddd m0, m15 |
| psubd m1, m15, m1 |
| paddd m2, m15 |
| psubd m3, m15, m3 |
| paddd m4, m14 |
| psubd m5, m13, m5 |
| paddd m6, m14 |
| psubd m7, m13, m7 |
| paddd m8, m14, m9 |
| psubd m9, m13, m10 |
| paddd m10, m14, m11 |
| psubd m11, m13, m12 |
| paddd m12, m15, [r6-32*1] |
| psubd m13, m15, [r6-32*2] |
| paddd m14, m15, [r6-32*3] |
| psubd m15, [r6-32*4] |
| .pass1_end: |
| REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 |
| sub r6, 32*8 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x16_internal_10bpc).transpose |
| lea r6, [pw_5+128] |
| mova [rsp], m15 |
| call m(iadst_16x16_internal_8bpc).main |
| call m(iadst_16x16_internal_8bpc).main_pass2_end |
| mova [rsp+32*0], m8 |
| mova [rsp+32*2], m12 |
| mova [rsp+32*3], m13 |
| vpbroadcastd m12, [pw_2048] |
| pxor m13, m13 |
| psubw m13, m12 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m13, [rsp+32*1] |
| mova [rsp+32*1], m9 |
| pmulhrsw m2, m12 |
| pmulhrsw m3, m13 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| pmulhrsw m0, m12, m4 |
| pmulhrsw m1, m13, m5 |
| pmulhrsw m2, m12, m6 |
| pmulhrsw m3, m13, m7 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m12, [rsp+32*0] |
| pmulhrsw m1, m13, [rsp+32*1] |
| pmulhrsw m2, m12, m10 |
| pmulhrsw m3, m13, m11 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m12, [rsp+32*2] |
| pmulhrsw m1, m13, [rsp+32*3] |
| pmulhrsw m2, m12, m14 |
| pmulhrsw m3, m13, m15 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+64* 2] |
| mova m1, [cq+64*13] |
| mova m2, [cq+64* 6] |
| mova m3, [cq+64* 9] |
| mova m4, [cq+64*10] |
| mova m5, [cq+64* 5] |
| mova m6, [cq+64*14] |
| mova m7, [cq+64* 1] |
| vpbroadcastd m12, [pd_2048] |
| call m(iadst_16x8_internal_10bpc).main_part1 |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64*15] |
| mova m2, [cq+64* 4] |
| mova m3, [cq+64*11] |
| mova m4, [cq+64* 8] |
| mova m5, [cq+64* 7] |
| mova m6, [cq+64*12] |
| mova m7, [cq+64* 3] |
| jmp m(iadst_16x8_internal_10bpc).main_part2 |
| |
| INV_TXFM_16X16_FN flipadst, dct |
| INV_TXFM_16X16_FN flipadst, adst |
| INV_TXFM_16X16_FN flipadst, flipadst |
| |
| cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| .pass1: |
| vpbroadcastd m15, [pd_2896] |
| lea r6, [rsp+32*4] |
| sub eobd, 36 |
| jl .fast |
| add cq, 32 |
| call m(iadst_16x16_internal_10bpc).main |
| sub cq, 32 |
| vpbroadcastd m8, [pd_5120] |
| paddd m11, m8 |
| paddd m9, m8 |
| paddd m6, m8 |
| paddd m4, m8 |
| vpbroadcastd m8, [pd_5119] |
| psubd m12, m8, m12 |
| psubd m10, m8, m10 |
| psubd m7, m8, m7 |
| psubd m5, m8, m5 |
| REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 |
| mova [r6+32*0], m12 |
| mova [r6+32*1], m11 |
| mova [r6+32*2], m10 |
| mova [r6+32*3], m9 |
| psrld m9, m15, 10 ; pd_2 |
| psubd m3, m9, m3 |
| paddd m2, m9 |
| psubd m1, m9, m1 |
| paddd m0, m9 |
| psubd m12, m9, [r6-32*4] |
| paddd m11, m9, [r6-32*3] |
| psubd m10, m9, [r6-32*2] |
| paddd m9, [r6-32*1] |
| REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 |
| mova [r6-32*4], m12 |
| mova [r6-32*3], m11 |
| mova [r6-32*2], m10 |
| mova [r6-32*1], m9 |
| add r6, 32*8 |
| mova [r6-32*4], m7 |
| mova [r6-32*3], m6 |
| mova [r6-32*2], m5 |
| mova [r6-32*1], m4 |
| mova [r6+32*0], m3 |
| mova [r6+32*1], m2 |
| mova [r6+32*2], m1 |
| mova [r6+32*3], m0 |
| .fast: |
| add r6, 32*8 |
| call m(iadst_16x16_internal_10bpc).main |
| vpbroadcastd m14, [pd_5120] |
| vpbroadcastd m13, [pd_5119] |
| psrld m15, 10 ; pd_2 |
| psubd m8, m13, m7 |
| paddd m7, m14, m9 |
| paddd m9, m14, m6 |
| psubd m6, m13, m10 |
| psubd m10, m13, m5 |
| paddd m5, m14, m11 |
| paddd m11, m14, m4 |
| psubd m4, m13, m12 |
| psubd m12, m15, m3 |
| paddd m3, m15, [r6-32*1] |
| paddd m13, m15, m2 |
| psubd m2, m15, [r6-32*2] |
| psubd m14, m15, m1 |
| mova m1, m15 |
| paddd m15, m0 |
| psubd m0, m1, [r6-32*4] |
| paddd m1, [r6-32*3] |
| jmp m(iadst_16x16_internal_10bpc).pass1_end |
| .pass2: |
| call m(idct_16x16_internal_10bpc).transpose |
| lea r6, [pw_5+128] |
| mova [rsp], m15 |
| call m(iadst_16x16_internal_8bpc).main |
| call m(iadst_16x16_internal_8bpc).main_pass2_end |
| mova [rsp+32*3], m3 |
| mova [rsp+32*2], m2 |
| mova [rsp+32*0], m0 |
| mova m2, m13 |
| mova m3, m12 |
| vpbroadcastd m12, [pw_2048] |
| pxor m13, m13 |
| psubw m13, m12 |
| pmulhrsw m0, m13, m15 |
| pmulhrsw m1, m12, m14 |
| pmulhrsw m2, m13 |
| pmulhrsw m3, m12 |
| mova m14, m8 |
| mova m15, m9 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| pmulhrsw m0, m13, m11 |
| pmulhrsw m1, m12, m10 |
| pmulhrsw m2, m13, m15 |
| pmulhrsw m3, m12, m14 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m13, m7 |
| pmulhrsw m1, m12, m6 |
| pmulhrsw m2, m13, m5 |
| pmulhrsw m3, m12, m4 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| pmulhrsw m0, m13, [rsp+32*3] |
| pmulhrsw m1, m12, [rsp+32*2] |
| pmulhrsw m2, m13, [rsp+32*1] |
| pmulhrsw m3, m12, [rsp+32*0] |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| |
| INV_TXFM_16X16_FN identity, dct, -92 |
| INV_TXFM_16X16_FN identity, identity |
| |
| cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m15, [pd_5793] |
| vpbroadcastd m7, [pd_5120] |
| lea r6, [rsp+32*4] |
| sub eobd, 36 |
| jl .fast |
| mov r3, -32*8*4 |
| .righthalf: |
| pmulld m0, m15, [cq+r3+32*33] |
| pmulld m1, m15, [cq+r3+32*35] |
| pmulld m2, m15, [cq+r3+32*37] |
| pmulld m3, m15, [cq+r3+32*39] |
| add r6, 32*4 |
| REPX {paddd x, m7}, m0, m1, m2, m3 |
| REPX {psrad x, 13}, m0, m1, m2, m3 |
| mova [r6+32*0], m0 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m2 |
| mova [r6+32*3], m3 |
| add r3, 32*8 |
| jl .righthalf |
| .fast: |
| pmulld m0, m15, [cq+64* 0] |
| pmulld m1, m15, [cq+64* 1] |
| pmulld m2, m15, [cq+64* 2] |
| pmulld m3, m15, [cq+64* 3] |
| pmulld m4, m15, [cq+64* 4] |
| pmulld m5, m15, [cq+64* 5] |
| pmulld m6, m15, [cq+64* 6] |
| pmulld m8, m15, [cq+64* 7] |
| mova [cq], m8 |
| pmulld m8, m15, [cq+64* 8] |
| pmulld m9, m15, [cq+64* 9] |
| pmulld m10, m15, [cq+64*10] |
| pmulld m11, m15, [cq+64*11] |
| pmulld m12, m15, [cq+64*12] |
| pmulld m13, m15, [cq+64*13] |
| pmulld m14, m15, [cq+64*14] |
| pmulld m15, [cq+64*15] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| paddd m7, [cq] |
| REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp tx2q |
| .pass2: |
| call m(idct_16x16_internal_10bpc).transpose |
| |
| mova [cq+32*0], m15 |
| mova [cq+32*1], m0 |
| vpbroadcastd m15, [pw_1697x16] |
| |
| REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ |
| 8, 9, 10, 11, 12, 13, 14 |
| mova m0, [cq+32*1] |
| mova [cq+32*1], m1 |
| IDTX16 0, 1, 15 |
| mova m1, [cq+32*0] |
| pmulhrsw m15, m1 |
| paddsw m1, m1 |
| paddsw m15, m1 |
| mova m1, [cq+32*1] |
| jmp m(idct_16x16_internal_10bpc).end |
| |
| INV_TXFM_16X16_FN dct, dct, 0, 12 |
| INV_TXFM_16X16_FN dct, identity, 28, 12 |
| INV_TXFM_16X16_FN dct, adst, 0, 12 |
| INV_TXFM_16X16_FN dct, flipadst, 0, 12 |
| |
| cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| jmp m(idct_16x16_internal_10bpc).pass1 |
| .pass2: |
| mova [cq+32* 8], m8 |
| mova [cq+32* 9], m9 |
| mova [cq+32*10], m10 |
| mova [cq+32*11], m11 |
| mova [cq+32*12], m12 |
| mova [cq+32*13], m13 |
| mova [cq+32*14], m14 |
| mova [cq+32*15], m15 |
| call .pass2_main |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| packssdw m2, m4, m5 |
| packssdw m3, m6, m7 |
| packssdw m4, m8, m9 |
| packssdw m5, m10, m11 |
| packssdw m6, m12, m13 |
| packssdw m7, m14, m15 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6-32*2], m2 |
| mova [r6-32*1], m3 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| mova m0, [cq+32* 8] |
| mova m1, [cq+32* 9] |
| mova m2, [cq+32*10] |
| mova m3, [cq+32*11] |
| mova m4, [cq+32*12] |
| mova m5, [cq+32*13] |
| mova m6, [cq+32*14] |
| mova m7, [cq+32*15] |
| mov r5, r6 |
| add r6, 32*16 |
| call .pass2_main |
| jmp m(iadst_16x16_internal_12bpc).end |
| ALIGN function_align |
| .write_16x16: |
| mova [rsp+gprsize+32*0], m8 |
| mova [rsp+gprsize+32*1], m9 |
| mova [rsp+gprsize+32*2], m12 |
| vpbroadcastd m12, [pw_16384] |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| pmulhrsw m2, m12 |
| pmulhrsw m3, m12 |
| call m(idct_16x8_internal_12bpc).write_16x4_start |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| jmp m(idct_16x16_internal_10bpc).write_16x16_2 |
| ALIGN function_align |
| .pass2_main: |
| call m(idct_8x8_internal_12bpc).transpose_8x8 |
| mova [cq+32* 0], m0 |
| mova [cq+32* 1], m2 |
| mova [cq+32* 2], m4 |
| mova [cq+32* 3], m6 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| pmaxsd m0, m12, m1 |
| pmaxsd m1, m12, m3 |
| pmaxsd m2, m12, m5 |
| pmaxsd m3, m12, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| test eobd, eobd |
| jge .pass2_slow |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| jmp .pass2_fast |
| .pass2_slow: |
| sub r6, 32*8 |
| mova m8, [r6-32*4] |
| mova m4, [r6-32*3] |
| mova m10, [r6-32*2] |
| mova m5, [r6-32*1] |
| mova m12, [r6+32*0] |
| mova m6, [r6+32*1] |
| mova m14, [r6+32*2] |
| mova m7, [r6+32*3] |
| TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 |
| mova [cq+32* 4], m8 |
| mova [cq+32* 5], m10 |
| mova [cq+32* 6], m12 |
| mova [cq+32* 7], m14 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| REPX {pmaxsd x, m12}, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m4, m5, m6, m7 |
| .pass2_fast: |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m14, [pd_2896] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| pmaxsd m0, m12, [cq+32* 0] |
| pmaxsd m1, m12, [cq+32* 1] |
| pmaxsd m2, m12, [cq+32* 2] |
| pmaxsd m3, m12, [cq+32* 3] |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| test eobd, eobd |
| jge .pass2_slow2 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| jmp .pass2_fast2 |
| .pass2_slow2: |
| pmaxsd m4, m12, [cq+32* 4] |
| pmaxsd m5, m12, [cq+32* 5] |
| pmaxsd m6, m12, [cq+32* 6] |
| pmaxsd m7, m12, [cq+32* 7] |
| REPX {pminsd x, m13}, m4, m5, m6, m7 |
| .pass2_fast2: |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| psrad m11, 8 ; pd_8 |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_16x8_internal_10bpc).pass1_rotations |
| REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| ret |
| |
| INV_TXFM_16X16_FN adst, dct, 0, 12 |
| INV_TXFM_16X16_FN adst, adst, 0, 12 |
| INV_TXFM_16X16_FN adst, flipadst, 0, 12 |
| |
| cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_20b_min] |
| vpbroadcastd m14, [clip_20b_max] |
| jmp m(iadst_16x16_internal_10bpc).pass1 |
| .pass2: |
| call .pass2_part1 |
| call m(iadst_16x8_internal_10bpc).pass1_rotations |
| call .pass2_part2 |
| call m(iadst_16x8_internal_10bpc).pass1_rotations |
| .pass2_part3: |
| REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 |
| .end: |
| packssdw m15, m14 |
| packssdw m14, m13, m12 |
| packssdw m13, m11, m10 |
| packssdw m12, m9, m8 |
| packssdw m11, m7, m6 |
| packssdw m10, m5, m4 |
| packssdw m7, m3, m2 |
| packssdw m6, m1, m0 |
| vpblendd m0, m6, [r5-32*4], 0x33 |
| vpblendd m1, m6, [r5-32*4], 0xcc |
| vpblendd m2, m7, [r5-32*3], 0x33 |
| vpblendd m3, m7, [r5-32*3], 0xcc |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| call m(idct_16x8_internal_12bpc).write_16x4_start |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| vpblendd m0, m10, [r5-32*2], 0x33 |
| vpblendd m1, m10, [r5-32*2], 0xcc |
| vpblendd m2, m11, [r5-32*1], 0x33 |
| vpblendd m3, m11, [r5-32*1], 0xcc |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| vpblendd m0, m12, [r5+32*0], 0x33 |
| vpblendd m1, m12, [r5+32*0], 0xcc |
| vpblendd m2, m13, [r5+32*1], 0x33 |
| vpblendd m3, m13, [r5+32*1], 0xcc |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| vpblendd m0, m14, [r5+32*2], 0x33 |
| vpblendd m1, m14, [r5+32*2], 0xcc |
| vpblendd m2, m15, [r5+32*3], 0x33 |
| vpblendd m3, m15, [r5+32*3], 0xcc |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| call m(idct_16x8_internal_10bpc).write_16x4_zero |
| RET |
| ALIGN function_align |
| .pass2_part1: |
| mova [cq+32* 8], m8 |
| mova [cq+32* 9], m9 |
| mova [cq+32*10], m10 |
| mova [cq+32*11], m11 |
| mova [cq+32*12], m12 |
| mova [cq+32*13], m13 |
| mova [cq+32*14], m14 |
| mova [cq+32*15], m15 |
| .pass2_main: |
| call m(idct_8x8_internal_12bpc).transpose_8x8 |
| mova [cq+32* 0], m0 |
| mova [cq+32* 1], m3 |
| mova [cq+32* 2], m4 |
| mova [cq+32* 3], m7 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| pmaxsd m0, m13, m2 |
| pmaxsd m2, m13, m6 |
| pmaxsd m5, m13, m5 |
| pmaxsd m7, m13, m1 |
| REPX {pminsd x, m14}, m0, m2, m5, m7 |
| test eobd, eobd |
| jge .pass2_slow |
| pxor m1, m1 |
| REPX {mova x, m1}, m3, m4, m6 |
| jmp .pass2_fast |
| .pass2_slow: |
| sub r6, 32*8 |
| mova m8, [r6-32*4] |
| mova m3, [r6-32*3] |
| mova m4, [r6-32*2] |
| mova m11, [r6-32*1] |
| mova m12, [r6+32*0] |
| mova m1, [r6+32*1] |
| mova m6, [r6+32*2] |
| mova m15, [r6+32*3] |
| TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 |
| mova [cq+32* 4], m8 |
| mova [cq+32* 5], m11 |
| mova [cq+32* 6], m12 |
| mova [cq+32* 7], m15 |
| vpbroadcastd m13, [clip_18b_min] |
| vpbroadcastd m14, [clip_18b_max] |
| REPX {pmaxsd x, m13}, m1, m3, m4, m6 |
| REPX {pminsd x, m14}, m1, m3, m4, m6 |
| .pass2_fast: |
| vpbroadcastd m12, [pd_2048] |
| vpbroadcastd m15, [pd_2896] |
| call m(iadst_16x8_internal_10bpc).main_part1 |
| pmaxsd m0, m13, [cq+32* 0] ; 0 |
| pmaxsd m7, m13, [cq+32* 1] ; 3 |
| pmaxsd m2, m13, [cq+32* 2] ; 4 |
| pmaxsd m5, m13, [cq+32* 3] ; 7 |
| REPX {pminsd x, m14}, m0, m2, m5, m7 |
| test eobd, eobd |
| jge .pass2_slow2 |
| pxor m1, m1 |
| REPX {mova x, m1}, m3, m4, m6 |
| jmp .pass2_fast2 |
| .pass2_slow2: |
| pmaxsd m4, m13, [cq+32* 4] ; 8 |
| pmaxsd m3, m13, [cq+32* 5] ; 11 |
| pmaxsd m6, m13, [cq+32* 6] ; 12 |
| pmaxsd m1, m13, [cq+32* 7] ; 15 |
| REPX {pminsd x, m14}, m1, m3, m4, m6 |
| .pass2_fast2: |
| call m(iadst_16x8_internal_10bpc).main_part2 |
| vpbroadcastd m14, [pd_17408] |
| psrld m15, 11 ; pd_1 |
| psubd m13, m14, m15 ; pd_17407 |
| pslld m15, 3 ; pd_8 |
| ret |
| ALIGN function_align |
| .pass2_part2: |
| REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 |
| packssdw m0, m1 |
| packssdw m1, m2, m3 |
| packssdw m2, m4, m5 |
| packssdw m3, m6, m7 |
| packssdw m4, m8, m9 |
| packssdw m5, m10, m11 |
| packssdw m6, m12, m13 |
| packssdw m7, m14, m15 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6-32*2], m2 |
| mova [r6-32*1], m3 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| mova m0, [cq+32* 8] |
| mova m1, [cq+32* 9] |
| mova m2, [cq+32*10] |
| mova m3, [cq+32*11] |
| mova m4, [cq+32*12] |
| mova m5, [cq+32*13] |
| mova m6, [cq+32*14] |
| mova m7, [cq+32*15] |
| mov r5, r6 |
| add r6, 32*16 |
| jmp .pass2_main |
| |
| INV_TXFM_16X16_FN flipadst, dct, 0, 12 |
| INV_TXFM_16X16_FN flipadst, adst, 0, 12 |
| INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 |
| |
| cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m13, [clip_20b_min] |
| vpbroadcastd m14, [clip_20b_max] |
| jmp m(iflipadst_16x16_internal_10bpc).pass1 |
| .pass2: |
| call m(iadst_16x16_internal_12bpc).pass2_part1 |
| call m(iflipadst_16x8_internal_10bpc).pass1_rotations |
| call m(iadst_16x16_internal_12bpc).pass2_part2 |
| call m(iflipadst_16x8_internal_10bpc).pass1_rotations |
| jmp m(iadst_16x16_internal_12bpc).pass2_part3 |
| |
| INV_TXFM_16X16_FN identity, dct, -92, 12 |
| INV_TXFM_16X16_FN identity, identity, 0, 12 |
| |
| %macro IDTX16_12BPC 1 ; src |
| pmulld m6, m7, m%1 |
| paddd m6, m15 |
| psrad m6, 12 |
| paddd m6, m%1 |
| psrad m%1, m6, 1 |
| %endmacro |
| |
| cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 |
| vpbroadcastd m7, [pd_1697] |
| vpbroadcastd m15, [pd_5120] |
| lea r6, [rsp+32*4] |
| sub eobd, 36 |
| jl .fast |
| mov r3, -32*8*4 |
| .righthalf: |
| mova m10, [cq+r3+32*33] |
| mova m11, [cq+r3+32*35] |
| mova m12, [cq+r3+32*37] |
| mova m13, [cq+r3+32*39] |
| add r6, 32*4 |
| pmulld m0, m7, m10 |
| pmulld m1, m7, m11 |
| pmulld m2, m7, m12 |
| pmulld m3, m7, m13 |
| REPX {paddd x, m15}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| paddd m0, m10 |
| paddd m1, m11 |
| paddd m2, m12 |
| paddd m3, m13 |
| REPX {psrad x, 1 }, m0, m1, m2, m3 |
| mova [r6+32*0], m0 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m2 |
| mova [r6+32*3], m3 |
| add r3, 32*8 |
| jl .righthalf |
| .fast: |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 1] |
| mova m2, [cq+64* 2] |
| mova m3, [cq+64* 3] |
| mova m4, [cq+64* 4] |
| mova m5, [cq+64* 5] |
| mova m8, [cq+64* 6] |
| mova m9, [cq+64* 7] |
| REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 |
| mova [cq+64*0], m8 |
| mova [cq+64*1], m9 |
| mova m8, [cq+64* 8] |
| mova m9, [cq+64* 9] |
| mova m10, [cq+64*10] |
| mova m11, [cq+64*11] |
| mova m12, [cq+64*12] |
| mova m13, [cq+64*13] |
| mova m14, [cq+64*14] |
| REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 |
| mova m6, [cq+64*15] |
| pmulld m7, m6 |
| paddd m7, m15 |
| psrad m7, 12 |
| paddd m7, m6 |
| mova m6, [cq+64*0] |
| psrad m15, m7, 1 |
| mova m7, [cq+64*1] |
| jmp tx2q |
| .pass2: |
| call m(iidentity_8x16_internal_12bpc).pass2_main |
| call m(idct_16x16_internal_10bpc).transpose_fast |
| test eobd, eobd |
| jl .pass2_fast |
| mova [cq+32* 8], m0 |
| mova [cq+32* 9], m1 |
| mova [cq+32*10], m2 |
| mova [cq+32*11], m3 |
| mova [cq+32*12], m4 |
| mova [cq+32*13], m5 |
| mova [cq+32*14], m6 |
| mova [cq+32*15], m7 |
| mova m8, [r6-32*4] |
| mova m9, [r6-32*3] |
| mova m10, [r6-32*2] |
| mova m11, [r6-32*1] |
| mova m12, [r6+32*0] |
| mova m13, [r6+32*1] |
| mova m14, [r6+32*2] |
| mova m15, [r6+32*3] |
| sub r6, 32*8 |
| mova m0, [r6-32*4] |
| mova m1, [r6-32*3] |
| mova m2, [r6-32*2] |
| mova m3, [r6-32*1] |
| mova m4, [r6+32*0] |
| mova m5, [r6+32*1] |
| mova m6, [r6+32*2] |
| mova m7, [r6+32*3] |
| call m(iidentity_8x16_internal_12bpc).pass2_main |
| call m(idct_16x8_internal_10bpc).transpose2 |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| mova m12, m4 |
| mova m13, m5 |
| mova m14, m6 |
| mova m15, m7 |
| mova m0, [cq+32* 8] |
| mova m1, [cq+32* 9] |
| mova m2, [cq+32*10] |
| mova m3, [cq+32*11] |
| mova m4, [cq+32*12] |
| mova m5, [cq+32*13] |
| mova m6, [cq+32*14] |
| mova m7, [cq+32*15] |
| .pass2_fast: |
| call m(idct_16x16_internal_12bpc).write_16x16 |
| RET |
| |
| %macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack |
| mova m%4, [r6+32*(%1-4)] |
| mova m%2, [r5+32*(3-%1)] |
| mova m%5, [r4+32*(%1-4)] |
| psubd m%3, m%1, m%4 ; idct16 out15 - n |
| paddd m%1, m%4 ; idct16 out0 + n |
| pmaxsd m%1, m12 |
| pmaxsd m%3, m12 |
| pminsd m%1, m13 |
| pminsd m%3, m13 |
| paddd m%1, m11 |
| paddd m%3, m11 |
| psubd m%4, m%1, m%2 ; out31 - n |
| paddd m%1, m%2 ; out0 + n |
| paddd m%2, m%3, m%5 ; out15 - n |
| psubd m%3, m%5 ; out16 + n |
| REPX {psrad x, %6}, m%1, m%3, m%2, m%4 |
| %if %7 & 1 |
| packssdw m%1, m%3 ; out0 + n, out16 + n |
| packssdw m%2, m%4 ; out15 - n, out31 - n |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vbroadcasti128 m14, [idct32_shuf] |
| mov r4, cq |
| call .pass1_main |
| mova [rsp+32*0], m2 |
| mova [rsp+32*1], m3 |
| cmp eobd, 43 |
| jge .eob43 |
| pxor m4, m4 |
| REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 |
| jmp .pass1_end_fast |
| .eob43: |
| lea r6, [rsp+32*8] |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| call .pass1_main |
| mova [rsp+32*2], m2 |
| cmp eobd, 107 |
| jge .eob107 |
| mova m11, m3 |
| mova m2, m0 |
| mova m3, m1 |
| mova m0, [r6-32*4] |
| mova m1, [r6-32*3] |
| pxor m4, m4 |
| .pass1_end_fast: |
| vpbroadcastd m10, [pw_2048] |
| lea r6, [deint_shuf+128] |
| REPX {mova x, m4}, m5, m6, m7 |
| call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast |
| jmp .end |
| .eob107: |
| mova [rsp+32*3], m3 |
| mova [r6-32*2], m0 |
| mova [r6-32*1], m1 |
| call .pass1_main |
| cmp eobd, 171 |
| jge .eob171 |
| pshufd m12, m2, q1032 |
| pshufd m13, m3, q1032 |
| mova m4, m0 |
| mova m5, m1 |
| pxor m6, m6 |
| REPX {mova x, m6}, m7, m14, m15 |
| jmp .pass1_end |
| .eob171: |
| mova [r6+32*0], m0 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m2 |
| mova [r6+32*3], m3 |
| call .pass1_main |
| pshufd m12, [r6+32*2], q1032 ; out19 out17 |
| pshufd m13, [r6+32*3], q1032 ; out23 out21 |
| mova m4, [r6+32*0] ; out16 out18 |
| mova m5, [r6+32*1] ; out20 out22 |
| pshufd m14, m2, q1032 ; out27 out25 |
| pshufd m15, m3, q1032 ; out31 out29 |
| mova m6, m0 ; out24 out26 |
| mova m7, m1 ; out28 out30 |
| .pass1_end: |
| mova m0, [r6-32*4] ; out0 out2 |
| mova m1, [r6-32*3] ; out4 out6 |
| mova m2, [r6-32*2] ; out8 out10 |
| mova m3, [r6-32*1] ; out12 out14 |
| lea r6, [deint_shuf+128] |
| mova m11, [rsp+32*3] ; out13 out15 |
| vpbroadcastd m10, [pw_2048] |
| call m(inv_txfm_add_dct_dct_8x32_8bpc).main |
| .end: ; [rsp+0*32] = m12 |
| vpbroadcastd m12, [pw_2048] |
| mov cq, r4 |
| mova [rsp+32*1], m8 |
| mova [rsp+32*2], m9 |
| mova [rsp+32*3], m10 |
| mova [rsp+32*4], m11 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4_start |
| vpermq m0, m2, q3120 |
| vpermq m1, m3, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m4, q3120 |
| vpermq m1, m5, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m6, q3120 |
| vpermq m1, m7, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [rsp+32*1], q3120 |
| vpermq m1, [rsp+32*2], q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [rsp+32*3], q3120 |
| vpermq m1, [rsp+32*4], q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [rsp+32*0], q3120 |
| vpermq m1, m13, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m14, q3120 |
| vpermq m1, m15, q2031 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m2, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 32 |
| add r6d, 640 |
| sar r6d, 10 |
| jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 |
| ALIGN function_align |
| .pass1_main_part1: |
| mova m0, [cq+128*0] |
| mova m1, [cq+128*1] |
| mova m2, [cq+128*2] |
| mova m3, [cq+128*3] |
| mova m4, [cq+128*4] |
| mova m5, [cq+128*5] |
| mova m6, [cq+128*6] |
| mova m7, [cq+128*7] |
| call m(idct_8x8_internal_10bpc).main |
| psrld m1, m11, 10 ; pd_2 |
| REPX {paddd x, m1}, m0, m6, m5, m3 |
| paddd m1, m6, m7 ; out1 |
| psubd m6, m7 ; out6 |
| psubd m7, m0, m9 ; out7 |
| paddd m0, m9 ; out0 |
| paddd m2, m5, m4 ; out2 |
| psubd m5, m4 ; out5 |
| psubd m4, m3, m8 ; out4 |
| paddd m3, m8 ; out3 |
| REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| ALIGN function_align |
| .pass1_main: |
| call .pass1_main_part1 |
| add cq, 32 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| pshufb m0, m14 |
| pshufb m2, m14 |
| pshufb m4, m14 |
| pshufb m6, m14 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| vperm2i128 m1, m0, m2, 0x31 ; 4 6 |
| vinserti128 m0, xm2, 1 ; 0 2 |
| vinserti128 m2, m3, xm4, 1 ; 1 3 |
| vperm2i128 m3, m4, 0x31 ; 5 7 |
| ret |
| .main_oddhalf_part1_fast_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| .main_oddhalf_part1_fast: ; lower half zero |
| vpbroadcastd m7, [pd_4091] |
| vpbroadcastd m8, [pd_201] |
| vpbroadcastd m6, [pd_m1380] |
| vpbroadcastd m9, [pd_3857] |
| vpbroadcastd m5, [pd_3703] |
| vpbroadcastd m10, [pd_1751] |
| vpbroadcastd m4, [pd_m2751] |
| vpbroadcastd m15, [pd_3035] |
| pmulld m7, m0 |
| pmulld m0, m8 |
| pmulld m6, m1 |
| pmulld m1, m9 |
| pmulld m5, m2 |
| pmulld m2, m10 |
| pmulld m4, m3 |
| pmulld m3, m15 |
| jmp .main_oddhalf_part1_fast2 |
| .main_oddhalf_part1_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 |
| ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a |
| ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a |
| ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a |
| ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a |
| .main_oddhalf_part1_fast2: |
| REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 |
| REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 |
| psubd m8, m0, m4 ; t17 |
| paddd m0, m4 ; t16 |
| psubd m4, m6, m2 ; t18 |
| paddd m6, m2 ; t19 |
| psubd m2, m1, m5 ; t29 |
| paddd m1, m5 ; t28 |
| psubd m5, m7, m3 ; t30 |
| paddd m7, m3 ; t31 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 |
| REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 |
| vpbroadcastd m15, [pd_4017] |
| vpbroadcastd m10, [pd_799] |
| ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a |
| ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a |
| psubd m3, m0, m6 ; t19a |
| paddd m0, m6 ; t16a |
| psubd m6, m7, m1 ; t28a |
| paddd m7, m1 ; t31a |
| psubd m1, m5, m4 ; t18 |
| paddd m5, m4 ; t17 |
| psubd m4, m8, m2 ; t29 |
| paddd m8, m2 ; t30 |
| REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 |
| REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 |
| vpbroadcastd m15, [pd_3784] |
| vpbroadcastd m10, [pd_1567] |
| ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a |
| ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m5 |
| mova [r6-32*2], m4 |
| mova [r6-32*1], m6 |
| mova [r6+32*0], m3 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m8 |
| mova [r6+32*3], m7 |
| ret |
| .main_oddhalf_part2_fast_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| .main_oddhalf_part2_fast: ; lower half zero |
| vpbroadcastd m7, [pd_m601] |
| vpbroadcastd m8, [pd_4052] |
| vpbroadcastd m6, [pd_3973] |
| vpbroadcastd m9, [pd_995] |
| vpbroadcastd m5, [pd_m2106] |
| vpbroadcastd m10, [pd_3513] |
| vpbroadcastd m4, [pd_3290] |
| vpbroadcastd m15, [pd_2440] |
| pmulld m7, m0 |
| pmulld m0, m8 |
| pmulld m6, m1 |
| pmulld m1, m9 |
| pmulld m5, m2 |
| pmulld m2, m10 |
| pmulld m4, m3 |
| pmulld m3, m15 |
| jmp .main_oddhalf_part2_fast2 |
| .main_oddhalf_part2_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 |
| ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a |
| ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a |
| ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a |
| ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a |
| .main_oddhalf_part2_fast2: |
| REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 |
| REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 |
| psubd m8, m0, m4 ; t25 |
| paddd m0, m4 ; t24 |
| psubd m4, m6, m2 ; t26 |
| paddd m6, m2 ; t27 |
| psubd m2, m1, m5 ; t21 |
| paddd m1, m5 ; t20 |
| psubd m5, m7, m3 ; t22 |
| paddd m7, m3 ; t23 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 |
| REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 |
| vpbroadcastd m15, [pd_2276] |
| vpbroadcastd m10, [pd_3406] |
| ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a |
| ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a |
| psubd m3, m0, m6 ; t27a |
| paddd m0, m6 ; t24a |
| psubd m6, m7, m1 ; t20a |
| paddd m7, m1 ; t23a |
| psubd m1, m5, m4 ; t21 |
| paddd m5, m4 ; t22 |
| psubd m4, m8, m2 ; t26 |
| paddd m8, m2 ; t25 |
| REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 |
| REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 |
| vpbroadcastd m15, [pd_3784] |
| vpbroadcastd m10, [pd_1567] |
| ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a |
| ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 |
| mova m9, [r6-32*4] ; t16a |
| mova m10, [r6-32*3] ; t17 |
| psubd m2, m9, m7 ; t23 |
| paddd m9, m7 ; t16 |
| psubd m7, m10, m5 ; t22a |
| paddd m10, m5 ; t17a |
| REPX {pmaxsd x, m12}, m9, m10, m2, m7 |
| REPX {pminsd x, m13}, m9, m10, m2, m7 |
| mova [r6-32*4], m9 |
| mova [r6-32*3], m10 |
| mova m9, [r6-32*2] ; t18a |
| mova m10, [r6-32*1] ; t19 |
| psubd m5, m9, m1 ; t21 |
| paddd m9, m1 ; t18 |
| psubd m1, m10, m6 ; t20a |
| paddd m10, m6 ; t19a |
| REPX {pmaxsd x, m12}, m9, m10, m5, m1 |
| REPX {pminsd x, m13}, m9, m10, m5, m1 |
| mova [r6-32*2], m9 |
| mova [r6-32*1], m10 |
| mova m9, [r6+32*0] ; t28 |
| mova m10, [r6+32*1] ; t29a |
| psubd m6, m9, m3 ; t27a |
| paddd m9, m3 ; t28a |
| psubd m3, m10, m4 ; t26 |
| paddd m10, m4 ; t29 |
| REPX {pmaxsd x, m12}, m9, m10, m6, m3 |
| REPX {pminsd x, m13}, m9, m10, m6, m3 |
| REPX {pmulld x, m14}, m6, m3, m1, m5 |
| paddd m6, m11 |
| paddd m3, m11 |
| psubd m4, m6, m1 ; t20 |
| paddd m6, m1 ; t27 |
| psubd m1, m3, m5 ; t21a |
| paddd m3, m5 ; t26a |
| REPX {psrad x, 12 }, m4, m1, m3, m6 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m1 |
| mova m4, [r6+32*2] ; t30 |
| mova m1, [r6+32*3] ; t31a |
| psubd m5, m4, m8 ; t25a |
| paddd m4, m8 ; t30a |
| psubd m8, m1, m0 ; t24 |
| paddd m1, m0 ; t31 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m1 |
| REPX {pminsd x, m13}, m8, m5, m4, m1 |
| REPX {pmulld x, m14}, m5, m8, m7, m2 |
| paddd m5, m11 |
| paddd m8, m11 |
| psubd m0, m5, m7 ; t22 |
| paddd m5, m7 ; t25 |
| psubd m7, m8, m2 ; t23a |
| paddd m2, m8 ; t24a |
| REPX {psrad x, 12 }, m0, m7, m2, m5 |
| mova [r6+32*2], m0 |
| mova [r6+32*3], m7 |
| mov r4, r6 |
| add r6, 32*8 |
| mova [r6-32*4], m2 |
| mova [r6-32*3], m5 |
| mova [r6-32*2], m3 |
| mova [r6-32*1], m6 |
| mova [r6+32*0], m9 |
| mova [r6+32*1], m10 |
| mova [r6+32*2], m4 |
| mova [r6+32*3], m1 |
| mov r5, r6 |
| add r6, 32*8 |
| ret |
| ALIGN function_align |
| .main_end: |
| psrld m11, 10 ; pd_2 |
| IDCT32_END 0, 15, 8, 9, 10, 2 |
| IDCT32_END 1, 14, 8, 9, 10, 2 |
| punpckhwd m8, m0, m1 ; 16 17 |
| punpcklwd m0, m1 ; 0 1 |
| punpcklwd m1, m14, m15 ; 14 15 |
| punpckhwd m14, m15 ; 30 31 |
| mova [r5+32*3], m8 |
| mova [r5+32*2], m14 |
| IDCT32_END 2, 15, 8, 9, 10, 2 |
| IDCT32_END 3, 14, 8, 9, 10, 2 |
| punpckhwd m8, m2, m3 ; 18 19 |
| punpcklwd m2, m3 ; 2 3 |
| punpcklwd m3, m14, m15 ; 12 13 |
| punpckhwd m14, m15 ; 28 29 |
| mova [r5+32*1], m8 |
| mova [r5+32*0], m14 |
| IDCT32_END 4, 15, 8, 9, 10, 2 |
| IDCT32_END 5, 14, 8, 9, 10, 2 |
| punpckhwd m8, m4, m5 ; 20 21 |
| punpcklwd m4, m5 ; 4 5 |
| punpcklwd m5, m14, m15 ; 10 11 |
| punpckhwd m14, m15 ; 26 27 |
| mova [r5-32*1], m8 |
| mova [r5-32*2], m14 |
| IDCT32_END 6, 15, 8, 9, 10, 2 |
| IDCT32_END 7, 14, 8, 9, 10, 2 |
| punpckhwd m8, m6, m7 ; 22 23 |
| punpcklwd m6, m7 ; 6 7 |
| punpcklwd m7, m14, m15 ; 8 9 |
| punpckhwd m14, m15 ; 24 25 |
| mova [r5-32*3], m8 |
| mova [r5-32*4], m14 |
| .transpose: |
| punpckhdq m15, m3, m1 |
| punpckldq m3, m1 |
| punpckhdq m1, m4, m6 |
| punpckldq m4, m6 |
| punpckhdq m6, m0, m2 |
| punpckldq m0, m2 |
| punpckhdq m2, m7, m5 |
| punpckldq m7, m5 |
| punpcklqdq m5, m2, m15 |
| punpckhqdq m2, m15 |
| punpckhqdq m15, m7, m3 |
| punpcklqdq m7, m3 |
| punpckhqdq m3, m6, m1 |
| punpcklqdq m6, m1 |
| punpckhqdq m1, m0, m4 |
| punpcklqdq m0, m4 |
| vperm2i128 m4, m0, m7, 0x31 |
| vinserti128 m0, xm7, 1 |
| vperm2i128 m7, m3, m2, 0x31 |
| vinserti128 m3, xm2, 1 |
| vinserti128 m2, m6, xm5, 1 |
| vperm2i128 m6, m5, 0x31 |
| vperm2i128 m5, m1, m15, 0x31 |
| vinserti128 m1, xm15, 1 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob |
| vpbroadcastd m7, [pixel_10bpc_max] |
| .pass1: |
| vpbroadcastd m5, [pw_5] |
| pxor m6, m6 |
| mov r6d, eobd |
| add eobb, 21 |
| cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 |
| lea r6, [strideq*3] |
| lea r5, [strideq*5] |
| lea r4, [strideq+r6*2] ; strideq*7 |
| .loop: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| REPX {paddsw x, m5}, m0, m1, m2, m3 |
| REPX {psraw x, 3 }, m0, m1, m2, m3 |
| call .main_zero |
| add cq, 32 |
| lea dstq, [dstq+strideq*8] |
| sub eobd, 64 |
| jge .loop |
| RET |
| ALIGN function_align |
| .main_zero: |
| REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| .main: |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m4 |
| punpcklwd m0, m4 |
| punpckhwd m4, m2, m1 |
| punpcklwd m2, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| mova xm4, [dstq+strideq*0] |
| vinserti128 m4, [dstq+strideq*4], 1 |
| paddw m0, m4 |
| mova xm4, [dstq+strideq*1] |
| vinserti128 m4, [dstq+r5 ], 1 |
| paddw m1, m4 |
| mova xm4, [dstq+strideq*2] |
| vinserti128 m4, [dstq+r6*2 ], 1 |
| paddw m2, m4 |
| mova xm4, [dstq+r6 ] |
| vinserti128 m4, [dstq+r4 ], 1 |
| paddw m3, m4 |
| REPX {pmaxsw x, m6}, m0, m1, m2, m3 |
| REPX {pminsw x, m7}, m0, m1, m2, m3 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*4], m0, 1 |
| mova [dstq+strideq*1], xm1 |
| vextracti128 [dstq+r5 ], m1, 1 |
| mova [dstq+strideq*2], xm2 |
| vextracti128 [dstq+r6*2 ], m2, 1 |
| mova [dstq+r6 ], xm3 |
| vextracti128 [dstq+r4 ], m3, 1 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| mov r4, cq |
| lea r6, [rsp+32*4] |
| call .pass1_main |
| cmp eobd, 43 |
| jge .eob43 |
| jmp .pass2_fast |
| .eob43: |
| call .pass1_main |
| cmp eobd, 107 |
| jge .eob107 |
| .pass2_fast: |
| mov cq, r4 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| pmaxsd m0, m12, [cq+128*1+ 0] |
| pmaxsd m1, m12, [cq+128*7+ 0] |
| pmaxsd m2, m12, [cq+128*1+32] |
| pmaxsd m3, m12, [cq+128*7+32] |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| vpbroadcastd m14, [pd_2896] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast |
| pmaxsd m0, m12, [cq+128*3+ 0] |
| pmaxsd m1, m12, [cq+128*5+ 0] |
| pmaxsd m2, m12, [cq+128*3+32] |
| pmaxsd m3, m12, [cq+128*5+32] |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast |
| pmaxsd m0, m12, [cq+128*2+ 0] |
| pmaxsd m1, m12, [cq+128*6+ 0] |
| pmaxsd m2, m12, [cq+128*2+32] |
| pmaxsd m3, m12, [cq+128*6+32] |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| call m(idct_8x16_internal_10bpc).main_oddhalf_fast |
| pmaxsd m0, m12, [cq+128*0+ 0] |
| pmaxsd m1, m12, [cq+128*4+ 0] |
| pmaxsd m2, m12, [cq+128*0+32] |
| pmaxsd m3, m12, [cq+128*4+32] |
| REPX {pminsd x, m13}, m0, m1, m2, m3 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| jmp .pass2_end |
| .eob107: |
| call .pass1_main |
| cmp eobd, 171 |
| jge .eob171 |
| jmp .pass2 |
| .eob171: |
| call .pass1_main |
| .pass2: |
| mov cq, r4 |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| pmaxsd m0, m12, [cq+128*1+ 0] |
| pmaxsd m1, m12, [cq+128*7+ 0] |
| pmaxsd m2, m12, [cq+128*1+32] |
| pmaxsd m3, m12, [cq+128*7+32] |
| pmaxsd m4, m12, [cq+128*1+64] |
| pmaxsd m5, m12, [cq+128*7+64] |
| pmaxsd m6, m12, [cq+128*1+96] |
| pmaxsd m7, m12, [cq+128*7+96] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpbroadcastd m14, [pd_2896] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 |
| pmaxsd m0, m12, [cq+128*3+ 0] |
| pmaxsd m1, m12, [cq+128*5+ 0] |
| pmaxsd m2, m12, [cq+128*3+32] |
| pmaxsd m3, m12, [cq+128*5+32] |
| pmaxsd m4, m12, [cq+128*3+64] |
| pmaxsd m5, m12, [cq+128*5+64] |
| pmaxsd m6, m12, [cq+128*3+96] |
| pmaxsd m7, m12, [cq+128*5+96] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 |
| pmaxsd m0, m12, [cq+128*2+ 0] |
| pmaxsd m1, m12, [cq+128*6+ 0] |
| pmaxsd m2, m12, [cq+128*2+32] |
| pmaxsd m3, m12, [cq+128*6+32] |
| pmaxsd m4, m12, [cq+128*2+64] |
| pmaxsd m5, m12, [cq+128*6+64] |
| pmaxsd m6, m12, [cq+128*2+96] |
| pmaxsd m7, m12, [cq+128*6+96] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| pmaxsd m0, m12, [cq+128*0+ 0] |
| pmaxsd m1, m12, [cq+128*4+ 0] |
| pmaxsd m2, m12, [cq+128*0+32] |
| pmaxsd m3, m12, [cq+128*4+32] |
| pmaxsd m4, m12, [cq+128*0+64] |
| pmaxsd m5, m12, [cq+128*4+64] |
| pmaxsd m6, m12, [cq+128*0+96] |
| pmaxsd m7, m12, [cq+128*4+96] |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| .pass2_end: |
| psrld m11, 8 ; pd_8 |
| IDCT32_END 0, 15, 8, 9, 10, 4 |
| IDCT32_END 1, 14, 8, 9, 10, 4 |
| punpckhqdq m8, m0, m1 ; 16 17 (interleaved) |
| punpcklqdq m0, m1 ; 0 1 (interleaved) |
| punpcklqdq m1, m14, m15 ; 14 15 (interleaved) |
| punpckhqdq m14, m15 ; 30 31 (interleaved) |
| mova [r5+32*3], m8 |
| mova [r5+32*2], m14 |
| IDCT32_END 2, 15, 8, 9, 10, 4 |
| IDCT32_END 3, 14, 8, 9, 10, 4 |
| punpckhqdq m8, m2, m3 ; 18 19 (interleaved) |
| punpcklqdq m2, m3 ; 2 3 (interleaved) |
| punpcklqdq m3, m14, m15 ; 12 13 (interleaved) |
| punpckhqdq m14, m15 ; 28 29 (interleaved) |
| mova [r5+32*1], m8 |
| mova [r5+32*0], m14 |
| IDCT32_END 4, 15, 8, 9, 10, 4 |
| IDCT32_END 5, 14, 8, 9, 10, 4 |
| punpckhqdq m8, m4, m5 ; 20 21 (interleaved) |
| punpcklqdq m4, m5 ; 4 5 (interleaved) |
| punpcklqdq m5, m14, m15 ; 10 11 (interleaved) |
| punpckhqdq m14, m15 ; 26 27 (interleaved) |
| mova [r5-32*1], m8 |
| mova [r5-32*2], m14 |
| IDCT32_END 6, 15, 8, 9, 10, 4 |
| IDCT32_END 7, 14, 8, 9, 10, 4 |
| punpckhqdq m8, m6, m7 ; 22 23 (interleaved) |
| punpcklqdq m6, m7 ; 6 7 (interleaved) |
| punpcklqdq m7, m14, m15 ; 8 9 (interleaved) |
| punpckhqdq m14, m15 ; 24 25 (interleaved) |
| mova [r5-32*3], m8 |
| mova [r5-32*4], m14 |
| mova m15, m1 |
| .end: |
| vpermq m0, m0, q3120 |
| vpermq m1, m2, q3120 |
| call m(idct_8x8_internal_12bpc).write_8x4_start |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m4, q3120 |
| vpermq m1, m6, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m7, q3120 |
| vpermq m1, m5, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, m3, q3120 |
| vpermq m1, m15, q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [r5+32*3], q3120 |
| vpermq m1, [r5+32*1], q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [r5-32*1], q3120 |
| vpermq m1, [r5-32*3], q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [r5-32*4], q3120 |
| vpermq m1, [r5-32*2], q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| vpermq m0, [r5+32*0], q3120 |
| vpermq m1, [r5+32*2], q3120 |
| call m(idct_8x8_internal_10bpc).write_8x4 |
| RET |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m2, [dconly_12bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 32 |
| add r6d, 640 |
| sar r6d, 10 |
| jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 |
| ALIGN function_align |
| .pass1_main: |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 |
| TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 |
| mova [cq+128*0], m0 |
| mova [cq+128*1], m1 |
| mova [cq+128*2], m2 |
| mova [cq+128*3], m3 |
| mova [cq+128*4], m4 |
| mova [cq+128*5], m5 |
| mova [cq+128*6], m6 |
| mova [cq+128*7], m7 |
| add cq, 32 |
| ret |
| ALIGN function_align |
| .main_end: |
| psrld m11, 10 ; pd_2 |
| IDCT32_END 0, 15, 8, 9, 10, 2, 0 |
| mova [cq+32*16], m8 |
| mova [cq+32*31], m9 |
| IDCT32_END 1, 14, 8, 9, 10, 2, 0 |
| mova [cq+32*17], m8 |
| mova [cq+32*30], m9 |
| mova [cq+32*14], m14 |
| IDCT32_END 2, 14, 8, 9, 10, 2, 0 |
| mova [cq+32*18], m8 |
| mova [cq+32*29], m9 |
| mova [cq+32*13], m14 |
| IDCT32_END 3, 14, 8, 9, 10, 2, 0 |
| mova [cq+32*19], m8 |
| mova [cq+32*28], m9 |
| mova [cq+32*12], m14 |
| IDCT32_END 4, 14, 8, 9, 10, 2, 0 |
| mova [cq+32*20], m8 |
| mova [cq+32*27], m9 |
| mova [cq+32* 0], m0 |
| mova [cq+32* 1], m1 |
| mova [cq+32* 2], m2 |
| IDCT32_END 5, 10, 0, 1, 2, 2, 0 |
| mova [cq+32*21], m0 |
| mova [cq+32*26], m1 |
| IDCT32_END 6, 9, 0, 1, 2, 2, 0 |
| mova [cq+32*22], m0 |
| mova [cq+32*25], m1 |
| IDCT32_END 7, 8, 0, 1, 2, 2, 0 |
| mova [cq+32*23], m0 |
| mova [cq+32*24], m1 |
| mova m0, [cq+32* 0] |
| mova m1, [cq+32* 1] |
| mova m2, [cq+32* 2] |
| mova m11, m14 |
| mova m12, [cq+32*12] |
| mova m13, [cq+32*13] |
| mova m14, [cq+32*14] |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob |
| vpbroadcastd m7, [pixel_12bpc_max] |
| jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 |
| |
| cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jnz .full |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 8 |
| .dconly: |
| add r6d, 640 |
| sar r6d, 10 |
| .dconly2: |
| imul r6d, 181 |
| add r6d, 2176 |
| sar r6d, 12 |
| movd xm0, r6d |
| paddsw xm0, xm3 |
| vpbroadcastw m0, xm0 |
| .dconly_loop: |
| paddsw m1, m0, [dstq+32*0] |
| paddsw m2, m0, [dstq+32*1] |
| psubusw m1, m3 |
| psubusw m2, m3 |
| mova [dstq+32*0], m1 |
| mova [dstq+32*1], m2 |
| add dstq, strideq |
| dec r3d |
| jg .dconly_loop |
| RET |
| .full: |
| PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob |
| lea r6, [rsp+32*4] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| call .pass1 |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end |
| lea r6, [deint_shuf+128] |
| vpbroadcastd m11, [pw_2048] |
| mov r4, dstq |
| call .pass2 |
| mova m0, [r5+32*3] ; 16 17 |
| mova m1, [r5+32*2] ; 30 31 |
| mova m2, [r5+32*1] ; 18 19 |
| mova m3, [r5+32*0] ; 28 29 |
| mova m4, [r5-32*1] ; 20 21 |
| mova m5, [r5-32*2] ; 26 27 |
| mova m6, [r5-32*3] ; 22 23 |
| mova m7, [r5-32*4] ; 24 25 |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| lea dstq, [r4+32] |
| call .pass2 |
| RET |
| ALIGN function_align |
| .pass2: |
| call m(idct_16x8_internal_8bpc).main |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3 |
| call m(idct_16x8_internal_10bpc).write_16x4_start |
| pmulhrsw m0, m11, m4 |
| pmulhrsw m1, m11, m5 |
| pmulhrsw m2, m11, m6 |
| pmulhrsw m3, m11, m7 |
| jmp m(idct_16x8_internal_10bpc).write_16x4_zero |
| ALIGN function_align |
| .pass1: |
| mova m0, [cq+32* 1] |
| mova m1, [cq+32* 7] |
| mova m2, [cq+32* 9] |
| mova m3, [cq+32*15] |
| mova m4, [cq+32*17] |
| mova m5, [cq+32*23] |
| mova m6, [cq+32*25] |
| mova m7, [cq+32*31] |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m14, [pd_2896] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 |
| mova m0, [cq+32* 3] |
| mova m1, [cq+32* 5] |
| mova m2, [cq+32*11] |
| mova m3, [cq+32*13] |
| mova m4, [cq+32*19] |
| mova m5, [cq+32*21] |
| mova m6, [cq+32*27] |
| mova m7, [cq+32*29] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 |
| mova m0, [cq+32* 2] |
| mova m1, [cq+32* 6] |
| mova m2, [cq+32*10] |
| mova m3, [cq+32*14] |
| mova m4, [cq+32*18] |
| mova m5, [cq+32*22] |
| mova m6, [cq+32*26] |
| mova m7, [cq+32*30] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| mova m0, [cq+32* 0] |
| mova m1, [cq+32* 4] |
| mova m2, [cq+32* 8] |
| mova m3, [cq+32*12] |
| mova m4, [cq+32*16] |
| mova m5, [cq+32*20] |
| mova m6, [cq+32*24] |
| mova m7, [cq+32*28] |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob |
| vpbroadcastd m7, [pixel_10bpc_max] |
| .pass1: |
| vpbroadcastd m5, [pw_4096] |
| pxor m6, m6 |
| mov r6d, eobd |
| add eobb, 21 |
| cmovc eobd, r6d |
| lea r6, [strideq*3] |
| lea r5, [strideq*5] |
| lea r4, [strideq+r6*2] ; strideq*7 |
| .loop: |
| mova m0, [cq+32*0] |
| packssdw m0, [cq+32*1] |
| mova m1, [cq+32*2] |
| packssdw m1, [cq+32*3] |
| REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 |
| add cq, 32*8 |
| mova m2, [cq-32*4] |
| packssdw m2, [cq-32*3] |
| mova m3, [cq-32*2] |
| packssdw m3, [cq-32*1] |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 |
| call m(inv_txfm_add_identity_identity_8x32_10bpc).main |
| add dstq, 16 |
| sub eobd, 64 |
| jge .loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jnz .full |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_12bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 8 |
| jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly |
| .full: |
| PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob |
| lea r6, [rsp+32*4] |
| vpbroadcastd m12, [clip_20b_min] |
| vpbroadcastd m13, [clip_20b_max] |
| call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 |
| call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end |
| mov r4, dstq |
| call m(idct_16x8_internal_12bpc).pass2_main |
| mova m0, [cq+32* 0] ; 16 |
| mova m1, [cq+32* 1] ; 17 |
| mova m2, [cq+32* 2] ; 18 |
| mova m3, [cq+32* 3] ; 19 |
| mova m4, [cq+32* 4] ; 20 |
| mova m5, [cq+32* 5] ; 21 |
| mova m6, [cq+32* 6] ; 22 |
| mova m7, [cq+32* 7] ; 23 |
| mova m8, [cq+32* 8] ; 24 |
| mova m9, [cq+32* 9] ; 25 |
| mova m10, [cq+32*10] ; 26 |
| mova m11, [cq+32*11] ; 27 |
| mova m12, [cq+32*12] ; 28 |
| mova m13, [cq+32*13] ; 29 |
| mova m14, [cq+32*14] ; 30 |
| mova m15, [cq+32*15] ; 31 |
| lea dstq, [r4+32] |
| call m(idct_16x8_internal_12bpc).pass2_main |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob |
| vpbroadcastd m7, [pixel_12bpc_max] |
| jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 |
| |
| %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] |
| mova m%4, [%2] |
| paddsw m%3, m%1, m%4 |
| psubsw m%1, m%4 |
| %if %1 == 0 |
| pxor m6, m6 |
| %endif |
| pmulhrsw m%3, m15 |
| pmulhrsw m%1, m15 |
| paddw m%3, [dstq+%5] |
| paddw m%1, [r2+%6] |
| pmaxsw m%3, m6 |
| pmaxsw m%1, m6 |
| pminsw m%3, m7 |
| pminsw m%1, m7 |
| mova [dstq+%5], m%3 |
| mova [r2+%6], m%1 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*16] |
| lea r4, [r6+32*8] |
| lea r5, [r6+32*16] |
| call .main |
| sub eobd, 44 |
| jge .eob44 |
| vperm2i128 m2, m0, m3, 0x31 ; 5 |
| vinserti128 m0, xm3, 1 ; 1 |
| vperm2i128 m3, m1, m4, 0x31 ; 7 |
| vinserti128 m1, xm4, 1 ; 3 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 |
| jmp .fast |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 32 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 |
| .eob44: |
| mova [r4+16*0], xm0 |
| mova [r4+16*1], xm3 |
| mova [r4+16*2], xm1 |
| mova [r4+16*3], xm4 |
| vextracti128 [r4+16*4], m0, 1 |
| vextracti128 [r4+16*5], m3, 1 |
| vextracti128 [r4+16*6], m1, 1 |
| vextracti128 [r4+16*7], m4, 1 |
| call .main |
| sub eobd, 107 |
| jge .eob151 |
| vperm2i128 m7, m1, m4, 0x31 ; 15 |
| vinserti128 m5, m1, xm4, 1 ; 11 |
| vperm2i128 m6, m0, m3, 0x31 ; 13 |
| vinserti128 m4, m0, xm3, 1 ; 9 |
| mova m0, [r4+32*0] |
| mova m1, [r4+32*1] |
| mova m2, [r4+32*2] |
| mova m3, [r4+32*3] |
| .fast: |
| lea r6, [pw_5+128] |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 |
| jmp .idct16 |
| .eob151: |
| mova [r4-16*8], xm0 |
| mova [r4-16*7], xm3 |
| mova [r4-16*6], xm1 |
| mova [r4-16*5], xm4 |
| vextracti128 [r4-16*4], m0, 1 |
| vextracti128 [r4-16*3], m3, 1 |
| vextracti128 [r4-16*2], m1, 1 |
| vextracti128 [r4-16*1], m4, 1 |
| call .main |
| sub eobd, 128 |
| jge .eob279 |
| vperm2i128 m10, m0, m3, 0x31 ; 21 |
| vinserti128 m8, m0, xm3, 1 ; 17 |
| vperm2i128 m11, m1, m4, 0x31 ; 23 |
| vinserti128 m9, m1, xm4, 1 ; 19 |
| pxor m12, m12 |
| REPX {mova x, m12}, m13, m14, m15 |
| REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 |
| jmp .full |
| .eob279: |
| mova [r5+16*0], xm0 |
| mova [r5+16*1], xm3 |
| mova [r5+16*2], xm1 |
| mova [r5+16*3], xm4 |
| vextracti128 [r5+16*4], m0, 1 |
| vextracti128 [r5+16*5], m3, 1 |
| vextracti128 [r5+16*6], m1, 1 |
| vextracti128 [r5+16*7], m4, 1 |
| call .main |
| vperm2i128 m14, m0, m3, 0x31 ; 29 |
| vinserti128 m12, m0, xm3, 1 ; 25 |
| vperm2i128 m15, m1, m4, 0x31 ; 31 |
| vinserti128 m13, m1, xm4, 1 ; 27 |
| mova m8, [r5+32*0] |
| mova m9, [r5+32*1] |
| mova m10, [r5+32*2] |
| mova m11, [r5+32*3] |
| .full: |
| mova m0, [r4+32*0] |
| mova m1, [r4+32*1] |
| mova m2, [r4+32*2] |
| mova m3, [r4+32*3] |
| mova m4, [r4-32*4] |
| mova m5, [r4-32*3] |
| mova m6, [r4-32*2] |
| mova m7, [r4-32*1] |
| lea r6, [pw_5 + 128] |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf |
| lea r3, [rsp+32*8] |
| mova m8, [r3+32*0] |
| mova m9, [r3+32*1] |
| mova m10, [r3+32*2] |
| mova m11, [r3+32*3] |
| mova m12, [r3-32*4] |
| mova m13, [r3-32*3] |
| mova m14, [r3-32*2] |
| mova m15, [r3-32*1] |
| .idct16: |
| lea r3, [rsp+32*16] |
| mova m0, [r3+32*0] |
| mova m1, [r3+32*1] |
| mova m2, [r3+32*2] |
| mova m3, [r3+32*3] |
| mova m4, [r3-32*4] |
| mova m5, [r3-32*3] |
| mova m6, [r3-32*2] |
| mova m7, [r3-32*1] |
| mova [rsp], m15 |
| call m(idct_16x16_internal_8bpc).main |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| call .pass2_end |
| RET |
| ALIGN function_align |
| .main: |
| pmulld m0, m14, [cq+128* 1] |
| pmulld m1, m14, [cq+128* 3] |
| pmulld m2, m14, [cq+128* 5] |
| pmulld m3, m14, [cq+128* 7] |
| pmulld m4, m14, [cq+128* 9] |
| pmulld m5, m14, [cq+128*11] |
| pmulld m6, m14, [cq+128*13] |
| pmulld m7, m14, [cq+128*15] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 |
| pmulld m0, m14, [cq+128* 0] |
| pmulld m1, m14, [cq+128* 2] |
| pmulld m2, m14, [cq+128* 4] |
| pmulld m3, m14, [cq+128* 6] |
| pmulld m4, m14, [cq+128* 8] |
| pmulld m5, m14, [cq+128*10] |
| pmulld m6, m14, [cq+128*12] |
| pmulld m7, m14, [cq+128*14] |
| call m(idct_8x8_internal_10bpc).main_rect2 |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| psrld m15, m11, 11 ; pd_1 |
| mova m8, [r6-32*4] |
| mova m9, [r6-32*3] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 |
| psubd m10, m0, m8 ; out15 |
| paddd m0, m8 ; out0 |
| mova m8, [r6-32*2] |
| paddd m15, m1, m9 ; out1 |
| psubd m1, m9 ; out14 |
| mova m9, [r6-32*1] |
| REPX {psrad x, 1}, m0, m15, m10, m1 |
| packssdw m0, m15 |
| packssdw m1, m10 |
| psubd m10, m2, m8 ; out13 |
| paddd m2, m8 ; out2 |
| mova m8, [r6+32*0] |
| paddd m15, m3, m9 ; out3 |
| psubd m3, m9 ; out12 |
| mova m9, [r6+32*1] |
| REPX {psrad x, 1}, m2, m15, m10, m3 |
| packssdw m2, m15 |
| packssdw m3, m10 |
| psubd m10, m4, m8 ; out11 |
| paddd m4, m8 ; out4 |
| mova m8, [r6+32*2] |
| paddd m15, m5, m9 ; out5 |
| psubd m5, m9 ; out10 |
| mova m9, [r6+32*3] |
| REPX {psrad x, 1}, m4, m10, m15, m5 |
| packssdw m4, m15 |
| packssdw m5, m10 |
| psubd m10, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| paddd m15, m7, m9 ; out7 |
| psubd m7, m9 ; out8 |
| REPX {psrad x, 1}, m6, m10, m15, m7 |
| packssdw m6, m15 |
| packssdw m7, m10 |
| punpckhwd m8, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m2, m3, m1 |
| punpcklwd m3, m1 |
| punpckhwd m1, m4, m6 |
| punpcklwd m4, m6 |
| punpcklwd m6, m7, m5 |
| punpckhwd m7, m5 |
| pxor m5, m5 |
| mov r7d, 128*13 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m5 |
| mova [cq+r7+128*0], m5 |
| mova [cq+r7+128*1], m5 |
| mova [cq+r7+128*2], m5 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| add cq, 32 |
| punpcklwd m5, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m1 |
| punpckhwd m4, m1 |
| punpckhwd m1, m0, m8 |
| punpcklwd m0, m8 |
| punpckhwd m8, m6, m7 |
| punpcklwd m6, m7 |
| punpcklqdq m7, m1, m4 |
| punpckhqdq m1, m4 |
| punpckhqdq m4, m8, m3 |
| punpcklqdq m8, m3 |
| punpckhqdq m3, m6, m5 |
| punpcklqdq m6, m5 |
| punpcklqdq m5, m0, m2 |
| punpckhqdq m0, m2 |
| mova [r6+16*0], xm5 |
| mova [r6+16*1], xm6 |
| mova [r6+16*2], xm7 |
| mova [r6+16*3], xm8 |
| vextracti128 [r6+16*4], m5, 1 |
| vextracti128 [r6+16*5], m6, 1 |
| vextracti128 [r6+16*6], m7, 1 |
| vextracti128 [r6+16*7], m8, 1 |
| sub r6, 32*4 |
| ret |
| ALIGN function_align |
| .pass2_end: |
| mova [rsp+gprsize+32*0], m6 |
| mova [rsp+gprsize+32*2], m7 |
| mova [rsp+gprsize+32*3], m15 |
| vpbroadcastd m15, [pw_2048] |
| vpbroadcastd m7, [pixel_10bpc_max] |
| IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 |
| IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 |
| IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 |
| IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| mova m1, [rsp+gprsize+32*1] |
| IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 |
| IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 |
| IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 |
| IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| mova m1, [rsp+gprsize+32*0] |
| IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 |
| IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 |
| IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 |
| IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 |
| add dstq, strideq |
| sub r2, strideq |
| mova m1, [rsp+gprsize+32*2] |
| mova m2, [rsp+gprsize+32*3] |
| IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 |
| IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 |
| IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 |
| IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob |
| vpbroadcastd m8, [pw_2896x8] |
| vpbroadcastd m9, [pw_1697x16] |
| vpbroadcastd m11, [pw_8192] |
| vpbroadcastd m7, [pixel_10bpc_max] |
| lea r6, [strideq*5] |
| pxor m6, m6 |
| paddw m10, m11, m11 ; pw_16384 |
| mov r5, dstq |
| call .main |
| sub eobd, 36 |
| jl .ret |
| add cq, 128*8 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8-32 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 107 ; eob < 143 |
| jl .ret |
| add cq, 128*8 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8-32 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 128 ; eob < 271 |
| jl .ret |
| add cq, 128*8 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8-32 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 128 ; eob < 399 |
| jl .ret |
| add cq, 128*8 |
| lea dstq, [r5+16] |
| call .main |
| .ret: |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 |
| REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3 |
| REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| .main2: |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m4 |
| punpcklwd m0, m4 |
| punpcklwd m4, m2, m1 |
| punpckhwd m2, m1 |
| punpckhqdq m1, m0, m4 |
| punpcklqdq m0, m4 |
| call m(iidentity_8x8_internal_10bpc).write_2x8x2 |
| punpcklqdq m0, m3, m2 |
| punpckhqdq m1, m3, m2 |
| jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 |
| |
| cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| lea r6, [rsp+32*4] |
| call .main |
| cmp eobd, 36 |
| jge .full |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] |
| lea r6, [pw_5+128] |
| mov r7, dstq |
| call m(idct_16x16_internal_8bpc).main |
| call .write_16x16 |
| mova m0, [r5+32*3] |
| mova m1, [r5+32*2] |
| mova m2, [r5+32*1] |
| mova m3, [r5+32*0] |
| mova m4, [r5-32*1] |
| mova m5, [r5-32*2] |
| mova m6, [r5-32*3] |
| mova m7, [r5-32*4] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] |
| jmp .end |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 16 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 384 |
| sar r6d, 9 |
| jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 |
| .full: |
| add cq, 32 |
| mova [r4+32*3], m0 |
| mova [r4+32*2], m1 |
| mova [r4+32*1], m2 |
| mova [r4+32*0], m3 |
| mova [r4-32*1], m4 |
| mova [r4-32*2], m5 |
| mova [r4-32*3], m6 |
| mova [r4-32*4], m7 |
| call .main |
| sub r4, 32*16 ; topleft 16x8 |
| call .transpose_16x16 |
| lea r6, [pw_5+128] |
| mov r7, dstq |
| call m(idct_16x16_internal_8bpc).main |
| call .write_16x16 |
| mova m0, [r5+32*3] |
| mova m1, [r5+32*2] |
| mova m2, [r5+32*1] |
| mova m3, [r5+32*0] |
| mova m4, [r5-32*1] |
| mova m5, [r5-32*2] |
| mova m6, [r5-32*3] |
| mova m7, [r5-32*4] |
| add r4, 32*8 ; bottomleft 16x8 |
| call .transpose_16x16 |
| .end: |
| lea dstq, [r7+32] |
| call m(idct_16x16_internal_8bpc).main |
| call .write_16x16 |
| RET |
| ALIGN function_align |
| .transpose_16x16: |
| punpckhdq m8, m3, m1 |
| punpckldq m3, m1 |
| punpckhdq m1, m0, m2 |
| punpckldq m0, m2 |
| punpckhdq m2, m7, m5 |
| punpckldq m7, m5 |
| punpckhdq m5, m4, m6 |
| punpckldq m4, m6 |
| punpckhqdq m6, m0, m4 |
| punpcklqdq m0, m4 |
| punpckhqdq m4, m1, m5 |
| punpcklqdq m1, m5 |
| punpckhqdq m5, m7, m3 |
| punpcklqdq m7, m3 |
| punpckhqdq m3, m2, m8 |
| punpcklqdq m2, m8 |
| vinserti128 m8, m0, xm7, 1 |
| vperm2i128 m12, m0, m7, 0x31 |
| vinserti128 m9, m6, xm5, 1 |
| vperm2i128 m13, m6, m5, 0x31 |
| vinserti128 m10, m1, xm2, 1 |
| vperm2i128 m14, m1, m2, 0x31 |
| vinserti128 m11, m4, xm3, 1 |
| vperm2i128 m15, m4, m3, 0x31 |
| mova m0, [r4+32*3] |
| mova m1, [r4+32*2] |
| mova m2, [r4+32*1] |
| mova m3, [r4+32*0] |
| mova m4, [r4-32*1] |
| mova m5, [r4-32*2] |
| mova m6, [r4-32*3] |
| mova m7, [r4-32*4] |
| mova [rsp+gprsize], m15 |
| jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| ALIGN function_align |
| .main: |
| vpbroadcastd m14, [pd_2896] |
| vpbroadcastd m11, [pd_2048] |
| pmulld m0, m14, [cq+64* 1] |
| pmulld m1, m14, [cq+64* 7] |
| pmulld m2, m14, [cq+64* 9] |
| pmulld m3, m14, [cq+64*15] |
| pmulld m4, m14, [cq+64*17] |
| pmulld m5, m14, [cq+64*23] |
| pmulld m6, m14, [cq+64*25] |
| pmulld m7, m14, [cq+64*31] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 |
| pmulld m0, m14, [cq+64* 3] |
| pmulld m1, m14, [cq+64* 5] |
| pmulld m2, m14, [cq+64*11] |
| pmulld m3, m14, [cq+64*13] |
| pmulld m4, m14, [cq+64*19] |
| pmulld m5, m14, [cq+64*21] |
| pmulld m6, m14, [cq+64*27] |
| pmulld m7, m14, [cq+64*29] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 |
| pmulld m0, m14, [cq+64* 2] |
| pmulld m1, m14, [cq+64* 6] |
| pmulld m2, m14, [cq+64*10] |
| pmulld m3, m14, [cq+64*14] |
| pmulld m4, m14, [cq+64*18] |
| pmulld m5, m14, [cq+64*22] |
| pmulld m6, m14, [cq+64*26] |
| pmulld m7, m14, [cq+64*30] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 |
| pmulld m0, m14, [cq+64* 0] |
| pmulld m1, m14, [cq+64* 4] |
| pmulld m2, m14, [cq+64* 8] |
| pmulld m3, m14, [cq+64*12] |
| pmulld m4, m14, [cq+64*16] |
| pmulld m5, m14, [cq+64*20] |
| pmulld m6, m14, [cq+64*24] |
| pmulld m7, m14, [cq+64*28] |
| call m(idct_8x8_internal_10bpc).main_rect2 |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| pxor m8, m8 |
| mov r7d, 64*30 |
| .main_zero_loop: |
| mova [cq+r7-64*2], m8 |
| mova [cq+r7-64*1], m8 |
| mova [cq+r7+64*0], m8 |
| mova [cq+r7+64*1], m8 |
| sub r7d, 64*4 |
| jg .main_zero_loop |
| .main_end: |
| psrld m11, 11 ; pd_1 |
| IDCT32_END 0, 15, 8, 9, 10, 1 |
| IDCT32_END 1, 14, 8, 9, 10, 1 |
| punpckhwd m8, m0, m1 ; 16 17 |
| punpcklwd m0, m1 ; 0 1 |
| punpcklwd m1, m14, m15 ; 14 15 |
| punpckhwd m14, m15 ; 30 31 |
| mova [r5+32*3], m8 |
| mova [r5+32*2], m14 |
| IDCT32_END 2, 15, 8, 9, 10, 1 |
| IDCT32_END 3, 14, 8, 9, 10, 1 |
| punpckhwd m8, m2, m3 ; 18 19 |
| punpcklwd m2, m3 ; 2 3 |
| punpcklwd m3, m14, m15 ; 12 13 |
| punpckhwd m14, m15 ; 28 29 |
| mova [r5+32*1], m8 |
| mova [r5+32*0], m14 |
| IDCT32_END 4, 15, 8, 9, 10, 1 |
| IDCT32_END 5, 14, 8, 9, 10, 1 |
| punpckhwd m8, m4, m5 ; 20 21 |
| punpcklwd m4, m5 ; 4 5 |
| punpcklwd m5, m14, m15 ; 10 11 |
| punpckhwd m14, m15 ; 26 27 |
| mova [r5-32*1], m8 |
| mova [r5-32*2], m14 |
| IDCT32_END 6, 15, 8, 9, 10, 1 |
| IDCT32_END 7, 14, 8, 9, 10, 1 |
| punpckhwd m8, m6, m7 ; 22 23 |
| punpcklwd m6, m7 ; 6 7 |
| punpcklwd m7, m14, m15 ; 8 9 |
| punpckhwd m14, m15 ; 24 25 |
| mova [r5-32*3], m8 |
| mova [r5-32*4], m14 |
| ret |
| ALIGN function_align |
| .write_16x16: |
| mova m1, [rsp+gprsize+32*1] |
| mova [rsp+gprsize+32*0], m8 |
| mova [rsp+gprsize+32*1], m9 |
| mova [rsp+gprsize+32*2], m12 |
| vpbroadcastd m12, [pw_2048] |
| vpbroadcastd m9, [pixel_10bpc_max] |
| lea r3, [strideq*3] |
| pxor m8, m8 |
| pmulhrsw m0, m12 |
| pmulhrsw m1, m12 |
| pmulhrsw m2, m12 |
| pmulhrsw m3, m12 |
| call m(idct_16x8_internal_10bpc).write_16x4 |
| pmulhrsw m0, m12, m4 |
| pmulhrsw m1, m12, m5 |
| pmulhrsw m2, m12, m6 |
| pmulhrsw m3, m12, m7 |
| call m(idct_16x8_internal_10bpc).write_16x4 |
| pmulhrsw m0, m12, [rsp+gprsize+32*0] |
| pmulhrsw m1, m12, [rsp+gprsize+32*1] |
| pmulhrsw m2, m12, m10 |
| pmulhrsw m3, m12, m11 |
| call m(idct_16x8_internal_10bpc).write_16x4 |
| pmulhrsw m0, m12, [rsp+gprsize+32*2] |
| pmulhrsw m1, m12, m13 |
| pmulhrsw m2, m12, m14 |
| pmulhrsw m3, m12, m15 |
| jmp m(idct_16x8_internal_10bpc).write_16x4 |
| |
| cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob |
| vpbroadcastd m8, [pw_2896x8] |
| vpbroadcastd m9, [pw_1697x16] |
| vpbroadcastd m10, [pw_2048] |
| vpbroadcastd m7, [pixel_10bpc_max] |
| lea r6, [strideq*5] |
| pxor m6, m6 |
| mov r5, dstq |
| call .main |
| sub eobd, 36 |
| jl .ret |
| add cq, 32 |
| lea dstq, [dstq+strideq*4] |
| call .main |
| add cq, 64*8-32 |
| lea dstq, [r5+16*1] |
| call .main |
| sub eobd, 107 ; eob < 143 |
| jl .ret |
| add cq, 32 |
| lea dstq, [dstq+strideq*4] |
| call .main |
| add cq, 64*8-32 |
| lea dstq, [r5+16*2] |
| call .main |
| sub eobd, 128 ; eob < 271 |
| jl .ret |
| add cq, 32 |
| lea dstq, [dstq+strideq*4] |
| call .main |
| add cq, 64*8-32 |
| lea dstq, [r5+16*3] |
| call .main |
| sub eobd, 128 ; eob < 399 |
| jl .ret |
| add cq, 32 |
| lea dstq, [dstq+strideq*4] |
| call .main |
| .ret: |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+64*0] |
| packssdw m0, [cq+64*1] |
| mova m1, [cq+64*2] |
| packssdw m1, [cq+64*3] |
| mova m2, [cq+64*4] |
| packssdw m2, [cq+64*5] |
| mova m3, [cq+64*6] |
| packssdw m3, [cq+64*7] |
| REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 |
| REPX {paddsw x, x }, m0, m1, m2, m3 |
| REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3 |
| REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 |
| |
| cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| lea r6, [rsp+32*7] |
| call .main |
| cmp eobd, 36 |
| jl .fast |
| call .main |
| cmp eobd, 136 |
| jl .fast |
| call .main |
| cmp eobd, 300 |
| jl .fast |
| call .main |
| jmp .pass2 |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 32 |
| jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly |
| .fast: |
| lea r4, [rsp+32*71] |
| pxor m0, m0 |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| cmp r6, r4 |
| jl .fast_loop |
| .pass2: |
| lea r3, [rsp+32*3] |
| mov r4, r6 |
| lea r5, [r6+32*8] |
| lea r6, [pw_5+128] |
| call .pass2_oddhalf |
| call .pass2_evenhalf |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end |
| sub dstq, r3 |
| lea r2, [r2+r3+32] |
| add dstq, 32 |
| lea r3, [rsp+32*11] |
| call .pass2_oddhalf |
| call .pass2_evenhalf |
| lea r3, [strideq*3] |
| call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128* 7] |
| mova m2, [cq+128* 9] |
| mova m3, [cq+128*15] |
| mova m4, [cq+128*17] |
| mova m5, [cq+128*23] |
| mova m6, [cq+128*25] |
| mova m7, [cq+128*31] |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m14, [pd_2896] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 |
| mova m0, [cq+128* 3] |
| mova m1, [cq+128* 5] |
| mova m2, [cq+128*11] |
| mova m3, [cq+128*13] |
| mova m4, [cq+128*19] |
| mova m5, [cq+128*21] |
| mova m6, [cq+128*27] |
| mova m7, [cq+128*29] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 |
| mova m0, [cq+128* 2] |
| mova m1, [cq+128* 6] |
| mova m2, [cq+128*10] |
| mova m3, [cq+128*14] |
| mova m4, [cq+128*18] |
| mova m5, [cq+128*22] |
| mova m6, [cq+128*26] |
| mova m7, [cq+128*30] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 4] |
| mova m2, [cq+128* 8] |
| mova m3, [cq+128*12] |
| mova m4, [cq+128*16] |
| mova m5, [cq+128*20] |
| mova m6, [cq+128*24] |
| mova m7, [cq+128*28] |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end |
| pxor m15, m15 |
| mov r7d, 128*29 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m15 |
| mova [cq+r7+128*0], m15 |
| mova [cq+r7+128*1], m15 |
| mova [cq+r7+128*2], m15 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| add cq, 32 |
| mova [r4-32*4], m0 |
| mova [r4-32*3], m1 |
| mova [r4-32*2], m2 |
| mova [r4-32*1], m3 |
| mova [r4+32*0], m4 |
| mova [r4+32*1], m5 |
| mova [r4+32*2], m6 |
| mova [r4+32*3], m7 |
| mova m0, [r5+32*3] |
| mova m1, [r5+32*2] |
| mova m2, [r5+32*1] |
| mova m3, [r5+32*0] |
| mova m4, [r5-32*1] |
| mova m5, [r5-32*2] |
| mova m6, [r5-32*3] |
| mova m7, [r5-32*4] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| mova [r5-32*4], m0 |
| mova [r5-32*3], m1 |
| mova [r5-32*2], m2 |
| mova [r5-32*1], m3 |
| mova [r5+32*0], m4 |
| mova [r5+32*1], m5 |
| mova [r5+32*2], m6 |
| mova [r5+32*3], m7 |
| ret |
| ALIGN function_align |
| .pass2_oddhalf: |
| mova m0, [r3+32* 1] ; 1 |
| mova m1, [r3+32* 3] ; 3 |
| mova m2, [r3+32* 5] ; 5 |
| mova m3, [r3+32* 7] ; 7 |
| mova m4, [r3+32*17] ; 9 |
| mova m5, [r3+32*19] ; 11 |
| mova m6, [r3+32*21] ; 13 |
| mova m7, [r3+32*23] ; 15 |
| mova m8, [r3+32*33] ; 17 |
| mova m9, [r3+32*35] ; 19 |
| mova m10, [r3+32*37] ; 21 |
| mova m11, [r3+32*39] ; 23 |
| mova m12, [r3+32*49] ; 25 |
| mova m13, [r3+32*51] ; 27 |
| mova m14, [r3+32*53] ; 29 |
| mova m15, [r3+32*55] ; 31 |
| jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf |
| ALIGN function_align |
| .pass2_evenhalf: |
| mova m0, [r3+32* 0] ; 0 |
| mova m1, [r3+32* 2] ; 2 |
| mova m2, [r3+32* 4] ; 4 |
| mova m3, [r3+32* 6] ; 6 |
| mova m4, [r3+32*16] ; 8 |
| mova m5, [r3+32*18] ; 10 |
| mova m6, [r3+32*20] ; 12 |
| mova m7, [r3+32*22] ; 14 |
| mova m8, [r3+32*32] ; 16 |
| mova m9, [r3+32*34] ; 18 |
| mova m10, [r3+32*36] ; 20 |
| mova m11, [r3+32*38] ; 22 |
| mova m12, [r3+32*48] ; 24 |
| mova m13, [r3+32*50] ; 26 |
| mova m14, [r3+32*52] ; 28 |
| mova m15, [r3+32*54] ; 30 |
| mova [rsp+gprsize], m15 |
| jmp m(idct_16x16_internal_8bpc).main |
| |
| cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m5, [pw_8192] |
| vpbroadcastd m7, [pixel_10bpc_max] |
| pxor m6, m6 |
| lea r6, [strideq*3] |
| lea r5, [strideq*5] |
| lea r4, [strideq+r6*2] ; strideq*7 |
| call .main ; 0 |
| cmp eobd, 36 |
| jl .ret |
| add cq, 128*8 ; 0 1 |
| mov r7, dstq ; 1 |
| add dstq, 16 |
| call .main |
| call .main2 |
| cmp eobd, 136 |
| jl .ret |
| add cq, 128*16-32 ; 0 1 2 |
| lea dstq, [r7+16*2] ; 1 2 |
| call .main ; 2 |
| call .main2 |
| call .main2 |
| cmp eobd, 300 |
| jl .ret |
| add cq, 128*24-64 ; 0 1 2 3 |
| add r7, 16*3 ; 1 2 3 |
| mov dstq, r7 ; 2 3 |
| call .main ; 3 |
| call .main2 |
| call .main2 |
| call .main2 |
| cmp eobd, 535 |
| jl .ret |
| add cq, 128*24-64 ; 0 1 2 3 |
| lea dstq, [r7+strideq*8] ; 1 2 3 4 |
| mov r7, dstq ; 2 3 4 |
| call .main ; 3 4 |
| call .main2 |
| call .main2 |
| cmp eobd, 755 |
| jl .ret |
| add cq, 128*16-32 ; 0 1 2 3 |
| lea dstq, [r7+strideq*8] ; 1 2 3 4 |
| call .main ; 2 3 4 5 |
| call .main2 ; 3 4 5 |
| cmp eobd, 911 |
| jl .ret |
| add cq, 128*8 ; 0 1 2 3 |
| add dstq, 16 ; 1 2 3 4 |
| call .main ; 2 3 4 5 |
| .ret: ; 3 4 5 6 |
| RET |
| ALIGN function_align |
| .main2: |
| sub cq, 128*8-32 |
| lea dstq, [dstq+strideq*8-16] |
| .main: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero |
| |
| %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) |
| %if %1 & 1 |
| mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n |
| mova m%4, [r4-32*(14+%1)] ; idct32 out31-n |
| %else |
| mova m%5, [r4-32*(45-%1)] |
| mova m%4, [r5-32*(20+%1)] |
| %endif |
| paddsw m%6, m%5, m%4 ; idct32 out 0+n |
| psubsw m%5, m%4 ; idct32 out31-n |
| paddsw m%4, m%5, m%3 ; out31-n |
| psubsw m%5, m%3 ; out32+n |
| paddsw m%3, m%6, m%2 ; out 0+n |
| psubsw m%6, m%2 ; out63-n |
| REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 |
| %if %1 & 1 |
| %define %%d0 r2 |
| %define %%d1 dstq |
| %else |
| %define %%d0 dstq |
| %define %%d1 r2 |
| %endif |
| paddw m%3, [%%d0+%7 ] |
| paddw m%4, [%%d1+%8 ] |
| paddw m%5, [%%d0+%9 ] |
| paddw m%6, [%%d1+%10] |
| pxor m%2, m%2 |
| REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 |
| vpbroadcastd m%2, [pixel_10bpc_max] |
| REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 |
| mova [%%d0+%7 ], m%3 |
| mova [%%d1+%8 ], m%4 |
| mova [%%d0+%9 ], m%5 |
| mova [%%d1+%10], m%6 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*6] |
| call .main |
| sub eobd, 44 |
| jl .fast |
| call .main |
| sub eobd, 107 |
| jl .fast |
| call .main |
| sub eobd, 128 |
| jl .fast |
| call .main |
| jmp .pass2 |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 64 |
| add r6d, 640 |
| sar r6d, 10 |
| jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 |
| .fast: |
| lea r4, [rsp+32*38] |
| pxor m0, m0 |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| cmp r6, r4 |
| jl .fast_loop |
| .pass2: |
| lea r6, [pw_5+128] |
| mova m0, [rsp+32* 2] ; in0 |
| mova m1, [rsp+32* 6] ; in4 |
| mova m2, [rsp+32*10] ; in8 |
| mova m3, [rsp+32*14] ; in12 |
| mova m4, [rsp+32*18] ; in16 |
| mova m5, [rsp+32*22] ; in20 |
| mova m6, [rsp+32*26] ; in24 |
| mova m7, [rsp+32*30] ; in28 |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| call m(idct_16x16_internal_8bpc).main |
| mova m1, [rsp+32*1] |
| lea r4, [rsp+32*38] |
| mova [r4-32*4], m0 |
| mova [r4-32*3], m1 |
| mova [r4-32*2], m2 |
| mova [r4-32*1], m3 |
| mova [r4+32*0], m4 |
| mova [r4+32*1], m5 |
| mova [r4+32*2], m6 |
| mova [r4+32*3], m7 |
| add r4, 32*8 |
| mova [r4-32*4], m8 |
| mova [r4-32*3], m9 |
| mova [r4-32*2], m10 |
| mova [r4-32*1], m11 |
| mova [r4+32*0], m12 |
| mova [r4+32*1], m13 |
| mova [r4+32*2], m14 |
| mova [r4+32*3], m15 |
| mova m0, [rsp+32* 4] ; in2 |
| mova m1, [rsp+32* 8] ; in6 |
| mova m2, [rsp+32*12] ; in10 |
| mova m3, [rsp+32*16] ; in14 |
| mova m4, [rsp+32*20] ; in18 |
| mova m5, [rsp+32*24] ; in22 |
| mova m6, [rsp+32*28] ; in26 |
| mova m7, [rsp+32*32] ; in30 |
| lea r5, [r4+32*16] |
| add r4, 32*8 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| mova m0, [rsp+32* 3] ; in1 |
| mova m1, [rsp+32*33] ; in31 |
| mova m2, [rsp+32*19] ; in17 |
| mova m3, [rsp+32*17] ; in15 |
| mova m4, [rsp+32*11] ; in9 |
| mova m5, [rsp+32*25] ; in23 |
| mova m6, [rsp+32*27] ; in25 |
| mova m7, [rsp+32* 9] ; in7 |
| lea r6, [idct64_mul - 8] |
| add r4, 32*16 |
| add r5, 32*32 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| mova m0, [rsp+32* 7] ; in5 |
| mova m1, [rsp+32*29] ; in27 |
| mova m2, [rsp+32*23] ; in21 |
| mova m3, [rsp+32*13] ; in11 |
| mova m4, [rsp+32*15] ; in13 |
| mova m5, [rsp+32*21] ; in19 |
| mova m6, [rsp+32*31] ; in29 |
| mova m7, [rsp+32* 5] ; in3 |
| add r6, 8 |
| add r4, 32*8 |
| sub r5, 32*8 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| lea r8, [strideq*4] |
| lea r9, [strideq*5] |
| lea r3, [r9+strideq*1] ; stride*6 |
| lea r7, [r9+strideq*2] ; stride*7 |
| call .main_part2_pass2 |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128* 3] |
| mova m2, [cq+128* 5] |
| mova m3, [cq+128* 7] |
| mova m4, [cq+128* 9] |
| mova m5, [cq+128*11] |
| mova m6, [cq+128*13] |
| mova m7, [cq+128*15] |
| call m(idct_8x16_internal_10bpc).main_oddhalf |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 2] |
| mova m2, [cq+128* 4] |
| mova m3, [cq+128* 6] |
| mova m4, [cq+128* 8] |
| mova m5, [cq+128*10] |
| mova m6, [cq+128*12] |
| mova m7, [cq+128*14] |
| call m(idct_8x8_internal_10bpc).main |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| pxor m15, m15 |
| mov r7d, 128*13 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m15 |
| mova [cq+r7+128*0], m15 |
| mova [cq+r7+128*1], m15 |
| mova [cq+r7+128*2], m15 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| add cq, 32 |
| psrld m15, m11, 10 ; pd_2 |
| mova m8, [r6-32*4] |
| mova m9, [r6+32*3] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 |
| psubd m10, m0, m8 ; out15 |
| paddd m0, m8 ; out0 |
| mova m8, [r6-32*3] |
| psubd m15, m7, m9 ; out8 |
| paddd m7, m9 ; out7 |
| mova m9, [r6+32*2] |
| REPX {psrad x, 2}, m0, m15, m10, m7 |
| packssdw m0, m15 |
| packssdw m7, m10 |
| psubd m10, m1, m8 ; out14 |
| paddd m1, m8 ; out1 |
| mova m8, [r6-32*2] |
| psubd m15, m6, m9 ; out9 |
| paddd m6, m9 ; out6 |
| mova m9, [r6+32*1] |
| REPX {psrad x, 2}, m1, m15, m10, m6 |
| packssdw m1, m15 |
| packssdw m6, m10 |
| psubd m10, m2, m8 ; out13 |
| paddd m2, m8 ; out2 |
| mova m8, [r6-32*1] |
| psubd m15, m5, m9 ; out10 |
| paddd m5, m9 ; out5 |
| mova m9, [r6+32*0] |
| REPX {psrad x, 2}, m2, m15, m10, m5 |
| packssdw m2, m15 |
| packssdw m5, m10 |
| psubd m10, m3, m8 ; out12 |
| paddd m3, m8 ; out3 |
| psubd m15, m4, m9 ; out11 |
| paddd m4, m9 ; out4 |
| REPX {psrad x, 2}, m3, m15, m10, m4 |
| packssdw m3, m15 |
| packssdw m4, m10 |
| call m(idct_16x8_internal_10bpc).transpose3 |
| mova [r6-32*4], m0 |
| mova [r6-32*3], m1 |
| mova [r6-32*2], m2 |
| mova [r6-32*1], m3 |
| mova [r6+32*0], m4 |
| mova [r6+32*1], m5 |
| mova [r6+32*2], m6 |
| mova [r6+32*3], m7 |
| add r6, 32*8 |
| ret |
| .main_part2_pass2: |
| vpbroadcastd m11, [pw_1567_3784] |
| vpbroadcastd m12, [pw_m3784_1567] |
| vpbroadcastd m13, [pw_2896_2896] |
| lea r6, [pw_5+128] |
| lea r2, [dstq+r7] |
| .main_part2_pass2_loop: |
| vpbroadcastd m14, [pw_m2896_2896] |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal |
| vpbroadcastd m14, [pw_2048] |
| IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 |
| IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 |
| IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 |
| IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 |
| add dstq, strideq |
| sub r2, strideq |
| cmp r4, r5 |
| jne .main_part2_pass2_loop |
| ret |
| ALIGN function_align |
| .main_part1_rect2: |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| .main_part1: ; idct64 steps 1-5 |
| ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a |
| ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a |
| ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a |
| ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a |
| vpbroadcastd m7, [r5+4*0] |
| vpbroadcastd m8, [r5+4*1] |
| vpbroadcastd m6, [r5+4*2] |
| vpbroadcastd m9, [r5+4*3] |
| vpbroadcastd m5, [r5+4*4] |
| vpbroadcastd m10, [r5+4*5] |
| vpbroadcastd m4, [r5+4*6] |
| vpbroadcastd m15, [r5+4*7] |
| pmulld m7, m0 ; t63a |
| pmulld m0, m8 ; t32a |
| pmulld m6, m1 ; t62a |
| pmulld m1, m9 ; t33a |
| pmulld m5, m2 ; t61a |
| pmulld m2, m10 ; t34a |
| pmulld m4, m3 ; t60a |
| pmulld m3, m15 ; t35a |
| vpbroadcastd m10, [r5+4*8] |
| vpbroadcastd m15, [r5+4*9] |
| REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 |
| REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 |
| psubd m8, m0, m1 ; t33 |
| paddd m0, m1 ; t32 |
| psubd m1, m7, m6 ; t62 |
| paddd m7, m6 ; t63 |
| psubd m6, m3, m2 ; t34 |
| paddd m3, m2 ; t35 |
| psubd m2, m4, m5 ; t61 |
| paddd m4, m5 ; t60 |
| REPX {pmaxsd x, m12}, m8, m1, m6, m2 |
| REPX {pminsd x, m13}, m8, m1, m6, m2 |
| ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a |
| ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a |
| REPX {pmaxsd x, m12}, m0, m3, m7, m4 |
| REPX {pminsd x, m13}, m0, m3, m7, m4 |
| vpbroadcastd m10, [r5+4*10] |
| vpbroadcastd m15, [r5+4*11] |
| psubd m5, m0, m3 ; t35a |
| paddd m0, m3 ; t32a |
| psubd m3, m7, m4 ; t60a |
| paddd m7, m4 ; t63a |
| psubd m4, m1, m6 ; t34 |
| paddd m1, m6 ; t33 |
| psubd m6, m8, m2 ; t61 |
| paddd m8, m2 ; t62 |
| REPX {pmaxsd x, m12}, m5, m3, m4, m6 |
| REPX {pminsd x, m13}, m5, m3, m4, m6 |
| ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 |
| ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a |
| REPX {pmaxsd x, m12}, m0, m7, m1, m8 |
| REPX {pminsd x, m13}, m0, m7, m1, m8 |
| add r5, 4*12 |
| mova [r6-32*4], m0 |
| mova [r6+32*3], m7 |
| mova [r6-32*3], m1 |
| mova [r6+32*2], m8 |
| mova [r6-32*2], m6 |
| mova [r6+32*1], m4 |
| mova [r6-32*1], m3 |
| mova [r6+32*0], m5 |
| add r6, 32*8 |
| ret |
| .main_part2: ; idct64 steps 6-9 |
| lea r5, [r6+32*3] |
| sub r6, 32*4 |
| vpbroadcastd m10, [pd_1567] |
| vpbroadcastd m15, [pd_3784] |
| .main_part2_loop: |
| mova m0, [r6-32*32] ; t32a |
| mova m1, [r5-32*24] ; t39a |
| mova m2, [r5-32*32] ; t63a |
| mova m3, [r6-32*24] ; t56a |
| mova m4, [r6-32*16] ; t40a |
| mova m5, [r5-32* 8] ; t47a |
| mova m6, [r5-32*16] ; t55a |
| mova m7, [r6-32* 8] ; t48a |
| psubd m8, m0, m1 ; t39 |
| paddd m0, m1 ; t32 |
| psubd m1, m2, m3 ; t56 |
| paddd m2, m3 ; t63 |
| psubd m3, m5, m4 ; t40 |
| paddd m5, m4 ; t47 |
| psubd m4, m7, m6 ; t55 |
| paddd m7, m6 ; t48 |
| REPX {pmaxsd x, m12}, m8, m1, m3, m4 |
| REPX {pminsd x, m13}, m8, m1, m3, m4 |
| ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a |
| ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a |
| REPX {pmaxsd x, m12}, m0, m2, m5, m7 |
| REPX {pminsd x, m13}, m0, m5, m2, m7 |
| psubd m6, m2, m7 ; t48a |
| paddd m2, m7 ; t63a |
| psubd m7, m0, m5 ; t47a |
| paddd m0, m5 ; t32a |
| psubd m5, m8, m4 ; t55 |
| paddd m8, m4 ; t56 |
| psubd m4, m1, m3 ; t40 |
| paddd m1, m3 ; t39 |
| REPX {pmaxsd x, m12}, m6, m7, m5, m4 |
| REPX {pminsd x, m13}, m6, m7, m5, m4 |
| REPX {pmulld x, m14}, m6, m7, m5, m4 |
| REPX {pmaxsd x, m12}, m2, m0, m8, m1 |
| REPX {pminsd x, m13}, m2, m0, m8, m1 |
| paddd m6, m11 |
| paddd m5, m11 |
| psubd m3, m6, m7 ; t47 |
| paddd m6, m7 ; t48 |
| psubd m7, m5, m4 ; t40a |
| paddd m5, m4 ; t55a |
| REPX {psrad x, 12}, m3, m6, m7, m5 |
| mova [r5-32* 8], m2 |
| mova [r6-32*32], m0 |
| mova [r6-32* 8], m8 |
| mova [r5-32*32], m1 |
| mova [r5-32*24], m3 |
| mova [r6-32*16], m6 |
| mova [r6-32*24], m7 |
| mova [r5-32*16], m5 |
| add r6, 32 |
| sub r5, 32 |
| cmp r6, r5 |
| jl .main_part2_loop |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| lea r6, [rsp+32*6] |
| call .main |
| cmp eobd, 36 |
| jl .fast |
| call .main |
| cmp eobd, 136 |
| jl .fast |
| call .main |
| cmp eobd, 300 |
| jl .fast |
| call .main |
| jmp .pass2 |
| .dconly: |
| imul r6d, [cq], 181 |
| vpbroadcastd m3, [dconly_10bpc] |
| mov [cq], eobd ; 0 |
| or r3d, 64 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 384 |
| sar r6d, 9 |
| jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 |
| .fast: |
| lea r4, [rsp+32*70] |
| pxor m0, m0 |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| cmp r6, r4 |
| jl .fast_loop |
| .pass2: |
| lea r6, [pw_5 + 128] |
| mov r10, rsp |
| lea r8, [strideq*4] |
| lea r9, [strideq*5] |
| lea r3, [r9+strideq*1] ; stride*6 |
| lea r7, [r9+strideq*2] ; stride*7 |
| .pass2_loop: |
| mova m0, [r10+32* 2] ; in0 |
| mova m1, [r10+32* 6] ; in4 |
| mova m2, [r10+32*18] ; in8 |
| mova m3, [r10+32*22] ; in12 |
| mova m4, [r10+32*34] ; in16 |
| mova m5, [r10+32*38] ; in20 |
| mova m6, [r10+32*50] ; in24 |
| mova m7, [r10+32*54] ; in28 |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| call m(idct_16x16_internal_8bpc).main |
| mova m1, [rsp+32*1] |
| lea r4, [rsp+32*70] |
| mova [r4-32*4], m0 |
| mova [r4-32*3], m1 |
| mova [r4-32*2], m2 |
| mova [r4-32*1], m3 |
| mova [r4+32*0], m4 |
| mova [r4+32*1], m5 |
| mova [r4+32*2], m6 |
| mova [r4+32*3], m7 |
| add r4, 32*8 |
| mova [r4-32*4], m8 |
| mova [r4-32*3], m9 |
| mova [r4-32*2], m10 |
| mova [r4-32*1], m11 |
| mova [r4+32*0], m12 |
| mova [r4+32*1], m13 |
| mova [r4+32*2], m14 |
| mova [r4+32*3], m15 |
| mova m0, [r10+32* 4] ; in2 |
| mova m1, [r10+32* 8] ; in6 |
| mova m2, [r10+32*20] ; in10 |
| mova m3, [r10+32*24] ; in14 |
| mova m4, [r10+32*36] ; in18 |
| mova m5, [r10+32*40] ; in22 |
| mova m6, [r10+32*52] ; in26 |
| mova m7, [r10+32*56] ; in30 |
| lea r5, [r4+32*16] |
| add r4, 32*8 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| mova m0, [r10+32* 3] ; in1 |
| mova m1, [r10+32*57] ; in31 |
| mova m2, [r10+32*35] ; in17 |
| mova m3, [r10+32*25] ; in15 |
| mova m4, [r10+32*19] ; in9 |
| mova m5, [r10+32*41] ; in23 |
| mova m6, [r10+32*51] ; in25 |
| mova m7, [r10+32* 9] ; in7 |
| lea r6, [idct64_mul - 8] |
| add r4, 32*16 |
| add r5, 32*32 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| mova m0, [r10+32* 7] ; in5 |
| mova m1, [r10+32*53] ; in27 |
| mova m2, [r10+32*39] ; in21 |
| mova m3, [r10+32*21] ; in11 |
| mova m4, [r10+32*23] ; in13 |
| mova m5, [r10+32*37] ; in19 |
| mova m6, [r10+32*55] ; in29 |
| mova m7, [r10+32* 5] ; in3 |
| add r6, 8 |
| add r4, 32*8 |
| sub r5, 32*8 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 |
| add r10, 32*8 |
| sub r4, 32*98 ; rsp+32*16 |
| sub dstq, r8 |
| add dstq, 32 |
| cmp r10, r4 |
| jl .pass2_loop |
| RET |
| ALIGN function_align |
| .main: |
| vpbroadcastd m14, [pd_2896] |
| vpbroadcastd m11, [pd_2048] |
| pmulld m0, m14, [cq+128* 1] |
| pmulld m1, m14, [cq+128* 7] |
| pmulld m2, m14, [cq+128* 9] |
| pmulld m3, m14, [cq+128*15] |
| pmulld m4, m14, [cq+128*17] |
| pmulld m5, m14, [cq+128*23] |
| pmulld m6, m14, [cq+128*25] |
| pmulld m7, m14, [cq+128*31] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 |
| pmulld m0, m14, [cq+128* 3] |
| pmulld m1, m14, [cq+128* 5] |
| pmulld m2, m14, [cq+128*11] |
| pmulld m3, m14, [cq+128*13] |
| pmulld m4, m14, [cq+128*19] |
| pmulld m5, m14, [cq+128*21] |
| pmulld m6, m14, [cq+128*27] |
| pmulld m7, m14, [cq+128*29] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 |
| pmulld m0, m14, [cq+128* 2] |
| pmulld m1, m14, [cq+128* 6] |
| pmulld m2, m14, [cq+128*10] |
| pmulld m3, m14, [cq+128*14] |
| pmulld m4, m14, [cq+128*18] |
| pmulld m5, m14, [cq+128*22] |
| pmulld m6, m14, [cq+128*26] |
| pmulld m7, m14, [cq+128*30] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 |
| pmulld m0, m14, [cq+128* 0] |
| pmulld m1, m14, [cq+128* 4] |
| pmulld m2, m14, [cq+128* 8] |
| pmulld m3, m14, [cq+128*12] |
| pmulld m4, m14, [cq+128*16] |
| pmulld m5, m14, [cq+128*20] |
| pmulld m6, m14, [cq+128*24] |
| pmulld m7, m14, [cq+128*28] |
| pxor m15, m15 |
| mov r7d, 128*29 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m15 |
| mova [cq+r7+128*0], m15 |
| mova [cq+r7+128*1], m15 |
| mova [cq+r7+128*2], m15 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| add cq, 32 |
| call m(idct_8x8_internal_10bpc).main_rect2 |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| mova [r4-32*4], m0 |
| mova [r4-32*3], m1 |
| mova [r4-32*2], m2 |
| mova [r4-32*1], m3 |
| mova [r4+32*0], m4 |
| mova [r4+32*1], m5 |
| mova [r4+32*2], m6 |
| mova [r4+32*3], m7 |
| mova m0, [r5+32*3] |
| mova m1, [r5+32*2] |
| mova m2, [r5+32*1] |
| mova m3, [r5+32*0] |
| mova m4, [r5-32*1] |
| mova m5, [r5-32*2] |
| mova m6, [r5-32*3] |
| mova m7, [r5-32*4] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose |
| mova [r5-32*4], m0 |
| mova [r5-32*3], m1 |
| mova [r5-32*2], m2 |
| mova [r5-32*1], m3 |
| mova [r5+32*0], m4 |
| mova [r5+32*1], m5 |
| mova [r5+32*2], m6 |
| mova [r5+32*3], m7 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jnz .normal |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 16 |
| .dconly: |
| add r6d, 640 |
| sar r6d, 10 |
| .dconly2: |
| vpbroadcastd m5, [dconly_10bpc] |
| imul r6d, 181 |
| add r6d, 2176 |
| sar r6d, 12 |
| movd xm0, r6d |
| paddsw xm0, xm5 |
| vpbroadcastw m0, xm0 |
| .dconly_loop: |
| paddsw m1, m0, [dstq+32*0] |
| paddsw m2, m0, [dstq+32*1] |
| paddsw m3, m0, [dstq+32*2] |
| paddsw m4, m0, [dstq+32*3] |
| REPX {psubusw x, m5}, m1, m2, m3, m4 |
| mova [dstq+32*0], m1 |
| mova [dstq+32*1], m2 |
| mova [dstq+32*2], m3 |
| mova [dstq+32*3], m4 |
| add dstq, strideq |
| dec r3d |
| jg .dconly_loop |
| RET |
| .normal: |
| PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*4] |
| call .main |
| call .shift_transpose |
| cmp eobd, 36 |
| jl .fast |
| call .main |
| call .shift_transpose |
| jmp .pass2 |
| .fast: |
| pxor m0, m0 |
| mov r3d, 4 |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| dec r3d |
| jg .fast_loop |
| .pass2: |
| lea r7, [r6-32*64] |
| lea r4, [r6-32*32] |
| lea r6, [pw_5+128] |
| mov r5, dstq |
| .pass2_loop: |
| mova m0, [r7-32*4] |
| mova m1, [r7-32*3] |
| mova m2, [r7-32*2] |
| mova m3, [r7-32*1] |
| mova m4, [r7+32*0] |
| mova m5, [r7+32*1] |
| mova m6, [r7+32*2] |
| mova m7, [r7+32*3] |
| add r7, 32*32 |
| mova m8, [r7-32*4] |
| mova m9, [r7-32*3] |
| mova m10, [r7-32*2] |
| mova m11, [r7-32*1] |
| mova m12, [r7+32*0] |
| mova m13, [r7+32*1] |
| mova m14, [r7+32*2] |
| mova m15, [r7+32*3] |
| sub r7, 32*24 |
| mova [rsp], m15 |
| call m(idct_16x16_internal_8bpc).main |
| mova m1, [rsp+32*1] |
| call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 |
| add r5, 32 |
| mov dstq, r5 |
| cmp r7, r4 |
| jl .pass2_loop |
| RET |
| ALIGN function_align |
| .main: |
| lea r5, [idct64_mul_16bpc] |
| mova m0, [cq+64* 1] |
| mova m1, [cq+64*31] |
| mova m2, [cq+64*17] |
| mova m3, [cq+64*15] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+64* 7] |
| mova m1, [cq+64*25] |
| mova m2, [cq+64*23] |
| mova m3, [cq+64* 9] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+64* 5] |
| mova m1, [cq+64*27] |
| mova m2, [cq+64*21] |
| mova m3, [cq+64*11] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+64* 3] |
| mova m1, [cq+64*29] |
| mova m2, [cq+64*19] |
| mova m3, [cq+64*13] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 |
| mova m0, [cq+64* 2] |
| mova m1, [cq+64*14] |
| mova m2, [cq+64*18] |
| mova m3, [cq+64*30] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast |
| mova m0, [cq+64* 6] |
| mova m1, [cq+64*10] |
| mova m2, [cq+64*22] |
| mova m3, [cq+64*26] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast |
| mova m0, [cq+64* 4] |
| mova m1, [cq+64*12] |
| mova m2, [cq+64*20] |
| mova m3, [cq+64*28] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_fast |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 8] |
| mova m2, [cq+64*16] |
| mova m3, [cq+64*24] |
| pxor m15, m15 |
| mov r7d, 64*30 |
| .main_zero_loop: |
| mova [cq+r7-64*2], m15 |
| mova [cq+r7-64*1], m15 |
| mova [cq+r7+64*0], m15 |
| mova [cq+r7+64*1], m15 |
| sub r7d, 64*4 |
| jg .main_zero_loop |
| .main_end: |
| psrld m15, m11, 10 ; pd_2 |
| .main_end2: |
| add cq, 32 |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call m(idct_8x8_internal_10bpc).main |
| add r6, 32*8 |
| call m(idct_8x16_internal_10bpc).main_evenhalf |
| mova [r6+32*2], m1 |
| mova [r6+32*1], m2 |
| mova [r6+32*0], m3 |
| mova [r6-32*1], m4 |
| mova [r6-32*2], m5 |
| mova [r6-32*3], m6 |
| mova [r6-32*4], m7 |
| jmp .main_end_loop_start |
| .main_end_loop: |
| mova m0, [r6+32* 3] ; idct8 0 + n |
| .main_end_loop_start: |
| mova m1, [r5+32* 4] ; idct16 15 - n |
| mova m2, [r5-32*12] ; idct32 16 + n |
| mova m3, [r6-32*13] ; idct32 31 - n |
| mova m4, [r6-32*29] ; idct64 63 - n |
| mova m5, [r5-32*28] ; idct64 48 + n |
| mova m6, [r6-32*45] ; idct64 47 - n |
| mova m7, [r5-32*44] ; idct64 32 + n |
| paddd m8, m0, m1 ; idct16 out0 + n |
| psubd m0, m1 ; idct16 out15 - n |
| REPX {pmaxsd x, m12}, m8, m0 |
| REPX {pminsd x, m13}, m8, m0 |
| paddd m1, m8, m3 ; idct32 out0 + n |
| psubd m8, m3 ; idct32 out31 - n |
| paddd m3, m0, m2 ; idct32 out15 - n |
| psubd m0, m2 ; idct32 out16 + n |
| REPX {pmaxsd x, m12}, m1, m8, m3, m0 |
| REPX {pminsd x, m13}, m1, m3, m8, m0 |
| REPX {paddd x, m15}, m1, m3, m0, m8 |
| paddd m2, m1, m4 ; idct64 out0 + n (unshifted) |
| psubd m1, m4 ; idct64 out63 - n (unshifted) |
| paddd m4, m3, m5 ; idct64 out15 - n (unshifted) |
| psubd m3, m5 ; idct64 out48 + n (unshifted) |
| paddd m5, m0, m6 ; idct64 out16 + n (unshifted) |
| psubd m0, m6 ; idct64 out47 - n (unshifted) |
| paddd m6, m8, m7 ; idct64 out31 - n (unshifted) |
| psubd m8, m7 ; idct64 out32 + n (unshifted) |
| mova [r5-32*44], m2 |
| mova [r6+32* 3], m1 |
| mova [r6-32*45], m4 |
| mova [r5+32* 4], m3 |
| mova [r5-32*28], m5 |
| mova [r6-32*13], m0 |
| mova [r6-32*29], m6 |
| mova [r5-32*12], m8 |
| add r5, 32 |
| sub r6, 32 |
| cmp r5, r6 |
| jl .main_end_loop |
| ret |
| .shift_transpose: |
| %macro IDCT64_SHIFT_TRANSPOSE 1 ; shift |
| sub r6, 32*48 |
| mov r5, r6 |
| %%loop: |
| mova m0, [r6-32* 4] |
| mova m4, [r6+32* 4] |
| mova m1, [r6-32* 3] |
| mova m5, [r6+32* 5] |
| mova m2, [r6-32* 2] |
| mova m6, [r6+32* 6] |
| mova m3, [r6-32* 1] |
| mova m7, [r6+32* 7] |
| REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 |
| packssdw m0, m4 |
| packssdw m1, m5 |
| packssdw m2, m6 |
| packssdw m3, m7 |
| mova m4, [r6+32* 0] |
| mova m6, [r6+32* 8] |
| mova m5, [r6+32* 1] |
| mova m7, [r6+32* 9] |
| REPX {psrad x, %1}, m4, m6, m5, m7 |
| packssdw m4, m6 |
| packssdw m5, m7 |
| mova m6, [r6+32* 2] |
| mova m8, [r6+32*10] |
| mova m7, [r6+32* 3] |
| mova m9, [r6+32*11] |
| REPX {psrad x, %1}, m6, m8, m7, m9 |
| packssdw m6, m8 |
| packssdw m7, m9 |
| call m(idct_16x8_internal_10bpc).transpose3 |
| mova [r5-32*4], m0 |
| mova [r5-32*3], m1 |
| mova [r5-32*2], m2 |
| mova [r5-32*1], m3 |
| mova [r5+32*0], m4 |
| mova [r5+32*1], m5 |
| mova [r5+32*2], m6 |
| mova [r5+32*3], m7 |
| add r6, 32*16 |
| add r5, 32*8 |
| cmp r5, r4 |
| jl %%loop |
| mov r6, r4 |
| %endmacro |
| IDCT64_SHIFT_TRANSPOSE 2 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*7] |
| call .main |
| cmp eobd, 36 |
| jl .fast |
| call .main |
| cmp eobd, 136 |
| jl .fast |
| call .main |
| cmp eobd, 300 |
| jl .fast |
| call .main |
| jmp .pass2 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 32 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 384 |
| sar r6d, 9 |
| jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 |
| .fast: |
| pxor m0, m0 |
| lea r4, [rsp+32*135] |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| cmp r6, r4 |
| jl .fast_loop |
| .pass2: |
| lea r7, [r6-32*32] |
| lea r5, [r6+32*8] |
| lea r6, [pw_5+128] |
| imul r2, strideq, 19 |
| lea r3, [strideq*3] |
| add r2, dstq |
| .pass2_loop: |
| mova m0, [r7-32*99] |
| mova m1, [r7-32*97] |
| mova m2, [r7-32*95] |
| mova m3, [r7-32*93] |
| mova m4, [r7-32*67] |
| mova m5, [r7-32*65] |
| mova m6, [r7-32*63] |
| mova m7, [r7-32*61] |
| mova m8, [r7-32*35] |
| mova m9, [r7-32*33] |
| mova m10, [r7-32*31] |
| mova m11, [r7-32*29] |
| mova m12, [r7-32* 3] |
| mova m13, [r7-32* 1] |
| mova m14, [r7+32* 1] |
| mova m15, [r7+32* 3] |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf |
| mova m0, [r7-32*100] |
| mova m1, [r7-32*98] |
| mova m2, [r7-32*96] |
| mova m3, [r7-32*94] |
| mova m4, [r7-32*68] |
| mova m5, [r7-32*66] |
| mova m6, [r7-32*64] |
| mova m7, [r7-32*62] |
| mova m8, [r7-32*36] |
| mova m9, [r7-32*34] |
| mova m10, [r7-32*32] |
| mova m11, [r7-32*30] |
| mova m12, [r7-32* 4] |
| mova m13, [r7-32* 2] |
| mova m14, [r7+32* 0] |
| mova m15, [r7+32* 2] |
| add r7, 32*8 |
| mova [rsp], m15 |
| call m(idct_16x16_internal_8bpc).main |
| call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end |
| sub dstq, r3 |
| lea r2, [r2+r3+32] |
| add dstq, 32 |
| cmp r7, r4 |
| jl .pass2_loop |
| RET |
| ALIGN function_align |
| .main: |
| lea r5, [idct64_mul_16bpc] |
| pmulld m0, m14, [cq+128* 1] |
| pmulld m1, m14, [cq+128*31] |
| pmulld m2, m14, [cq+128*17] |
| pmulld m3, m14, [cq+128*15] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 |
| pmulld m0, m14, [cq+128* 7] |
| pmulld m1, m14, [cq+128*25] |
| pmulld m2, m14, [cq+128*23] |
| pmulld m3, m14, [cq+128* 9] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 |
| pmulld m0, m14, [cq+128* 5] |
| pmulld m1, m14, [cq+128*27] |
| pmulld m2, m14, [cq+128*21] |
| pmulld m3, m14, [cq+128*11] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 |
| pmulld m0, m14, [cq+128* 3] |
| pmulld m1, m14, [cq+128*29] |
| pmulld m2, m14, [cq+128*19] |
| pmulld m3, m14, [cq+128*13] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 |
| pmulld m0, m14, [cq+128* 2] |
| pmulld m1, m14, [cq+128*14] |
| pmulld m2, m14, [cq+128*18] |
| pmulld m3, m14, [cq+128*30] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 |
| pmulld m0, m14, [cq+128* 6] |
| pmulld m1, m14, [cq+128*10] |
| pmulld m2, m14, [cq+128*22] |
| pmulld m3, m14, [cq+128*26] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 |
| pmulld m0, m14, [cq+128* 4] |
| pmulld m1, m14, [cq+128*12] |
| pmulld m2, m14, [cq+128*20] |
| pmulld m3, m14, [cq+128*28] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 |
| pmulld m0, m14, [cq+128* 0] |
| pmulld m1, m14, [cq+128* 8] |
| pmulld m2, m14, [cq+128*16] |
| pmulld m3, m14, [cq+128*24] |
| pxor m15, m15 |
| mov r7d, 128*29 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m15 |
| mova [cq+r7+128*0], m15 |
| mova [cq+r7+128*1], m15 |
| mova [cq+r7+128*2], m15 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| psrld m15, m11, 11 ; pd_1 |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 |
| IDCT64_SHIFT_TRANSPOSE 1 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob |
| %undef cmp |
| vpbroadcastd m11, [pd_2048] |
| vpbroadcastd m12, [clip_18b_min] |
| vpbroadcastd m13, [clip_18b_max] |
| vpbroadcastd m14, [pd_2896] |
| lea r6, [rsp+32*7] |
| call .main |
| cmp eobd, 36 |
| jl .fast |
| call .main |
| cmp eobd, 136 |
| jl .fast |
| call .main |
| cmp eobd, 300 |
| jl .fast |
| call .main |
| jmp .pass2 |
| .dconly: |
| imul r6d, [cq], 181 |
| mov [cq], eobd ; 0 |
| or r3d, 64 |
| jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly |
| .fast: |
| pxor m0, m0 |
| lea r4, [rsp+32*135] |
| .fast_loop: |
| REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 |
| add r6, 32*8 |
| cmp r6, r4 |
| jl .fast_loop |
| .pass2: |
| lea r10, [r6-32*32] |
| lea r6, [pw_5+128] |
| lea r8, [strideq*4] |
| lea r9, [strideq*5] |
| lea r3, [r9+strideq*1] ; stride*6 |
| lea r7, [r9+strideq*2] ; stride*7 |
| .pass2_loop: |
| mova m0, [r10-32*100] ; in0 |
| mova m1, [r10-32*96] ; in4 |
| mova m2, [r10-32*68] ; in8 |
| mova m3, [r10-32*64] ; in12 |
| mova m4, [r10-32*36] ; in16 |
| mova m5, [r10-32*32] ; in20 |
| mova m6, [r10-32* 4] ; in24 |
| mova m7, [r10+32* 0] ; in28 |
| pxor m8, m8 |
| REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 |
| mova [rsp], m8 |
| call m(idct_16x16_internal_8bpc).main |
| mova m1, [rsp+32*1] |
| mova [r4-32*4], m0 |
| mova [r4-32*3], m1 |
| mova [r4-32*2], m2 |
| mova [r4-32*1], m3 |
| mova [r4+32*0], m4 |
| mova [r4+32*1], m5 |
| mova [r4+32*2], m6 |
| mova [r4+32*3], m7 |
| add r4, 32*8 |
| mova [r4-32*4], m8 |
| mova [r4-32*3], m9 |
| mova [r4-32*2], m10 |
| mova [r4-32*1], m11 |
| mova [r4+32*0], m12 |
| mova [r4+32*1], m13 |
| mova [r4+32*2], m14 |
| mova [r4+32*3], m15 |
| mova m0, [r10-32*98] ; in2 |
| mova m1, [r10-32*94] ; in6 |
| mova m2, [r10-32*66] ; in10 |
| mova m3, [r10-32*62] ; in14 |
| mova m4, [r10-32*34] ; in18 |
| mova m5, [r10-32*30] ; in22 |
| mova m6, [r10-32* 2] ; in26 |
| mova m7, [r10+32* 2] ; in30 |
| lea r5, [r4+32*16] |
| add r4, 32*8 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| mova m0, [r10-32*99] ; in1 |
| mova m1, [r10+32* 3] ; in31 |
| mova m2, [r10-32*35] ; in17 |
| mova m3, [r10-32*61] ; in15 |
| mova m4, [r10-32*67] ; in9 |
| mova m5, [r10-32*29] ; in23 |
| mova m6, [r10-32* 3] ; in25 |
| mova m7, [r10-32*93] ; in7 |
| lea r6, [idct64_mul - 8] |
| add r4, 32*16 |
| add r5, 32*32 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| mova m0, [r10-32*95] ; in5 |
| mova m1, [r10-32* 1] ; in27 |
| mova m2, [r10-32*31] ; in21 |
| mova m3, [r10-32*65] ; in11 |
| mova m4, [r10-32*63] ; in13 |
| mova m5, [r10-32*33] ; in19 |
| mova m6, [r10+32* 1] ; in29 |
| mova m7, [r10-32*97] ; in3 |
| add r6, 8 |
| add r4, 32*8 |
| sub r5, 32*8 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 |
| add r10, 32*8 |
| sub dstq, r8 |
| sub r4, 32*44 |
| add dstq, 32 |
| cmp r10, r4 |
| jl .pass2_loop |
| RET |
| ALIGN function_align |
| .main: |
| lea r5, [idct64_mul_16bpc] |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128*31] |
| mova m2, [cq+128*17] |
| mova m3, [cq+128*15] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+128* 7] |
| mova m1, [cq+128*25] |
| mova m2, [cq+128*23] |
| mova m3, [cq+128* 9] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+128* 5] |
| mova m1, [cq+128*27] |
| mova m2, [cq+128*21] |
| mova m3, [cq+128*11] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| mova m0, [cq+128* 3] |
| mova m1, [cq+128*29] |
| mova m2, [cq+128*19] |
| mova m3, [cq+128*13] |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 |
| mova m0, [cq+128* 2] |
| mova m1, [cq+128*14] |
| mova m2, [cq+128*18] |
| mova m3, [cq+128*30] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast |
| mova m0, [cq+128* 6] |
| mova m1, [cq+128*10] |
| mova m2, [cq+128*22] |
| mova m3, [cq+128*26] |
| call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast |
| mova m0, [cq+128* 4] |
| mova m1, [cq+128*12] |
| mova m2, [cq+128*20] |
| mova m3, [cq+128*28] |
| call m(idct_8x16_internal_10bpc).main_oddhalf_fast |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 8] |
| mova m2, [cq+128*16] |
| mova m3, [cq+128*24] |
| pxor m15, m15 |
| mov r7d, 128*29 |
| .main_zero_loop: |
| mova [cq+r7-128*1], m15 |
| mova [cq+r7+128*0], m15 |
| mova [cq+r7+128*1], m15 |
| mova [cq+r7+128*2], m15 |
| sub r7d, 128*4 |
| jg .main_zero_loop |
| call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end |
| jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose |
| |
| %endif ; ARCH_X86_64 |