| ; Copyright © 2021, VideoLAN and dav1d authors |
| ; Copyright © 2021, Two Orioles, LLC |
| ; Copyright © 2017-2021, The rav1e contributors |
| ; Copyright © 2020, Nathan Egge |
| ; Copyright © 2021, Matthias Dressel |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA |
| %macro COEF 1-2 |
| pd_%1: times 4 dd %1 |
| %if %0 == 2 |
| pd_m%1: times 4 dd -%1 |
| %endif |
| %endmacro |
| |
| COEF 201 |
| COEF 401 |
| COEF 601, 1 |
| COEF 799 |
| COEF 995 |
| COEF 1189, 1 |
| COEF 1380, 1 |
| COEF 1567 |
| COEF 1751 |
| COEF 1931 |
| COEF 2106, 1 |
| COEF 2276, 1 |
| COEF 2440 |
| COEF 2598, 1 |
| COEF 2751, 1 |
| COEF 2896 |
| COEF 3035 |
| COEF 3166 |
| COEF 3290 |
| COEF 3406 |
| COEF 3513 |
| COEF 3612 |
| COEF 3703 |
| COEF 3784 |
| COEF 3857 |
| COEF 3920 |
| COEF 3973 |
| COEF 4017 |
| COEF 4052 |
| COEF 4076 |
| COEF 4091 |
| |
| deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 |
| |
| %if ARCH_X86_32 |
| pd_1: times 4 dd 1 |
| %endif |
| pd_2: times 4 dd 2 |
| pw_5: times 8 dw 5 |
| pd_1321: times 4 dd 1321 |
| pd_2482: times 4 dd 2482 |
| pd_m3344: times 4 dd -3344 |
| pd_2048: times 4 dd 2048 |
| pw_4x2048_4xm2048: times 4 dw 2048 |
| times 4 dw -2048 |
| pw_4xm2048_4x2048: times 4 dw -2048 |
| times 4 dw 2048 |
| pw_2048: times 8 dw 2048 |
| pw_m2048: times 8 dw -2048 |
| pd_3803: times 4 dd 3803 |
| pw_4096: times 8 dw 4096 |
| pd_5793: times 4 dd 5793 |
| pd_6144: times 4 dd 6144 |
| pw_8192: times 8 dw 8192 |
| pd_10240: times 4 dd 10240 |
| pd_11586: times 4 dd 11586 |
| pw_1697x8: times 8 dw 1697*8 |
| pw_2896x8: times 8 dw 2896*8 |
| pw_1697x16: times 8 dw 1697*16 |
| pw_16384: times 8 dw 16384 |
| pixel_10bpc_max: times 8 dw 0x03ff |
| |
| pw_1567_3784: times 4 dw 1567, 3784 |
| pw_m3784_1567: times 4 dw -3784, 1567 |
| pw_2896_2896: times 4 dw 2896, 2896 |
| pw_m2896_2896: times 4 dw -2896, 2896 |
| |
| clip_18b_min: times 4 dd -0x20000 |
| clip_18b_max: times 4 dd 0x1ffff |
| |
| idct64_mul_16bpc: |
| dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 |
| dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 |
| dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 |
| dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 |
| |
| cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 |
| cextern iadst_4x4_internal_8bpc_ssse3.main |
| cextern idct_4x8_internal_8bpc_ssse3.main |
| cextern iadst_4x8_internal_8bpc_ssse3.main |
| cextern idct_16x4_internal_8bpc_ssse3.main |
| cextern iadst_16x4_internal_8bpc_ssse3.main |
| cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end |
| cextern idct_8x4_internal_8bpc_ssse3.main |
| cextern iadst_8x4_internal_8bpc_ssse3.main |
| cextern idct_8x8_internal_8bpc_ssse3.main |
| cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 |
| cextern iadst_8x8_internal_8bpc_ssse3.main |
| cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end |
| cextern idct_16x8_internal_8bpc_ssse3.main |
| cextern iadst_16x8_internal_8bpc_ssse3.main |
| cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end |
| cextern idct_8x32_internal_8bpc_ssse3.main |
| cextern idct_8x32_internal_8bpc_ssse3.main_fast |
| cextern idct_8x32_internal_8bpc_ssse3.main_veryfast |
| cextern idct_16x64_internal_8bpc_ssse3.main |
| cextern idct_16x64_internal_8bpc_ssse3.main_fast |
| |
| tbl_4x16_2d: db 0, 13, 29, 45 |
| tbl_4x16_h: db 0, 16, 32, 48 |
| tbl_4x16_v: db 0, 4, 8, 12 |
| |
| tbl_8x16_2d: db 0, 14, 30, 46 |
| tbl_8x16_v: db 0, 4, 8, 12 |
| tbl_8x16_h: db 0, 32, 64, 96 |
| |
| tbl_16x16_2d: db 0, 10, 36, 78 |
| tbl_16x16_v: db 0, 4, 8, 12 |
| tbl_16x16_h: db 0, 64, 128, 192 |
| |
| tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 |
| |
| tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 |
| |
| tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one |
| tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 |
| |
| tbl_Nx32_odd_offset: db 2*16, 2*23 |
| db 2*20, 2*19 |
| db 2*18, 2*21 |
| db 2*22, 2*17 |
| db 2*30, 2*25 |
| db 2*26, 2*29 |
| db 2*28, 2*27 |
| db 2*24, 2*31 |
| |
| tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 |
| db 2* 8, 2*40, 2*23, 2*38 |
| db 2* 1, 2*36, 2*20, 2*42 |
| db 2* 9, 2*44, 2*19, 2*34 |
| db 2* 2, 2*60, 2*18, 2*50 |
| db 2*10, 2*52, 2*21, 2*58 |
| db 2* 3, 2*56, 2*22, 2*54 |
| db 2*11, 2*48, 2*17, 2*62 |
| |
| SECTION .text |
| |
| %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) |
| %define m(x) m_suffix(x, SUFFIX) |
| |
| ; This refers to the first function in itx_sse i.e. the start of the text section |
| ; which is needed as a base pointer for constants. |
| %define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) |
| |
| %if ARCH_X86_64 |
| %define o(x) x |
| %else |
| %define o(x) r6-$$+x ; PIC |
| %endif |
| |
| %macro IWHT4_1D 0 |
| ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 |
| paddd m0, m1 ; in0 += in1 |
| psubd m4, m2, m3 ; tmp0 = in2 - in3 |
| psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 |
| psrad m5, 1 |
| psubd m2, m5, m1 ; in2 = tmp1 - in1 |
| psubd m5, m3 ; in1 = tmp1 - in3 |
| psubd m0, m5 ; in0 -= in1 |
| paddd m4, m2 ; in3 = tmp0 + in2 |
| ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 |
| ; m4 = out3, m5 = out1 |
| %endmacro |
| |
| INIT_XMM sse2 |
| cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| mova m2, [cq+16*2] |
| mova m3, [cq+16*3] |
| REPX {psrad x, 2}, m0, m1, m2, m3 |
| IWHT4_1D |
| punpckldq m1, m0, m5 |
| punpckhdq m3, m0, m5 |
| punpckldq m5, m2, m4 |
| punpckhdq m2, m4 |
| punpcklqdq m0, m1, m5 |
| punpckhqdq m1, m5 |
| punpcklqdq m4, m3, m2 |
| punpckhqdq m3, m2 |
| mova m2, m4 |
| IWHT4_1D |
| packssdw m0, m4 ; low: out3, high: out0 |
| packssdw m2, m5 ; low: out2, high: out1 |
| pxor m4, m4 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m4 |
| mova [cq+16*2], m4 |
| mova [cq+16*3], m4 |
| lea r2, [dstq+strideq*2] |
| movq m1, [dstq+strideq*0] |
| movhps m1, [r2 +strideq*1] |
| movq m3, [r2 +strideq*0] |
| movhps m3, [dstq+strideq*1] |
| movd m5, bdmaxm |
| pshuflw m5, m5, q0000 ; broadcast |
| punpcklqdq m5, m5 ; broadcast |
| paddsw m0, m1 |
| paddsw m2, m3 |
| pmaxsw m0, m4 |
| pmaxsw m2, m4 |
| pminsw m0, m5 |
| pminsw m2, m5 |
| movhps [r2 +strideq*1], m0 ; write out0 |
| movhps [dstq+strideq*1], m2 ; write out1 |
| movq [r2 +strideq*0], m2 ; write out2 |
| movq [dstq+strideq*0], m0 ; write out3 |
| RET |
| |
| ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 |
| ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 |
| ; flags: 2 = inv_dst1, 4 = inv_dst2 |
| ; skip round/shift if rnd is not a number |
| %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags |
| ; %1 dst/src[1] |
| ; %2 dst/src[2] |
| ; %3 tmp[1] |
| ; %4 tmp[2] |
| ; %5 tmp[3] |
| ; %6 rnd |
| ; %7 coef[1] |
| ; %8 coef[2] |
| ; %9 flags |
| %ifnidn %7,%8 ; optimize when coef1 == coef2 |
| %if %8 < 32 |
| pmulld m%4, m%1, m%8 |
| pmulld m%3, m%2, m%8 |
| %else |
| mova m%3, [o(pd_%8)] |
| pmulld m%4, m%1, m%3 |
| pmulld m%3, m%2 |
| %endif |
| %endif |
| %if %7 < 32 |
| pmulld m%1, m%7 |
| pmulld m%2, m%7 |
| %else |
| mova m%5, [o(pd_%7)] |
| pmulld m%1, m%5 |
| pmulld m%2, m%5 |
| %endif |
| %if %9 & 4 ; invert dst2 |
| paddd m%4, m%2 |
| psubd m%2, m%6, m%4 |
| %else |
| %ifnum %6 |
| %ifnidn %7,%8 |
| paddd m%4, m%6 |
| %else |
| paddd m%1, m%6 |
| %endif |
| %endif |
| %ifnidn %7,%8 |
| paddd m%2, m%4 |
| %else |
| mova m%3, m%2 |
| paddd m%2, m%1 |
| %endif |
| %endif |
| %if %9 & 2 ; invert dst1 |
| psubd m%3, m%1 |
| paddd m%1, m%3, m%6 |
| %else |
| %ifnum %6 |
| %ifnidn %7,%8 |
| paddd m%1, m%6 |
| %endif |
| %endif |
| psubd m%1, m%3 |
| %endif |
| %ifnum %6 |
| psrad m%2, 12 |
| psrad m%1, 12 |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack |
| cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 |
| %define %%p1 m(i%1_%4_internal_16bpc) |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %endif |
| %if has_epilogue |
| %ifidn %1_%2, dct_dct |
| test eobd, eobd |
| jz %%end |
| %endif |
| lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] |
| %ifnum %3 |
| %if %3 |
| add eobd, %3 |
| %endif |
| %else |
| lea r5, [o(%3)] |
| %endif |
| call %%p1 |
| RET |
| %%end: |
| %else |
| ; Jump to the 1st txfm function if we're not taking the fast path, which |
| ; in turn performs an indirect jump to the 2nd txfm function. |
| lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] |
| %ifnum %3 |
| %if %3 |
| add eobd, %3 |
| %endif |
| %else |
| lea r5, [o(%3)] |
| %endif |
| %ifidn %1_%2, dct_dct |
| test eobd, eobd |
| jnz %%p1 |
| %else |
| ; jump to the 1st txfm function unless it's located directly after this |
| times ((%%end - %%p1) >> 31) & 1 jmp %%p1 |
| ALIGN function_align |
| %%end: |
| %endif |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_4X4_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 0, 4x4 |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 4 |
| .dconly: |
| add r5d, 128 |
| sar r5d, 8 |
| .dconly2: |
| imul r5d, 2896 |
| mova m2, [o(pixel_10bpc_max)] |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| pxor m3, m3 |
| punpcklqdq m0, m0 |
| .dconly_loop: |
| movq m1, [dstq+strideq*0] |
| movhps m1, [dstq+strideq*1] |
| paddw m1, m0 |
| pminsw m1, m2 |
| pmaxsw m1, m3 |
| movq [dstq+strideq*0], m1 |
| movhps [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub r3d, 2 |
| jg .dconly_loop |
| RET |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd |
| ; butterfly rotation |
| ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 |
| ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 |
| ; Hadamard rotation |
| psubd m%5, m%1, m%2 |
| paddd m%2, m%1 |
| paddd m%1, m%3, m%4 |
| psubd m%3, m%4 |
| ; %1 (src1) = out0 |
| ; %2 (src2) = out1 |
| ; %3 (src3) = out3 |
| ; $5 (tmp1) = out2 |
| %endmacro |
| |
| INIT_XMM sse4 |
| |
| INV_TXFM_4X4_FN dct, dct |
| INV_TXFM_4X4_FN dct, identity |
| INV_TXFM_4X4_FN dct, adst |
| INV_TXFM_4X4_FN dct, flipadst |
| |
| cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| mova m2, [cq+16*2] |
| mova m3, [cq+16*3] |
| mova m5, [o(pd_2048)] |
| call .pass1_main |
| packssdw m0, m1 ; out0 out1 |
| packssdw m4, m2 ; out2 out3 |
| ; transpose |
| punpckhwd m2, m0, m4 |
| punpcklwd m0, m4 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| ; m0 = out0 out1 |
| ; m1 = out2 out3 |
| ; m5 = pd_2048 |
| jmp tx2q |
| .pass1_main: |
| IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 |
| ret |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| ; m5 = pd_2048 |
| punpckhwd m2, m1, m0 |
| punpcklwd m1, m0 |
| pmaddwd m4, m2, [o(pw_m3784_1567)] |
| pmaddwd m2, [o(pw_1567_3784)] |
| pmaddwd m0, m1, [o(pw_m2896_2896)] |
| pmaddwd m1, [o(pw_2896_2896)] |
| REPX {paddd x, m5}, m4, m2, m0, m1 |
| packssdw m5, m5 ; pw_2048 |
| REPX {psrad x, 12}, m4, m2, m0, m1 |
| packssdw m2, m4 ; t3 t2 |
| packssdw m1, m0 ; t0 t1 |
| paddsw m0, m1, m2 ; out0 out1 |
| psubsw m1, m2 ; out3 out2 |
| pmulhrsw m0, m5 |
| pmulhrsw m1, m5 |
| movq m2, [dstq+strideq*0] |
| movhps m2, [dstq+strideq*1] |
| lea r5, [dstq+strideq*2] |
| movq m3, [r5 +strideq*1] |
| movhps m3, [r5 +strideq*0] |
| mova m5, [o(pixel_10bpc_max)] |
| pxor m4, m4 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m4 |
| mova [cq+16*2], m4 |
| mova [cq+16*3], m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m5 |
| pminsw m1, m5 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movhps [r5 +strideq*0], m1 |
| movq [r5 +strideq*1], m1 |
| RET |
| |
| INV_TXFM_4X4_FN adst, dct |
| INV_TXFM_4X4_FN adst, adst |
| INV_TXFM_4X4_FN adst, flipadst |
| INV_TXFM_4X4_FN adst, identity |
| |
| cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| call .main |
| packssdw m0, m2 ; out0 out1 |
| packssdw m1, m4 ; out2 out3 |
| ; transpose |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| ; m0 = out0 out1 |
| ; m1 = out2 out3 |
| ; m5 = pd_2048 |
| jmp tx2q |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main |
| .end: |
| mova m4, [o(pw_2048)] |
| movq m2, [dstq+strideq*0] |
| movhps m2, [dstq+strideq*1] |
| lea r5, [dstq+strideq*2] |
| movq m3, [r5 +strideq*0] |
| movhps m3, [r5 +strideq*1] |
| mova m5, [o(pixel_10bpc_max)] |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| pxor m4, m4 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m4 |
| mova [cq+16*2], m4 |
| mova [cq+16*3], m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m5 |
| pminsw m1, m5 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [r5 +strideq*0], m1 |
| movhps [r5 +strideq*1], m1 |
| RET |
| ALIGN function_align |
| .main: |
| mova m1, [cq+16*2] |
| mova m3, [cq+16*3] |
| mova m5, [cq+16*0] |
| lea r3, [cq+16*1] |
| .main2: |
| mova m0, [o(pd_1321)] ; SINPI_1_9 |
| mova m2, [o(pd_2482)] ; SINPI_2_9 |
| mova m6, [o(pd_3803)] ; SINPI_4_9 |
| pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] |
| pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] |
| pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] |
| pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] |
| psubd m1, m3 ; T[2] - T[3] |
| pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] |
| pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] |
| paddd m0, m6 ; s[0] += s[3] |
| paddd m0, m3 ; s[0] += s[5] |
| mova m3, [o(pd_m3344)] ; -SINPI_3_9 |
| psubd m2, m4 ; s[1] -= s[4] |
| psubd m2, m7 ; s[1] -= s[6] |
| psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] |
| pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 |
| pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] |
| mova m5, [o(pd_2048)] |
| REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 |
| paddd m4, m0, m2 ; x[3] = s[0] + s[1] |
| psubd m2, m3 ; x[1] = s[1] + s[3] |
| psubd m0, m3 ; x[0] = s[0] + s[3] |
| paddd m4, m3 ; x[3] -= s[3] |
| paddd m2, m5 ; x[1] + 2048 |
| REPX {psrad x, 12}, m0, m2, m1, m4 |
| ret |
| |
| |
| INV_TXFM_4X4_FN flipadst, dct |
| INV_TXFM_4X4_FN flipadst, adst |
| INV_TXFM_4X4_FN flipadst, flipadst |
| INV_TXFM_4X4_FN flipadst, identity |
| |
| cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| call m(iadst_4x4_internal_16bpc).main |
| packssdw m0, m2 ; out0 out1 |
| packssdw m1, m4 ; out2 out3 |
| ; transpose |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| ; m0 = out0 out1 |
| ; m1 = out2 out3 |
| ; m5 = pd_2048 |
| jmp tx2q |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main |
| mova m4, [o(pw_2048)] |
| movq m3, [dstq+strideq*1] |
| movhps m3, [dstq+strideq*0] |
| lea r5, [dstq+strideq*2] |
| movq m2, [r5 +strideq*1] |
| movhps m2, [r5 +strideq*0] |
| mova m5, [o(pixel_10bpc_max)] |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| pxor m4, m4 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m4 |
| mova [cq+16*2], m4 |
| mova [cq+16*3], m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m4 |
| pmaxsw m1, m4 |
| pminsw m0, m5 |
| pminsw m1, m5 |
| movhps [dstq+strideq*0], m1 |
| movq [dstq+strideq*1], m1 |
| movhps [r5 +strideq*0], m0 |
| movq [r5 +strideq*1], m0 |
| RET |
| |
| INV_TXFM_4X4_FN identity, dct |
| INV_TXFM_4X4_FN identity, adst |
| INV_TXFM_4X4_FN identity, flipadst |
| INV_TXFM_4X4_FN identity, identity |
| |
| cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| mova m3, [o(pd_5793)] |
| pmulld m0, m3, [cq+16*0] |
| pmulld m1, m3, [cq+16*1] |
| pmulld m2, m3, [cq+16*2] |
| pmulld m3, [cq+16*3] |
| mova m5, [o(pd_2048)] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| ; transpose |
| punpckhwd m3, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| ; m0 = out0 out1 |
| ; m1 = out2 out3 |
| ; m5 = pd_2048 |
| jmp tx2q |
| .pass2: |
| ; m0 = in0 in1 |
| ; m1 = in2 in3 |
| ; m5 = pd_2048 |
| mova m4, [o(pw_1697x8)] |
| movq m2, [dstq+strideq*0] |
| movhps m2, [dstq+strideq*1] |
| lea r5, [dstq+strideq*2] |
| pmulhrsw m3, m4, m0 |
| pmulhrsw m4, m1 |
| paddsw m0, m3 |
| paddsw m1, m4 |
| movq m3, [r5 +strideq*0] |
| movhps m3, [r5 +strideq*1] |
| mova m4, [o(pixel_10bpc_max)] |
| packssdw m5, m5 ; pw_2048 |
| pmulhrsw m0, m5 |
| pmulhrsw m1, m5 |
| pxor m5, m5 |
| mova [cq+16*0], m5 |
| mova [cq+16*1], m5 |
| mova [cq+16*2], m5 |
| mova [cq+16*3], m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m5 |
| pmaxsw m1, m5 |
| pminsw m0, m4 |
| pminsw m1, m4 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [r5 +strideq*0], m1 |
| movhps [r5 +strideq*1], m1 |
| RET |
| |
| %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset |
| INV_TXFM_FN %1, %2, %3, 4x8 |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 8 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly |
| %endif |
| %endmacro |
| |
| INV_TXFM_4X8_FN dct, dct |
| INV_TXFM_4X8_FN dct, identity, 9 |
| INV_TXFM_4X8_FN dct, adst |
| INV_TXFM_4X8_FN dct, flipadst |
| |
| cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| mova m5, [o(pd_2048)] |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 13 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 13 |
| sbb r5d, 0 |
| %endif |
| shl r5d, 4 |
| .loop_pass1: |
| mova m3, [o(pd_2896)] |
| pmulld m0, m3, [cq+32*0+r5] |
| pmulld m1, m3, [cq+32*1+r5] |
| pmulld m2, m3, [cq+32*2+r5] |
| pmulld m3, [cq+32*3+r5] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| call m(idct_4x4_internal_16bpc).pass1_main |
| packssdw m0, m1 ; out0 out1 |
| packssdw m4, m2 ; out2 out3 |
| test r5d, r5d |
| jz .end_pass1 |
| mova [cq+32*0+16], m0 |
| mova [cq+32*1+16], m4 |
| xor r5d, r5d |
| jmp .loop_pass1 |
| .end_pass1: |
| punpckhwd m2, m0, m4 |
| punpcklwd m0, m4 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| mova m2, [cq+32*0+16] |
| mova m6, [cq+32*1+16] |
| punpckhwd m4, m2, m6 |
| punpcklwd m2, m6 |
| punpckhwd m3, m2, m4 |
| punpcklwd m2, m4 |
| ; m0-3 = packed & transposed output |
| jmp tx2q |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(idct_4x8_internal_8bpc, _ssse3).main |
| ; m0-3 is now out0/1,3/2,4/5,7/6 |
| mova m4, [o(pw_2048)] |
| shufps m1, m1, q1032 |
| shufps m3, m3, q1032 |
| .end: |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| pxor m4, m4 |
| REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 |
| mova m7, [o(pixel_10bpc_max)] |
| lea r2, [strideq*3] |
| movq m5, [dstq+strideq*0] |
| movq m6, [dstq+strideq*2] |
| movhps m5, [dstq+strideq*1] |
| movhps m6, [dstq+r2] |
| lea r4, [dstq+strideq*4] |
| paddw m0, m5 |
| paddw m1, m6 |
| movq m5, [r4+strideq*0] |
| movq m6, [r4+strideq*2] |
| movhps m5, [r4+strideq*1] |
| movhps m6, [r4+r2] |
| paddw m2, m5 |
| paddw m3, m6 |
| REPX {pminsw x, m7}, m0, m1, m2, m3 |
| REPX {pmaxsw x, m4}, m0, m1, m2, m3 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m1 |
| movhps [dstq+r2 ], m1 |
| movq [r4 +strideq*0], m2 |
| movhps [r4 +strideq*1], m2 |
| movq [r4 +strideq*2], m3 |
| movhps [r4 +r2 ], m3 |
| RET |
| |
| INV_TXFM_4X8_FN adst, dct |
| INV_TXFM_4X8_FN adst, adst |
| INV_TXFM_4X8_FN adst, flipadst |
| INV_TXFM_4X8_FN adst, identity, 9 |
| |
| cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| call .pass1_main |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| mova m2, [cq+32*2+16] |
| mova m6, [cq+32*3+16] |
| punpckhwd m4, m2, m6 |
| punpcklwd m2, m6 |
| punpckhwd m3, m2, m4 |
| punpcklwd m2, m4 |
| ; m0-3 = packed & transposed output |
| jmp tx2q |
| .pass1_main: |
| %undef cmp |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 13 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 13 |
| sbb r5d, 0 |
| %endif |
| shl r5d, 4 |
| lea r3, [cq+32*1+16] |
| .loop_pass1: |
| mova m0, [o(pd_2048)] |
| mova m3, [o(pd_2896)] |
| pmulld m5, m3, [cq+32*0+r5] |
| pmulld m2, m3, [cq+32*1+r5] |
| pmulld m1, m3, [cq+32*2+r5] |
| pmulld m3, [cq+32*3+r5] |
| REPX {paddd x, m0}, m5, m2, m1, m3 |
| REPX {psrad x, 12}, m5, m2, m1, m3 |
| mova [r3], m2 |
| call m(iadst_4x4_internal_16bpc).main2 |
| packssdw m0, m2 ; out0 out1 |
| packssdw m1, m4 ; out2 out3 |
| test r5d, r5d |
| jz .end_pass1 |
| mova [cq+32*2+16], m0 |
| mova [cq+32*3+16], m1 |
| xor r5d, r5d |
| jmp .loop_pass1 |
| .end_pass1: |
| ret |
| .pass2: |
| shufps m0, m0, q1032 |
| shufps m1, m1, q1032 |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main |
| mova m4, [o(pw_4x2048_4xm2048)] |
| jmp m(idct_4x8_internal_16bpc).end |
| |
| INV_TXFM_4X8_FN flipadst, dct |
| INV_TXFM_4X8_FN flipadst, adst |
| INV_TXFM_4X8_FN flipadst, flipadst |
| INV_TXFM_4X8_FN flipadst, identity, 9 |
| |
| cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| call m(iadst_4x8_internal_16bpc).pass1_main |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| mova m6, [cq+32*2+16] |
| mova m2, [cq+32*3+16] |
| punpcklwd m4, m2, m6 |
| punpckhwd m2, m6 |
| punpckhwd m3, m2, m4 |
| punpcklwd m2, m4 |
| ; m0-3 = packed & transposed output |
| jmp tx2q |
| .pass2: |
| shufps m0, m0, q1032 |
| shufps m1, m1, q1032 |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main |
| mova m4, m0 |
| mova m5, m1 |
| pshufd m0, m3, q1032 |
| pshufd m1, m2, q1032 |
| pshufd m2, m5, q1032 |
| pshufd m3, m4, q1032 |
| mova m4, [o(pw_4xm2048_4x2048)] |
| jmp m(idct_4x8_internal_16bpc).end |
| |
| INV_TXFM_4X8_FN identity, dct |
| INV_TXFM_4X8_FN identity, adst |
| INV_TXFM_4X8_FN identity, flipadst |
| INV_TXFM_4X8_FN identity, identity, 3 |
| |
| cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| mova m5, [o(pd_2048)] |
| mova m4, [o(pd_2896)] |
| mova m6, [o(pd_5793)] |
| ; clear m7 in case we skip the bottom square |
| pxor m7, m7 |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 16 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 16 |
| sbb r5d, 0 |
| %endif |
| shl r5d, 4 |
| .loop_pass1: |
| pmulld m0, m4, [cq+32*0+r5] |
| pmulld m1, m4, [cq+32*1+r5] |
| pmulld m2, m4, [cq+32*2+r5] |
| pmulld m3, m4, [cq+32*3+r5] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| REPX {pmulld x, m6}, m0, m1, m2, m3 |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 12}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| test r5d, r5d |
| jz .end_pass1 |
| mova [cq+32*0+16], m0 |
| mova m7, m2 |
| xor r5d, r5d |
| jmp .loop_pass1 |
| .end_pass1: |
| punpckhwd m4, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m1, m0, m4 |
| punpcklwd m0, m4 |
| mova m2, [cq+32*0+16] |
| punpckhwd m4, m2, m7 |
| punpcklwd m2, m7 |
| punpckhwd m3, m2, m4 |
| punpcklwd m2, m4 |
| ; m0-3 = packed & transposed output |
| jmp tx2q |
| .pass2: |
| mova m4, [o(pw_4096)] |
| jmp m(idct_4x8_internal_16bpc).end |
| |
| %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix |
| INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 16 |
| add r5d, 384 |
| sar r5d, 9 |
| jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 |
| %endif |
| %endmacro |
| |
| INV_TXFM_4X16_FN dct, dct |
| INV_TXFM_4X16_FN dct, identity, v |
| INV_TXFM_4X16_FN dct, adst |
| INV_TXFM_4X16_FN dct, flipadst |
| |
| cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| %if ARCH_X86_32 |
| mov r5m, r6d |
| %endif |
| mov r6d, 4 |
| .zero_loop: |
| dec r6d |
| cmp eobb, byte [r5+r6] |
| jl .zero_loop |
| mov r5d, r6d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, r5m |
| %endif |
| mova m5, [o(pd_2048)] |
| .loop_pass1: |
| mova m0, [cq+64*0+r5] |
| mova m1, [cq+64*1+r5] |
| mova m2, [cq+64*2+r5] |
| mova m3, [cq+64*3+r5] |
| call m(idct_4x4_internal_16bpc).pass1_main |
| pcmpeqd m3, m3 |
| REPX {psubd x, m3}, m0, m1, m4, m2 |
| REPX {psrad x, 1}, m0, m1, m4, m2 |
| packssdw m0, m1 ; out0 out1 |
| packssdw m4, m2 ; out2 out3 |
| punpckhwd m2, m0, m4 |
| punpcklwd m0, m4 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| test r5d, r5d |
| jz .end_pass1 |
| mova [cq+64*0+r5], m0 |
| mova [cq+64*1+r5], m1 |
| sub r5d, 16 |
| jmp .loop_pass1 |
| .end_pass1: |
| mova m2, [cq+64*0+16] |
| mova m3, [cq+64*1+16] |
| mova m4, [cq+64*0+32] |
| mova m5, [cq+64*1+32] |
| mova m6, [cq+64*0+48] |
| mova m7, [cq+64*1+48] |
| ; m0-7 = packed & transposed output |
| jmp tx2q |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(idct_16x4_internal_8bpc, _ssse3).main |
| ; m0-6 is out0-13 [with odd registers having inversed output] |
| ; [coeffq+16*7] has out15/14 |
| mova m7, [o(pw_2048)] |
| REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmulhrsw m7, [cq+16*7] |
| REPX {shufps x, x, q1032}, m1, m3, m5, m7 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m5 |
| mova [cq+16*2], m6 |
| mova [cq+16*3], m7 |
| .end: |
| pxor m4, m4 |
| REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| mova m7, [o(pixel_10bpc_max)] |
| mov r5d, 2 |
| lea r3, [strideq*3] |
| .loop: |
| movq m5, [dstq+strideq*0] |
| movq m6, [dstq+strideq*2] |
| movhps m5, [dstq+strideq*1] |
| movhps m6, [dstq+r3] |
| lea r4, [dstq+strideq*4] |
| paddw m0, m5 |
| paddw m1, m6 |
| movq m5, [r4+strideq*0] |
| movq m6, [r4+strideq*2] |
| movhps m5, [r4+strideq*1] |
| movhps m6, [r4+r3] |
| paddw m2, m5 |
| paddw m3, m6 |
| REPX {pminsw x, m7}, m0, m1, m2, m3 |
| REPX {pmaxsw x, m4}, m0, m1, m2, m3 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m1 |
| movhps [dstq+r3 ], m1 |
| movq [r4 +strideq*0], m2 |
| movhps [r4 +strideq*1], m2 |
| movq [r4 +strideq*2], m3 |
| movhps [r4 +r3 ], m3 |
| dec r5d |
| jz .end2 |
| lea dstq, [dstq+strideq*8] |
| mova m0, [cq+0*16] |
| mova m1, [cq+1*16] |
| mova m2, [cq+2*16] |
| mova m3, [cq+3*16] |
| REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 |
| jmp .loop |
| .end2: |
| RET |
| |
| INV_TXFM_4X16_FN adst, dct |
| INV_TXFM_4X16_FN adst, adst |
| INV_TXFM_4X16_FN adst, flipadst |
| INV_TXFM_4X16_FN adst, identity, v |
| |
| cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| %if ARCH_X86_32 |
| mov r5m, r6d |
| %endif |
| mov r6d, 4 |
| .zero_loop: |
| dec r6d |
| cmp eobb, byte [r6+r5] |
| jl .zero_loop |
| mov r5d, r6d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, r5m |
| %endif |
| .loop_pass1: |
| mova m5, [cq+64*0+r5] |
| lea r3, [cq+64*1+r5] |
| mova m1, [cq+64*2+r5] |
| mova m3, [cq+64*3+r5] |
| call m(iadst_4x4_internal_16bpc).main2 |
| pcmpeqd m3, m3 |
| REPX {psubd x, m3}, m0, m2, m1, m4 |
| REPX {psrad x, 1}, m0, m2, m1, m4 |
| packssdw m0, m2 ; out0 out1 |
| packssdw m1, m4 ; out2 out3 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| test r5d, r5d |
| jz m(idct_4x16_internal_16bpc).end_pass1 |
| mova [cq+64*0+r5], m0 |
| mova [cq+64*1+r5], m1 |
| sub r5d, 16 |
| jmp .loop_pass1 |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end |
| ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 |
| ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 |
| mova m1, [o(pw_4x2048_4xm2048)] |
| REPX {pmulhrsw x, m1}, m7, m2, m0 |
| pshufd m6, m1, q1032 ; 4x-2048,4x2048 |
| pmulhrsw m1, [cq+16*7] |
| REPX {pmulhrsw x, m6}, m5, m4, m3 |
| pmulhrsw m6, [cq+16*6] |
| ; m7/5/2/4 = out4/11,5/10,6/9,7/8 |
| ; m0/3/6/1 = out0/15,3/12,1/14,2/13 |
| ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 |
| movhps [cq+0*8], m4 |
| movhps [cq+1*8], m2 |
| movhps [cq+2*8], m5 |
| movhps [cq+3*8], m7 |
| movhps [cq+4*8], m3 |
| movhps [cq+5*8], m1 |
| movhps [cq+6*8], m6 |
| movhps [cq+7*8], m0 |
| punpcklqdq m0, m6 |
| punpcklqdq m1, m3 |
| punpcklqdq m3, m2, m4 |
| punpcklqdq m2, m7, m5 |
| jmp m(idct_4x16_internal_16bpc).end |
| |
| INV_TXFM_4X16_FN flipadst, dct |
| INV_TXFM_4X16_FN flipadst, adst |
| INV_TXFM_4X16_FN flipadst, flipadst |
| INV_TXFM_4X16_FN flipadst, identity, v |
| |
| cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| %if ARCH_X86_32 |
| mov r5m, r6d |
| %endif |
| mov r6d, 4 |
| .zero_loop: |
| dec r6d |
| cmp eobb, byte [r5+r6] |
| jl .zero_loop |
| mov r5d, r6d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, r5m |
| %endif |
| .loop_pass1: |
| mova m5, [cq+64*0+r5] |
| lea r3, [cq+64*1+r5] |
| mova m1, [cq+64*2+r5] |
| mova m3, [cq+64*3+r5] |
| call m(iadst_4x4_internal_16bpc).main2 |
| pcmpeqd m3, m3 |
| REPX {psubd x, m3}, m0, m2, m1, m4 |
| REPX {psrad x, 1}, m0, m2, m1, m4 |
| packssdw m0, m2 ; out3 out2 |
| packssdw m1, m4 ; out1 out0 |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| test r5d, r5d |
| jz m(idct_4x16_internal_16bpc).end_pass1 |
| mova [cq+64*0+r5], m0 |
| mova [cq+64*1+r5], m1 |
| sub r5d, 16 |
| jmp .loop_pass1 |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end |
| ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 |
| ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 |
| mova m1, [o(pw_4x2048_4xm2048)] |
| REPX {pmulhrsw x, m1}, m7, m2, m0 |
| pshufd m6, m1, q1032 ; 4x-2048,4x2048 |
| pmulhrsw m1, [cq+16*7] |
| REPX {pmulhrsw x, m6}, m5, m4, m3 |
| pmulhrsw m6, [cq+16*6] |
| ; m7/5/2/4 = out11/4,10/5,9/6,8/7 |
| ; m0/3/6/1 = out15/0,12/3,14/1,13/2 |
| ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 |
| movq [cq+0*8], m4 |
| movq [cq+1*8], m2 |
| movq [cq+2*8], m5 |
| movq [cq+3*8], m7 |
| movq [cq+4*8], m3 |
| movq [cq+5*8], m1 |
| movq [cq+6*8], m6 |
| movq [cq+7*8], m0 |
| punpckhqdq m0, m6 |
| punpckhqdq m1, m3 |
| punpckhqdq m3, m2, m4 |
| punpckhqdq m2, m7, m5 |
| jmp m(idct_4x16_internal_16bpc).end |
| |
| INV_TXFM_4X16_FN identity, dct, h |
| INV_TXFM_4X16_FN identity, adst, h |
| INV_TXFM_4X16_FN identity, flipadst, h |
| INV_TXFM_4X16_FN identity, identity |
| |
| cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %undef cmp |
| %if ARCH_X86_32 |
| mov r5m, r6d |
| %endif |
| mov r6d, 4 |
| .zero_loop: |
| dec r6d |
| cmp eobb, byte [r5+r6] |
| jl .zero_loop |
| mov r5d, r6d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, r5m |
| %endif |
| mova m5, [o(pd_6144)] |
| mova m4, [o(pd_5793)] |
| .loop_pass1: |
| pmulld m0, m4, [cq+64*0+r5] |
| pmulld m1, m4, [cq+64*1+r5] |
| pmulld m2, m4, [cq+64*2+r5] |
| pmulld m3, m4, [cq+64*3+r5] |
| REPX {paddd x, m5}, m0, m1, m2, m3 |
| REPX {psrad x, 13}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| punpckhwd m3, m0, m2 |
| punpcklwd m0, m2 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| test r5d, r5d |
| jz m(idct_4x16_internal_16bpc).end_pass1 |
| mova [cq+64*0+r5], m0 |
| mova [cq+64*1+r5], m1 |
| sub r5d, 16 |
| jmp .loop_pass1 |
| .pass2: |
| mova [cq+16*4], m0 |
| mova [cq+16*5], m1 |
| mova [cq+16*6], m2 |
| mova [cq+16*7], m7 |
| mova m0, [o(pw_1697x16)] |
| mova m7, [o(pw_2048)] |
| pmulhrsw m1, m0, m4 |
| pmulhrsw m2, m0, m5 |
| REPX {paddsw x, x}, m4, m5 |
| paddsw m4, m1 |
| paddsw m5, m2 |
| REPX {pmulhrsw x, m7}, m4, m5 |
| mova [cq+16*0], m4 |
| mova [cq+16*1], m5 |
| mova m4, [cq+16*7] |
| pmulhrsw m1, m0, m6 |
| pmulhrsw m2, m0, m4 |
| REPX {paddsw x, x}, m6, m4 |
| paddsw m6, m1 |
| paddsw m4, m2 |
| REPX {pmulhrsw x, m7}, m6, m4 |
| mova [cq+16*2], m6 |
| mova [cq+16*3], m4 |
| mova m4, [cq+16*4] |
| mova m1, [cq+16*5] |
| mova m2, [cq+16*6] |
| pmulhrsw m5, m0, m2 |
| pmulhrsw m6, m0, m3 |
| REPX {paddsw x, x}, m2, m3 |
| paddsw m2, m5 |
| paddsw m3, m6 |
| pmulhrsw m6, m0, m1 |
| pmulhrsw m0, m4 |
| REPX {paddsw x, x}, m1, m4 |
| paddsw m1, m6 |
| paddsw m0, m4 |
| REPX {pmulhrsw x, m7}, m2, m3, m1, m0 |
| jmp m(idct_4x16_internal_16bpc).end |
| |
| %macro INV_TXFM_8X4_FN 2 ; type1, type2 |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, 0, 8x4, 15 |
| %else |
| INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 2896 |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| punpcklqdq m0, m0 |
| mova m6, [o(pixel_10bpc_max)] |
| pxor m5, m5 |
| lea r2, [strideq*3] |
| mova m1, [dstq+strideq*0] |
| mova m2, [dstq+strideq*1] |
| mova m3, [dstq+strideq*2] |
| mova m4, [dstq+r2] |
| REPX {paddw x, m0}, m1, m2, m3, m4 |
| REPX {pmaxsw x, m5}, m1, m2, m3, m4 |
| REPX {pminsw x, m6}, m1, m2, m3, m4 |
| mova [dstq+strideq*0], m1 |
| mova [dstq+strideq*1], m2 |
| mova [dstq+strideq*2], m3 |
| mova [dstq+r2 ], m4 |
| RET |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X4_FN dct, dct |
| INV_TXFM_8X4_FN dct, identity |
| INV_TXFM_8X4_FN dct, adst |
| INV_TXFM_8X4_FN dct, flipadst |
| |
| cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| lea r5, [o(.main)] |
| .pass1_entry: |
| %if ARCH_X86_32 |
| lea r3, [rsp+gprsize] |
| %else |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+0*16] |
| mova m1, [cq+1*16] |
| mova m2, [cq+2*16] |
| mova m3, [cq+3*16] |
| mova m4, [cq+4*16] |
| mova m5, [cq+5*16] |
| mova m6, [cq+6*16] |
| mova m7, [cq+7*16] |
| call .rect2_mul |
| call r5 |
| call .transpose4x8packed |
| ; m0-3 = packed & transposed output |
| jmp tx2q |
| .transpose4x8packed: |
| ; transpose |
| punpcklwd m1, m2, m6 |
| punpckhwd m2, m6 |
| punpckhwd m6, m0, m4 |
| punpcklwd m0, m4 |
| |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m4, m6, m2 |
| punpcklwd m6, m2 |
| |
| punpcklwd m2, m3, m4 |
| punpckhwd m3, m4 |
| punpckhwd m1, m0, m6 |
| punpcklwd m0, m6 |
| ret |
| .main: |
| call .main_pass1 |
| call .round |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| ret |
| .rect2_mul: |
| %if ARCH_X86_64 |
| REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| %else |
| mova [r3], m7 |
| mova m7, [o(pd_2896)] |
| REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmulld m7, [r3] |
| mova [r3], m7 |
| mova m7, [o(pd_2048)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3] |
| %endif |
| REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| ret |
| %if ARCH_X86_64 |
| .main_pass1_fast: |
| pmulld m5, m3, [o(pd_m2276)] |
| pmulld m3, [o(pd_3406)] |
| pmulld m7, m1, [o(pd_4017)] |
| pmulld m1, [o(pd_799)] |
| pmulld m6, m2, [o(pd_3784)] |
| pmulld m2, [o(pd_1567)] |
| pmulld m0, m14 |
| pxor m4, m4 |
| jmp .main_pass1_fast2 |
| .main_pass1: |
| ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a |
| ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a |
| ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 |
| REPX {pmulld x, m14}, m0, m4 |
| .main_pass1_fast2: |
| REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 |
| REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 |
| paddd m8, m1, m5 ; t4 |
| psubd m1, m5 ; t5a |
| paddd m9, m7, m3 ; t7 |
| psubd m7, m3 ; t6a |
| REPX {pmaxsd x, m12}, m1, m8, m7, m9 |
| REPX {pminsd x, m13}, m1, m8, m7, m9 |
| REPX {pmulld x, m14}, m7, m1 |
| paddd m0, m11 |
| paddd m7, m11 |
| psubd m5, m0, m4 |
| paddd m0, m4 |
| psubd m4, m7, m1 |
| paddd m7, m1 |
| REPX {psrad x, 12 }, m5, m0, m4, m7 |
| psubd m3, m0, m6 ; dct4 out3 |
| paddd m0, m6 ; dct4 out0 |
| paddd m6, m5, m2 ; dct4 out1 |
| psubd m5, m2 ; dct4 out2 |
| REPX {pmaxsd x, m12}, m0, m6, m5, m3 |
| REPX {pminsd x, m13}, m0, m6, m5, m3 |
| ret |
| .round: |
| paddd m1, m6, m7 ; out1 |
| psubd m6, m7 ; out6 |
| psubd m7, m0, m9 ; out7 |
| paddd m0, m9 ; out0 |
| paddd m2, m5, m4 ; out2 |
| psubd m5, m4 ; out5 |
| psubd m4, m3, m8 ; out4 |
| paddd m3, m8 ; out3 |
| %else |
| .main_pass1_fast: |
| pmulld m5, m3, [o(pd_m2276)] |
| pmulld m3, [o(pd_3406)] |
| pmulld m7, m1, [o(pd_4017)] |
| pmulld m1, [o(pd_799)] |
| pmulld m6, m2, [o(pd_3784)] |
| pmulld m2, [o(pd_1567)] |
| mova m4, [o(pd_2048)] |
| mova [r3+0*16], m2 |
| REPX {paddd x, m4}, m5, m3, m7, m1 |
| REPX {psrad x, 12}, m5, m3, m7, m1 |
| paddd m2, m1, m5 ; t4 |
| psubd m1, m5 ; t5a |
| pmulld m5, m0, [o(pd_2896)] |
| mova m0, m4 |
| paddd m4, m7, m3 ; t7 |
| psubd m7, m3 ; t6a |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3 }, m1, m2, m7, m4 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3 }, m1, m2, m7, m4 |
| mova [r3+3*16], m2 |
| mova [r3+1*16], m4 |
| pxor m4, m4 |
| mova m2, [r3+0*16] |
| mova m3, [o(pd_2896)] |
| jmp .main_pass1_fast2 |
| .main_pass1: |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m2 |
| mova [r3+2*16], m4 |
| mova [r3+3*16], m6 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a |
| ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a |
| paddd m2, m1, m5 ; t4 |
| psubd m1, m5 ; t5a |
| paddd m4, m7, m3 ; t7 |
| psubd m7, m3 ; t6a |
| mova m6, [o(clip_18b_min)] |
| REPX {pmaxsd x, m6 }, m1, m2, m7, m4 |
| mova m6, [o(clip_18b_max)] |
| REPX {pminsd x, m6 }, m1, m2, m7, m4 |
| mova m6, [r3+3*16] |
| mova [r3+3*16], m2 |
| mova m2, [r3+1*16] |
| mova [r3+1*16], m4 |
| |
| ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 |
| mova m3, [o(pd_2896)] |
| mova m5, [r3+0*16] |
| mova m4, [r3+2*16] |
| REPX {pmulld x, m3 }, m5, m4 |
| .main_pass1_fast2: |
| REPX {paddd x, m0 }, m2, m6 |
| REPX {psrad x, 12 }, m2, m6 |
| REPX {pmulld x, m3 }, m7, m1 |
| paddd m7, m0 |
| paddd m0, m5 |
| |
| psubd m5, m0, m4 |
| paddd m0, m4 |
| psubd m4, m7, m1 |
| paddd m7, m1 |
| REPX {psrad x, 12 }, m5, m0, m4, m7 |
| psubd m3, m0, m6 ; dct4 out3 |
| paddd m0, m6 ; dct4 out0 |
| paddd m6, m5, m2 ; dct4 out1 |
| psubd m5, m2 ; dct4 out2 |
| |
| mova m1, [o(clip_18b_min)] |
| REPX {pmaxsd x, m1 }, m0, m6, m5, m3 |
| mova m1, [o(clip_18b_max)] |
| REPX {pminsd x, m1 }, m0, m6, m5, m3 |
| ret |
| .round: |
| paddd m1, m6, m7 ; out1 |
| psubd m6, m7 ; out6 |
| mova [r3+0*16], m6 |
| mova m6, [r3+1*16] |
| psubd m7, m0, m6 ; out7 |
| paddd m0, m6 ; out0 |
| paddd m2, m5, m4 ; out2 |
| psubd m5, m4 ; out5 |
| mova m6, [r3+3*16] |
| psubd m4, m3, m6 ; out4 |
| paddd m3, m6 ; out3 |
| mova m6, [r3+0*16] |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(idct_8x4_internal_8bpc, _ssse3).main |
| .end: |
| lea r3, [strideq*3] |
| call .round2_and_write_8x4 |
| REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| RET |
| .round2_and_write_8x4: |
| pxor m6, m6 |
| mova m5, [o(pixel_10bpc_max)] |
| mova m4, [o(pw_2048)] |
| .round1_and_write_8x4: |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| .write_8x4: |
| paddw m0, [dstq+strideq*0] |
| paddw m1, [dstq+strideq*1] |
| paddw m2, [dstq+strideq*2] |
| paddw m3, [dstq+r3] |
| REPX {pminsw x, m5}, m0, m1, m2, m3 |
| REPX {pmaxsw x, m6}, m0, m1, m2, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r3 ], m3 |
| ret |
| |
| INV_TXFM_8X4_FN adst, dct |
| INV_TXFM_8X4_FN adst, adst |
| INV_TXFM_8X4_FN adst, flipadst |
| INV_TXFM_8X4_FN adst, identity |
| |
| cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| lea r5, [o(.main)] |
| jmp m(idct_8x4_internal_16bpc).pass1_entry |
| .main: |
| call .main_pass1 |
| call .round |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| ret |
| .main_pass1: |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a |
| ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a |
| ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a |
| ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a |
| psubd m8, m2, m6 ; t6 |
| paddd m2, m6 ; t2 |
| psubd m6, m0, m4 ; t4 |
| paddd m0, m4 ; t0 |
| psubd m4, m5, m1 ; t7 |
| paddd m5, m1 ; t3 |
| psubd m1, m7, m3 ; t5 |
| paddd m7, m3 ; t1 |
| REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 |
| REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 |
| ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a |
| ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a |
| psubd m9, m6, m8 ; t7 |
| paddd m6, m8 ; out6 |
| mova m8, [o(pd_2896)] |
| psubd m3, m7, m5 ; t3 |
| paddd m7, m5 ; -out7 |
| psubd m5, m0, m2 ; t2 |
| paddd m0, m2 ; out0 |
| psubd m2, m1, m4 ; t6 |
| paddd m1, m4 ; -out1 |
| REPX {pmaxsd x, m12}, m5, m3, m2, m9 |
| REPX {pminsd x, m13}, m5, m3, m2, m9 |
| REPX {pmulld x, m14}, m5, m3, m2, m9 |
| psubd m4, m5, m3 ; (t2 - t3) * 2896 |
| paddd m3, m5 ; (t2 + t3) * 2896 |
| psubd m5, m2, m9 ; (t6 - t7) * 2896 |
| paddd m2, m9 ; (t6 + t7) * 2896 |
| ret |
| .round: |
| |
| ; m0=out0,m1=-out1,m6=out6,m7=-out7 |
| |
| pcmpeqd m8, m8 |
| REPX {pxor x, m8 }, m1, m7, m3, m5 |
| REPX {psubd x, m8 }, m1, m7 |
| REPX {paddd x, m11}, m2, m3, m4, m5 |
| REPX {psrad x, 12 }, m2, m3, m4, m5 |
| %else |
| mova [r3+0*16], m2 |
| mova [r3+1*16], m3 |
| mova [r3+2*16], m4 |
| mova [r3+3*16], m5 |
| mova m5, [o(pd_2048)] |
| |
| ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a |
| ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a |
| mova m2, [r3+0*16] |
| mova m3, [r3+1*16] |
| mova m4, [r3+2*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova [r3+2*16], m6 |
| mova m1, [r3+3*16] |
| mova [r3+3*16], m7 |
| ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a |
| ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a |
| mova m0, [r3+0*16] |
| mova m6, [r3+2*16] |
| psubd m7, m2, m6 ; t6 |
| paddd m2, m6 ; t2 |
| psubd m6, m0, m4 ; t4 |
| paddd m0, m4 ; t0 |
| mova [r3+0*16], m7 |
| mova m5, [r3+1*16] |
| mova m7, [r3+3*16] |
| psubd m4, m1, m5 ; t7 |
| paddd m5, m1 ; t3 |
| psubd m1, m7, m3 ; t5 |
| paddd m7, m3 ; t1 |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 |
| mova [r3+1*16], m7 |
| mova m7, [o(clip_18b_max)] |
| pmaxsd m3, [r3+0*16] |
| REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 |
| pminsd m7, [r3+1*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m2 |
| mova [r3+2*16], m5 |
| mova [r3+3*16], m7 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a |
| ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a |
| mova m5, [r3+2*16] |
| mova m7, [r3+3*16] |
| psubd m2, m6, m3 ; t7 |
| paddd m6, m3 ; out6 |
| mova [r3+3*16], m6 |
| mova m0, [r3+0*16] |
| mova m6, [r3+1*16] |
| psubd m3, m7, m5 ; t3 |
| paddd m7, m5 ; -out7 |
| psubd m5, m0, m6 ; t2 |
| paddd m0, m6 ; out0 |
| psubd m6, m1, m4 ; t6 |
| paddd m1, m4 ; -out1 |
| mova m4, [o(clip_18b_min)] |
| REPX {pmaxsd x, m4 }, m5, m3, m6, m2 |
| mova m4, [o(clip_18b_max)] |
| REPX {pminsd x, m4 }, m5, m3, m6, m2 |
| mova m4, [o(pd_2896)] |
| REPX {pmulld x, m4 }, m5, m3, m6, m2 |
| psubd m4, m5, m3 ; (t2 - t3) * 2896 |
| paddd m3, m5 ; (t2 + t3) * 2896 |
| psubd m5, m6, m2 ; (t6 - t7) * 2896 |
| paddd m2, m6 ; (t6 + t7) * 2896 |
| ret |
| .round: |
| mova [r3+2*16], m0 |
| |
| pcmpeqd m0, m0 |
| mova m6, [o(pd_2048)] |
| REPX {pxor x, m0 }, m1, m7, m3, m5 |
| REPX {psubd x, m0 }, m1, m7 |
| REPX {paddd x, m6 }, m2, m3, m4, m5 |
| REPX {psrad x, 12 }, m2, m3, m4, m5 |
| |
| mova m6, [r3+3*16] |
| mova m0, [r3+2*16] |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main |
| jmp m(idct_8x4_internal_16bpc).end |
| |
| INV_TXFM_8X4_FN flipadst, dct |
| INV_TXFM_8X4_FN flipadst, adst |
| INV_TXFM_8X4_FN flipadst, flipadst |
| INV_TXFM_8X4_FN flipadst, identity |
| |
| cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| lea r5, [o(.main)] |
| jmp m(idct_8x4_internal_16bpc).pass1_entry |
| .main: |
| call m(iadst_8x4_internal_16bpc).main_pass1 |
| call m(iadst_8x4_internal_16bpc).round |
| packssdw m7, m6 |
| packssdw m5, m4 |
| packssdw m3, m2 |
| packssdw m1, m0 |
| mova m0, m7 |
| mova m2, m5 |
| mova m4, m3 |
| mova m6, m1 |
| ret |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main |
| lea r3, [strideq*3] |
| add dstq, r3 |
| neg strideq |
| jmp m(idct_8x4_internal_16bpc).end |
| |
| INV_TXFM_8X4_FN identity, dct |
| INV_TXFM_8X4_FN identity, adst |
| INV_TXFM_8X4_FN identity, flipadst |
| INV_TXFM_8X4_FN identity, identity |
| |
| cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| lea r5, [o(.main)] |
| jmp m(idct_8x4_internal_16bpc).pass1_entry |
| .main: |
| REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| ret |
| .pass2: |
| mova m7, [o(pw_1697x8)] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| jmp m(idct_8x4_internal_16bpc).end |
| |
| %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 |
| %else |
| INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 2 |
| .end: |
| add r5d, 384 |
| sar r5d, 9 |
| .end2: |
| imul r5d, 2896 |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| punpcklqdq m0, m0 |
| mova m6, [o(pixel_10bpc_max)] |
| pxor m5, m5 |
| lea r2, [strideq*3] |
| .loop: |
| mova m1, [dstq+strideq*0] |
| mova m2, [dstq+strideq*1] |
| mova m3, [dstq+strideq*2] |
| mova m4, [dstq+r2] |
| REPX {paddw x, m0}, m1, m2, m3, m4 |
| REPX {pmaxsw x, m5}, m1, m2, m3, m4 |
| REPX {pminsw x, m6}, m1, m2, m3, m4 |
| mova [dstq+strideq*0], m1 |
| mova [dstq+strideq*1], m2 |
| mova [dstq+strideq*2], m3 |
| mova [dstq+r2 ], m4 |
| lea dstq, [dstq+strideq*4] |
| dec r3d |
| jg .loop |
| RET |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X8_FN dct, dct |
| INV_TXFM_8X8_FN dct, identity, 6 |
| INV_TXFM_8X8_FN dct, adst |
| INV_TXFM_8X8_FN dct, flipadst |
| |
| cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 1 |
| mov [rsp+4*16+1*gprsize], r1 |
| %else |
| DECLARE_REG_TMP 6 |
| %endif |
| lea t0, [o(.pass1_main)] |
| |
| .pass1_full: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| %undef cmp |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 10 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 10 |
| sbb r5d, 0 |
| %endif |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| lea r3, [rsp+gprsize] |
| %endif |
| .loop_pass1: |
| mova m0, [cq+0*32+r5] |
| mova m1, [cq+1*32+r5] |
| mova m2, [cq+2*32+r5] |
| mova m3, [cq+3*32+r5] |
| mova m4, [cq+4*32+r5] |
| mova m5, [cq+5*32+r5] |
| mova m6, [cq+6*32+r5] |
| mova m7, [cq+7*32+r5] |
| call t0 |
| |
| test r5d, r5d |
| jz .end_pass1 |
| |
| mova [cq+0*32+16], m0 |
| mova [cq+1*32+16], m1 |
| mova [cq+2*32+16], m2 |
| mova [cq+3*32+16], m3 |
| |
| sub r5d, 16 |
| jmp .loop_pass1 |
| .end_pass1: |
| mova m4, [cq+0*32+16] |
| mova m5, [cq+1*32+16] |
| mova m6, [cq+2*32+16] |
| mova m7, [cq+3*32+16] |
| %if ARCH_X86_32 |
| mov r1, [rsp+4*16+1*gprsize] |
| %endif |
| jmp tx2q |
| .pass1_main: |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| pcmpeqd m1, m1 |
| REPX {psubd x, m1}, m0, m6, m5, m3 |
| call m(idct_8x4_internal_16bpc).round |
| REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| .pack_and_transpose: |
| packssdw m2, m3 |
| packssdw m6, m7 |
| packssdw m0, m1 |
| packssdw m4, m5 |
| jmp m(idct_8x4_internal_16bpc).transpose4x8packed |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| lea r3, [strideq*3] |
| %if ARCH_X86_64 |
| mova m10, [o(pixel_10bpc_max)] |
| pxor m9, m9 |
| %endif |
| call .round3_and_write_8x8 |
| .zero: |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| %undef mzero |
| RET |
| |
| ; round (rounded right-shift by 5) before writing |
| ; data in m0-7 |
| ; on x86-64, pw_2048 is in m8 |
| ; .round1 is for m0-7 |
| ; .round2 is for m0-6 & [rsp+gprsize*2] |
| ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) |
| ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 |
| %if ARCH_X86_32 |
| .round1_and_write_8x8: |
| mova [rsp+gprsize*2], m7 |
| .round2_and_write_8x8: |
| %endif |
| .round3_and_write_8x8: |
| mova m7, [o(pw_2048)] |
| %if ARCH_X86_32 |
| .round4_and_write_8x8: |
| %endif |
| REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmulhrsw m7, [rsp+gprsize*2] |
| %if ARCH_X86_64 |
| jmp .write_8x8 |
| .round2_and_write_8x8: |
| mova m7, [rsp+gprsize*2] |
| .round1_and_write_8x8: |
| REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| %endif |
| |
| ; m0-7 have to-be-written data [pre-rounded] |
| ; on x86-64, m9-10 contain a zero/pixel_max |
| ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch |
| ; r0,1,3 contain dstq/strideq/stride3q |
| ; r5 is a scratch register |
| .write_8x8: |
| lea r5, [dstq+strideq*4] |
| paddw m0, [dstq+strideq*0] |
| paddw m1, [dstq+strideq*1] |
| paddw m2, [dstq+strideq*2] |
| paddw m3, [dstq+r3] |
| paddw m4, [r5 +strideq*0] |
| paddw m5, [r5 +strideq*1] |
| paddw m6, [r5 +strideq*2] |
| paddw m7, [r5 +r3] |
| %if ARCH_X86_64 |
| REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| %else |
| mova [rsp+gprsize*2], m7 |
| pxor m7, m7 |
| REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmaxsw m7, [rsp+gprsize*2] |
| mova [rsp+gprsize*2], m7 |
| mova m7, [o(pixel_10bpc_max)] |
| REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pminsw m7, [rsp+gprsize*2] |
| %endif |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r3 ], m3 |
| mova [r5 +strideq*0], m4 |
| mova [r5 +strideq*1], m5 |
| mova [r5 +strideq*2], m6 |
| mova [r5 +r3 ], m7 |
| ret |
| |
| INV_TXFM_8X8_FN adst, dct |
| INV_TXFM_8X8_FN adst, adst |
| INV_TXFM_8X8_FN adst, flipadst |
| INV_TXFM_8X8_FN adst, identity, 6 |
| |
| cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| mov [rsp+4*16+1*gprsize], r1 |
| %endif |
| lea t0, [o(.pass1_main)] |
| jmp m(idct_8x8_internal_16bpc).pass1_full |
| .pass1_main: |
| call m(iadst_8x4_internal_16bpc).main_pass1 |
| call .round |
| jmp m(idct_8x8_internal_16bpc).pack_and_transpose |
| .round: |
| %if ARCH_X86_64 |
| pcmpeqd m8, m8 ; -1 |
| REPX {psubd x, m8 }, m0, m6 |
| REPX {pxor x, m8 }, m1, m7, m3, m5 |
| REPX {psrad x, 1 }, m0, m1, m6, m7 |
| REPX {psubd x, m8 }, m1, m7 |
| mova m8, [o(pd_6144)] |
| REPX {paddd x, m8 }, m2, m3, m4, m5 |
| REPX {psrad x, 13 }, m2, m3, m4, m5 |
| %else |
| mova [r3+2*16], m0 |
| |
| pcmpeqd m0, m0 ; -1 |
| mova m6, [o(pd_6144)] |
| REPX {pxor x, m0 }, m1, m7, m3, m5 |
| REPX {psrad x, 1 }, m1, m7 |
| REPX {psubd x, m0 }, m1, m7 |
| REPX {paddd x, m6 }, m2, m3, m4, m5 |
| REPX {psrad x, 13 }, m2, m3, m4, m5 |
| |
| mova m0, [r3+2*16] |
| psrld m6, 12 ; +1 |
| paddd m0, m6 |
| paddd m6, [r3+3*16] |
| REPX {psrad x, 1 }, m0, m6 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end |
| lea r3, [strideq*3] |
| %if ARCH_X86_64 |
| mova m10, [o(pixel_10bpc_max)] |
| pxor m9, m9 |
| %endif |
| call .round3_and_write_8x8 |
| jmp m(idct_8x8_internal_16bpc).zero |
| |
| ; round (rounded right-shift by 5) before writing; odd registers are negated |
| ; data in m0-7 |
| ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 |
| ; .round1 is for m0-7 |
| ; .round2 is for m0-6 & [rsp+gprsize*2] |
| ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) |
| %if ARCH_X86_64 |
| .round2_and_write_8x8: |
| mova m7, [rsp+gprsize*2] |
| .round1_and_write_8x8: |
| REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 |
| REPX {pmulhrsw x, m11}, m1, m3, m5, m7 |
| jmp m(idct_8x8_internal_16bpc).write_8x8 |
| %else |
| .round1_and_write_8x8: |
| mova [rsp+gprsize*2], m7 |
| .round2_and_write_8x8: |
| %endif |
| .round3_and_write_8x8: |
| mova m7, [o(pw_2048)] |
| REPX {pmulhrsw x, m7}, m0, m2, m4, m6 |
| mova m7, [o(pw_m2048)] |
| REPX {pmulhrsw x, m7}, m1, m3, m5 |
| pmulhrsw m7, [rsp+gprsize*2] |
| jmp m(idct_8x8_internal_16bpc).write_8x8 |
| |
| INV_TXFM_8X8_FN flipadst, dct |
| INV_TXFM_8X8_FN flipadst, adst |
| INV_TXFM_8X8_FN flipadst, flipadst |
| INV_TXFM_8X8_FN flipadst, identity, 6 |
| |
| cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| mov [rsp+4*16+1*gprsize], r1 |
| %endif |
| lea t0, [o(.pass1_main)] |
| jmp m(idct_8x8_internal_16bpc).pass1_full |
| .pass1_main: |
| call m(iadst_8x4_internal_16bpc).main_pass1 |
| call m(iadst_8x8_internal_16bpc).round |
| ; invert registers |
| packssdw m7, m6 |
| packssdw m5, m4 |
| packssdw m3, m2 |
| packssdw m1, m0 |
| mova m0, m7 |
| mova m2, m5 |
| mova m4, m3 |
| mova m6, m1 |
| jmp m(idct_8x4_internal_16bpc).transpose4x8packed |
| |
| .pass2: |
| lea dstq, [dstq+strideq*8] |
| sub dstq, strideq |
| neg strideq |
| jmp m(iadst_8x8_internal_16bpc).pass2 |
| |
| INV_TXFM_8X8_FN identity, dct |
| INV_TXFM_8X8_FN identity, adst |
| INV_TXFM_8X8_FN identity, flipadst |
| INV_TXFM_8X8_FN identity, identity |
| |
| cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+0*32] |
| mova m1, [cq+1*32] |
| mova m2, [cq+2*32] |
| mova m3, [cq+3*32] |
| mova m4, [cq+4*32] |
| mova m5, [cq+5*32] |
| mova m6, [cq+6*32] |
| mova m7, [cq+7*32] |
| packssdw m0, [cq+0*32+16] |
| packssdw m1, [cq+1*32+16] |
| packssdw m2, [cq+2*32+16] |
| packssdw m3, [cq+3*32+16] |
| packssdw m4, [cq+4*32+16] |
| packssdw m5, [cq+5*32+16] |
| packssdw m6, [cq+6*32+16] |
| packssdw m7, [cq+7*32+16] |
| mova [rsp+gprsize+16*1], m6 |
| jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| lea r3, [strideq*3] |
| %if ARCH_X86_64 |
| mova m10, [o(pixel_10bpc_max)] |
| pxor m9, m9 |
| mova m8, [o(pw_4096)] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| %else |
| mova [rsp+gprsize], m7 |
| mova m7, [o(pw_4096)] |
| call m(idct_8x8_internal_16bpc).round4_and_write_8x8 |
| %endif |
| jmp m(idct_8x8_internal_16bpc).zero |
| |
| %macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 |
| %else |
| INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| mov r3d, 4 |
| %if stack_size_padded > 0 |
| ; adjust to caller's stack allocation |
| add rsp, (12+ARCH_X86_64)*16 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X16_FN dct, dct |
| INV_TXFM_8X16_FN dct, identity, v |
| INV_TXFM_8X16_FN dct, adst |
| INV_TXFM_8X16_FN dct, flipadst |
| |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 7 |
| %endif |
| |
| cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| PUSH r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] |
| .pass1_full: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| %undef cmp |
| mov r6d, 4 |
| .zero_loop: |
| dec r6d |
| cmp eobb, byte [r5+r6] |
| jl .zero_loop |
| mov r5d, r6d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, [rsp+16*16+2*gprsize] |
| ; setup stack pointer |
| lea r3, [rsp+gprsize] |
| %endif |
| .loop_pass1: |
| mova m0, [cq+0*64+r5] |
| mova m1, [cq+1*64+r5] |
| mova m2, [cq+2*64+r5] |
| mova m3, [cq+3*64+r5] |
| mova m4, [cq+4*64+r5] |
| mova m5, [cq+5*64+r5] |
| mova m6, [cq+6*64+r5] |
| mova m7, [cq+7*64+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call t0 |
| |
| mova [cq+0*64+r5], m0 |
| mova [cq+1*64+r5], m1 |
| mova [cq+2*64+r5], m2 |
| mova [cq+3*64+r5], m3 |
| sub r5d, 16 |
| jge .loop_pass1 |
| %if WIN64 |
| POP r7 |
| %elif ARCH_X86_32 |
| mov r1, [rsp+16*16+1*gprsize] |
| %endif |
| jmp tx2q |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| |
| ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 |
| ; some are still pre-loaded from the final loop iteration in pass=1 |
| |
| mova m1, m2 |
| mova m2, [cq+ 1*16] |
| mova m3, [cq+ 9*16] |
| mova m4, [cq+ 2*16] |
| mova m5, [cq+10*16] |
| mova m6, [cq+ 3*16] |
| mova m7, [cq+11*16] |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| mova [rsp+gprsize+3*16], m0 |
| mova [rsp+gprsize+4*16], m1 |
| mova [rsp+gprsize+5*16], m2 |
| mova [rsp+gprsize+6*16], m3 |
| mova [rsp+gprsize+7*16], m4 |
| mova [rsp+gprsize+8*16], m5 |
| mova [rsp+gprsize+9*16], m6 |
| ; m7 is already stored in [rsp+gprsize+0*16] |
| mova m0, [cq+ 4*16] |
| mova m1, [cq+12*16] |
| mova m2, [cq+ 5*16] |
| mova m3, [cq+13*16] |
| mova m4, [cq+ 6*16] |
| mova m5, [cq+14*16] |
| mova m6, [cq+ 7*16] |
| mova m7, [cq+15*16] |
| call m_suffix(idct_16x8_internal_8bpc, _ssse3).main |
| |
| ; out0-7 is in rsp+gprsize+3-10*mmsize |
| ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize |
| |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| mova m10, [o(pixel_10bpc_max)] |
| pxor m9, m9 |
| mov r6, dstq |
| %else |
| mov [rsp+16*16+gprsize*1], dstq |
| %endif |
| lea r3, [strideq*3] |
| lea dstq, [dstq+strideq*8] |
| call m(idct_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ |
| 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
| %undef mzero |
| mova m0, [rsp+gprsize+ 3*16] |
| mova m1, [rsp+gprsize+ 4*16] |
| mova m2, [rsp+gprsize+ 5*16] |
| mova m3, [rsp+gprsize+ 6*16] |
| mova m4, [rsp+gprsize+ 7*16] |
| mova m5, [rsp+gprsize+ 8*16] |
| mova m6, [rsp+gprsize+ 9*16] |
| mova m7, [rsp+gprsize+10*16] |
| %if ARCH_X86_64 |
| mov dstq, r6 |
| %else |
| mov dstq, [rsp+16*16+gprsize*1] |
| %endif |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| RET |
| |
| INV_TXFM_8X16_FN adst, dct |
| INV_TXFM_8X16_FN adst, adst |
| INV_TXFM_8X16_FN adst, flipadst |
| INV_TXFM_8X16_FN adst, identity, v |
| |
| cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| PUSH r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] |
| jmp m(idct_8x16_internal_16bpc).pass1_full |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| mova m4, [cq+ 9*16] |
| mova m5, [cq+13*16] |
| mova [rsp+gprsize+7*16], m0 |
| mova [rsp+gprsize+8*16], m1 |
| mova [rsp+gprsize+5*16], m4 |
| mova [rsp+gprsize+6*16], m5 |
| mova m0, m2 |
| mova m1, m3 |
| mova m2, [cq+ 1*16] |
| mova m3, [cq+ 5*16] |
| mova m4, [cq+ 2*16] |
| mova m5, [cq+ 6*16] |
| mova m6, [cq+11*16] |
| mova m7, [cq+15*16] |
| mova [rsp+gprsize+ 3*16], m4 |
| mova [rsp+gprsize+ 4*16], m5 |
| mova [rsp+gprsize+ 9*16], m6 |
| mova [rsp+gprsize+10*16], m7 |
| mova m4, [cq+10*16] |
| mova m5, [cq+14*16] |
| mova m6, [cq+ 3*16] |
| mova m7, [cq+ 7*16] |
| call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end |
| |
| %if ARCH_X86_64 |
| mova m11, [o(pw_m2048)] |
| mova m8, [o(pw_2048)] |
| mova m10, [o(pixel_10bpc_max)] |
| pxor m9, m9 |
| mov r6, dstq |
| %else |
| mov [rsp+16*16+gprsize*1], dstq |
| %endif |
| lea r3, [strideq*3] |
| lea dstq, [dstq+strideq*8] |
| call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ |
| 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
| %undef mzero |
| mova m0, [rsp+gprsize+ 3*16] |
| mova m1, [rsp+gprsize+ 4*16] |
| mova m2, [rsp+gprsize+ 5*16] |
| mova m3, [rsp+gprsize+ 6*16] |
| mova m4, [rsp+gprsize+ 7*16] |
| mova m5, [rsp+gprsize+ 8*16] |
| mova m6, [rsp+gprsize+ 9*16] |
| mova m7, [rsp+gprsize+10*16] |
| %if ARCH_X86_64 |
| mov dstq, r6 |
| %else |
| mov dstq, [rsp+16*16+gprsize*1] |
| %endif |
| call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 |
| RET |
| |
| INV_TXFM_8X16_FN flipadst, dct |
| INV_TXFM_8X16_FN flipadst, adst |
| INV_TXFM_8X16_FN flipadst, flipadst |
| INV_TXFM_8X16_FN flipadst, identity, v |
| |
| cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| PUSH r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] |
| jmp m(idct_8x16_internal_16bpc).pass1_full |
| |
| .pass2: |
| lea r3, [strideq*3] |
| lea r3, [r3*5] |
| add dstq, r3 |
| neg strideq |
| jmp m(iadst_8x16_internal_16bpc).pass2 |
| |
| INV_TXFM_8X16_FN identity, dct, h |
| INV_TXFM_8X16_FN identity, adst, h |
| INV_TXFM_8X16_FN identity, flipadst, h |
| INV_TXFM_8X16_FN identity, identity |
| |
| cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| PUSH r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] |
| jmp m(idct_8x16_internal_16bpc).pass1_full |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m4, [o(pw_2048)] |
| mova m5, [o(pixel_10bpc_max)] |
| pxor m6, m6 |
| mova m7, [o(pw_1697x16)] |
| %endif |
| mov r5d, 4 |
| lea r3, [strideq*3] |
| .pass2_loop: |
| call .main |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).round1_and_write_8x4 |
| %else |
| call m(idct_8x4_internal_16bpc).round2_and_write_8x4 |
| %endif |
| REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 |
| dec r5d |
| jle .end |
| add cq, 16 |
| lea dstq, [dstq+strideq*4] |
| mova m0, [cq+ 0*16] |
| mova m1, [cq+ 4*16] |
| mova m2, [cq+ 8*16] |
| mova m3, [cq+12*16] |
| jmp .pass2_loop |
| .end: |
| RET |
| .main: |
| ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) |
| %if ARCH_X86_32 |
| mova m7, [o(pw_1697x16)] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| %else |
| pmulhrsw m8, m7, m0 |
| pmulhrsw m9, m7, m1 |
| pmulhrsw m10, m7, m2 |
| pmulhrsw m11, m7, m3 |
| %endif |
| REPX {paddsw x, x}, m0, m1, m2, m3 |
| %if ARCH_X86_64 |
| paddsw m0, m8 |
| paddsw m1, m9 |
| paddsw m2, m10 |
| paddsw m3, m11 |
| %else |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| %endif |
| ret |
| |
| %macro INV_TXFM_16X4_FN 2 ; type1, type2 |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 |
| %else |
| INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 4 |
| .dconly: |
| add r5d, 384 |
| sar r5d, 9 |
| .dconly2: |
| imul r5d, 2896 |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| punpcklqdq m0, m0 |
| mova m3, [o(pixel_10bpc_max)] |
| pxor m4, m4 |
| .loop: |
| mova m1, [dstq+ 0] |
| mova m2, [dstq+16] |
| REPX {paddw x, m0}, m1, m2 |
| REPX {pminsw x, m3}, m1, m2 |
| REPX {pmaxsw x, m4}, m1, m2 |
| mova [dstq+ 0], m1 |
| mova [dstq+16], m2 |
| add dstq, strideq |
| dec r3d |
| jg .loop |
| RET |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X4_FN dct, dct |
| INV_TXFM_16X4_FN dct, identity |
| INV_TXFM_16X4_FN dct, adst |
| INV_TXFM_16X4_FN dct, flipadst |
| |
| cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| ; setup stack pointer |
| lea r3, [rsp+gprsize] |
| |
| mova m0, [cq+ 1*16] |
| mova m1, [cq+ 3*16] |
| mova m2, [cq+ 5*16] |
| mova m3, [cq+ 7*16] |
| mova m4, [cq+ 9*16] |
| mova m5, [cq+11*16] |
| mova m6, [cq+13*16] |
| mova m7, [cq+15*16] |
| call .main_oddhalf |
| mova m0, [cq+ 0*16] |
| mova m1, [cq+ 2*16] |
| mova m2, [cq+ 4*16] |
| mova m3, [cq+ 6*16] |
| mova m4, [cq+ 8*16] |
| mova m5, [cq+10*16] |
| mova m6, [cq+12*16] |
| mova m7, [cq+14*16] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| ; t0-7 is in m0-7 |
| |
| call .round |
| |
| %if ARCH_X86_64 |
| .pack_transpose: |
| ; transpose in two parts |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| .transpose: |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| call .transpose4x8packed_hi |
| %else |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova [r3+2*16], m2 |
| mova [r3+3*16], m3 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+ 9*16] |
| mova m4, [r3+10*16] |
| mova m6, [r3+11*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| jmp tx2q |
| %if ARCH_X86_64 |
| .transpose4x8packed_hi: |
| punpcklwd m9, m10, m14 |
| punpckhwd m10, m14 |
| punpckhwd m14, m8, m12 |
| punpcklwd m8, m12 |
| |
| punpckhwd m11, m8, m9 |
| punpcklwd m8, m9 |
| punpckhwd m12, m14, m10 |
| punpcklwd m14, m10 |
| |
| punpcklwd m10, m11, m12 |
| punpckhwd m11, m12 |
| punpckhwd m9, m8, m14 |
| punpcklwd m8, m14 |
| ret |
| %endif |
| .main_oddhalf_fast: ; lower half zero |
| pmulld m7, m0, [o(pd_4076)] |
| pmulld m0, [o(pd_401)] |
| pmulld m6, m1, [o(pd_m1189)] |
| pmulld m1, [o(pd_3920)] |
| %if ARCH_X86_32 |
| mova m4, [o(pd_2048)] |
| REPX {paddd x, m4}, m1, m6 |
| REPX {psrad x, 12}, m1, m6 |
| mova [r3+1*16], m1 |
| %endif |
| pmulld m5, m2, [o(pd_3612)] |
| pmulld m2, [o(pd_1931)] |
| %if ARCH_X86_32 |
| pmulld m1, m3, [o(pd_m2598)] |
| %else |
| pmulld m4, m3, [o(pd_m2598)] |
| %endif |
| pmulld m3, [o(pd_3166)] |
| jmp .main_oddhalf_fast2 |
| .main_oddhalf: |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a |
| ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a |
| ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a |
| ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a |
| .main_oddhalf_fast2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| psubd m8, m0, m4 ; t9 |
| paddd m0, m4 ; t8 |
| psubd m4, m6, m2 ; t10 |
| paddd m2, m6 ; t11 |
| psubd m6, m1, m5 ; t13 |
| paddd m5, m1 ; t12 |
| psubd m1, m7, m3 ; t14 |
| paddd m7, m3 ; t15 |
| REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 |
| REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 |
| mova m15, [o(pd_3784)] |
| mova m10, [o(pd_1567)] |
| ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 |
| psubd m3, m1, m4 ; t10 |
| paddd m1, m4 ; t9 |
| psubd m4, m0, m2 ; t11a |
| paddd m0, m2 ; t8a |
| psubd m2, m8, m6 ; t13 |
| paddd m6, m8 ; t14 |
| psubd m8, m7, m5 ; t12a |
| paddd m7, m5 ; t15a |
| REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 |
| REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 |
| REPX {pmulld x, m14}, m2, m8, m3, m4 |
| paddd m2, m11 |
| paddd m8, m11 |
| paddd m5, m2, m3 ; t13a |
| psubd m2, m3 ; t10a |
| psubd m3, m8, m4 ; t11 |
| paddd m4, m8 ; t12 |
| REPX {psrad x, 12}, m5, m2, m3, m4 |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova [r3+2*16], m2 |
| mova [r3+3*16], m3 |
| mova [r3+4*16], m4 |
| mova [r3+5*16], m5 |
| mova [r3+6*16], m6 |
| mova [r3+7*16], m7 |
| %else |
| mova [r3+0*16], m2 |
| mova [r3+1*16], m3 |
| mova [r3+2*16], m4 |
| mova [r3+3*16], m5 |
| mova m4, [o(pd_2048)] |
| |
| ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a |
| ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a |
| |
| mova m2, [r3+0*16] |
| mova m3, [r3+1*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova m1, [r3+2*16] |
| mova m5, [r3+3*16] |
| mova [r3+2*16], m6 |
| mova [r3+3*16], m7 |
| |
| ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a |
| ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a |
| |
| mova m0, [r3+0*16] |
| mova m6, [r3+2*16] |
| mova m7, [r3+3*16] |
| .main_oddhalf_fast2: |
| REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 |
| REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 |
| psubd m4, m0, m1 ; t9 |
| paddd m0, m1 ; t8 |
| mova m1, [r3+1*16] |
| mova [r3+0*16], m4 |
| psubd m4, m6, m2 ; t10 |
| paddd m2, m6 ; t11 |
| psubd m6, m1, m5 ; t13 |
| paddd m5, m1 ; t12 |
| psubd m1, m7, m3 ; t14 |
| paddd m7, m3 ; t15 |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 |
| pmaxsd m3, [r3+0*16] |
| mova [r3+0*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 |
| pminsd m3, [r3+0*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m2 |
| mova [r3+2*16], m5 |
| mova [r3+3*16], m7 |
| mova m7, [o(pd_2048)] |
| ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 |
| ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 |
| mova m0, [r3+0*16] |
| mova m2, [r3+1*16] |
| psubd m5, m1, m4 ; t10 |
| mova [r3+1*16], m5 |
| paddd m1, m4 ; t9 |
| psubd m4, m0, m2 ; t11a |
| paddd m0, m2 ; t8a |
| mova m5, [r3+2*16] |
| mova m7, [r3+3*16] |
| psubd m2, m3, m6 ; t13 |
| paddd m6, m3 ; t14 |
| paddd m3, m7, m5 ; t15a |
| psubd m7, m5 ; t12a |
| mova [r3+0*16], m3 |
| mova m3, [r3+1*16] |
| mova m5, [o(clip_18b_min)] |
| REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 |
| pmaxsd m5, [r3+0*16] |
| mova [r3+0*16], m5 |
| mova m5, [o(clip_18b_max)] |
| REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 |
| pminsd m5, [r3+0*16] |
| mova [r3+0*16], m5 |
| mova m5, [o(pd_2896)] |
| REPX {pmulld x, m5}, m2, m7, m3, m4 |
| mova m5, [o(pd_2048)] |
| REPX {paddd x, m5}, m2, m7 |
| paddd m5, m2, m3 ; t13a |
| psubd m2, m3 ; t10a |
| psubd m3, m7, m4 ; t11 |
| paddd m4, m7 ; t12 |
| REPX {psrad x, 12}, m5, m2, m3, m4 |
| mova m7, [r3+0*16] |
| mova [r3+11*16], m0 |
| mova [r3+10*16], m1 |
| mova [r3+9*16], m2 |
| mova [r3+8*16], m3 |
| mova [r3+7*16], m4 |
| mova [r3+6*16], m5 |
| mova [r3+5*16], m6 |
| mova [r3+4*16], m7 |
| %endif |
| ret |
| .round: |
| %if ARCH_X86_64 |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| pcmpeqd m8, m8 |
| REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| mova m8, [r3+1*16] |
| mova m9, [r3+2*16] |
| mova m10, [r3+3*16] |
| mova m11, [r3+4*16] |
| mova m12, [r3+5*16] |
| mova m13, [r3+6*16] |
| mova m14, [r3+7*16] |
| psubd m15, m0, m14 ; out15 |
| paddd m0, m14 ; out0 |
| psubd m14, m1, m13 ; out14 |
| paddd m1, m13 ; out1 |
| psubd m13, m2, m12 ; out13 |
| paddd m2, m12 ; out2 |
| psubd m12, m3, m11 ; out12 |
| paddd m3, m11 ; out3 |
| psubd m11, m4, m10 ; out11 |
| paddd m4, m10 ; out4 |
| psubd m10, m5, m9 ; out10 |
| paddd m5, m9 ; out5 |
| psubd m9, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| psubd m8, m7, [r3+0*16] ; out8 |
| paddd m7, [r3+0*16] ; out7 |
| REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| ; and out0-15 is now in m0-15 |
| %else |
| mova [r3+ 0*16], m0 |
| mova m0, [o(clip_18b_min)] |
| REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 |
| pmaxsd m0, [r3+ 0*16] |
| mova [r3+ 0*16], m7 |
| mova m7, [o(clip_18b_max)] |
| REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pminsd m7, [r3+ 0*16] |
| mova [r3+ 0*16], m0 |
| pcmpeqd m0, m0 |
| REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 |
| mova [r3+ 1*16], m1 |
| mova [r3+ 2*16], m2 |
| mova m1, [r3+ 0*16] |
| psubd m1, m0 |
| mova [r3+ 0*16], m1 |
| mova m1, [r3+11*16] |
| mova m2, [r3+10*16] |
| psubd m0, m7, m1 |
| paddd m7, m1 |
| psubd m1, m6, m2 |
| paddd m6, m2 |
| REPX {psrad x, 1}, m0, m1, m6, m7 |
| packssdw m0, m1 ; out8-9 |
| packssdw m6, m7 ; out6-7 |
| mova [r3+11*16], m6 |
| mova m1, [r3+9*16] |
| mova m7, [r3+8*16] |
| psubd m2, m5, m1 |
| paddd m5, m1 |
| psubd m1, m4, m7 |
| paddd m4, m7 |
| REPX {psrad x, 1}, m2, m1, m4, m5 |
| packssdw m2, m1 ; out10-11 |
| packssdw m4, m5 ; out4-5 |
| mova m1, [r3+2*16] |
| mova [r3+10*16], m4 |
| mova m6, [r3+7*16] |
| mova m7, [r3+6*16] |
| psubd m4, m3, m6 |
| paddd m3, m6 |
| psubd m6, m1, m7 |
| paddd m1, m7 |
| REPX {psrad x, 1}, m4, m6, m1, m3 |
| packssdw m4, m6 ; out12-13 |
| packssdw m1, m3 ; out2-3 |
| mova m3, [r3+1*16] |
| mova [r3+9*16], m1 |
| mova m1, [r3+0*16] |
| mova m5, [r3+5*16] |
| mova m7, [r3+4*16] |
| psubd m6, m3, m5 |
| paddd m3, m5 |
| psubd m5, m1, m7 |
| paddd m1, m7 |
| REPX {psrad x, 1}, m6, m5, m1, m3 |
| packssdw m6, m5 ; out14-15 |
| packssdw m1, m3 ; out0-1 |
| mova [r3+8*16], m1 |
| %endif |
| ret |
| |
| .pass2: |
| lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] |
| .pass2_loop: |
| lea r3, [strideq*3] |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call r4 |
| call m(idct_8x4_internal_16bpc).round2_and_write_8x4 |
| REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| %if ARCH_X86_64 |
| mova m0, m8 |
| mova m1, m9 |
| mova m2, m10 |
| mova m3, m11 |
| %else |
| mova m0, [rsp+gprsize+0*16] |
| mova m1, [rsp+gprsize+1*16] |
| mova m2, [rsp+gprsize+2*16] |
| mova m3, [rsp+gprsize+3*16] |
| %endif |
| add dstq, 16 |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call r4 |
| call m(idct_8x4_internal_16bpc).round2_and_write_8x4 |
| RET |
| |
| INV_TXFM_16X4_FN adst, dct |
| INV_TXFM_16X4_FN adst, adst |
| INV_TXFM_16X4_FN adst, flipadst |
| INV_TXFM_16X4_FN adst, identity |
| |
| cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| ; setup stack pointer |
| lea r3, [rsp+gprsize] |
| call .main |
| %if ARCH_X86_64 |
| jmp m(idct_16x4_internal_16bpc).pack_transpose |
| %else |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+gprsize+0*16], m0 |
| mova [rsp+gprsize+1*16], m1 |
| mova [rsp+gprsize+2*16], m2 |
| mova [rsp+gprsize+3*16], m3 |
| mova m0, [rsp+gprsize+ 8*16] |
| mova m2, [rsp+gprsize+ 9*16] |
| mova m4, [rsp+gprsize+10*16] |
| mova m6, [rsp+gprsize+11*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| jmp tx2q |
| %endif |
| |
| .main: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 2*16] |
| mova m1, [cq+13*16] |
| mova m2, [cq+ 6*16] |
| mova m3, [cq+ 9*16] |
| mova m4, [cq+10*16] |
| mova m5, [cq+ 5*16] |
| mova m6, [cq+14*16] |
| mova m7, [cq+ 1*16] |
| call .main_part1 |
| mova m0, [cq+ 0*16] |
| mova m1, [cq+15*16] |
| mova m2, [cq+ 4*16] |
| mova m3, [cq+11*16] |
| mova m4, [cq+ 8*16] |
| mova m5, [cq+ 7*16] |
| mova m6, [cq+12*16] |
| mova m7, [cq+ 3*16] |
| call .main_part2 |
| .round: |
| %if ARCH_X86_64 |
| mova m15, [o(pd_6144)] |
| psrld m14, 11 ; pd_1 |
| pcmpeqd m8, m8 ; -1 |
| psubd m13, m15, m14 ; pd_6143 |
| REPX {paddd x, m14}, m0, m2 |
| REPX {paddd x, m15}, m4, m6 |
| REPX {pxor x, m8 }, m1, m3, m5, m7 |
| REPX {psrad x, 1 }, m1, m3 |
| REPX {paddd x, m15}, m5, m7 |
| REPX {psubd x, m8 }, m1, m3 |
| paddd m8, m15, m9 |
| psubd m9, m13, m10 |
| paddd m10, m15, m11 |
| psubd m11, m13, m12 |
| paddd m12, m14, [r3+3*16] |
| psubd m13, m14, [r3+2*16] |
| psubd m15, m14, [r3+0*16] |
| paddd m14, [r3+1*16] |
| REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 |
| REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 |
| %else |
| mova [r3+8*16], m1 |
| mova [r3+9*16], m3 |
| mova m3, [o(pd_6144)] |
| pcmpeqd m1, m1 |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m4, m5, m6, m7 |
| REPX {psrad x, 13}, m4, m5, m6, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m6 |
| mova m4, [r3+4*16] |
| mova m5, [r3+5*16] |
| mova m6, [r3+6*16] |
| mova m7, [r3+7*16] |
| REPX {pxor x, m1}, m5, m7 |
| REPX {psubd x, m1}, m4, m6 |
| REPX {psrad x, 1 }, m4, m5, m6, m7 |
| REPX {psubd x, m1}, m5, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova m5, [r3+8*16] |
| mova m7, [r3+9*16] |
| mova [r3+8*16], m4 |
| mova [r3+9*16], m6 |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m0, m5, m2, m7 |
| REPX {psrad x, 13}, m0, m5, m2, m7 |
| packssdw m0, m5 |
| packssdw m2, m7 |
| mova m4, [r3+0*16] |
| mova m5, [r3+1*16] |
| mova m6, [r3+2*16] |
| mova m7, [r3+3*16] |
| REPX {psubd x, m1}, m4, m6 |
| REPX {pxor x, m1}, m5, m7 |
| REPX {psrad x, 1 }, m4, m5, m6, m7 |
| REPX {psubd x, m1}, m5, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| %endif |
| ret |
| |
| .main_part2: |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 |
| ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 |
| ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 |
| ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 |
| psubd m8, m0, m4 ; t8a |
| paddd m0, m4 ; t0a |
| psubd m4, m1, m5 ; t9a |
| paddd m1, m5 ; t1a |
| psubd m5, m2, m6 ; t12a |
| paddd m2, m6 ; t4a |
| psubd m6, m3, m7 ; t13a |
| paddd m7, m3 ; t5a |
| REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 |
| REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 |
| mova m15, [o(pd_4017)] |
| mova m10, [o(pd_799)] |
| ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 |
| psubd m3, m0, m2 ; t4 |
| paddd m0, m2 ; t0 |
| psubd m2, m1, m7 ; t5 |
| paddd m1, m7 ; t1 |
| psubd m7, m4, m6 ; t12a |
| paddd m4, m6 ; t8a |
| psubd m6, m8, m5 ; t13a |
| paddd m5, m8 ; t9a |
| REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 |
| REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 |
| mova m15, [o(pd_3784)] |
| mova m10, [o(pd_1567)] |
| ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 |
| mova m10, [r3+0*16] ; t2 |
| mova m8, [r3+1*16] ; t3 |
| psubd m9, m0, m10 ; t2a |
| paddd m0, m10 ; out0 |
| psubd m10, m1, m8 ; t3a |
| paddd m1, m8 ; -out15 |
| mova [r3+0*16], m1 |
| mova m15, [r3+3*16] ; t7a |
| mova m1, [r3+2*16] ; t6a |
| psubd m8, m3, m15 ; t7 |
| paddd m15, m3 ; out12 |
| paddd m3, m2, m1 ; -out3 |
| psubd m2, m1 ; t6 |
| mova [r3+3*16], m15 |
| mova [r3+1*16], m2 |
| mova m1, [r3+7*16] ; t15 |
| mova m2, [r3+6*16] ; t14 |
| paddd m15, m7, m1 ; -out13 |
| psubd m7, m1 ; t15a |
| psubd m11, m6, m2 ; t14a |
| paddd m2, m6 ; out2 |
| mova [r3+2*16], m15 |
| mova m1, [r3+4*16] ; t10a |
| mova m15, [r3+5*16] ; t11a |
| psubd m6, m4, m1 ; t10 |
| paddd m1, m4 ; -out1 |
| psubd m4, m5, m15 ; t11 |
| paddd m5, m15 ; out14 |
| REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 |
| pmaxsd m12, [r3+1*16] ; t6 |
| mova [r3+1*16], m5 |
| REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 |
| REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 |
| paddd m5, m11, m7 ; -out5 (unshifted) |
| psubd m11, m7 ; out10 (unshifted) |
| paddd m7, m9, m10 ; -out7 (unshifted) |
| psubd m9, m10 ; out8 (unshifted) |
| psubd m10, m6, m4 ; -out9 (unshifted) |
| paddd m6, m4 ; out6 (unshifted) |
| paddd m4, m12, m8 ; out4 (unshifted) |
| psubd m12, m8 ; -out11 (unshifted) |
| %else |
| mova [r3+8*16], m0 |
| mova [r3+9*16], m1 |
| mova [r3+10*16], m2 |
| mova [r3+11*16], m3 |
| mova m3, [o(pd_2048)] |
| ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 |
| ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 |
| mova m0, [r3+8*16] |
| mova m1, [r3+9*16] |
| mova [r3+8*16], m4 |
| mova m4, [r3+10*16] |
| mova [r3+9*16], m5 |
| mova [r3+10*16], m6 |
| mova m5, [r3+11*16] |
| mova [r3+11*16], m7 |
| ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 |
| ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 |
| mova m2, [r3+8*16] |
| mova m6, [r3+9*16] |
| psubd m3, m0, m2 ; t8a |
| paddd m0, m2 ; t0a |
| mova [r3+8*16], m3 |
| psubd m2, m1, m6 ; t9a |
| paddd m1, m6 ; t1a |
| mova m3, [r3+10*16] |
| psubd m6, m4, m3 ; t12a |
| paddd m4, m3 ; t4a |
| mova m3, [r3+11*16] |
| psubd m7, m5, m3 ; t13a |
| paddd m5, m3 ; t5a |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 |
| pmaxsd m3, [r3+8*16] |
| mova [r3+8*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 |
| pminsd m3, [r3+8*16] |
| mova [r3+8*16], m3 |
| psubd m3, m0, m4 ; t4 |
| paddd m0, m4 ; t0 |
| psubd m4, m1, m5 ; t5 |
| paddd m1, m5 ; t1 |
| mova m5, [o(pd_2048)] |
| mova [r3+9*16], m1 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m3 |
| mova m3, [r3+8*16] |
| mova [r3+8*16], m0 |
| ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 |
| ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 |
| psubd m5, m2, m7 ; t12a |
| paddd m2, m7 ; t8a |
| psubd m7, m3, m6 ; t13a |
| paddd m6, m3 ; t9a |
| mova m0, [r3+8*16] |
| mova m1, [r3+9*16] |
| mova m4, [r3+10*16] |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 |
| pmaxsd m3, [r3+11*16] |
| mova [r3+8*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 |
| pminsd m3, [r3+8*16] |
| mova [r3+8*16], m0 |
| mova [r3+9*16], m1 |
| mova [r3+10*16], m2 |
| mova [r3+11*16], m6 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 |
| ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 |
| mova m0, [r3+7*16] ; t7a |
| mova m2, [r3+6*16] ; t6a |
| psubd m1, m3, m0 ; t7 |
| paddd m0, m3 ; out12 |
| paddd m3, m4, m2 ; -out3 |
| psubd m4, m2 ; t6 |
| mova [r3+7*16], m3 |
| mova m3, [r3+3*16] ; t15 |
| mova m2, [r3+2*16] ; t14 |
| paddd m6, m5, m3 ; -out13 |
| psubd m5, m3 ; t15a |
| psubd m3, m7, m2 ; t14a |
| paddd m2, m7 ; out2 |
| mova [r3+6*16], m2 |
| mova m7, [r3+0*16] ; t10a |
| mova m2, [r3+1*16] ; t11a |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m6 |
| mova m6, [r3+11*16] |
| psubd m0, m6, m2 ; t11 |
| paddd m6, m2 ; out14 |
| mova [r3+2*16], m6 |
| mova m2, [r3+10*16] |
| psubd m6, m2, m7 ; t10 |
| paddd m2, m7 ; -out1 |
| mova m7, [r3+5*16] ; t3 |
| mova [r3+5*16], m2 |
| mova [r3+10*16], m1 |
| mova m1, [r3+9*16] |
| psubd m2, m1, m7 ; t3a |
| paddd m1, m7 ; -out15 |
| mova [r3+3*16], m1 |
| mova m1, [r3+4*16] ; t2 |
| mova m7, [r3+8*16] |
| psubd m7, m1 ; t2a |
| paddd m1, [r3+8*16] ; out0 |
| mova [r3+4*16], m1 |
| mova m1, [o(clip_18b_min)] |
| REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 |
| pmaxsd m1, [r3+10*16] |
| mova [r3+10*16], m1 |
| mova m1, [o(clip_18b_max)] |
| REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 |
| pminsd m1, [r3+10*16] |
| mova [r3+10*16], m1 |
| mova m1, [o(pd_2896)] |
| REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 |
| pmulld m1, [r3+10*16] |
| mova [r3+11*16], m3 |
| psubd m3, m4, m1 ; -out11 (unshifted) |
| paddd m4, m1 ; out4 (unshifted) |
| psubd m1, m6, m0 ; -out9 (unshifted) |
| paddd m6, m0 ; out6 (unshifted) |
| psubd m0, m7, m2 ; out8 (unshifted) |
| paddd m7, m2 ; -out7 (unshifted) |
| mova m2, [r3+11*16] |
| mova [r3+11*16], m5 |
| paddd m5, m2 ; -out5 (unshifted) |
| psubd m2, [r3+11*16] ; out10 (unshifted) |
| ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) |
| ; r[-4,3] contain out0-3 and out12-15 |
| %endif |
| ret |
| .main_part1: |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 |
| ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 |
| ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 |
| ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 |
| psubd m8, m0, m4 ; t10a |
| paddd m0, m4 ; t2a |
| psubd m4, m1, m5 ; t11a |
| paddd m1, m5 ; t3a |
| psubd m5, m2, m6 ; t14a |
| paddd m2, m6 ; t6a |
| psubd m6, m3, m7 ; t15a |
| paddd m7, m3 ; t7a |
| REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 |
| REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 |
| mova m15, [o(pd_2276)] |
| mova m10, [o(pd_3406)] |
| ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 |
| psubd m3, m0, m2 ; t6 |
| paddd m0, m2 ; t2 |
| psubd m2, m1, m7 ; t7 |
| paddd m1, m7 ; t3 |
| psubd m7, m4, m6 ; t14a |
| paddd m4, m6 ; t10a |
| psubd m6, m8, m5 ; t15a |
| paddd m5, m8 ; t11a |
| REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 |
| REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 |
| mova m15, [o(pd_1567)] |
| mova m10, [o(pd_3784)] |
| ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 |
| ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova [r3+4*16], m4 |
| mova [r3+5*16], m5 |
| mova [r3+2*16], m2 |
| mova [r3+3*16], m3 |
| mova [r3+6*16], m6 |
| mova [r3+7*16], m7 |
| %else |
| mova [r3+4*16], m0 |
| mova [r3+5*16], m1 |
| mova [r3+6*16], m2 |
| mova [r3+7*16], m3 |
| mova m3, [o(pd_2048)] |
| ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 |
| ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 |
| mova [r3+0*16], m4 |
| mova [r3+1*16], m5 |
| mova [r3+2*16], m6 |
| mova [r3+3*16], m7 |
| mova m0, [r3+4*16] |
| mova m1, [r3+5*16] |
| mova m2, [r3+6*16] |
| mova m7, [r3+7*16] |
| ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 |
| ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 |
| mova m4, [r3+0*16] |
| mova m5, [r3+1*16] |
| psubd m6, m0, m4 ; t10a |
| paddd m0, m4 ; t2a |
| mova [r3+4*16], m6 |
| mova m6, [r3+2*16] |
| mova m3, [r3+3*16] |
| psubd m4, m1, m5 ; t11a |
| paddd m1, m5 ; t3a |
| psubd m5, m2, m6 ; t14a |
| paddd m2, m6 ; t6a |
| psubd m6, m7, m3 ; t15a |
| paddd m7, m3 ; t7a |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 |
| pmaxsd m3, [r3+4*16] |
| mova [r3+4*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 |
| pminsd m3, [r3+4*16] |
| mova [r3+4*16], m3 |
| psubd m3, m0, m2 ; t6 |
| paddd m0, m2 ; t2 |
| psubd m2, m1, m7 ; t7 |
| paddd m1, m7 ; t3 |
| mova [r3+5*16], m1 |
| mova [r3+6*16], m3 |
| mova [r3+7*16], m2 |
| mova m1, [r3+4*16] |
| mova [r3+4*16], m0 |
| mova m3, [o(pd_2048)] |
| ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 |
| ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 |
| psubd m7, m4, m6 ; t14a |
| paddd m4, m6 ; t10a |
| psubd m6, m1, m5 ; t15a |
| paddd m5, m1 ; t11a |
| mova m1, [r3+5*16] |
| mova m3, [r3+6*16] |
| mova m2, [r3+7*16] |
| mova m0, [o(clip_18b_min)] |
| REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 |
| pmaxsd m0, [r3+4*16] |
| mova [r3+4*16], m0 |
| mova m0, [o(clip_18b_max)] |
| REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 |
| pminsd m0, [r3+4*16] |
| mova [r3+4*16], m0 |
| mova [r3+5*16], m1 |
| mova [r3+0*16], m4 |
| mova [r3+1*16], m5 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 |
| ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 |
| mova [r3+6*16], m2 |
| mova [r3+7*16], m3 |
| mova [r3+2*16], m6 |
| mova [r3+3*16], m7 |
| %endif |
| ret |
| |
| .pass2: |
| lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] |
| jmp m(idct_16x4_internal_16bpc).pass2_loop |
| |
| INV_TXFM_16X4_FN flipadst, dct |
| INV_TXFM_16X4_FN flipadst, adst |
| INV_TXFM_16X4_FN flipadst, flipadst |
| INV_TXFM_16X4_FN flipadst, identity |
| |
| cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| lea r3, [rsp+gprsize] |
| call m(iadst_16x4_internal_16bpc).main |
| %if ARCH_X86_64 |
| packssdw m1, m0 |
| packssdw m3, m2 |
| packssdw m5, m4 |
| packssdw m7, m6 |
| packssdw m9, m8 |
| packssdw m11, m10 |
| packssdw m13, m12 |
| packssdw m15, m14 |
| mova m0, m15 |
| mova m2, m13 |
| mova m4, m11 |
| mova m6, m9 |
| mova m8, m7 |
| mova m10, m5 |
| mova m12, m3 |
| mova m14, m1 |
| jmp m(idct_16x4_internal_16bpc).transpose |
| %else |
| mova [rsp+gprsize+4*16], m0 |
| mova [rsp+gprsize+5*16], m2 |
| mova [rsp+gprsize+6*16], m4 |
| mova [rsp+gprsize+7*16], m6 |
| pshufd m6, [rsp+gprsize+ 8*16], q1032 |
| pshufd m4, [rsp+gprsize+ 9*16], q1032 |
| pshufd m2, [rsp+gprsize+10*16], q1032 |
| pshufd m0, [rsp+gprsize+11*16], q1032 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+gprsize+0*16], m0 |
| mova [rsp+gprsize+1*16], m1 |
| mova [rsp+gprsize+2*16], m2 |
| mova [rsp+gprsize+3*16], m3 |
| pshufd m6, [rsp+gprsize+ 4*16], q1032 |
| pshufd m4, [rsp+gprsize+ 5*16], q1032 |
| pshufd m2, [rsp+gprsize+ 6*16], q1032 |
| pshufd m0, [rsp+gprsize+ 7*16], q1032 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| jmp tx2q |
| %endif |
| |
| .pass2: |
| lea r3, [strideq*3] |
| lea dstq, [dstq+r3] |
| neg strideq |
| lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] |
| jmp m(idct_16x4_internal_16bpc).pass2_loop |
| |
| INV_TXFM_16X4_FN identity, dct |
| INV_TXFM_16X4_FN identity, adst |
| INV_TXFM_16X4_FN identity, flipadst |
| INV_TXFM_16X4_FN identity, identity |
| |
| cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_64 |
| mova m15, [o(pd_11586)] |
| pmulld m0, m15, [cq+ 0*16] |
| pmulld m1, m15, [cq+ 1*16] |
| pmulld m2, m15, [cq+ 2*16] |
| pmulld m3, m15, [cq+ 3*16] |
| pmulld m4, m15, [cq+ 4*16] |
| pmulld m5, m15, [cq+ 5*16] |
| pmulld m6, m15, [cq+ 6*16] |
| pmulld m7, m15, [cq+ 7*16] |
| pmulld m8, m15, [cq+ 8*16] |
| pmulld m9, m15, [cq+ 9*16] |
| pmulld m10, m15, [cq+10*16] |
| pmulld m11, m15, [cq+11*16] |
| pmulld m12, m15, [cq+12*16] |
| pmulld m13, m15, [cq+13*16] |
| pmulld m14, m15, [cq+14*16] |
| pmulld m15, [cq+15*16] |
| mova [cq+ 0*16], m15 |
| mova m15, [o(pd_6144)] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| paddd m15, [cq+ 0*16] |
| REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| jmp m(idct_16x4_internal_16bpc).pack_transpose |
| %else |
| add cq, 8*16 |
| mov r5d, 2 |
| .loop_pass1: |
| mova m7, [o(pd_11586)] |
| pmulld m0, m7, [cq+0*16] |
| pmulld m1, m7, [cq+1*16] |
| pmulld m2, m7, [cq+2*16] |
| pmulld m3, m7, [cq+3*16] |
| pmulld m4, m7, [cq+4*16] |
| pmulld m5, m7, [cq+5*16] |
| pmulld m6, m7, [cq+6*16] |
| pmulld m7, [cq+7*16] |
| mova [cq+7*16], m7 |
| mova m7, [o(pd_6144)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [cq+7*16] |
| REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| dec r5d |
| jz .end_pass1 |
| mova [rsp+gprsize+0*16], m0 |
| mova [rsp+gprsize+1*16], m1 |
| mova [rsp+gprsize+2*16], m2 |
| mova [rsp+gprsize+3*16], m3 |
| sub cq, 8*16 |
| jmp .loop_pass1 |
| .end_pass1: |
| jmp tx2q |
| %endif |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m12, [o(pw_1697x8)] |
| %endif |
| lea r4, [o(.main)] |
| jmp m(idct_16x4_internal_16bpc).pass2_loop |
| .main: |
| %if ARCH_X86_64 |
| pmulhrsw m4, m0, m12 |
| pmulhrsw m5, m1, m12 |
| pmulhrsw m6, m2, m12 |
| pmulhrsw m7, m3, m12 |
| %else |
| mova m7, [o(pw_1697x8)] |
| pmulhrsw m4, m0, m7 |
| pmulhrsw m5, m1, m7 |
| pmulhrsw m6, m2, m7 |
| pmulhrsw m7, m3 |
| %endif |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| ret |
| |
| %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 |
| %else |
| INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 8 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| %if ARCH_X86_32 |
| add rsp, 1*16 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X8_FN dct, dct |
| INV_TXFM_16X8_FN dct, identity, 6 |
| INV_TXFM_16X8_FN dct, adst |
| INV_TXFM_16X8_FN dct, flipadst |
| |
| cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 6, 4, 6 |
| %else |
| mov [rsp+gprsize+12*16], r1 |
| DECLARE_REG_TMP 1, 4, 3 |
| %endif |
| lea t0, [o(.main)] |
| .loop_main: |
| %undef cmp |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 10 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 10 |
| sbb r5d, 0 |
| %endif |
| shl r5d, 4 |
| |
| lea r3, [rsp+gprsize] |
| .loop_pass1: |
| call t0 |
| %if ARCH_X86_64 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+4*32+r5], m8 |
| mova [cq+5*32+r5], m9 |
| mova [cq+6*32+r5], m10 |
| mova [cq+7*32+r5], m11 |
| %else |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+4*32+r5], m0 |
| mova [cq+5*32+r5], m1 |
| mova [cq+6*32+r5], m2 |
| mova [cq+7*32+r5], m3 |
| mova m0, [rsp+gprsize+ 8*16] |
| mova m2, [rsp+gprsize+ 9*16] |
| mova m4, [rsp+gprsize+10*16] |
| mova m6, [rsp+gprsize+11*16] |
| %endif |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| pxor m7, m7 |
| REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 |
| test r5d, r5d |
| jz .end |
| mova [cq+0*32+r5], m0 |
| mova [cq+1*32+r5], m1 |
| mova [cq+2*32+r5], m2 |
| mova [cq+3*32+r5], m3 |
| xor r5d, r5d |
| jmp .loop_pass1 |
| .end: |
| |
| jmp tx2q |
| .main: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 1*32+r5] |
| mova m1, [cq+ 3*32+r5] |
| mova m2, [cq+ 5*32+r5] |
| mova m3, [cq+ 7*32+r5] |
| mova m4, [cq+ 9*32+r5] |
| mova m5, [cq+11*32+r5] |
| mova m6, [cq+13*32+r5] |
| mova m7, [cq+15*32+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+ 0*32+r5] |
| mova m1, [cq+ 2*32+r5] |
| mova m2, [cq+ 4*32+r5] |
| mova m3, [cq+ 6*32+r5] |
| mova m4, [cq+ 8*32+r5] |
| mova m5, [cq+10*32+r5] |
| mova m6, [cq+12*32+r5] |
| mova m7, [cq+14*32+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| call m(idct_16x4_internal_16bpc).round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize+12*16] |
| %endif |
| mov r4d, 2 |
| .pass2_main: |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %endif |
| lea r3, [strideq*3] |
| jmp .loop_pass2_entry |
| .loop_pass2: |
| mova m0, [cq+0*32+ 0] |
| mova m1, [cq+1*32+ 0] |
| mova m2, [cq+2*32+ 0] |
| mova m3, [cq+3*32+ 0] |
| .loop_pass2_entry: |
| mova m4, [cq+0*32+16] |
| mova m5, [cq+1*32+16] |
| mova m6, [cq+2*32+16] |
| mova m7, [cq+3*32+16] |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| call m(idct_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add dstq, 16 |
| add cq, 4*32 |
| dec r4d |
| jg .loop_pass2 |
| RET |
| |
| INV_TXFM_16X8_FN adst, dct |
| INV_TXFM_16X8_FN adst, adst |
| INV_TXFM_16X8_FN adst, flipadst |
| INV_TXFM_16X8_FN adst, identity, 6 |
| |
| cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| mov [rsp+gprsize+12*16], r1 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x8_internal_16bpc).loop_main |
| |
| .main: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 2*32+r5] |
| mova m1, [cq+13*32+r5] |
| mova m2, [cq+ 6*32+r5] |
| mova m3, [cq+ 9*32+r5] |
| mova m4, [cq+10*32+r5] |
| mova m5, [cq+ 5*32+r5] |
| mova m6, [cq+14*32+r5] |
| mova m7, [cq+ 1*32+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(iadst_16x4_internal_16bpc).main_part1 |
| mova m0, [cq+ 0*32+r5] |
| mova m1, [cq+15*32+r5] |
| mova m2, [cq+ 4*32+r5] |
| mova m3, [cq+11*32+r5] |
| mova m4, [cq+ 8*32+r5] |
| mova m5, [cq+ 7*32+r5] |
| mova m6, [cq+12*32+r5] |
| mova m7, [cq+ 3*32+r5] |
| %if ARCH_X86_32 |
| add r3, 8*16 |
| %endif |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| %if ARCH_X86_32 |
| sub r3, 8*16 |
| %endif |
| call m(iadst_16x4_internal_16bpc).main_part2 |
| call m(iadst_16x4_internal_16bpc).round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize+12*16] |
| %endif |
| mov r4d, 2 |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| mova m11, [o(pw_m2048)] |
| %endif |
| lea r3, [strideq*3] |
| jmp .loop_pass2_entry |
| .loop_pass2: |
| mova m0, [cq+0*32+ 0] |
| mova m1, [cq+1*32+ 0] |
| mova m2, [cq+2*32+ 0] |
| mova m3, [cq+3*32+ 0] |
| .loop_pass2_entry: |
| mova m4, [cq+0*32+16] |
| mova m5, [cq+1*32+16] |
| mova m6, [cq+2*32+16] |
| mova m7, [cq+3*32+16] |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end |
| call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add dstq, 16 |
| add cq, 4*32 |
| dec r4d |
| jg .loop_pass2 |
| RET |
| |
| INV_TXFM_16X8_FN flipadst, dct |
| INV_TXFM_16X8_FN flipadst, adst |
| INV_TXFM_16X8_FN flipadst, flipadst |
| INV_TXFM_16X8_FN flipadst, identity, 6 |
| |
| cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| mov [rsp+gprsize+12*16], r1 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x8_internal_16bpc).loop_main |
| .main: |
| call m(iadst_16x8_internal_16bpc).main |
| %if ARCH_X86_64 |
| pshufd m1, m0, q1032 |
| pshufd m3, m2, q1032 |
| pshufd m5, m4, q1032 |
| pshufd m7, m6, q1032 |
| pshufd m0, m14, q1032 |
| pshufd m2, m12, q1032 |
| pshufd m4, m10, q1032 |
| pshufd m6, m8, q1032 |
| mova m14, m1 |
| mova m12, m3 |
| mova m10, m5 |
| mova m8, m7 |
| %else |
| pshufd m1, m0, q1032 |
| pshufd m3, m2, q1032 |
| pshufd m5, m4, q1032 |
| pshufd m7, m6, q1032 |
| pshufd m0, [r3+11*16], q1032 |
| pshufd m2, [r3+10*16], q1032 |
| pshufd m4, [r3+9*16], q1032 |
| pshufd m6, [r3+8*16], q1032 |
| mova [r3+8*16], m7 |
| mova [r3+9*16], m5 |
| mova [r3+10*16], m3 |
| mova [r3+11*16], m1 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize+12*16] |
| %endif |
| lea dstq, [dstq+strideq*8] |
| neg strideq |
| add dstq, strideq |
| %if ARCH_X86_32 |
| mov [rsp+gprsize+12*16], strideq |
| %endif |
| jmp m(iadst_16x8_internal_16bpc).pass2 |
| |
| INV_TXFM_16X8_FN identity, dct, -54 |
| INV_TXFM_16X8_FN identity, adst, -54 |
| INV_TXFM_16X8_FN identity, flipadst, -54 |
| INV_TXFM_16X8_FN identity, identity |
| |
| cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_32 |
| mov [rsp+gprsize+12*16], r1 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x8_internal_16bpc).loop_main |
| .main: |
| %if ARCH_X86_64 |
| mova m15, [o(pd_2896)] |
| pmulld m0, m15, [cq+ 0*32+r5] |
| pmulld m1, m15, [cq+ 1*32+r5] |
| pmulld m2, m15, [cq+ 2*32+r5] |
| pmulld m3, m15, [cq+ 3*32+r5] |
| pmulld m4, m15, [cq+ 4*32+r5] |
| pmulld m5, m15, [cq+ 5*32+r5] |
| pmulld m6, m15, [cq+ 6*32+r5] |
| pmulld m7, m15, [cq+ 7*32+r5] |
| pmulld m8, m15, [cq+ 8*32+r5] |
| pmulld m9, m15, [cq+ 9*32+r5] |
| pmulld m10, m15, [cq+10*32+r5] |
| pmulld m11, m15, [cq+11*32+r5] |
| pmulld m12, m15, [cq+12*32+r5] |
| pmulld m13, m15, [cq+13*32+r5] |
| pmulld m14, m15, [cq+14*32+r5] |
| pmulld m15, [cq+15*32+r5] |
| mova [r3], m15 |
| mova m15, [o(pd_2048)] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| paddd m15, [r3] |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| mova [r3], m15 |
| mova m15, [o(pd_11586)] |
| REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| pmulld m15, [r3] |
| mova [r3], m15 |
| mova m15, [o(pd_6144)] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| paddd m15, [r3] |
| REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %else |
| mova m0, [cq+ 0*32+r5] |
| mova m1, [cq+ 1*32+r5] |
| mova m2, [cq+ 2*32+r5] |
| mova m3, [cq+ 3*32+r5] |
| mova m4, [cq+ 4*32+r5] |
| mova m5, [cq+ 5*32+r5] |
| mova m6, [cq+ 6*32+r5] |
| mova m7, [cq+ 7*32+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| mova [r3], m7 |
| mova m7, [o(pd_11586)] |
| REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmulld m7, [r3] |
| mova [r3], m7 |
| mova m7, [o(pd_6144)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3] |
| REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova [r3+ 8*16], m0 |
| mova [r3+ 9*16], m2 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m6 |
| mova m0, [cq+ 8*32+r5] |
| mova m1, [cq+ 9*32+r5] |
| mova m2, [cq+10*32+r5] |
| mova m3, [cq+11*32+r5] |
| mova m4, [cq+12*32+r5] |
| mova m5, [cq+13*32+r5] |
| mova m6, [cq+14*32+r5] |
| mova m7, [cq+15*32+r5] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| mova [r3], m7 |
| mova m7, [o(pd_11586)] |
| REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pmulld m7, [r3] |
| mova [r3], m7 |
| mova m7, [o(pd_6144)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3] |
| REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| %endif |
| ret |
| .pass2: |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize+12*16] |
| %endif |
| mov r4d, 2 |
| %if ARCH_X86_64 |
| mova m8, [o(pw_4096)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %endif |
| lea r3, [strideq*3] |
| jmp .loop_pass2_entry |
| .loop_pass2: |
| mova m0, [cq+0*32+ 0] |
| mova m1, [cq+1*32+ 0] |
| mova m2, [cq+2*32+ 0] |
| mova m3, [cq+3*32+ 0] |
| .loop_pass2_entry: |
| mova m4, [cq+0*32+16] |
| mova m5, [cq+1*32+16] |
| mova m6, [cq+2*32+16] |
| mova m7, [cq+3*32+16] |
| %if ARCH_X86_64 |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| %else |
| mova [rsp+gprsize], m7 |
| mova m7, [o(pw_4096)] |
| call m(idct_8x8_internal_16bpc).round4_and_write_8x8 |
| %endif |
| %if ARCH_X86_64 |
| %define mzero m9 |
| %else |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add dstq, 16 |
| add cq, 4*32 |
| dec r4d |
| jg .loop_pass2 |
| RET |
| |
| %macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix |
| %if ARCH_X86_64 |
| INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 |
| %else |
| INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 |
| %endif |
| %ifidn %1_%2, dct_dct |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 16 |
| add r5d, 640 |
| sar r5d, 10 |
| add rsp, (5+ARCH_X86_64*3+WIN64)*16 |
| jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X16_FN dct, dct |
| INV_TXFM_16X16_FN dct, identity, v |
| INV_TXFM_16X16_FN dct, adst |
| INV_TXFM_16X16_FN dct, flipadst |
| |
| cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 6, 7 |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %endif |
| %elif ARCH_X86_32 |
| DECLARE_REG_TMP 1, 6 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(.main)] |
| .pass1_full: |
| %undef cmp |
| mov t1d, 4 |
| .zero_loop: |
| dec t1d |
| cmp eobb, byte [r5+t1] |
| jb .zero_loop |
| mov r5d, t1d |
| shl r5d, 4 |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r6, [rsp+16*16+2*gprsize] |
| %endif |
| ; setup stack pointer |
| lea r3, [rsp+gprsize] |
| .loop_pass1: |
| call t0 |
| %if ARCH_X86_64 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+4*64+r5], m8 |
| mova [cq+5*64+r5], m9 |
| mova [cq+6*64+r5], m10 |
| mova [cq+7*64+r5], m11 |
| %else |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+4*64+r5], m0 |
| mova [cq+5*64+r5], m1 |
| mova [cq+6*64+r5], m2 |
| mova [cq+7*64+r5], m3 |
| mova m0, [rsp+gprsize+ 8*16] |
| mova m2, [rsp+gprsize+ 9*16] |
| mova m4, [rsp+gprsize+10*16] |
| mova m6, [rsp+gprsize+11*16] |
| %endif |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+0*64+r5], m0 |
| mova [cq+1*64+r5], m1 |
| mova [cq+2*64+r5], m2 |
| mova [cq+3*64+r5], m3 |
| pxor m0, m0 |
| REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 |
| sub r5d, 16 |
| jge .loop_pass1 |
| |
| %if ARCH_X86_32 |
| ; restore pic-ptr |
| mov r1, [rsp+16*16+1*gprsize] |
| %endif |
| jmp tx2q |
| .main: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| |
| mova m0, [cq+ 1*64+r5] |
| mova m1, [cq+ 3*64+r5] |
| mova m2, [cq+ 5*64+r5] |
| mova m3, [cq+ 7*64+r5] |
| mova m4, [cq+ 9*64+r5] |
| mova m5, [cq+11*64+r5] |
| mova m6, [cq+13*64+r5] |
| mova m7, [cq+15*64+r5] |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+ 0*64+r5] |
| mova m1, [cq+ 2*64+r5] |
| mova m2, [cq+ 4*64+r5] |
| mova m3, [cq+ 6*64+r5] |
| mova m4, [cq+ 8*64+r5] |
| mova m5, [cq+10*64+r5] |
| mova m6, [cq+12*64+r5] |
| mova m7, [cq+14*64+r5] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| call .round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| ret |
| .round: |
| %if ARCH_X86_64 |
| REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 |
| psrld m8, m11, 10 ; 2 |
| REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| mova m8, [r3+1*16] |
| mova m9, [r3+2*16] |
| mova m10, [r3+3*16] |
| mova m11, [r3+4*16] |
| mova m12, [r3+5*16] |
| mova m13, [r3+6*16] |
| mova m14, [r3+7*16] |
| psubd m15, m0, m14 ; out15 |
| paddd m0, m14 ; out0 |
| psubd m14, m1, m13 ; out14 |
| paddd m1, m13 ; out1 |
| psubd m13, m2, m12 ; out13 |
| paddd m2, m12 ; out2 |
| psubd m12, m3, m11 ; out12 |
| paddd m3, m11 ; out3 |
| psubd m11, m4, m10 ; out11 |
| paddd m4, m10 ; out4 |
| psubd m10, m5, m9 ; out10 |
| paddd m5, m9 ; out5 |
| psubd m9, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| psubd m8, m7, [r3+0*16] ; out8 |
| paddd m7, [r3+0*16] ; out7 |
| REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| ; and out0-15 is now in m0-15 |
| %else |
| mova [r3+ 0*16], m0 |
| mova m0, [o(clip_18b_min)] |
| REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 |
| pmaxsd m0, [r3+ 0*16] |
| mova [r3+ 0*16], m7 |
| mova m7, [o(clip_18b_max)] |
| REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| pminsd m7, [r3+ 0*16] |
| mova [r3+ 0*16], m0 |
| mova m0, [o(pd_2)] |
| REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 |
| paddd m0, [r3+ 0*16] |
| mova [r3+ 0*16], m0 |
| mova [r3+ 1*16], m1 |
| mova [r3+ 2*16], m2 |
| mova m1, [r3+11*16] |
| mova m2, [r3+10*16] |
| psubd m0, m7, m1 |
| paddd m7, m1 |
| psubd m1, m6, m2 |
| paddd m6, m2 |
| REPX {psrad x, 2}, m0, m1, m6, m7 |
| packssdw m0, m1 ; out8-9 |
| packssdw m6, m7 ; out6-7 |
| mova [r3+11*16], m6 |
| mova m1, [r3+9*16] |
| mova m7, [r3+8*16] |
| psubd m2, m5, m1 |
| paddd m5, m1 |
| psubd m1, m4, m7 |
| paddd m4, m7 |
| REPX {psrad x, 2}, m2, m1, m4, m5 |
| packssdw m2, m1 ; out10-11 |
| packssdw m4, m5 ; out4-5 |
| mova m1, [r3+2*16] |
| mova [r3+10*16], m4 |
| mova m6, [r3+7*16] |
| mova m7, [r3+6*16] |
| psubd m4, m3, m6 |
| paddd m3, m6 |
| psubd m6, m1, m7 |
| paddd m1, m7 |
| REPX {psrad x, 2}, m4, m6, m1, m3 |
| packssdw m4, m6 ; out12-13 |
| packssdw m1, m3 ; out2-3 |
| mova m3, [r3+1*16] |
| mova [r3+9*16], m1 |
| mova m1, [r3+0*16] |
| mova m5, [r3+5*16] |
| mova m7, [r3+4*16] |
| psubd m6, m3, m5 |
| paddd m3, m5 |
| psubd m5, m1, m7 |
| paddd m1, m7 |
| REPX {psrad x, 2}, m6, m5, m1, m3 |
| packssdw m6, m5 ; out14-15 |
| packssdw m1, m3 ; out0-1 |
| mova [r3+8*16], m1 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| mov r7, dstq |
| %else |
| mov [rsp+2*gprsize+16*16], dstq |
| %endif |
| lea r3, [strideq*3] |
| mov r4d, 2 |
| .loop_pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| mova m0, [cq+0*64+ 0] |
| mova m1, [cq+2*64+ 0] |
| mova m2, [cq+0*64+16] |
| mova m3, [cq+2*64+16] |
| mova m4, [cq+0*64+32] |
| mova m5, [cq+2*64+32] |
| mova m6, [cq+0*64+48] |
| mova m7, [cq+2*64+48] |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| mova [rsp+gprsize+3*16], m0 |
| mova [rsp+gprsize+4*16], m1 |
| mova [rsp+gprsize+5*16], m2 |
| mova [rsp+gprsize+6*16], m3 |
| mova [rsp+gprsize+7*16], m4 |
| mova [rsp+gprsize+8*16], m5 |
| mova [rsp+gprsize+9*16], m6 |
| ; m7 is already stored in [rsp+gprsize+0*16] |
| mova m0, [cq+1*64+ 0] |
| mova m1, [cq+3*64+ 0] |
| mova m2, [cq+1*64+16] |
| mova m3, [cq+3*64+16] |
| mova m4, [cq+1*64+32] |
| mova m5, [cq+3*64+32] |
| mova m6, [cq+1*64+48] |
| mova m7, [cq+3*64+48] |
| call m_suffix(idct_16x8_internal_8bpc, _ssse3).main |
| |
| ; out0-7 is in rsp+gprsize+3-10*mmsize |
| ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize |
| |
| %if ARCH_X86_64 |
| lea dstq, [r7+strideq*8] |
| %else |
| mov dstq, [rsp+2*gprsize+16*16] |
| lea dstq, [dstq+strideq*8] |
| %endif |
| call m(idct_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| mov dstq, r7 |
| %else |
| mov dstq, [rsp+2*gprsize+16*16] |
| %endif |
| mova m0, [rsp+gprsize+ 3*16] |
| mova m1, [rsp+gprsize+ 4*16] |
| mova m2, [rsp+gprsize+ 5*16] |
| mova m3, [rsp+gprsize+ 6*16] |
| mova m4, [rsp+gprsize+ 7*16] |
| mova m5, [rsp+gprsize+ 8*16] |
| mova m6, [rsp+gprsize+ 9*16] |
| mova m7, [rsp+gprsize+10*16] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| %if ARCH_X86_64 |
| add r7, 16 |
| %define mzero m9 |
| %else |
| add dword [rsp+2*gprsize+16*16], 16 |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add cq, 64*4 |
| REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 |
| %undef mzero |
| dec r4d |
| jg .loop_pass2 |
| %if WIN64 |
| mov r7, [rsp+16*16+gprsize] |
| %endif |
| RET |
| |
| INV_TXFM_16X16_FN adst, dct |
| INV_TXFM_16X16_FN adst, adst |
| INV_TXFM_16X16_FN adst, flipadst |
| |
| cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x16_internal_16bpc).pass1_full |
| |
| .main: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 2*64+r5] |
| mova m1, [cq+13*64+r5] |
| mova m2, [cq+ 6*64+r5] |
| mova m3, [cq+ 9*64+r5] |
| mova m4, [cq+10*64+r5] |
| mova m5, [cq+ 5*64+r5] |
| mova m6, [cq+14*64+r5] |
| mova m7, [cq+ 1*64+r5] |
| call m(iadst_16x4_internal_16bpc).main_part1 |
| mova m0, [cq+ 0*64+r5] |
| mova m1, [cq+15*64+r5] |
| mova m2, [cq+ 4*64+r5] |
| mova m3, [cq+11*64+r5] |
| mova m4, [cq+ 8*64+r5] |
| mova m5, [cq+ 7*64+r5] |
| mova m6, [cq+12*64+r5] |
| mova m7, [cq+ 3*64+r5] |
| call m(iadst_16x4_internal_16bpc).main_part2 |
| call .round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| ret |
| .round: |
| %if ARCH_X86_64 |
| pcmpeqd m8, m8 ; -1 |
| mova m15, [o(pd_10240)] |
| psrld m14, 10 ; +2 |
| psubd m13, m14, m8 ; +3 |
| REPX {pxor x, m8 }, m1, m3, m5, m7 |
| REPX {paddd x, m14}, m0, m2 |
| REPX {paddd x, m13}, m1, m3 |
| REPX {paddd x, m15}, m4, m5, m6, m7 |
| paddd m13, m15, m8 ; +10239 |
| paddd m8, m15, m9 |
| psubd m9, m13, m10 |
| paddd m10, m15, m11 |
| psubd m11, m13, m12 |
| paddd m12, m14, [r3+3*16] |
| psubd m13, m14, [r3+2*16] |
| psubd m15, m14, [r3+0*16] |
| paddd m14, [r3+1*16] |
| REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 |
| REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 |
| %else |
| mova [r3+8*16], m1 |
| mova [r3+9*16], m3 |
| mova m3, [o(pd_10240)] |
| pcmpeqd m1, m1 |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m4, m5, m6, m7 |
| REPX {psrad x, 14}, m4, m5, m6, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m6 |
| mova m4, [r3+4*16] |
| mova m5, [r3+5*16] |
| mova m6, [r3+6*16] |
| mova m7, [r3+7*16] |
| mova m3, [o(pd_2)] |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m4, m6 |
| psubd m3, m1 |
| REPX {paddd x, m3}, m5, m7 |
| REPX {psrad x, 2 }, m4, m5, m6, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova m5, [r3+8*16] |
| mova m7, [r3+9*16] |
| mova [r3+8*16], m4 |
| mova [r3+9*16], m6 |
| mova m3, [o(pd_10240)] |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m0, m5, m2, m7 |
| REPX {psrad x, 14}, m0, m5, m2, m7 |
| packssdw m0, m5 |
| packssdw m2, m7 |
| mova m4, [r3+0*16] |
| mova m5, [r3+1*16] |
| mova m6, [r3+2*16] |
| mova m7, [r3+3*16] |
| mova m3, [o(pd_2)] |
| REPX {pxor x, m1}, m5, m7 |
| REPX {paddd x, m3}, m4, m6 |
| psubd m3, m1 |
| REPX {paddd x, m3}, m5, m7 |
| REPX {psrad x, 2 }, m4, m5, m6, m7 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| %endif |
| ret |
| .pass2: |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| mova m11, [o(pw_m2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| mov r7, dstq |
| %else |
| mov [rsp+2*gprsize+16*16], dstq |
| %endif |
| lea r3, [strideq*3] |
| mov r4d, 2 |
| .loop_pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| mova m0, [cq+0*64+32] |
| mova m1, [cq+1*64+32] |
| mova m2, [cq+2*64+16] |
| mova m3, [cq+3*64+16] |
| mova m4, [cq+0*64+ 0] |
| mova m5, [cq+1*64+ 0] |
| mova m6, [cq+2*64+48] |
| mova m7, [cq+3*64+48] |
| mova [rsp+gprsize+3*16], m0 |
| mova [rsp+gprsize+4*16], m1 |
| mova [rsp+gprsize+5*16], m2 |
| mova [rsp+gprsize+6*16], m3 |
| mova [rsp+gprsize+7*16], m4 |
| mova [rsp+gprsize+8*16], m5 |
| mova [rsp+gprsize+9*16], m6 |
| mova [rsp+gprsize+10*16], m7 |
| mova m0, [cq+2*64+ 0] |
| mova m1, [cq+3*64+ 0] |
| mova m2, [cq+0*64+16] |
| mova m3, [cq+1*64+16] |
| mova m4, [cq+2*64+32] |
| mova m5, [cq+3*64+32] |
| mova m6, [cq+0*64+48] |
| mova m7, [cq+1*64+48] |
| call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main |
| call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end |
| |
| ; out0-7 is in rsp+gprsize+3-10*mmsize |
| ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize |
| |
| %if ARCH_X86_64 |
| lea dstq, [r7+strideq*8] |
| %else |
| mov dstq, [rsp+2*gprsize+16*16] |
| lea dstq, [dstq+strideq*8] |
| %endif |
| call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 |
| %if ARCH_X86_64 |
| mov dstq, r7 |
| %else |
| mov dstq, [rsp+2*gprsize+16*16] |
| %endif |
| mova m0, [rsp+gprsize+ 3*16] |
| mova m1, [rsp+gprsize+ 4*16] |
| mova m2, [rsp+gprsize+ 5*16] |
| mova m3, [rsp+gprsize+ 6*16] |
| mova m4, [rsp+gprsize+ 7*16] |
| mova m5, [rsp+gprsize+ 8*16] |
| mova m6, [rsp+gprsize+ 9*16] |
| mova m7, [rsp+gprsize+10*16] |
| call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 |
| %if ARCH_X86_64 |
| add r7, 16 |
| %define mzero m9 |
| %else |
| add dword [rsp+2*gprsize+16*16], 16 |
| %define mzero m7 |
| pxor m7, m7 |
| %endif |
| REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 |
| add cq, 64*4 |
| REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 |
| %undef mzero |
| dec r4d |
| jg .loop_pass2 |
| %if WIN64 |
| mov r7, [rsp+16*16+gprsize] |
| %endif |
| RET |
| |
| INV_TXFM_16X16_FN flipadst, dct |
| INV_TXFM_16X16_FN flipadst, adst |
| INV_TXFM_16X16_FN flipadst, flipadst |
| |
| cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x16_internal_16bpc).pass1_full |
| |
| .main: |
| call m(iadst_16x16_internal_16bpc).main |
| %if ARCH_X86_64 |
| mova m1, m0 |
| mova m3, m2 |
| mova m5, m4 |
| mova m7, m6 |
| pshufd m0, m14, q1032 |
| pshufd m2, m12, q1032 |
| pshufd m4, m10, q1032 |
| pshufd m6, m8, q1032 |
| pshufd m8, m7, q1032 |
| pshufd m10, m5, q1032 |
| pshufd m12, m3, q1032 |
| pshufd m14, m1, q1032 |
| %else |
| pshufd m1, m0, q1032 |
| pshufd m3, m2, q1032 |
| pshufd m5, m4, q1032 |
| pshufd m7, m6, q1032 |
| pshufd m0, [r3+11*16], q1032 |
| pshufd m2, [r3+10*16], q1032 |
| pshufd m4, [r3+9*16], q1032 |
| pshufd m6, [r3+8*16], q1032 |
| mova [r3+11*16], m1 |
| mova [r3+10*16], m3 |
| mova [r3+ 9*16], m5 |
| mova [r3+ 8*16], m7 |
| %endif |
| ret |
| |
| .pass2: |
| lea r3, [strideq*3] |
| lea r3, [r3*5] |
| add dstq, r3 |
| neg strideq |
| jmp m(iadst_16x16_internal_16bpc).pass2 |
| |
| INV_TXFM_16X16_FN identity, dct, h |
| INV_TXFM_16X16_FN identity, identity |
| |
| cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %elif ARCH_X86_32 |
| mov [rsp+16*16+gprsize*1], r1 |
| mov [rsp+16*16+gprsize*2], r6 |
| %endif |
| lea t0, [o(.main)] |
| jmp m(idct_16x16_internal_16bpc).pass1_full |
| |
| .main: |
| %if ARCH_X86_64 |
| mova m15, [o(pd_11586)] |
| pmulld m0, m15, [cq+ 0*64+r5] |
| pmulld m1, m15, [cq+ 1*64+r5] |
| pmulld m2, m15, [cq+ 2*64+r5] |
| pmulld m3, m15, [cq+ 3*64+r5] |
| pmulld m4, m15, [cq+ 4*64+r5] |
| pmulld m5, m15, [cq+ 5*64+r5] |
| pmulld m6, m15, [cq+ 6*64+r5] |
| pmulld m7, m15, [cq+ 7*64+r5] |
| pmulld m8, m15, [cq+ 8*64+r5] |
| pmulld m9, m15, [cq+ 9*64+r5] |
| pmulld m10, m15, [cq+10*64+r5] |
| pmulld m11, m15, [cq+11*64+r5] |
| pmulld m12, m15, [cq+12*64+r5] |
| pmulld m13, m15, [cq+13*64+r5] |
| pmulld m14, m15, [cq+14*64+r5] |
| pmulld m15, [cq+15*64+r5] |
| mova [r3], m15 |
| mova m15, [o(pd_10240)] |
| REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14 |
| paddd m15, [r3] |
| REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m8, m9, m10, m11, m12, m13, m14, m15 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %else |
| mova m7, [o(pd_11586)] |
| pmulld m0, m7, [cq+ 0*64+r5] |
| pmulld m1, m7, [cq+ 1*64+r5] |
| pmulld m2, m7, [cq+ 2*64+r5] |
| pmulld m3, m7, [cq+ 3*64+r5] |
| pmulld m4, m7, [cq+ 4*64+r5] |
| pmulld m5, m7, [cq+ 5*64+r5] |
| pmulld m6, m7, [cq+ 6*64+r5] |
| pmulld m7, [cq+ 7*64+r5] |
| mova [r3], m7 |
| mova m7, [o(pd_10240)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3] |
| REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| mova [r3+8*16], m0 |
| mova [r3+9*16], m2 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m6 |
| mova m7, [o(pd_11586)] |
| pmulld m0, m7, [cq+ 8*64+r5] |
| pmulld m1, m7, [cq+ 9*64+r5] |
| pmulld m2, m7, [cq+10*64+r5] |
| pmulld m3, m7, [cq+11*64+r5] |
| pmulld m4, m7, [cq+12*64+r5] |
| pmulld m5, m7, [cq+13*64+r5] |
| pmulld m6, m7, [cq+14*64+r5] |
| pmulld m7, [cq+15*64+r5] |
| mova [r3], m7 |
| mova m7, [o(pd_10240)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3] |
| REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| %endif |
| ret |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m4, [o(pw_2048)] |
| mova m5, [o(pixel_10bpc_max)] |
| pxor m6, m6 |
| mova m7, [o(pw_1697x16)] |
| mov r7, dstq |
| %else |
| mov [rsp+2*gprsize+16*16], dstq |
| %endif |
| mov r5d, 4 |
| lea r3, [strideq*3] |
| .pass2_loop: |
| mova m0, [cq+0*64+0] |
| mova m1, [cq+1*64+0] |
| mova m2, [cq+2*64+0] |
| mova m3, [cq+3*64+0] |
| call m(iidentity_8x16_internal_16bpc).main |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).round1_and_write_8x4 |
| %else |
| call m(idct_8x4_internal_16bpc).round2_and_write_8x4 |
| %endif |
| REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 |
| add cq, 16 |
| lea dstq, [dstq+strideq*4] |
| dec r5w |
| jg .pass2_loop |
| add cq, 64*3 |
| btc r5d, 16 |
| jc .end |
| %if ARCH_X86_64 |
| lea dstq, [r7+16] |
| %else |
| mov dstq, [rsp+2*gprsize+16*16] |
| add dstq, 16 |
| %endif |
| add r5d, 4 |
| jmp .pass2_loop |
| .end: |
| %if WIN64 |
| mov r7, [rsp+16*16+gprsize] |
| %endif |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %endif |
| mova m5, [o(pw_5)] |
| mova m7, [o(pixel_10bpc_max)] |
| pxor m6, m6 |
| mov r5d, eobd |
| add eobb, 21 |
| cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 |
| lea r4, [strideq*3] |
| .loop: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| REPX {paddsw x, m5}, m0, m1, m2, m3 |
| REPX {psraw x, 3 }, m0, m1, m2, m3 |
| call .main_zero |
| add cq, 16 |
| lea dstq, [dstq+strideq*4] |
| btc eobd, 16 |
| jnc .loop |
| sub eobd, 64 |
| jge .loop |
| RET |
| ALIGN function_align |
| .main_zero: |
| REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| .main: |
| punpckhwd m4, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m2, m3 |
| punpcklwd m2, m3 |
| punpckhwd m3, m0, m4 |
| punpcklwd m0, m4 |
| punpckhwd m4, m2, m1 |
| punpcklwd m2, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| paddw m0, [dstq+strideq*0] |
| paddw m1, [dstq+strideq*1] |
| paddw m2, [dstq+strideq*2] |
| paddw m3, [dstq+r4 ] |
| REPX {pmaxsw x, m6}, m0, m1, m2, m3 |
| REPX {pminsw x, m7}, m0, m1, m2, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r4 ], m3 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %endif |
| mova m5, [o(pw_4096)] |
| mova m7, [o(pixel_10bpc_max)] |
| pxor m6, m6 |
| mov r4d, eobd |
| add eobb, 21 |
| cmovc eobd, r4d |
| lea r4, [strideq*3] |
| mov r5, dstq |
| .loop: |
| mova m0, [cq+32*0] |
| packssdw m0, [cq+32*1] |
| mova m1, [cq+32*2] |
| packssdw m1, [cq+32*3] |
| mova m2, [cq+32*4] |
| packssdw m2, [cq+32*5] |
| mova m3, [cq+32*6] |
| packssdw m3, [cq+32*7] |
| REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| call m(inv_txfm_add_identity_identity_8x32_16bpc).main |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| btc eobd, 16 |
| jnc .loop |
| add cq, 32*8-32 |
| add r5, 16 |
| mov dstq, r5 |
| sub eobd, 64 |
| jge .loop |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %else |
| mova m8, [o(pw_2896x8)] |
| mova m9, [o(pw_1697x16)] |
| mova m11, [o(pw_8192)] |
| %endif |
| mova m7, [o(pixel_10bpc_max)] |
| lea r4, [strideq*3] |
| pxor m6, m6 |
| %if ARCH_X86_64 |
| paddw m10, m11, m11 ; pw_16384 |
| %endif |
| mov r5, dstq |
| call .main |
| sub eobd, 36 |
| jl .ret |
| add cq, 128*8-32 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 107 ; eob < 143 |
| jl .ret |
| add cq, 128*8-32 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 128 ; eob < 271 |
| jl .ret |
| add cq, 128*8-32 |
| lea dstq, [r5+16] |
| call .main |
| sub cq, 128*8 |
| lea dstq, [r5+strideq*8] |
| mov r5, dstq |
| call .main |
| sub eobd, 128 ; eob < 399 |
| jl .ret |
| add cq, 128*8-32 |
| lea dstq, [r5+16] |
| call .main |
| .ret: |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| %if ARCH_X86_64 |
| REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 |
| pmulhrsw m4, m9, m0 |
| pmulhrsw m5, m9, m1 |
| REPX {pmulhrsw x, m10}, m4, m5 |
| %else |
| mova m6, [o(pw_2896x8)] |
| REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 |
| mova m5, [o(pw_1697x16)] |
| pmulhrsw m4, m5, m0 |
| pmulhrsw m5, m1 |
| mova m6, [o(pw_16384)] |
| REPX {pmulhrsw x, m6 }, m4, m5 |
| %endif |
| paddsw m0, m4 |
| paddsw m1, m5 |
| %if ARCH_X86_64 |
| pmulhrsw m4, m9, m2 |
| pmulhrsw m5, m9, m3 |
| REPX {pmulhrsw x, m10}, m4, m5 |
| %else |
| mova m5, [o(pw_1697x16)] |
| pmulhrsw m4, m5, m2 |
| pmulhrsw m5, m3 |
| REPX {pmulhrsw x, m6 }, m4, m5 |
| %endif |
| paddsw m2, m4 |
| paddsw m3, m5 |
| %if ARCH_X86_64 |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3 |
| %else |
| psrlw m6, 1 ; pw_8192 |
| REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 |
| pxor m6, m6 |
| %endif |
| call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| btc eobd, 16 |
| jnc .main |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %else |
| mova m8, [o(pw_2896x8)] |
| mova m9, [o(pw_1697x16)] |
| mova m10, [o(pw_2048)] |
| %endif |
| mova m7, [o(pixel_10bpc_max)] |
| lea r4, [strideq*3] |
| pxor m6, m6 |
| mov r5, dstq |
| call .main |
| sub eobd, 36 |
| jl .ret |
| call .main |
| add cq, 64*8-64 |
| lea dstq, [r5+16*1] |
| call .main |
| sub eobd, 107 ; eob < 143 |
| jl .ret |
| call .main |
| add cq, 64*8-64 |
| lea dstq, [r5+16*2] |
| call .main |
| sub eobd, 128 ; eob < 271 |
| jl .ret |
| call .main |
| add cq, 64*8-64 |
| lea dstq, [r5+16*3] |
| call .main |
| sub eobd, 128 ; eob < 399 |
| jl .ret |
| call .main |
| .ret: |
| RET |
| ALIGN function_align |
| .main: |
| mova m0, [cq+64*0] |
| packssdw m0, [cq+64*1] |
| mova m1, [cq+64*2] |
| packssdw m1, [cq+64*3] |
| mova m2, [cq+64*4] |
| packssdw m2, [cq+64*5] |
| mova m3, [cq+64*6] |
| packssdw m3, [cq+64*7] |
| %if ARCH_X86_64 |
| REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 |
| %else |
| mova m6, [o(pw_2896x8)] |
| REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 |
| %endif |
| REPX {paddsw x, x }, m0, m1, m2, m3 |
| %if ARCH_X86_64 |
| pmulhrsw m4, m9, m0 |
| pmulhrsw m5, m9, m1 |
| %else |
| mova m6, [o(pw_1697x16)] |
| pmulhrsw m4, m6, m0 |
| pmulhrsw m5, m6, m1 |
| %endif |
| REPX {paddsw x, x }, m0, m1 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| %if ARCH_X86_64 |
| pmulhrsw m4, m9, m2 |
| pmulhrsw m5, m9, m3 |
| %else |
| pmulhrsw m4, m6, m2 |
| pmulhrsw m6, m3 |
| %endif |
| REPX {paddsw x, x }, m2, m3 |
| paddsw m2, m4 |
| %if ARCH_X86_64 |
| paddsw m3, m5 |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3 |
| %else |
| paddsw m3, m6 |
| mova m6, [o(pw_2048)] |
| REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 |
| pxor m6, m6 |
| %endif |
| REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 |
| call m(inv_txfm_add_identity_identity_8x32_16bpc).main |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| btc eobd, 16 |
| jnc .main |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob |
| %undef cmp |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %endif |
| mova m5, [o(pw_8192)] |
| mova m7, [o(pixel_10bpc_max)] |
| pxor m6, m6 |
| lea r4, [strideq*3] |
| mov r5, dstq |
| call .main ; 0 |
| cmp eobd, 36 |
| jl .ret |
| add cq, 128*8-32 ; 0 1 |
| lea dstq, [r5+16] ; 1 |
| call .main |
| call .main2 |
| cmp eobd, 136 |
| jl .ret |
| add cq, 128*16-64 ; 0 1 2 |
| lea dstq, [r5+16*2] ; 1 2 |
| call .main ; 2 |
| call .main2 |
| call .main2 |
| cmp eobd, 300 |
| jl .ret |
| add cq, 128*24-96 ; 0 1 2 3 |
| add r5, 16*3 ; 1 2 3 |
| mov dstq, r5 ; 2 3 |
| call .main ; 3 |
| call .main2 |
| call .main2 |
| call .main2 |
| cmp eobd, 535 |
| jl .ret |
| add cq, 128*24-96 ; 0 1 2 3 |
| lea dstq, [r5+strideq*8] ; 1 2 3 4 |
| mov r5, dstq ; 2 3 4 |
| call .main ; 3 4 |
| call .main2 |
| call .main2 |
| cmp eobd, 755 |
| jl .ret |
| add cq, 128*16-64 ; 0 1 2 3 |
| lea dstq, [r5+strideq*8] ; 1 2 3 4 |
| mov r5, dstq ; 2 3 4 5 |
| call .main ; 3 4 5 |
| call .main2 |
| cmp eobd, 911 |
| jl .ret |
| add cq, 128*8-32 ; 0 1 2 3 |
| lea dstq, [r5+strideq*8] ; 1 2 3 4 |
| call .main ; 2 3 4 5 |
| .ret: ; 3 4 5 6 |
| RET |
| ALIGN function_align |
| .main2: |
| sub cq, 128*8 |
| sub dstq, 16 |
| .main: |
| mova m0, [cq+128*0] |
| packssdw m0, [cq+128*1] |
| mova m1, [cq+128*2] |
| packssdw m1, [cq+128*3] |
| mova m2, [cq+128*4] |
| packssdw m2, [cq+128*5] |
| mova m3, [cq+128*6] |
| packssdw m3, [cq+128*7] |
| REPX {pmulhrsw x, m5}, m0, m1, m2, m3 |
| call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero |
| lea dstq, [dstq+strideq*4] |
| add cq, 16 |
| btc eobd, 16 |
| jnc .main |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ |
| dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %define base $$ |
| DECLARE_REG_TMP 0, 4 |
| %else |
| lea r6, [tbl_Nx32_odd_offset] |
| %define base tbl_Nx32_odd_offset |
| DECLARE_REG_TMP 4, 7 |
| %if WIN64 |
| mov [rsp+gprsize*1+35*16], r7 |
| %endif |
| %endif |
| %define o2(x) r6-base+x |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| mov [rsp+gprsize*1+35*16], r0 |
| %endif |
| %undef cmp |
| ; remove entirely-zero iterations |
| mov r5d, 7*2 |
| cmp eobw, word [o2(tbl_8x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| mova [rsp+ 3*16+r5*8], m0 |
| mova [rsp+11*16+r5*8], m0 |
| mova [rsp+ 3*16+t0*8], m0 |
| mova [rsp+ 3*16+t1*8], m0 |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_8x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| ; actual first pass after skipping all-zero data |
| mov [rsp+gprsize*0+35*16], eobd |
| mov r3, rsp |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+0*128+r5*8] |
| mova m1, [cq+1*128+r5*8] |
| mova m2, [cq+2*128+r5*8] |
| mova m3, [cq+3*128+r5*8] |
| mova m4, [cq+4*128+r5*8] |
| mova m5, [cq+5*128+r5*8] |
| mova m6, [cq+6*128+r5*8] |
| mova m7, [cq+7*128+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| mova m1, [o(pd_2)] |
| REPX {paddd x, m1}, m0, m6, m5, m3 |
| call m(idct_8x4_internal_16bpc).round |
| REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| mova [r3+ 3*16+r5*8], m0 |
| mova [r3+11*16+r5*8], m2 |
| mova [r3+ 3*16+t1*8], m1 |
| mova [r3+ 3*16+t0*8], m3 |
| pxor m7, m7 |
| REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass 2 code starts here |
| ; m0 is already loaded from last iteration of first pass |
| %if ARCH_X86_32 |
| mov r0, [rsp+gprsize*1+35*16] |
| %endif |
| mov eobd, [rsp+gprsize*0+35*16] |
| cmp eobd, 43 |
| jl .load_veryfast |
| cmp eobd, 107 |
| jl .load_fast |
| ; load normal |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .load_fast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| jmp .run |
| .load_veryfast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| ; fall-through |
| .run: |
| call .pass2 |
| %if WIN64 |
| mov r7, [rsp+gprsize*1+35*16] |
| %endif |
| RET |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| mova m1, [rsp+gprsize+16* 4] |
| mova m2, [rsp+gprsize+16* 5] |
| mova m3, [rsp+gprsize+16* 6] |
| mova m4, [rsp+gprsize+16* 7] |
| mova m5, [rsp+gprsize+16* 8] |
| mova m6, [rsp+gprsize+16* 9] |
| mova m7, [rsp+gprsize+16*10] |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| mova [rsp+gprsize+ 3*16], m0 |
| mova [rsp+gprsize+ 4*16], m1 |
| mova [rsp+gprsize+ 5*16], m2 |
| mova [rsp+gprsize+ 6*16], m3 |
| mova [rsp+gprsize+ 7*16], m4 |
| mova [rsp+gprsize+ 8*16], m5 |
| mova [rsp+gprsize+ 9*16], m6 |
| mova m0, [rsp+gprsize+11*16] |
| mova m1, [rsp+gprsize+12*16] |
| mova m2, [rsp+gprsize+13*16] |
| mova m3, [rsp+gprsize+14*16] |
| mova m4, [rsp+gprsize+15*16] |
| mova m5, [rsp+gprsize+16*16] |
| mova m6, [rsp+gprsize+17*16] |
| mova m7, [rsp+gprsize+18*16] |
| call m_suffix(idct_16x8_internal_8bpc, _ssse3).main |
| mova m7, [rsp+gprsize+ 0*16] |
| mova [rsp+gprsize+11*16], m0 |
| mova [rsp+gprsize+12*16], m1 |
| mova [rsp+gprsize+13*16], m2 |
| mova [rsp+gprsize+14*16], m3 |
| mova [rsp+gprsize+15*16], m4 |
| mova [rsp+gprsize+16*16], m5 |
| mova [rsp+gprsize+17*16], m6 |
| mova [rsp+gprsize+18*16], m7 |
| call r4 |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %endif |
| lea r3, [strideq*3] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| lea dstq, [dstq+strideq*8] |
| mova m0, [rsp+gprsize+11*16] |
| mova m1, [rsp+gprsize+12*16] |
| mova m2, [rsp+gprsize+13*16] |
| mova m3, [rsp+gprsize+14*16] |
| mova m4, [rsp+gprsize+15*16] |
| mova m5, [rsp+gprsize+16*16] |
| mova m6, [rsp+gprsize+17*16] |
| mova m7, [rsp+gprsize+18*16] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| lea dstq, [dstq+strideq*8] |
| mova m0, [rsp+gprsize+19*16] |
| mova m1, [rsp+gprsize+20*16] |
| mova m2, [rsp+gprsize+21*16] |
| mova m3, [rsp+gprsize+22*16] |
| mova m4, [rsp+gprsize+23*16] |
| mova m5, [rsp+gprsize+24*16] |
| mova m6, [rsp+gprsize+25*16] |
| mova m7, [rsp+gprsize+26*16] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| lea dstq, [dstq+strideq*8] |
| mova m0, [rsp+gprsize+27*16] |
| mova m1, [rsp+gprsize+28*16] |
| mova m2, [rsp+gprsize+29*16] |
| mova m3, [rsp+gprsize+30*16] |
| mova m4, [rsp+gprsize+31*16] |
| mova m5, [rsp+gprsize+32*16] |
| mova m6, [rsp+gprsize+33*16] |
| mova m7, [rsp+gprsize+34*16] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| ret |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 8 |
| add r5d, 640 |
| sar r5d, 10 |
| add rsp, (31+2*ARCH_X86_64)*16 |
| jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 |
| |
| cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| mov [rsp+gprsize*1+76*16], r0 |
| %elif WIN64 |
| mov [rsp+gprsize*1+76*16], r7 |
| %endif |
| %undef cmp |
| ; remove entirely-zero iterations |
| mov r5d, 7*2 |
| cmp eobw, word [o2(tbl_16x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| mova [rsp+12*16+r5*8], m0 |
| mova [rsp+20*16+r5*8], m0 |
| mova [rsp+12*16+t0*8], m0 |
| mova [rsp+12*16+t1*8], m0 |
| mova [rsp+44*16+r5*8], m0 |
| mova [rsp+52*16+r5*8], m0 |
| mova [rsp+44*16+t0*8], m0 |
| mova [rsp+44*16+t1*8], m0 |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_16x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| ; actual first pass after skipping all-zero data |
| mov [rsp+gprsize*0+76*16], eobd |
| mov r3, rsp |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 1*128+r5*8] |
| mova m1, [cq+ 3*128+r5*8] |
| mova m2, [cq+ 5*128+r5*8] |
| mova m3, [cq+ 7*128+r5*8] |
| mova m4, [cq+ 9*128+r5*8] |
| mova m5, [cq+11*128+r5*8] |
| mova m6, [cq+13*128+r5*8] |
| mova m7, [cq+15*128+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+ 0*128+r5*8] |
| mova m1, [cq+ 2*128+r5*8] |
| mova m2, [cq+ 4*128+r5*8] |
| mova m3, [cq+ 6*128+r5*8] |
| mova m4, [cq+ 8*128+r5*8] |
| mova m5, [cq+10*128+r5*8] |
| mova m6, [cq+12*128+r5*8] |
| mova m7, [cq+14*128+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| call m(idct_16x4_internal_16bpc).round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| %if ARCH_X86_64 |
| mova [rsp+12*16+r5*8], m0 |
| mova [rsp+20*16+r5*8], m2 |
| mova [rsp+12*16+t1*8], m1 |
| mova [rsp+12*16+t0*8], m3 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+44*16+r5*8], m8 |
| mova [rsp+52*16+r5*8], m10 |
| mova [rsp+44*16+t1*8], m9 |
| mova [rsp+44*16+t0*8], m11 |
| %else |
| mova [rsp+44*16+r5*8], m0 |
| mova [rsp+52*16+r5*8], m2 |
| mova [rsp+44*16+t1*8], m1 |
| mova [rsp+44*16+t0*8], m3 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+ 9*16] |
| mova m4, [r3+10*16] |
| mova m6, [r3+11*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+12*16+r5*8], m0 |
| mova [rsp+20*16+r5*8], m2 |
| mova [rsp+12*16+t1*8], m1 |
| mova [rsp+12*16+t0*8], m3 |
| %endif |
| pxor m7, m7 |
| REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 |
| add rsp, 9*16 |
| %if ARCH_X86_64 |
| mov r6, dstq |
| %else |
| mov dstq, [rsp+gprsize*1+67*16] |
| %endif |
| mov eobd, [rsp+gprsize*0+67*16] |
| cmp eobd, 44 |
| jl .load_veryfast |
| cmp eobd, 151 |
| jl .load_fast |
| ; load normal |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .load_fast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| jmp .run |
| .load_veryfast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| ; fall-through |
| .run: |
| %if ARCH_X86_64 |
| lea r2, [dstq+32] |
| mov r7, -4 |
| %else |
| lea r2, [rsp+67*16] |
| mov dword [r2+0*gprsize], 2 |
| %endif |
| jmp .loop_pass2_entry |
| .loop_pass2: |
| mova m0, [rsp+16* 3] |
| .loop_pass2_entry: |
| %if ARCH_X86_32 |
| mov dstq, [r2+1*gprsize] |
| %endif |
| call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 |
| add rsp, 32*16 |
| %if ARCH_X86_64 |
| add r7, 2 |
| lea dstq, [r2+r7*8] |
| jl .loop_pass2 |
| %if WIN64 |
| mov r7, [rsp+gprsize*1+3*16] |
| %endif |
| %else |
| add dword [r2+1*gprsize], 16 |
| dec dword [r2+0*gprsize] |
| jg .loop_pass2 |
| %endif |
| %assign stack_size (stack_size-73*16) |
| %if STACK_ALIGNMENT >= 16 |
| %assign stack_size_padded (stack_size_padded-73*16) |
| %assign stack_offset (stack_offset-73*16) |
| %else |
| %xdefine rstkm [rsp + stack_size] |
| %endif |
| RET |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 32 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| add rsp, (65+4*ARCH_X86_64)*16 |
| jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly |
| |
| cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ |
| dst, stride, c, eob |
| %if ARCH_X86_32 |
| LEA r6, $$ |
| %endif |
| test eobd, eobd |
| jz .dconly |
| |
| ; remove entirely-zero iterations |
| %undef cmp |
| %if ARCH_X86_64 |
| xor r5d, r5d |
| cmp eobd, 10 |
| setge r5b |
| %else |
| mov r5d, 1 |
| cmp eobd, 10 |
| sbb r5d, 0 |
| %endif |
| add r5d, r5d |
| |
| ; actual first pass after skipping all-zero data |
| .loop_pass1: |
| mova m0, [cq+32* 1+r5*8] |
| mova m1, [cq+32* 7+r5*8] |
| mova m2, [cq+32* 9+r5*8] |
| mova m3, [cq+32*15+r5*8] |
| mova m4, [cq+32*17+r5*8] |
| mova m5, [cq+32*23+r5*8] |
| mova m6, [cq+32*25+r5*8] |
| mova m7, [cq+32*31+r5*8] |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mov r3, rsp |
| call .main_oddhalf_part1 |
| mova m0, [cq+32* 3+r5*8] |
| mova m1, [cq+32* 5+r5*8] |
| mova m2, [cq+32*11+r5*8] |
| mova m3, [cq+32*13+r5*8] |
| mova m4, [cq+32*19+r5*8] |
| mova m5, [cq+32*21+r5*8] |
| mova m6, [cq+32*27+r5*8] |
| mova m7, [cq+32*29+r5*8] |
| call .main_oddhalf_part2 |
| mova m0, [cq+32* 2+r5*8] |
| mova m1, [cq+32* 6+r5*8] |
| mova m2, [cq+32*10+r5*8] |
| mova m3, [cq+32*14+r5*8] |
| mova m4, [cq+32*18+r5*8] |
| mova m5, [cq+32*22+r5*8] |
| mova m6, [cq+32*26+r5*8] |
| mova m7, [cq+32*30+r5*8] |
| add r3, 16*(16+4*ARCH_X86_32) |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| mova m0, [cq+32* 0+r5*8] |
| mova m1, [cq+32* 4+r5*8] |
| mova m2, [cq+32* 8+r5*8] |
| mova m3, [cq+32*12+r5*8] |
| mova m4, [cq+32*16+r5*8] |
| mova m5, [cq+32*20+r5*8] |
| mova m6, [cq+32*24+r5*8] |
| mova m7, [cq+32*28+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| sub r3, 16*(16+4*ARCH_X86_32) |
| call .round_dct32 |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+32* 8+r5*8], m8 |
| mova [cq+32* 9+r5*8], m9 |
| mova [cq+32*10+r5*8], m10 |
| mova [cq+32*11+r5*8], m11 |
| mova m8, [r3+16* 9] ; 8 9 |
| mova m10, [r3+16*11] ; 10 11 |
| mova m12, [r3+16*13] ; 12 13 |
| mova m14, [r3+16*15] ; 14 15 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+32* 4+r5*8], m8 |
| mova [cq+32* 5+r5*8], m9 |
| mova [cq+32* 6+r5*8], m10 |
| mova [cq+32* 7+r5*8], m11 |
| mova m8, [r3+16* 8] ; 24 25 |
| mova m10, [r3+16*10] ; 26 27 |
| mova m12, [r3+16*12] ; 28 29 |
| mova m14, [r3+16*14] ; 30 31 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+32*12+r5*8], m8 |
| mova [cq+32*13+r5*8], m9 |
| mova [cq+32*14+r5*8], m10 |
| mova [cq+32*15+r5*8], m11 |
| %else |
| sub r3, 8*16 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+10*16] |
| mova m4, [r3+12*16] |
| mova m6, [r3+14*16] |
| packssdw m0, [r3+ 9*16] |
| packssdw m2, [r3+11*16] |
| packssdw m4, [r3+13*16] |
| packssdw m6, [r3+15*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+32* 4+r5*8], m0 |
| mova [cq+32* 5+r5*8], m1 |
| mova [cq+32* 6+r5*8], m2 |
| mova [cq+32* 7+r5*8], m3 |
| mova m0, [r3+16*16] |
| mova m2, [r3+18*16] |
| mova m4, [r3+20*16] |
| mova m6, [r3+22*16] |
| packssdw m0, [r3+17*16] |
| packssdw m2, [r3+19*16] |
| packssdw m4, [r3+21*16] |
| packssdw m6, [r3+23*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+32* 8+r5*8], m0 |
| mova [cq+32* 9+r5*8], m1 |
| mova [cq+32*10+r5*8], m2 |
| mova [cq+32*11+r5*8], m3 |
| mova m0, [r3+31*16] |
| mova m2, [r3+29*16] |
| mova m4, [r3+27*16] |
| mova m6, [r3+25*16] |
| packssdw m0, [r3+30*16] |
| packssdw m2, [r3+28*16] |
| packssdw m4, [r3+26*16] |
| packssdw m6, [r3+24*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+32*12+r5*8], m0 |
| mova [cq+32*13+r5*8], m1 |
| mova [cq+32*14+r5*8], m2 |
| mova [cq+32*15+r5*8], m3 |
| mova m0, [r3+ 0*16] |
| mova m2, [r3+ 2*16] |
| mova m4, [r3+ 4*16] |
| mova m6, [r3+ 6*16] |
| packssdw m0, [r3+ 1*16] |
| packssdw m2, [r3+ 3*16] |
| packssdw m4, [r3+ 5*16] |
| packssdw m6, [r3+ 7*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| pxor m7, m7 |
| ; clear lower half of [cq] |
| REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| test r5d, r5d |
| jz .end_pass1 |
| mova [cq+32* 0+r5*8], m0 |
| mova [cq+32* 1+r5*8], m1 |
| mova [cq+32* 2+r5*8], m2 |
| mova [cq+32* 3+r5*8], m3 |
| sub r5d, 2 |
| jmp .loop_pass1 |
| .end_pass1: |
| |
| ; pass=2, we need to call this otherwise the stack pointer has |
| ; the wrong offset in the 8-bit code |
| mov r4d, 4 |
| call m(idct_16x8_internal_16bpc).pass2_main |
| RET |
| |
| .main_oddhalf_part1_fast: ; lower half zero |
| pmulld m7, m0, [o(pd_4091)] |
| pmulld m0, [o(pd_201)] |
| pmulld m4, m3, [o(pd_m2751)] |
| %if ARCH_X86_32 |
| pmulld m3, [o(pd_3035)] |
| mova m5, [o(pd_2048)] |
| REPX {paddd x, m5}, m0, m7 |
| REPX {psrad x, 12}, m0, m7 |
| mova [r3+3*16], m7 |
| mova m7, m3 |
| mova m3, m5 |
| %else |
| pmulld m3, [o(pd_3035)] |
| %endif |
| pmulld m6, m1, [o(pd_m1380)] |
| pmulld m1, [o(pd_3857)] |
| pmulld m5, m2, [o(pd_3703)] |
| pmulld m2, [o(pd_1751)] |
| jmp .main_oddhalf_part1_fast2 |
| .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a |
| ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a |
| ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a |
| ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a |
| .main_oddhalf_part1_fast2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| psubd m8, m0, m4 ; t17 |
| paddd m0, m4 ; t16 |
| psubd m4, m6, m2 ; t18 |
| paddd m6, m2 ; t19 |
| psubd m2, m1, m5 ; t29 |
| paddd m1, m5 ; t28 |
| psubd m5, m7, m3 ; t30 |
| paddd m7, m3 ; t31 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 |
| REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 |
| mova m15, [o(pd_4017)] |
| mova m10, [o(pd_799)] |
| ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a |
| ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a |
| psubd m3, m0, m6 ; t19a |
| paddd m0, m6 ; t16a |
| psubd m6, m7, m1 ; t28a |
| paddd m7, m1 ; t31a |
| psubd m1, m5, m4 ; t18 |
| paddd m5, m4 ; t17 |
| psubd m4, m8, m2 ; t29 |
| paddd m8, m2 ; t30 |
| REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 |
| REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 |
| mova m15, [o(pd_3784)] |
| mova m10, [o(pd_1567)] |
| ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a |
| ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 |
| mova [r3+16*0], m0 |
| mova [r3+16*1], m5 |
| mova [r3+16*2], m4 |
| mova [r3+16*3], m6 |
| mova [r3+16*4], m3 |
| mova [r3+16*5], m1 |
| mova [r3+16*6], m8 |
| mova [r3+16*7], m7 |
| %else |
| mova [r3+0*16], m2 |
| mova [r3+1*16], m3 |
| mova [r3+2*16], m4 |
| mova [r3+3*16], m5 |
| mova m3, [o(pd_2048)] |
| ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a |
| ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a |
| mova m4, [r3+2*16] |
| mova m5, [r3+3*16] |
| mova [r3+2*16], m6 |
| mova [r3+3*16], m7 |
| mova m2, [r3+0*16] |
| mova m7, [r3+1*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a |
| ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a |
| mova m0, [r3+0*16] |
| mova m1, [r3+1*16] |
| mova m6, [r3+2*16] |
| .main_oddhalf_part1_fast2: |
| REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 |
| REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 |
| psubd m3, m0, m4 ; t17 |
| mova [r3+0*16], m3 |
| mova m3, [r3+3*16] |
| paddd m0, m4 ; t16 |
| psubd m4, m6, m2 ; t18 |
| paddd m6, m2 ; t19 |
| psubd m2, m1, m5 ; t29 |
| paddd m1, m5 ; t28 |
| psubd m5, m3, m7 ; t30 |
| paddd m7, m3 ; t31 |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 |
| pmaxsd m3, [r3+0*16] |
| mova [r3+0*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 |
| pminsd m3, [r3+0*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m1 |
| mova [r3+2*16], m6 |
| mova [r3+3*16], m7 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a |
| ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a |
| psubd m1, m5, m4 ; t18 |
| paddd m5, m4 ; t17 |
| psubd m4, m3, m2 ; t29 |
| paddd m3, m2 ; t30 |
| mova m0, [r3+0*16] |
| mova m2, [r3+1*16] |
| mova m6, [r3+2*16] |
| mova m7, [r3+3*16] |
| mova [r3+0*16], m3 |
| psubd m3, m0, m6 ; t19a |
| paddd m0, m6 ; t16a |
| psubd m6, m7, m2 ; t28a |
| paddd m7, m2 ; t31a |
| mova m2, [o(clip_18b_min)] |
| REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 |
| pmaxsd m2, [r3+0*16] |
| mova [r3+0*16], m2 |
| mova m2, [o(clip_18b_max)] |
| REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 |
| pminsd m2, [r3+0*16] |
| mova [r3+16*0], m0 |
| mova [r3+16*1], m5 |
| mova [r3+16*6], m2 |
| mova [r3+16*7], m7 |
| mova m7, [o(pd_2048)] |
| ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a |
| ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 |
| mova [r3+16*2], m4 |
| mova [r3+16*3], m6 |
| mova [r3+16*4], m3 |
| mova [r3+16*5], m1 |
| %endif |
| ret |
| .main_oddhalf_part2_fast: ; lower half zero |
| pmulld m7, m0, [o(pd_m601)] |
| pmulld m0, [o(pd_4052)] |
| pmulld m4, m3, [o(pd_3290)] |
| %if ARCH_X86_32 |
| pmulld m3, [o(pd_2440)] |
| mova m5, [o(pd_2048)] |
| REPX {paddd x, m5}, m0, m7 |
| REPX {psrad x, 12}, m0, m7 |
| mova [r3+11*16], m7 |
| mova m7, m3 |
| mova m3, m5 |
| %else |
| pmulld m3, [o(pd_2440)] |
| %endif |
| pmulld m6, m1, [o(pd_3973)] |
| pmulld m1, [o(pd_995)] |
| pmulld m5, m2, [o(pd_m2106)] |
| pmulld m2, [o(pd_3513)] |
| jmp .main_oddhalf_part2_fast2 |
| .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 |
| %if ARCH_X86_64 |
| ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a |
| ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a |
| ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a |
| ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a |
| .main_oddhalf_part2_fast2: |
| REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 |
| psubd m8, m0, m4 ; t25 |
| paddd m0, m4 ; t24 |
| psubd m4, m6, m2 ; t26 |
| paddd m6, m2 ; t27 |
| psubd m2, m1, m5 ; t21 |
| paddd m1, m5 ; t20 |
| psubd m5, m7, m3 ; t22 |
| paddd m7, m3 ; t23 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 |
| REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 |
| mova m15, [o(pd_2276)] |
| mova m10, [o(pd_3406)] |
| ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a |
| ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a |
| psubd m3, m0, m6 ; t27a |
| paddd m0, m6 ; t24a |
| psubd m6, m7, m1 ; t20a |
| paddd m7, m1 ; t23a |
| psubd m1, m5, m4 ; t21 |
| paddd m5, m4 ; t22 |
| psubd m4, m8, m2 ; t26 |
| paddd m8, m2 ; t25 |
| REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 |
| REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 |
| mova m15, [o(pd_3784)] |
| mova m10, [o(pd_1567)] |
| ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a |
| ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 |
| mova m9, [r3+16*0] ; t16a |
| mova m10, [r3+16*1] ; t17 |
| psubd m2, m9, m7 ; t23 |
| paddd m9, m7 ; t16 |
| psubd m7, m10, m5 ; t22a |
| paddd m10, m5 ; t17a |
| REPX {pmaxsd x, m12}, m9, m10, m2, m7 |
| REPX {pminsd x, m13}, m9, m10, m2, m7 |
| mova [r3+16*0], m9 |
| mova [r3+16*1], m10 |
| mova m9, [r3+16*2] ; t18a |
| mova m10, [r3+16*3] ; t19 |
| psubd m5, m9, m1 ; t21 |
| paddd m9, m1 ; t18 |
| psubd m1, m10, m6 ; t20a |
| paddd m10, m6 ; t19a |
| REPX {pmaxsd x, m12}, m9, m10, m5, m1 |
| REPX {pminsd x, m13}, m9, m10, m5, m1 |
| mova [r3+16*2], m9 |
| mova [r3+16*3], m10 |
| mova m9, [r3+16*4] ; t28 |
| mova m10, [r3+16*5] ; t29a |
| psubd m6, m9, m3 ; t27a |
| paddd m9, m3 ; t28a |
| psubd m3, m10, m4 ; t26 |
| paddd m10, m4 ; t29 |
| REPX {pmaxsd x, m12}, m9, m10, m6, m3 |
| REPX {pminsd x, m13}, m9, m10, m6, m3 |
| REPX {pmulld x, m14}, m6, m3, m1, m5 |
| paddd m6, m11 |
| paddd m3, m11 |
| psubd m4, m6, m1 ; t20 |
| paddd m6, m1 ; t27 |
| psubd m1, m3, m5 ; t21a |
| paddd m3, m5 ; t26a |
| REPX {psrad x, 12 }, m4, m1, m3, m6 |
| mova [r3+16*4], m4 |
| mova [r3+16*5], m1 |
| mova m4, [r3+16*6] ; t30 |
| mova m1, [r3+16*7] ; t31a |
| psubd m5, m4, m8 ; t25a |
| paddd m4, m8 ; t30a |
| psubd m8, m1, m0 ; t24 |
| paddd m1, m0 ; t31 |
| REPX {pmaxsd x, m12}, m8, m5, m4, m1 |
| REPX {pminsd x, m13}, m8, m5, m4, m1 |
| REPX {pmulld x, m14}, m5, m8, m7, m2 |
| paddd m5, m11 |
| paddd m8, m11 |
| psubd m0, m5, m7 ; t22 |
| paddd m5, m7 ; t25 |
| psubd m7, m8, m2 ; t23a |
| paddd m2, m8 ; t24a |
| REPX {psrad x, 12 }, m0, m7, m2, m5 |
| mova [r3+16*6], m0 |
| mova [r3+16*7], m7 |
| mova [r3+16*8], m2 |
| mova [r3+16*9], m5 |
| mova [r3+16*10], m3 |
| mova [r3+16*11], m6 |
| mova [r3+16*12], m9 |
| mova [r3+16*13], m10 |
| mova [r3+16*14], m4 |
| mova [r3+16*15], m1 |
| %else |
| mova [r3+ 8*16], m2 |
| mova [r3+ 9*16], m3 |
| mova [r3+10*16], m4 |
| mova [r3+11*16], m5 |
| mova m3, [o(pd_2048)] |
| ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a |
| ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a |
| mova m2, [r3+ 8*16] |
| mova m4, [r3+10*16] |
| mova m5, [r3+11*16] |
| mova [r3+ 8*16], m0 |
| mova [r3+10*16], m6 |
| mova [r3+11*16], m7 |
| mova m7, [r3+ 9*16] |
| mova [r3+ 9*16], m1 |
| ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a |
| ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a |
| mova m0, [r3+ 8*16] |
| mova m1, [r3+ 9*16] |
| mova m6, [r3+10*16] |
| .main_oddhalf_part2_fast2: |
| REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 |
| REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 |
| psubd m3, m0, m4 ; t25 |
| mova [r3+ 8*16], m3 |
| mova m3, [r3+11*16] |
| paddd m0, m4 ; t24 |
| psubd m4, m6, m2 ; t26 |
| paddd m6, m2 ; t27 |
| psubd m2, m1, m5 ; t21 |
| paddd m1, m5 ; t20 |
| psubd m5, m3, m7 ; t22 |
| paddd m7, m3 ; t23 |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 |
| pmaxsd m3, [r3+ 8*16] |
| mova [r3+ 8*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 |
| pminsd m3, [r3+ 8*16] |
| mova [r3+ 8*16], m0 |
| mova [r3+ 9*16], m1 |
| mova [r3+10*16], m6 |
| mova [r3+11*16], m7 |
| mova m7, [o(pd_2048)] |
| ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a |
| ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a |
| psubd m1, m5, m4 ; t21 |
| paddd m5, m4 ; t22 |
| psubd m4, m3, m2 ; t26 |
| paddd m3, m2 ; t25 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+ 9*16] |
| mova m6, [r3+10*16] |
| mova m7, [r3+11*16] |
| mova [r3+ 8*16], m3 |
| psubd m3, m0, m6 ; t27a |
| paddd m0, m6 ; t24a |
| psubd m6, m7, m2 ; t20a |
| paddd m7, m2 ; t23a |
| mova m2, [o(clip_18b_min)] |
| REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 |
| pmaxsd m2, [r3+ 8*16] |
| mova [r3+ 8*16], m2 |
| mova m2, [o(clip_18b_max)] |
| REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 |
| pminsd m2, [r3+ 8*16] |
| mova [r3+ 8*16], m0 |
| mova [r3+ 9*16], m2 |
| mova [r3+14*16], m5 |
| mova [r3+15*16], m7 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a |
| ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 |
| mova [r3+10*16], m3 |
| mova m0, [o(clip_18b_min)] |
| mova m2, [o(clip_18b_max)] |
| mova m5, [r3+16*2] ; t18a |
| mova m7, [r3+16*3] ; t19 |
| psubd m3, m5, m1 ; t21 |
| paddd m5, m1 ; t18 |
| psubd m1, m7, m6 ; t20a |
| paddd m7, m6 ; t19a |
| REPX {pmaxsd x, m0}, m5, m7, m3, m1 |
| REPX {pminsd x, m2}, m5, m7, m3, m1 |
| mova [r3+16*2], m5 |
| mova [r3+16*3], m7 |
| mova [r3+11*16], m3 |
| mova m3, [r3+10*16] |
| mova m5, [r3+16*4] ; t28 |
| mova m7, [r3+16*5] ; t29a |
| psubd m6, m5, m3 ; t27a |
| paddd m5, m3 ; t28a |
| psubd m3, m7, m4 ; t26 |
| paddd m7, m4 ; t29 |
| REPX {pmaxsd x, m0}, m5, m7, m6, m3 |
| REPX {pminsd x, m2}, m5, m7, m6, m3 |
| mova [r3+16*12], m5 |
| mova [r3+16*13], m7 |
| mova m5, [o(pd_2048)] |
| mova m7, [o(pd_2896)] |
| mova m4, [r3+11*16] |
| REPX {pmulld x, m7}, m6, m3, m1, m4 |
| paddd m6, m5 |
| paddd m3, m5 |
| psubd m5, m6, m1 ; t20 |
| paddd m6, m1 ; t27 |
| psubd m1, m3, m4 ; t21a |
| paddd m3, m4 ; t26a |
| REPX {psrad x, 12}, m5, m1, m3, m6 |
| mova [r3+16*4], m5 |
| mova [r3+16*5], m1 |
| mova [r3+16*10], m3 |
| mova [r3+16*11], m6 |
| |
| mova m5, [r3+14*16] |
| mova m6, [r3+15*16] |
| mova m3, [r3+16*0] ; t16a |
| mova m4, [r3+16*1] ; t17 |
| psubd m1, m3, m6 ; t23 |
| paddd m3, m6 ; t16 |
| psubd m6, m4, m5 ; t22a |
| paddd m4, m5 ; t17a |
| REPX {pmaxsd x, m0}, m3, m4, m1, m6 |
| REPX {pminsd x, m2}, m3, m4, m1, m6 |
| mova [r3+16*0], m3 |
| mova [r3+16*1], m4 |
| mova m5, [r3+ 8*16] |
| mova m3, [r3+ 9*16] |
| mova [r3+ 8*16], m1 |
| mova [r3+ 9*16], m6 |
| mova m4, [r3+16*6] ; t30 |
| mova m1, [r3+16*7] ; t31a |
| psubd m6, m1, m5 ; t24 |
| paddd m1, m5 ; t31 |
| psubd m5, m4, m3 ; t25a |
| paddd m4, m3 ; t30a |
| REPX {pmaxsd x, m0}, m6, m5, m4, m1 |
| REPX {pminsd x, m2}, m6, m5, m4, m1 |
| mova [r3+16*14], m4 |
| mova [r3+16*15], m1 |
| mova m4, [o(pd_2048)] |
| mova m1, [r3+ 9*16] |
| mova m2, [r3+ 8*16] |
| REPX {pmulld x, m7}, m5, m6, m1, m2 |
| paddd m5, m4 |
| paddd m6, m4 |
| psubd m0, m5, m1 ; t22 |
| paddd m5, m1 ; t25 |
| psubd m1, m6, m2 ; t23a |
| paddd m2, m6 ; t24a |
| REPX {psrad x, 12}, m0, m1, m2, m5 |
| mova [r3+16*6], m0 |
| mova [r3+16*7], m1 |
| mova [r3+16*8], m2 |
| mova [r3+16*9], m5 |
| %endif |
| ret |
| |
| ; final sumsub for idct16 as well as idct32, plus final downshift |
| %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx |
| mova m%4, [r3+16*(23-%1)] |
| pmaxsd m%1, m12 |
| pminsd m%1, m13 |
| psubd m%3, m%1, m%4 ; idct16 out15 - n |
| paddd m%1, m%4 ; idct16 out0 + n |
| pmaxsd m%1, m12 |
| pmaxsd m%3, m12 |
| pminsd m%1, m13 |
| pminsd m%3, m13 |
| paddd m%1, m11 |
| paddd m%3, m11 |
| mova m%5, [r3+16*( 0+%1)] |
| mova m%2, [r3+16*(15-%1)] |
| psubd m%4, m%1, m%2 ; out31 - n |
| paddd m%1, m%2 ; out0 + n |
| paddd m%2, m%3, m%5 ; out15 - n |
| psubd m%3, m%5 ; out16 + n |
| REPX {psrad x, %6}, m%1, m%3, m%2, m%4 |
| %endmacro |
| |
| .round_dct32: |
| %if ARCH_X86_64 |
| psrld m11, 10 ; pd_2 |
| IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 |
| mova [r3+ 0*16], m6 |
| mova [r3+23*16], m7 |
| IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 |
| packssdw m0, m1 ; 0 1 |
| packssdw m14, m15 ; 14 15 |
| packssdw m8, m6 ; 16 17 |
| packssdw m7, m9 ; 30 31 |
| mova [r3+16*15], m14 |
| mova [r3+16*14], m7 |
| IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 |
| IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 |
| packssdw m2, m3 ; 2 3 |
| packssdw m14, m15 ; 12 13 |
| packssdw m10, m1 ; 18 19 |
| packssdw m9, m7 ; 28 29 |
| mova [r3+16*13], m14 |
| mova [r3+16*12], m9 |
| IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 |
| IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 |
| packssdw m4, m5 ; 4 5 |
| packssdw m14, m15 ; 10 11 |
| packssdw m1, m3 ; 20 21 |
| packssdw m9, m7 ; 26 27 |
| mova [r3+16*11], m14 |
| mova [r3+16*10], m9 |
| mova m6, [r3+ 0*16] |
| mova m7, [r3+23*16] |
| IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 |
| IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 |
| packssdw m6, m7 ; 6 7 |
| packssdw m11, m15 ; 8 9 |
| packssdw m14, m3 ; 22 23 |
| packssdw m9, m5 ; 24 25 |
| mova [r3+16*9], m11 |
| mova [r3+16*8], m9 |
| mova m12, m1 |
| ret |
| %else |
| mova [r3+16*16], m0 |
| mova [r3+17*16], m1 |
| mova [r3+18*16], m2 |
| mova [r3+19*16], m3 |
| mova [r3+20*16], m4 |
| mova [r3+21*16], m5 |
| mova [r3+22*16], m6 |
| mova [r3+23*16], m7 |
| mova m1, [o(pd_2)] |
| mova m2, [o(clip_18b_min)] |
| mova m3, [o(clip_18b_max)] |
| |
| mov r4, 15*16 |
| .loop_dct32_end: |
| mova m0, [r3+16*16] |
| mova m6, [r3+16*24] |
| pmaxsd m0, m2 |
| pminsd m0, m3 |
| psubd m5, m0, m6 ; idct16 out15 - n |
| paddd m0, m6 ; idct16 out0 + n |
| pmaxsd m0, m2 |
| pmaxsd m5, m2 |
| pminsd m0, m3 |
| pminsd m5, m3 |
| paddd m0, m1 |
| paddd m5, m1 |
| mova m7, [r3] |
| mova m4, [r3+r4] |
| psubd m6, m0, m4 ; out31 - n |
| paddd m0, m4 ; out0 + n |
| paddd m4, m5, m7 ; out15 - n |
| psubd m5, m7 ; out16 + n |
| REPX {psrad x, 2}, m0, m5, m4, m6 |
| mova [r3], m0 |
| mova [r3+r4], m4 |
| mova [r3+16*16], m5 |
| mova [r3+24*16], m6 |
| add r3, 16 |
| sub r4, 32 |
| jg .loop_dct32_end |
| ret |
| %endif |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 8 |
| .dconly1: |
| add r5d, 640 |
| sar r5d, 10 |
| .dconly2: |
| imul r5d, 2896 |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| punpcklqdq m0, m0 |
| mova m6, [o(pixel_10bpc_max)] |
| pxor m5, m5 |
| .dconly_loop: |
| mova m1, [dstq+16*0] |
| mova m2, [dstq+16*1] |
| mova m3, [dstq+16*2] |
| mova m4, [dstq+16*3] |
| REPX {paddw x, m0}, m1, m2, m3, m4 |
| REPX {pminsw x, m6}, m1, m2, m3, m4 |
| REPX {pmaxsw x, m5}, m1, m2, m3, m4 |
| mova [dstq+16*0], m1 |
| mova [dstq+16*1], m2 |
| mova [dstq+16*2], m3 |
| mova [dstq+16*3], m4 |
| add dstq, strideq |
| dec r3d |
| jg .dconly_loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| ; remove entirely-zero iterations |
| %undef cmp |
| mov r5d, 8 |
| .zero_loop: |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x16_2d)+r5] |
| jl .zero_loop |
| |
| ; actual first pass after skipping all-zero data |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+64* 1+r5*8] |
| mova m1, [cq+64* 7+r5*8] |
| mova m2, [cq+64* 9+r5*8] |
| mova m3, [cq+64*15+r5*8] |
| mova m4, [cq+64*17+r5*8] |
| mova m5, [cq+64*23+r5*8] |
| mova m6, [cq+64*25+r5*8] |
| mova m7, [cq+64*31+r5*8] |
| mov r3, rsp |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 |
| |
| mova m0, [cq+64* 3+r5*8] |
| mova m1, [cq+64* 5+r5*8] |
| mova m2, [cq+64*11+r5*8] |
| mova m3, [cq+64*13+r5*8] |
| mova m4, [cq+64*19+r5*8] |
| mova m5, [cq+64*21+r5*8] |
| mova m6, [cq+64*27+r5*8] |
| mova m7, [cq+64*29+r5*8] |
| %if ARCH_X86_32 |
| add r3, 16*8 |
| %endif |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| %if ARCH_X86_32 |
| sub r3, 16*8 |
| %endif |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 |
| add r3, 16*(16+4*ARCH_X86_32) |
| |
| mova m0, [cq+64* 2+r5*8] |
| mova m1, [cq+64* 6+r5*8] |
| mova m2, [cq+64*10+r5*8] |
| mova m3, [cq+64*14+r5*8] |
| mova m4, [cq+64*18+r5*8] |
| mova m5, [cq+64*22+r5*8] |
| mova m6, [cq+64*26+r5*8] |
| mova m7, [cq+64*30+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+64* 0+r5*8] |
| mova m1, [cq+64* 4+r5*8] |
| mova m2, [cq+64* 8+r5*8] |
| mova m3, [cq+64*12+r5*8] |
| mova m4, [cq+64*16+r5*8] |
| mova m5, [cq+64*20+r5*8] |
| mova m6, [cq+64*24+r5*8] |
| mova m7, [cq+64*28+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| sub r3, 16*(16+4*ARCH_X86_32) |
| call .round_dct32 |
| |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+64* 8+r5*8], m8 |
| mova [cq+64* 9+r5*8], m9 |
| mova [cq+64*10+r5*8], m10 |
| mova [cq+64*11+r5*8], m11 |
| mova m8, [r3+16* 9] ; 8 9 |
| mova m10, [r3+16*11] ; 10 11 |
| mova m12, [r3+16*13] ; 12 13 |
| mova m14, [r3+16*15] ; 14 15 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+64* 4+r5*8], m8 |
| mova [cq+64* 5+r5*8], m9 |
| mova [cq+64* 6+r5*8], m10 |
| mova [cq+64* 7+r5*8], m11 |
| mova m8, [r3+16* 8] ; 24 25 |
| mova m10, [r3+16*10] ; 26 27 |
| mova m12, [r3+16*12] ; 28 29 |
| mova m14, [r3+16*14] ; 30 31 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [cq+64*12+r5*8], m8 |
| mova [cq+64*13+r5*8], m9 |
| mova [cq+64*14+r5*8], m10 |
| mova [cq+64*15+r5*8], m11 |
| %else |
| sub r3, 8*16 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+10*16] |
| mova m4, [r3+12*16] |
| mova m6, [r3+14*16] |
| packssdw m0, [r3+ 9*16] |
| packssdw m2, [r3+11*16] |
| packssdw m4, [r3+13*16] |
| packssdw m6, [r3+15*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+64* 4+r5*8], m0 |
| mova [cq+64* 5+r5*8], m1 |
| mova [cq+64* 6+r5*8], m2 |
| mova [cq+64* 7+r5*8], m3 |
| mova m0, [r3+16*16] |
| mova m2, [r3+18*16] |
| mova m4, [r3+20*16] |
| mova m6, [r3+22*16] |
| packssdw m0, [r3+17*16] |
| packssdw m2, [r3+19*16] |
| packssdw m4, [r3+21*16] |
| packssdw m6, [r3+23*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+64* 8+r5*8], m0 |
| mova [cq+64* 9+r5*8], m1 |
| mova [cq+64*10+r5*8], m2 |
| mova [cq+64*11+r5*8], m3 |
| mova m0, [r3+31*16] |
| mova m2, [r3+29*16] |
| mova m4, [r3+27*16] |
| mova m6, [r3+25*16] |
| packssdw m0, [r3+30*16] |
| packssdw m2, [r3+28*16] |
| packssdw m4, [r3+26*16] |
| packssdw m6, [r3+24*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [cq+64*12+r5*8], m0 |
| mova [cq+64*13+r5*8], m1 |
| mova [cq+64*14+r5*8], m2 |
| mova [cq+64*15+r5*8], m3 |
| mova m0, [r3+ 0*16] |
| mova m2, [r3+ 2*16] |
| mova m4, [r3+ 4*16] |
| mova m6, [r3+ 6*16] |
| packssdw m0, [r3+ 1*16] |
| packssdw m2, [r3+ 3*16] |
| packssdw m4, [r3+ 5*16] |
| packssdw m6, [r3+ 7*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| mova [cq+64* 0+r5*8], m0 |
| mova [cq+64* 1+r5*8], m1 |
| mova [cq+64* 2+r5*8], m2 |
| mova [cq+64* 3+r5*8], m3 |
| pxor m0, m0 |
| REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2, we need to call this otherwise the stack pointer has |
| ; the wrong offset in the 8-bit code |
| call .pass2 |
| RET |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %endif |
| mov r7, dstq |
| %else |
| mov [rsp+2*gprsize+16*16], dstq |
| %endif |
| lea r3, [strideq*3] |
| mov r4d, 4 |
| jmp m(idct_16x16_internal_16bpc).loop_pass2 |
| |
| .round_dct32: |
| %if ARCH_X86_64 |
| psrld m11, 11 ; pd_1 |
| IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 |
| mova [r3+ 0*16], m6 |
| mova [r3+23*16], m7 |
| IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 |
| packssdw m0, m1 ; 0 1 |
| packssdw m14, m15 ; 14 15 |
| packssdw m8, m6 ; 16 17 |
| packssdw m7, m9 ; 30 31 |
| mova [r3+16*15], m14 |
| mova [r3+16*14], m7 |
| IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 |
| IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 |
| packssdw m2, m3 ; 2 3 |
| packssdw m14, m15 ; 12 13 |
| packssdw m10, m1 ; 18 19 |
| packssdw m9, m7 ; 28 29 |
| mova [r3+16*13], m14 |
| mova [r3+16*12], m9 |
| IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 |
| IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 |
| packssdw m4, m5 ; 4 5 |
| packssdw m14, m15 ; 10 11 |
| packssdw m1, m3 ; 20 21 |
| packssdw m9, m7 ; 26 27 |
| mova [r3+16*11], m14 |
| mova [r3+16*10], m9 |
| mova m6, [r3+ 0*16] |
| mova m7, [r3+23*16] |
| IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 |
| IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 |
| packssdw m6, m7 ; 6 7 |
| packssdw m11, m15 ; 8 9 |
| packssdw m14, m3 ; 22 23 |
| packssdw m9, m5 ; 24 25 |
| mova [r3+16*9], m11 |
| mova [r3+16*8], m9 |
| mova m12, m1 |
| ret |
| %else |
| mova [r3+16*16], m0 |
| mova [r3+17*16], m1 |
| mova [r3+18*16], m2 |
| mova [r3+19*16], m3 |
| mova [r3+20*16], m4 |
| mova [r3+21*16], m5 |
| mova [r3+22*16], m6 |
| mova [r3+23*16], m7 |
| pcmpeqd m1, m1 ; -1 |
| mova m2, [o(clip_18b_min)] |
| mova m3, [o(clip_18b_max)] |
| |
| mov r4, 15*16 |
| .loop_dct32_end: |
| mova m0, [r3+16*16] |
| mova m6, [r3+16*24] |
| psubd m5, m0, m6 ; idct16 out15 - n |
| paddd m0, m6 ; idct16 out0 + n |
| pmaxsd m0, m2 |
| pmaxsd m5, m2 |
| pminsd m0, m3 |
| pminsd m5, m3 |
| psubd m0, m1 |
| psubd m5, m1 |
| mova m7, [r3] |
| mova m4, [r3+r4] |
| psubd m6, m0, m4 ; out31 - n |
| paddd m0, m4 ; out0 + n |
| paddd m4, m5, m7 ; out15 - n |
| psubd m5, m7 ; out16 + n |
| REPX {psrad x, 1}, m0, m5, m4, m6 |
| mova [r3], m0 |
| mova [r3+r4], m4 |
| mova [r3+16*16], m5 |
| mova [r3+24*16], m6 |
| add r3, 16 |
| sub r4, 32 |
| jg .loop_dct32_end |
| ret |
| %endif |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 16 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| add r5d, 384 |
| sar r5d, 9 |
| jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 |
| |
| cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| ; remove entirely-zero iterations |
| %if ARCH_X86_32 |
| mov [rsp+5*32*16+1*gprsize], dstq |
| %elif WIN64 |
| mov [rsp+5*32*16+1*gprsize], r7 |
| %endif |
| %undef cmp |
| mov r5d, 14 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| mova [rsp+32*16+r5*8+0*32*16], m0 |
| mova [rsp+40*16+r5*8+0*32*16], m0 |
| mova [rsp+32*16+t0*8+0*32*16], m0 |
| mova [rsp+32*16+t1*8+0*32*16], m0 |
| mova [rsp+32*16+r5*8+1*32*16], m0 |
| mova [rsp+40*16+r5*8+1*32*16], m0 |
| mova [rsp+32*16+t0*8+1*32*16], m0 |
| mova [rsp+32*16+t1*8+1*32*16], m0 |
| mova [rsp+32*16+r5*8+2*32*16], m0 |
| mova [rsp+40*16+r5*8+2*32*16], m0 |
| mova [rsp+32*16+t0*8+2*32*16], m0 |
| mova [rsp+32*16+t1*8+2*32*16], m0 |
| mova [rsp+32*16+r5*8+3*32*16], m0 |
| mova [rsp+40*16+r5*8+3*32*16], m0 |
| mova [rsp+32*16+t0*8+3*32*16], m0 |
| mova [rsp+32*16+t1*8+3*32*16], m0 |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| |
| ; actual first pass after skipping all-zero data |
| mov [rsp+gprsize*0+5*32*16], eobd |
| .loop_pass1: |
| mova m0, [cq+128* 1+r5*8] |
| mova m1, [cq+128* 7+r5*8] |
| mova m2, [cq+128* 9+r5*8] |
| mova m3, [cq+128*15+r5*8] |
| mova m4, [cq+128*17+r5*8] |
| mova m5, [cq+128*23+r5*8] |
| mova m6, [cq+128*25+r5*8] |
| mova m7, [cq+128*31+r5*8] |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mov r3, rsp |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 |
| mova m0, [cq+128* 3+r5*8] |
| mova m1, [cq+128* 5+r5*8] |
| mova m2, [cq+128*11+r5*8] |
| mova m3, [cq+128*13+r5*8] |
| mova m4, [cq+128*19+r5*8] |
| mova m5, [cq+128*21+r5*8] |
| mova m6, [cq+128*27+r5*8] |
| mova m7, [cq+128*29+r5*8] |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 |
| mova m0, [cq+128* 2+r5*8] |
| mova m1, [cq+128* 6+r5*8] |
| mova m2, [cq+128*10+r5*8] |
| mova m3, [cq+128*14+r5*8] |
| mova m4, [cq+128*18+r5*8] |
| mova m5, [cq+128*22+r5*8] |
| mova m6, [cq+128*26+r5*8] |
| mova m7, [cq+128*30+r5*8] |
| add r3, 16*(16+4*ARCH_X86_32) |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| mova m0, [cq+128* 0+r5*8] |
| mova m1, [cq+128* 4+r5*8] |
| mova m2, [cq+128* 8+r5*8] |
| mova m3, [cq+128*12+r5*8] |
| mova m4, [cq+128*16+r5*8] |
| mova m5, [cq+128*20+r5*8] |
| mova m6, [cq+128*24+r5*8] |
| mova m7, [cq+128*28+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| sub r3, 16*(16+4*ARCH_X86_32) |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+32*16+r5*8+2*32*16], m8 |
| mova [rsp+40*16+r5*8+2*32*16], m10 |
| mova [rsp+32*16+t1*8+2*32*16], m9 |
| mova [rsp+32*16+t0*8+2*32*16], m11 |
| mova m8, [r3+16* 9] ; 8 9 |
| mova m10, [r3+16*11] ; 10 11 |
| mova m12, [r3+16*13] ; 12 13 |
| mova m14, [r3+16*15] ; 14 15 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+32*16+r5*8+1*32*16], m8 |
| mova [rsp+40*16+r5*8+1*32*16], m10 |
| mova [rsp+32*16+t1*8+1*32*16], m9 |
| mova [rsp+32*16+t0*8+1*32*16], m11 |
| mova m8, [r3+16* 8] ; 24 25 |
| mova m10, [r3+16*10] ; 26 27 |
| mova m12, [r3+16*12] ; 28 29 |
| mova m14, [r3+16*14] ; 30 31 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+32*16+r5*8+3*32*16], m8 |
| mova [rsp+40*16+r5*8+3*32*16], m10 |
| mova [rsp+32*16+t1*8+3*32*16], m9 |
| mova [rsp+32*16+t0*8+3*32*16], m11 |
| %else |
| sub r3, 8*16 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+10*16] |
| mova m4, [r3+12*16] |
| mova m6, [r3+14*16] |
| packssdw m0, [r3+ 9*16] |
| packssdw m2, [r3+11*16] |
| packssdw m4, [r3+13*16] |
| packssdw m6, [r3+15*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+32*16+r5*8+1*32*16], m0 |
| mova [rsp+40*16+r5*8+1*32*16], m2 |
| mova [rsp+32*16+t1*8+1*32*16], m1 |
| mova [rsp+32*16+t0*8+1*32*16], m3 |
| mova m0, [r3+16*16] |
| mova m2, [r3+18*16] |
| mova m4, [r3+20*16] |
| mova m6, [r3+22*16] |
| packssdw m0, [r3+17*16] |
| packssdw m2, [r3+19*16] |
| packssdw m4, [r3+21*16] |
| packssdw m6, [r3+23*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+32*16+r5*8+2*32*16], m0 |
| mova [rsp+40*16+r5*8+2*32*16], m2 |
| mova [rsp+32*16+t1*8+2*32*16], m1 |
| mova [rsp+32*16+t0*8+2*32*16], m3 |
| mova m0, [r3+31*16] |
| mova m2, [r3+29*16] |
| mova m4, [r3+27*16] |
| mova m6, [r3+25*16] |
| packssdw m0, [r3+30*16] |
| packssdw m2, [r3+28*16] |
| packssdw m4, [r3+26*16] |
| packssdw m6, [r3+24*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+32*16+r5*8+3*32*16], m0 |
| mova [rsp+40*16+r5*8+3*32*16], m2 |
| mova [rsp+32*16+t1*8+3*32*16], m1 |
| mova [rsp+32*16+t0*8+3*32*16], m3 |
| mova m0, [r3+ 0*16] |
| mova m2, [r3+ 2*16] |
| mova m4, [r3+ 4*16] |
| mova m6, [r3+ 6*16] |
| packssdw m0, [r3+ 1*16] |
| packssdw m2, [r3+ 3*16] |
| packssdw m4, [r3+ 5*16] |
| packssdw m6, [r3+ 7*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| pxor m7, m7 |
| ; clear lower half of [cq] |
| REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ |
| 8, 9, 10, 11, 12, 13, 14, 15, \ |
| 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| mova [rsp+32*16+r5*8+0*32*16], m0 |
| mova [rsp+40*16+r5*8+0*32*16], m2 |
| mova [rsp+32*16+t1*8+0*32*16], m1 |
| mova [rsp+32*16+t0*8+0*32*16], m3 |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 code starts here |
| mov eobd, [rsp+gprsize*0+5*32*16] |
| add rsp, 29*16 |
| cmp eobd, 36 |
| jl .load_veryfast |
| cmp eobd, 136 |
| jl .load_fast |
| ; load normal |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .load_fast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| jmp .run |
| .load_veryfast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| ; fall-through |
| .run: |
| %if ARCH_X86_64 |
| lea r2, [dstq+64] |
| mov r7, -8 |
| %else |
| lea r2, [rsp+(4*32+3)*16] |
| mov dword [r2+0*gprsize], 4 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 32 |
| add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 |
| jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 |
| |
| cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ |
| 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 2, 0 |
| mov [rsp+gprsize*1+(64*2+12)*16], r0 |
| mov [rsp+gprsize*2+(64*2+12)*16], r1 |
| mov [rsp+gprsize*3+(64*2+12)*16], r2 |
| %else |
| DECLARE_REG_TMP 8, 9, 4, 7 |
| mov [rsp+gprsize*1+(64*2+12)*16], r9 |
| %if WIN64 |
| mov [rsp+gprsize*2+(64*2+12)*16], r7 |
| mov [rsp+gprsize*3+(64*2+12)*16], r8 |
| %endif |
| %endif |
| %undef cmp |
| ; remove entirely-zero iterations |
| mov r5d, 7*2 |
| cmp eobw, word [o2(tbl_16x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| mova [rsp+12*16+t0*8], m0 |
| mova [rsp+12*16+t1*8], m0 |
| mova [rsp+12*16+t2*8], m0 |
| mova [rsp+12*16+t3*8], m0 |
| mova [rsp+76*16+t0*8], m0 |
| mova [rsp+76*16+t1*8], m0 |
| mova [rsp+76*16+t2*8], m0 |
| mova [rsp+76*16+t3*8], m0 |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_16x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| ; actual first pass after skipping all-zero data |
| mov [rsp+gprsize*0+(64*2+12)*16], eobd |
| mov r3, rsp |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 6, 0 |
| mov r2, [rsp+gprsize*3+(64*2+12)*16] |
| mov [rsp+gprsize*3+(64*2+12)*16], r6 |
| %endif |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+ 1*128+r5*8] |
| mova m1, [cq+ 3*128+r5*8] |
| mova m2, [cq+ 5*128+r5*8] |
| mova m3, [cq+ 7*128+r5*8] |
| mova m4, [cq+ 9*128+r5*8] |
| mova m5, [cq+11*128+r5*8] |
| mova m6, [cq+13*128+r5*8] |
| mova m7, [cq+15*128+r5*8] |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+ 0*128+r5*8] |
| mova m1, [cq+ 2*128+r5*8] |
| mova m2, [cq+ 4*128+r5*8] |
| mova m3, [cq+ 6*128+r5*8] |
| mova m4, [cq+ 8*128+r5*8] |
| mova m5, [cq+10*128+r5*8] |
| mova m6, [cq+12*128+r5*8] |
| mova m7, [cq+14*128+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| call m(idct_16x16_internal_16bpc).round |
| %if ARCH_X86_64 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| packssdw m8, m9 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| %if ARCH_X86_64 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+76*16+t0*8], m8 |
| mova [rsp+76*16+t1*8], m9 |
| mova [rsp+76*16+t2*8], m10 |
| mova [rsp+76*16+t3*8], m11 |
| %else |
| mova [rsp+76*16+t0*8], m0 |
| mova [rsp+76*16+t1*8], m1 |
| mova [rsp+76*16+t2*8], m2 |
| mova [rsp+76*16+t3*8], m3 |
| mova m0, [rsp+ 8*16] |
| mova m2, [rsp+ 9*16] |
| mova m4, [rsp+10*16] |
| mova m6, [rsp+11*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| mova [rsp+12*16+t0*8], m0 |
| mova [rsp+12*16+t1*8], m1 |
| mova [rsp+12*16+t2*8], m2 |
| mova [rsp+12*16+t3*8], m3 |
| %if ARCH_X86_32 |
| mov r6, [rsp+gprsize*3+(64*2+12)*16] |
| %endif |
| pxor m7, m7 |
| REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 |
| mov eobd, [rsp+gprsize*0+(64*2+12)*16] |
| cmp eobd, 151 |
| jl .fast |
| ; fall-through |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 8, 9 |
| %else |
| DECLARE_REG_TMP 1, 5 |
| %endif |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .fast: |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] |
| .run: |
| add rsp, 9*16 |
| |
| %if ARCH_X86_64 |
| lea r2, [dstq+32] |
| mov r7, -4 |
| %else |
| lea r2, [rsp+(64*2+3)*16] |
| mov [r2+4*gprsize], t0 |
| mov [r2+5*gprsize], t1 |
| mov r1, [r2+2*gprsize] |
| mov dword [r2+0*gprsize], 2 |
| %endif |
| .loop_pass2: |
| %if ARCH_X86_32 |
| mov dstq, [r2+1*gprsize] |
| %endif |
| call .pass2 |
| add rsp, 64*16 |
| %if ARCH_X86_64 |
| add r7, 2 |
| lea dstq, [r2+r7*8] |
| jl .loop_pass2 |
| %else |
| add dword [r2+1*gprsize], 16 |
| dec dword [r2+0*gprsize] |
| jg .loop_pass2 |
| %endif |
| %assign stack_size (stack_size-(64*2+9)*16) |
| %if STACK_ALIGNMENT >= 16 |
| %assign stack_size_padded (stack_size_padded-(64*2+9)*16) |
| %assign stack_offset (stack_offset-(64*2+9)*16) |
| %else |
| %xdefine rstkm [rsp + stack_size] |
| %endif |
| %if ARCH_X86_64 |
| mov r9, [rsp+gprsize*1+3*16] |
| %if WIN64 |
| mov r7, [rsp+gprsize*2+3*16] |
| mov r8, [rsp+gprsize*3+3*16] |
| %endif |
| %endif |
| RET |
| |
| .pass2: |
| %if ARCH_X86_32 |
| lea r5, [o(itx8_start)] |
| %endif |
| mova m0, [rsp+gprsize+16* 3] |
| mova m1, [rsp+gprsize+16* 4] |
| mova m2, [rsp+gprsize+16* 5] |
| mova m3, [rsp+gprsize+16* 6] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call m_suffix(idct_8x8_internal_8bpc, _ssse3).main |
| mova [rsp+gprsize+ 3*16], m0 |
| mova [rsp+gprsize+ 4*16], m1 |
| mova [rsp+gprsize+ 5*16], m2 |
| mova [rsp+gprsize+ 6*16], m3 |
| mova [rsp+gprsize+ 7*16], m4 |
| mova [rsp+gprsize+ 8*16], m5 |
| mova [rsp+gprsize+ 9*16], m6 |
| mova [rsp+gprsize+10*16], m7 |
| mova m0, [rsp+gprsize+16*11] |
| mova m1, [rsp+gprsize+16*12] |
| mova m2, [rsp+gprsize+16*13] |
| mova m3, [rsp+gprsize+16*14] |
| pxor m4, m4 |
| REPX {mova x, m4}, m5, m6, m7 |
| call m_suffix(idct_16x8_internal_8bpc, _ssse3).main |
| mova m7, [rsp+gprsize+ 0*16] |
| mova [rsp+gprsize+11*16], m0 |
| mova [rsp+gprsize+12*16], m1 |
| mova [rsp+gprsize+13*16], m2 |
| mova [rsp+gprsize+14*16], m3 |
| mova [rsp+gprsize+15*16], m4 |
| mova [rsp+gprsize+16*16], m5 |
| mova [rsp+gprsize+17*16], m6 |
| mova [rsp+gprsize+18*16], m7 |
| %if ARCH_X86_64 |
| call r8 |
| %else |
| call [r2+4*gprsize] |
| %endif |
| mova [rsp+gprsize+ 3*16], m0 |
| mova [rsp+gprsize+ 5*16], m2 |
| mova [rsp+gprsize+ 8*16], m5 |
| mova [rsp+gprsize+10*16], m7 |
| %if ARCH_X86_64 |
| call r9 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %else |
| call [r2+5*gprsize] |
| %endif |
| lea r3, [strideq*3] |
| lea r4, [rsp+gprsize+ 3*16] |
| %if ARCH_X86_64 |
| mov r6d, 8 |
| %else |
| mov dword [r2+2*gprsize], 8 |
| %endif |
| .loop_write: |
| mova m0, [r4+0*16] |
| mova m1, [r4+1*16] |
| mova m2, [r4+2*16] |
| mova m3, [r4+3*16] |
| mova m4, [r4+4*16] |
| mova m5, [r4+5*16] |
| mova m6, [r4+6*16] |
| mova m7, [r4+7*16] |
| call m(idct_8x8_internal_16bpc).round1_and_write_8x8 |
| lea dstq, [dstq+strideq*8] |
| add r4, 8*16 |
| %if ARCH_X86_64 |
| dec r6d |
| %else |
| dec dword [r2+2*gprsize] |
| %endif |
| jg .loop_write |
| ret |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 64 |
| add r5d, 640 |
| sar r5d, 10 |
| add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 |
| jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 |
| |
| cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ |
| 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 2, 0 |
| mov [rsp+gprsize*1+(64*4+32)*16], r0 |
| mov [rsp+gprsize*2+(64*4+32)*16], r1 |
| mov [rsp+gprsize*3+(64*4+32)*16], r2 |
| %else |
| DECLARE_REG_TMP 8, 9, 4, 7 |
| mov [rsp+gprsize*1+(64*4+32)*16], r9 |
| %if WIN64 |
| mov [rsp+gprsize*2+(64*4+32)*16], r7 |
| mov [rsp+gprsize*3+(64*4+32)*16], r8 |
| %endif |
| %endif |
| %undef cmp |
| ; remove entirely-zero iterations |
| mov r5d, 7*2 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| mova [rsp+ 32*16+t0*8], m0 |
| mova [rsp+ 32*16+t1*8], m0 |
| mova [rsp+ 32*16+t2*8], m0 |
| mova [rsp+ 32*16+t3*8], m0 |
| mova [rsp+ 96*16+t0*8], m0 |
| mova [rsp+ 96*16+t1*8], m0 |
| mova [rsp+ 96*16+t2*8], m0 |
| mova [rsp+ 96*16+t3*8], m0 |
| mova [rsp+160*16+t0*8], m0 |
| mova [rsp+160*16+t1*8], m0 |
| mova [rsp+160*16+t2*8], m0 |
| mova [rsp+160*16+t3*8], m0 |
| mova [rsp+224*16+t0*8], m0 |
| mova [rsp+224*16+t1*8], m0 |
| mova [rsp+224*16+t2*8], m0 |
| mova [rsp+224*16+t3*8], m0 |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| ; actual first pass after skipping all-zero data |
| mov [rsp+gprsize*0+(64*4+32)*16], eobd |
| mov r3, rsp |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 6, 0 |
| mov r2, [rsp+gprsize*3+(64*4+32)*16] |
| mov [rsp+gprsize*3+(64*4+32)*16], r6 |
| %endif |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| mova m0, [cq+128* 1+r5*8] |
| mova m1, [cq+128* 7+r5*8] |
| mova m2, [cq+128* 9+r5*8] |
| mova m3, [cq+128*15+r5*8] |
| mova m4, [cq+128*17+r5*8] |
| mova m5, [cq+128*23+r5*8] |
| mova m6, [cq+128*25+r5*8] |
| mova m7, [cq+128*31+r5*8] |
| mov r3, rsp |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 |
| |
| mova m0, [cq+128* 3+r5*8] |
| mova m1, [cq+128* 5+r5*8] |
| mova m2, [cq+128*11+r5*8] |
| mova m3, [cq+128*13+r5*8] |
| mova m4, [cq+128*19+r5*8] |
| mova m5, [cq+128*21+r5*8] |
| mova m6, [cq+128*27+r5*8] |
| mova m7, [cq+128*29+r5*8] |
| %if ARCH_X86_32 |
| add r3, 16*8 |
| %endif |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| %if ARCH_X86_32 |
| sub r3, 16*8 |
| %endif |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 |
| add r3, 16*(16+4*ARCH_X86_32) |
| |
| mova m0, [cq+128* 2+r5*8] |
| mova m1, [cq+128* 6+r5*8] |
| mova m2, [cq+128*10+r5*8] |
| mova m3, [cq+128*14+r5*8] |
| mova m4, [cq+128*18+r5*8] |
| mova m5, [cq+128*22+r5*8] |
| mova m6, [cq+128*26+r5*8] |
| mova m7, [cq+128*30+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_16x4_internal_16bpc).main_oddhalf |
| |
| mova m0, [cq+128* 0+r5*8] |
| mova m1, [cq+128* 4+r5*8] |
| mova m2, [cq+128* 8+r5*8] |
| mova m3, [cq+128*12+r5*8] |
| mova m4, [cq+128*16+r5*8] |
| mova m5, [cq+128*20+r5*8] |
| mova m6, [cq+128*24+r5*8] |
| mova m7, [cq+128*28+r5*8] |
| call m(idct_8x4_internal_16bpc).rect2_mul |
| call m(idct_8x4_internal_16bpc).main_pass1 |
| call m(idct_8x4_internal_16bpc).round |
| sub r3, 16*(16+4*ARCH_X86_32) |
| call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 |
| |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| %if ARCH_X86_64 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+160*16+t0*8], m8 |
| mova [rsp+160*16+t1*8], m9 |
| mova [rsp+160*16+t2*8], m10 |
| mova [rsp+160*16+t3*8], m11 |
| mova m8, [r3+16* 9] ; 8 9 |
| mova m10, [r3+16*11] ; 10 11 |
| mova m12, [r3+16*13] ; 12 13 |
| mova m14, [r3+16*15] ; 14 15 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+ 96*16+t0*8], m8 |
| mova [rsp+ 96*16+t1*8], m9 |
| mova [rsp+ 96*16+t2*8], m10 |
| mova [rsp+ 96*16+t3*8], m11 |
| mova m8, [r3+16* 8] ; 24 25 |
| mova m10, [r3+16*10] ; 26 27 |
| mova m12, [r3+16*12] ; 28 29 |
| mova m14, [r3+16*14] ; 30 31 |
| call m(idct_16x4_internal_16bpc).transpose4x8packed_hi |
| mova [rsp+224*16+t0*8], m8 |
| mova [rsp+224*16+t1*8], m9 |
| mova [rsp+224*16+t2*8], m10 |
| mova [rsp+224*16+t3*8], m11 |
| %else |
| sub r3, 8*16 |
| mova m0, [r3+ 8*16] |
| mova m2, [r3+10*16] |
| mova m4, [r3+12*16] |
| mova m6, [r3+14*16] |
| packssdw m0, [r3+ 9*16] |
| packssdw m2, [r3+11*16] |
| packssdw m4, [r3+13*16] |
| packssdw m6, [r3+15*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+ 96*16+t0*8], m0 |
| mova [rsp+ 96*16+t1*8], m1 |
| mova [rsp+ 96*16+t2*8], m2 |
| mova [rsp+ 96*16+t3*8], m3 |
| mova m0, [r3+16*16] |
| mova m2, [r3+18*16] |
| mova m4, [r3+20*16] |
| mova m6, [r3+22*16] |
| packssdw m0, [r3+17*16] |
| packssdw m2, [r3+19*16] |
| packssdw m4, [r3+21*16] |
| packssdw m6, [r3+23*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+160*16+t0*8], m0 |
| mova [rsp+160*16+t1*8], m1 |
| mova [rsp+160*16+t2*8], m2 |
| mova [rsp+160*16+t3*8], m3 |
| mova m0, [r3+31*16] |
| mova m2, [r3+29*16] |
| mova m4, [r3+27*16] |
| mova m6, [r3+25*16] |
| packssdw m0, [r3+30*16] |
| packssdw m2, [r3+28*16] |
| packssdw m4, [r3+26*16] |
| packssdw m6, [r3+24*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [rsp+224*16+t0*8], m0 |
| mova [rsp+224*16+t1*8], m1 |
| mova [rsp+224*16+t2*8], m2 |
| mova [rsp+224*16+t3*8], m3 |
| mova m0, [r3+ 0*16] |
| mova m2, [r3+ 2*16] |
| mova m4, [r3+ 4*16] |
| mova m6, [r3+ 6*16] |
| packssdw m0, [r3+ 1*16] |
| packssdw m2, [r3+ 3*16] |
| packssdw m4, [r3+ 5*16] |
| packssdw m6, [r3+ 7*16] |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| %endif |
| mova [rsp+ 32*16+t0*8], m0 |
| mova [rsp+ 32*16+t1*8], m1 |
| mova [rsp+ 32*16+t2*8], m2 |
| mova [rsp+ 32*16+t3*8], m3 |
| pxor m0, m0 |
| REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ |
| 8, 9, 10, 11, 12, 13, 14, 15, \ |
| 16, 17, 18, 19, 20, 21, 22, 23, \ |
| 24, 25, 26, 27, 28, 29, 30, 31 |
| %if ARCH_X86_32 |
| mov r6, [rsp+gprsize*3+(64*4+32)*16] |
| %endif |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 |
| mov eobd, [rsp+gprsize*0+(64*4+32)*16] |
| cmp eobd, 136 |
| jl .fast |
| ; fall-through |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 8, 9 |
| %else |
| DECLARE_REG_TMP 1, 5 |
| %endif |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .fast: |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] |
| .run: |
| add rsp, 29*16 |
| |
| %if ARCH_X86_64 |
| lea r2, [dstq+64] |
| mov r7, -8 |
| %else |
| lea r2, [rsp+(64*4+3)*16] |
| mov [r2+4*gprsize], t0 |
| mov [r2+5*gprsize], t1 |
| mov r1, [r2+2*gprsize] |
| mov dword [r2+0*gprsize], 4 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 64 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| add r5d, 384 |
| sar r5d, 9 |
| add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 |
| jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 |
| |
| cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| ; remove entirely-zero iterations |
| %undef cmp |
| mov r5d, 8 |
| .zero_loop: |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x16_2d)+r5] |
| jl .zero_loop |
| |
| ; actual first pass after skipping all-zero data |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| |
| mov r3, rsp |
| lea r4, [o(idct64_mul_16bpc)] |
| mova m0, [cq+64* 1+r5*8] |
| mova m1, [cq+64*31+r5*8] |
| mova m2, [cq+64*17+r5*8] |
| mova m3, [cq+64*15+r5*8] |
| call .main_part1 |
| mova m0, [cq+64* 7+r5*8] |
| mova m1, [cq+64*25+r5*8] |
| mova m2, [cq+64*23+r5*8] |
| mova m3, [cq+64* 9+r5*8] |
| call .main_part1 |
| mova m0, [cq+64* 5+r5*8] |
| mova m1, [cq+64*27+r5*8] |
| mova m2, [cq+64*21+r5*8] |
| mova m3, [cq+64*11+r5*8] |
| call .main_part1 |
| mova m0, [cq+64* 3+r5*8] |
| mova m1, [cq+64*29+r5*8] |
| mova m2, [cq+64*19+r5*8] |
| mova m3, [cq+64*13+r5*8] |
| call .main_part1 |
| call .main_part2 |
| |
| mova m0, [cq+64* 2+r5*8] |
| mova m1, [cq+64*14+r5*8] |
| mova m2, [cq+64*18+r5*8] |
| mova m3, [cq+64*30+r5*8] |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast |
| |
| mova m0, [cq+64* 6+r5*8] |
| mova m1, [cq+64*10+r5*8] |
| mova m2, [cq+64*22+r5*8] |
| mova m3, [cq+64*26+r5*8] |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast |
| add r3, 16*(24+4*ARCH_X86_32) |
| |
| mova m0, [cq+64* 4+r5*8] |
| mova m1, [cq+64*12+r5*8] |
| mova m2, [cq+64*20+r5*8] |
| mova m3, [cq+64*28+r5*8] |
| call m(idct_16x4_internal_16bpc).main_oddhalf_fast |
| |
| mova m0, [cq+64* 0+r5*8] |
| mova m1, [cq+64* 8+r5*8] |
| mova m2, [cq+64*16+r5*8] |
| mova m3, [cq+64*24+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1_fast |
| call m(idct_8x4_internal_16bpc).round |
| mova [r3-(7+4*ARCH_X86_32)*16], m1 |
| mova [r3-(6+4*ARCH_X86_32)*16], m2 |
| mova [r3-(5+4*ARCH_X86_32)*16], m3 |
| mova [r3-(4+4*ARCH_X86_32)*16], m4 |
| mova [r3-(3+4*ARCH_X86_32)*16], m5 |
| mova [r3-(2+4*ARCH_X86_32)*16], m6 |
| mova [r3-(1+4*ARCH_X86_32)*16], m7 |
| sub r3, 16*(40+4*ARCH_X86_32-4) |
| |
| %if ARCH_X86_64 |
| psrld m15, m11, 10 ; pd_2 |
| %else |
| mova m7, [o(pd_2)] |
| %endif |
| call .main_end_loop_start |
| |
| lea r3, [rsp+56*16] |
| lea r4, [cq+r5*8+64*28] |
| call .shift_transpose |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2, we need to call this otherwise the stack pointer has |
| ; the wrong offset in the 8-bit code |
| call .pass2 |
| RET |
| |
| .pass2: |
| %if ARCH_X86_64 |
| mova m8, [o(pw_2048)] |
| pxor m9, m9 |
| mova m10, [o(pixel_10bpc_max)] |
| %if WIN64 |
| mov [rsp+16*16+gprsize], r7 |
| %endif |
| mov r7, dstq |
| %else |
| mov [rsp+2*gprsize+16*16], dstq |
| %endif |
| lea r3, [strideq*3] |
| mov r4d, 8 |
| jmp m(idct_16x16_internal_16bpc).loop_pass2 |
| |
| .main_part1: ; idct64 steps 1-5 |
| ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a |
| ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a |
| ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a |
| ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a |
| %if ARCH_X86_64 |
| movd m7, [r4+4*0] |
| movd m8, [r4+4*1] |
| movd m6, [r4+4*2] |
| movd m9, [r4+4*3] |
| movd m5, [r4+4*4] |
| movd m10, [r4+4*5] |
| movd m4, [r4+4*6] |
| movd m15, [r4+4*7] |
| REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 |
| pmulld m7, m0 ; t63a |
| pmulld m0, m8 ; t32a |
| pmulld m6, m1 ; t62a |
| pmulld m1, m9 ; t33a |
| pmulld m5, m2 ; t61a |
| pmulld m2, m10 ; t34a |
| pmulld m4, m3 ; t60a |
| pmulld m3, m15 ; t35a |
| movd m10, [r4+4*8] |
| movd m15, [r4+4*9] |
| REPX {pshufd x, x, q0000}, m10, m15 |
| REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 |
| REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 |
| psubd m8, m0, m1 ; t33 |
| paddd m0, m1 ; t32 |
| psubd m1, m7, m6 ; t62 |
| paddd m7, m6 ; t63 |
| psubd m6, m3, m2 ; t34 |
| paddd m3, m2 ; t35 |
| psubd m2, m4, m5 ; t61 |
| paddd m4, m5 ; t60 |
| REPX {pmaxsd x, m12}, m8, m1, m6, m2 |
| REPX {pminsd x, m13}, m8, m1, m6, m2 |
| ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a |
| ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a |
| REPX {pmaxsd x, m12}, m0, m3, m7, m4 |
| REPX {pminsd x, m13}, m0, m3, m7, m4 |
| movd m10, [r4+4*10] |
| movd m15, [r4+4*11] |
| REPX {pshufd x, x, q0000}, m10, m15 |
| psubd m5, m0, m3 ; t35a |
| paddd m0, m3 ; t32a |
| psubd m3, m7, m4 ; t60a |
| paddd m7, m4 ; t63a |
| psubd m4, m1, m6 ; t34 |
| paddd m1, m6 ; t33 |
| psubd m6, m8, m2 ; t61 |
| paddd m8, m2 ; t62 |
| REPX {pmaxsd x, m12}, m5, m3, m4, m6 |
| REPX {pminsd x, m13}, m5, m3, m4, m6 |
| ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 |
| ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a |
| REPX {pmaxsd x, m12}, m0, m7, m1, m8 |
| REPX {pminsd x, m13}, m0, m7, m1, m8 |
| add r4, 4*12 |
| mova [r3+16*0], m0 |
| mova [r3+16*7], m7 |
| mova [r3+16*1], m1 |
| mova [r3+16*6], m8 |
| mova [r3+16*2], m6 |
| mova [r3+16*5], m4 |
| mova [r3+16*3], m3 |
| mova [r3+16*4], m5 |
| %else |
| movd m7, [r4+4*0] |
| movd m6, [r4+4*2] |
| movd m5, [r4+4*4] |
| movd m4, [r4+4*6] |
| REPX {pshufd x, x, q0000}, m7, m6, m5, m4 |
| pmulld m7, m0 ; t63a |
| pmulld m6, m1 ; t62a |
| pmulld m5, m2 ; t61a |
| pmulld m4, m3 ; t60a |
| mova [r3+0*16], m6 |
| mova [r3+1*16], m7 |
| movd m6, [r4+4*1] |
| movd m7, [r4+4*3] |
| REPX {pshufd x, x, q0000}, m7, m6 |
| pmulld m0, m6 ; t32a |
| pmulld m1, m7 ; t33a |
| movd m6, [r4+4*5] |
| movd m7, [r4+4*7] |
| REPX {pshufd x, x, q0000}, m7, m6 |
| pmulld m2, m6 ; t34a |
| pmulld m3, m7 ; t35a |
| mova m6, [r3+0*16] |
| mova m7, [o(pd_2048)] |
| REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 |
| paddd m7, [r3+1*16] |
| REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 |
| mova [r3+0*16], m5 |
| psubd m5, m0, m1 ; t33 |
| paddd m0, m1 ; t32 |
| mova [r3+1*16], m0 |
| mova m0, [r3+0*16] |
| psubd m1, m7, m6 ; t62 |
| paddd m7, m6 ; t63 |
| psubd m6, m3, m2 ; t34 |
| paddd m3, m2 ; t35 |
| psubd m2, m4, m0 ; t61 |
| paddd m4, m0 ; t60 |
| mova m0, [o(clip_18b_min)] |
| REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 |
| pmaxsd m0, [r3+1*16] |
| mova [r3+0*16], m0 |
| mova m0, [o(clip_18b_max)] |
| REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 |
| pminsd m0, [r3+0*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m3 |
| mova [r3+2*16], m4 |
| mova [r3+3*16], m7 |
| mova m0, [o(pd_2048)] |
| movd m3, [r4+4*8] |
| movd m4, [r4+4*9] |
| REPX {pshufd x, x, q0000}, m3, m4 |
| mova [r3+4*16], m2 |
| ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a |
| mova m2, [r3+4*16] |
| mova [r3+4*16], m5 |
| ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a |
| mova m0, [r3+0*16] |
| mova m3, [r3+1*16] |
| mova m4, [r3+2*16] |
| mova m7, [r3+3*16] |
| psubd m5, m0, m3 ; t35a |
| paddd m0, m3 ; t32a |
| mova [r3+0*16], m5 |
| mova m5, [r3+4*16] |
| psubd m3, m7, m4 ; t60a |
| paddd m7, m4 ; t63a |
| psubd m4, m1, m6 ; t34 |
| paddd m1, m6 ; t33 |
| psubd m6, m5, m2 ; t61 |
| paddd m2, m5 ; t62 |
| mova m5, [o(clip_18b_min)] |
| REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 |
| pmaxsd m5, [r3+0*16] |
| mova [r3+0*16], m5 |
| mova m5, [o(clip_18b_max)] |
| REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 |
| pminsd m5, [r3+0*16] |
| mova [r3+16*0], m0 |
| mova [r3+16*7], m7 |
| mova [r3+16*1], m1 |
| mova [r3+16*6], m2 |
| mova [r3+16*2], m4 |
| mova m7, [o(pd_2048)] |
| movd m0, [r4+4*10] |
| movd m1, [r4+4*11] |
| REPX {pshufd x, x, q0000}, m0, m1 |
| ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 |
| mova [r3+16*3], m3 |
| mova [r3+16*4], m5 |
| mova m4, [r3+2*16] |
| ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a |
| add r4, 4*12 |
| mova [r3+16*2], m6 |
| mova [r3+16*5], m4 |
| %endif |
| add r3, 16*8 |
| ret |
| |
| .main_part2: ; idct64 steps 6-9 |
| lea r4, [r3+16*7] |
| %if ARCH_X86_64 |
| mova m10, [o(pd_1567)] |
| mova m15, [o(pd_3784)] |
| .main_part2_loop: |
| mova m0, [r3-16*32] ; t32a |
| mova m1, [r4-16*24] ; t39a |
| mova m2, [r4-16*32] ; t63a |
| mova m3, [r3-16*24] ; t56a |
| mova m4, [r3-16*16] ; t40a |
| mova m5, [r4-16* 8] ; t47a |
| mova m6, [r4-16*16] ; t55a |
| mova m7, [r3-16* 8] ; t48a |
| psubd m8, m0, m1 ; t39 |
| paddd m0, m1 ; t32 |
| psubd m1, m2, m3 ; t56 |
| paddd m2, m3 ; t63 |
| psubd m3, m5, m4 ; t40 |
| paddd m5, m4 ; t47 |
| psubd m4, m7, m6 ; t55 |
| paddd m7, m6 ; t48 |
| REPX {pmaxsd x, m12}, m8, m1, m3, m4 |
| REPX {pminsd x, m13}, m8, m1, m3, m4 |
| ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a |
| ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a |
| REPX {pmaxsd x, m12}, m0, m2, m5, m7 |
| REPX {pminsd x, m13}, m0, m5, m2, m7 |
| psubd m6, m2, m7 ; t48a |
| paddd m2, m7 ; t63a |
| psubd m7, m0, m5 ; t47a |
| paddd m0, m5 ; t32a |
| psubd m5, m8, m4 ; t55 |
| paddd m8, m4 ; t56 |
| psubd m4, m1, m3 ; t40 |
| paddd m1, m3 ; t39 |
| REPX {pmaxsd x, m12}, m6, m7, m5, m4 |
| REPX {pminsd x, m13}, m6, m7, m5, m4 |
| REPX {pmulld x, m14}, m6, m7, m5, m4 |
| REPX {pmaxsd x, m12}, m2, m0, m8, m1 |
| REPX {pminsd x, m13}, m2, m0, m8, m1 |
| paddd m6, m11 |
| paddd m5, m11 |
| psubd m3, m6, m7 ; t47 |
| paddd m6, m7 ; t48 |
| psubd m7, m5, m4 ; t40a |
| paddd m5, m4 ; t55a |
| REPX {psrad x, 12}, m3, m6, m7, m5 |
| mova [r4-16* 8], m2 |
| mova [r3-16*32], m0 |
| mova [r3-16* 8], m8 |
| mova [r4-16*32], m1 |
| mova [r4-16*24], m3 |
| mova [r3-16*16], m6 |
| mova [r3-16*24], m7 |
| mova [r4-16*16], m5 |
| %else |
| .main_part2_loop: |
| mova m0, [r3-16*32] ; t32a |
| mova m1, [r4-16*24] ; t39a |
| mova m2, [r4-16*32] ; t63a |
| mova m3, [r3-16*24] ; t56a |
| mova m4, [r3-16*16] ; t40a |
| mova m5, [r4-16* 8] ; t47a |
| mova m6, [r4-16*16] ; t55a |
| psubd m7, m0, m1 ; t39 |
| paddd m0, m1 ; t32 |
| mova [r3+0*16], m7 |
| mova m7, [r3-16* 8] ; t48a |
| psubd m1, m2, m3 ; t56 |
| paddd m2, m3 ; t63 |
| psubd m3, m5, m4 ; t40 |
| paddd m5, m4 ; t47 |
| psubd m4, m7, m6 ; t55 |
| paddd m7, m6 ; t48 |
| mova m6, [o(clip_18b_min)] |
| REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 |
| pmaxsd m6, [r3+0*16] |
| mova [r3+0*16], m6 |
| mova m6, [o(clip_18b_max)] |
| REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 |
| pminsd m6, [r3+0*16] |
| mova [r3+0*16], m0 |
| mova [r3+1*16], m2 |
| mova [r3+2*16], m5 |
| mova [r3+3*16], m7 |
| mova m0, [o(pd_2048)] |
| ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a |
| ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a |
| mova m2, [r3+1*16] |
| mova m7, [r3+3*16] |
| psubd m5, m2, m7 ; t48a |
| paddd m2, m7 ; t63a |
| mova [r3+1*16], m5 |
| mova m0, [r3+0*16] |
| mova m5, [r3+2*16] |
| psubd m7, m0, m5 ; t47a |
| paddd m0, m5 ; t32a |
| psubd m5, m6, m4 ; t55 |
| paddd m6, m4 ; t56 |
| psubd m4, m1, m3 ; t40 |
| paddd m1, m3 ; t39 |
| mova m3, [o(clip_18b_min)] |
| REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 |
| pmaxsd m3, [r3+1*16] |
| mova [r3+0*16], m3 |
| mova m3, [o(clip_18b_max)] |
| REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 |
| pminsd m3, [r3+0*16] |
| mova [r4-16* 8], m2 |
| mova [r3-16*32], m0 |
| mova [r3-16* 8], m6 |
| mova [r4-16*32], m1 |
| mova m0, [o(pd_2896)] |
| mova m1, [o(pd_2048)] |
| REPX {pmulld x, m0}, m3, m7, m5, m4 |
| REPX {paddd x, m1}, m3, m5 |
| psubd m6, m3, m7 ; t47 |
| paddd m3, m7 ; t48 |
| psubd m7, m5, m4 ; t40a |
| paddd m5, m4 ; t55a |
| REPX {psrad x, 12}, m6, m3, m7, m5 |
| mova [r4-16*24], m6 |
| mova [r3-16*16], m3 |
| mova [r3-16*24], m7 |
| mova [r4-16*16], m5 |
| %endif |
| add r3, 16 |
| sub r4, 16 |
| cmp r3, r4 |
| jl .main_part2_loop |
| sub r3, 4*16 |
| ret |
| |
| .main_end_loop: |
| mova m0, [r3+16*28] ; idct8 0 + n |
| .main_end_loop_start: |
| mova m2, [r3+16*12] ; idct32 16 + n |
| mova m3, [r4+16*12] ; idct32 31 - n |
| %if ARCH_X86_64 |
| mova m1, [r4+16*28] ; idct16 15 - n |
| mova m4, [r4-16* 4] ; idct64 63 - n |
| mova m5, [r3-16* 4] ; idct64 48 + n |
| mova m6, [r4-16*20] ; idct64 47 - n |
| mova m7, [r3-16*20] ; idct64 32 + n |
| pmaxsd m0, m12 |
| pminsd m0, m13 |
| paddd m8, m0, m1 ; idct16 out0 + n |
| psubd m0, m1 ; idct16 out15 - n |
| REPX {pmaxsd x, m12}, m8, m0 |
| REPX {pminsd x, m13}, m8, m0 |
| paddd m1, m8, m3 ; idct32 out0 + n |
| psubd m8, m3 ; idct32 out31 - n |
| paddd m3, m0, m2 ; idct32 out15 - n |
| psubd m0, m2 ; idct32 out16 + n |
| REPX {pmaxsd x, m12}, m1, m8, m3, m0 |
| REPX {pminsd x, m13}, m1, m3, m8, m0 |
| REPX {paddd x, m15}, m1, m3, m0, m8 |
| paddd m2, m1, m4 ; idct64 out0 + n (unshifted) |
| psubd m1, m4 ; idct64 out63 - n (unshifted) |
| paddd m4, m3, m5 ; idct64 out15 - n (unshifted) |
| psubd m3, m5 ; idct64 out48 + n (unshifted) |
| paddd m5, m0, m6 ; idct64 out16 + n (unshifted) |
| psubd m0, m6 ; idct64 out47 - n (unshifted) |
| paddd m6, m8, m7 ; idct64 out31 - n (unshifted) |
| psubd m8, m7 ; idct64 out32 + n (unshifted) |
| mova [r3-16*20], m2 |
| mova [r4+16*28], m1 |
| mova [r4-16*20], m4 |
| mova [r3+16*28], m3 |
| mova [r3-16* 4], m5 |
| mova [r4+16*12], m0 |
| mova [r4-16* 4], m6 |
| mova [r3+16*12], m8 |
| %else |
| mova m5, [o(clip_18b_min)] |
| mova m6, [o(clip_18b_max)] |
| mova m1, [r3+16*44] ; idct16 15 - n |
| pmaxsd m0, m5 |
| pminsd m0, m6 |
| paddd m4, m0, m1 ; idct16 out0 + n |
| psubd m0, m1 ; idct16 out15 - n |
| REPX {pmaxsd x, m5}, m4, m0 |
| REPX {pminsd x, m6}, m4, m0 |
| paddd m1, m4, m3 ; idct32 out0 + n |
| psubd m4, m3 ; idct32 out31 - n |
| paddd m3, m0, m2 ; idct32 out15 - n |
| psubd m0, m2 ; idct32 out16 + n |
| REPX {pmaxsd x, m5}, m1, m4, m3, m0 |
| REPX {pminsd x, m6}, m1, m3, m4, m0 |
| REPX {paddd x, m7}, m1, m3, m0, m4 |
| mova m5, [r4-16* 4] ; idct64 63 - n |
| mova m6, [r3-16* 4] ; idct64 48 + n |
| paddd m2, m1, m5 ; idct64 out0 + n (unshifted) |
| psubd m1, m5 ; idct64 out63 - n (unshifted) |
| paddd m5, m3, m6 ; idct64 out15 - n (unshifted) |
| psubd m3, m6 ; idct64 out48 + n (unshifted) |
| mova [r4+16*28], m1 |
| mova [r3+16*28], m3 |
| mova m6, [r4-16*20] ; idct64 47 - n |
| mova m1, [r3-16*20] ; idct64 32 + n |
| mova [r3-16*20], m2 |
| mova [r4-16*20], m5 |
| paddd m5, m0, m6 ; idct64 out16 + n (unshifted) |
| psubd m0, m6 ; idct64 out47 - n (unshifted) |
| paddd m6, m4, m1 ; idct64 out31 - n (unshifted) |
| psubd m4, m1 ; idct64 out32 + n (unshifted) |
| mova [r3-16* 4], m5 |
| mova [r4+16*12], m0 |
| mova [r4-16* 4], m6 |
| mova [r3+16*12], m4 |
| %endif |
| sub r4, 16 |
| add r3, 16 |
| cmp r3, r4 |
| jl .main_end_loop |
| ret |
| |
| .shift_transpose: |
| mova m0, [r3+0*16] |
| mova m1, [r3+1*16] |
| mova m2, [r3+2*16] |
| mova m3, [r3+3*16] |
| mova m4, [r3+4*16] |
| mova m5, [r3+5*16] |
| mova m6, [r3+6*16] |
| mova m7, [r3+7*16] |
| REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [r4+0*64], m0 |
| mova [r4+1*64], m1 |
| mova [r4+2*64], m2 |
| mova [r4+3*64], m3 |
| sub r4, 4*64 |
| sub r3, 8*16 |
| cmp r3, rsp |
| jg .shift_transpose |
| ret |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 16 |
| .dconly1: |
| add r5d, 640 |
| sar r5d, 10 |
| .dconly2: |
| imul r5d, 2896 |
| add r5d, 34816 |
| movd m0, r5d |
| pshuflw m0, m0, q1111 |
| punpcklqdq m0, m0 |
| mova m6, [o(pixel_10bpc_max)] |
| pxor m5, m5 |
| .dconly_loop: |
| paddw m1, m0, [dstq+16*0] |
| paddw m2, m0, [dstq+16*1] |
| paddw m3, m0, [dstq+16*2] |
| paddw m4, m0, [dstq+16*3] |
| REPX {pmaxsw x, m5}, m1, m2, m3, m4 |
| REPX {pminsw x, m6}, m1, m2, m3, m4 |
| mova [dstq+16*0], m1 |
| mova [dstq+16*1], m2 |
| mova [dstq+16*2], m3 |
| mova [dstq+16*3], m4 |
| add dstq, 64 |
| btc r3d, 16 |
| jnc .dconly_loop |
| lea dstq, [dstq+strideq-128] |
| dec r3d |
| jg .dconly_loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ |
| 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 0, 4, 1 |
| mov [rsp+(8*32+64+8)*16+1*gprsize], dstq |
| mov [rsp+(8*32+64+8)*16+2*gprsize], strideq |
| %else |
| DECLARE_REG_TMP 4, 7, 8 |
| %if WIN64 |
| mov [rsp+(8*32+64+1)*16+1*gprsize], r7 |
| mov [rsp+64*16+0*gprsize], r8 |
| %endif |
| %endif |
| %undef cmp |
| ; remove entirely-zero iterations |
| mov r5d, 14 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| lea t2, [rsp+7*32*16] |
| .zero_loop_inner: |
| mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 |
| mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 |
| mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 |
| mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 |
| sub t2, 32*16 |
| cmp t2, rsp |
| jge .zero_loop_inner |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd |
| ; actual first pass after skipping all-zero data |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| |
| mov r3, rsp |
| lea r4, [o(idct64_mul_16bpc)] |
| mova m0, [cq+128* 1+r5*8] |
| mova m1, [cq+128*31+r5*8] |
| mova m2, [cq+128*17+r5*8] |
| mova m3, [cq+128*15+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 7+r5*8] |
| mova m1, [cq+128*25+r5*8] |
| mova m2, [cq+128*23+r5*8] |
| mova m3, [cq+128* 9+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 5+r5*8] |
| mova m1, [cq+128*27+r5*8] |
| mova m2, [cq+128*21+r5*8] |
| mova m3, [cq+128*11+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 3+r5*8] |
| mova m1, [cq+128*29+r5*8] |
| mova m2, [cq+128*19+r5*8] |
| mova m3, [cq+128*13+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 |
| |
| mova m0, [cq+128* 2+r5*8] |
| mova m1, [cq+128*14+r5*8] |
| mova m2, [cq+128*18+r5*8] |
| mova m3, [cq+128*30+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast |
| |
| mova m0, [cq+128* 6+r5*8] |
| mova m1, [cq+128*10+r5*8] |
| mova m2, [cq+128*22+r5*8] |
| mova m3, [cq+128*26+r5*8] |
| call .rect2_mul_fast |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast |
| add r3, 16*(24+4*ARCH_X86_32) |
| |
| mova m0, [cq+128* 4+r5*8] |
| mova m1, [cq+128*12+r5*8] |
| mova m2, [cq+128*20+r5*8] |
| mova m3, [cq+128*28+r5*8] |
| call .rect2_mul_fast |
| call m(idct_16x4_internal_16bpc).main_oddhalf_fast |
| |
| mova m0, [cq+128* 0+r5*8] |
| mova m1, [cq+128* 8+r5*8] |
| mova m2, [cq+128*16+r5*8] |
| mova m3, [cq+128*24+r5*8] |
| call .rect2_mul_fast |
| call m(idct_8x4_internal_16bpc).main_pass1_fast |
| call m(idct_8x4_internal_16bpc).round |
| mova [r3-(7+4*ARCH_X86_32)*16], m1 |
| mova [r3-(6+4*ARCH_X86_32)*16], m2 |
| mova [r3-(5+4*ARCH_X86_32)*16], m3 |
| mova [r3-(4+4*ARCH_X86_32)*16], m4 |
| mova [r3-(3+4*ARCH_X86_32)*16], m5 |
| mova [r3-(2+4*ARCH_X86_32)*16], m6 |
| mova [r3-(1+4*ARCH_X86_32)*16], m7 |
| sub r3, 16*(40+4*ARCH_X86_32-4) |
| |
| %if ARCH_X86_64 |
| psrld m15, m11, 11 ; pd_1 |
| %else |
| mova m7, [o(pd_1)] |
| %endif |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start |
| |
| lea r3, [rsp+56*16] |
| lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] |
| movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] |
| movzx t1d, t0b |
| shr t0d, 8 |
| call .shift_transpose |
| ; zero cq |
| pxor m7, m7 |
| lea r4, [cq+30*128+r5*8] |
| .zero_cq_loop: |
| REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 |
| sub r4, 4*128 |
| cmp r4, cq |
| jg .zero_cq_loop |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 code starts here |
| mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] |
| %elif WIN64 |
| mov r8, [rsp+gprsize*0+64*16] |
| %endif |
| add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 |
| cmp eobd, 36 |
| jl .load_veryfast |
| cmp eobd, 136 |
| jl .load_fast |
| ; load normal |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .load_fast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| jmp .run |
| .load_veryfast: |
| lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| ; fall-through |
| .run: |
| %if ARCH_X86_64 |
| lea r2, [dstq+128] |
| mov r7, -16 |
| %else |
| lea r2, [rsp+(8*32+3)*16] |
| mov dword [r2+0*gprsize], 8 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry |
| |
| .rect2_mul_fast: |
| %if ARCH_X86_64 |
| REPX {pmulld x, m14}, m0, m1, m2, m3 |
| REPX {paddd x, m11}, m0, m1, m2, m3 |
| %else |
| mova m4, [o(pd_2896)] |
| mova m5, [o(pd_2048)] |
| REPX {pmulld x, m4 }, m0, m1, m2, m3 |
| REPX {paddd x, m5 }, m0, m1, m2, m3 |
| %endif |
| REPX {psrad x, 12 }, m0, m1, m2, m3 |
| ret |
| |
| .shift_transpose: |
| mova m0, [r3+0*16] |
| mova m1, [r3+1*16] |
| mova m2, [r3+2*16] |
| mova m3, [r3+3*16] |
| mova m4, [r3+4*16] |
| mova m5, [r3+5*16] |
| mova m6, [r3+6*16] |
| mova m7, [r3+7*16] |
| REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [t2+0*16+r5*8], m0 |
| mova [t2+8*16+r5*8], m2 |
| mova [t2+0*16+t0*8], m3 |
| mova [t2+0*16+t1*8], m1 |
| sub t2, 16*32 |
| sub r3, 8*16 |
| cmp r3, rsp |
| jg .shift_transpose |
| ret |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 32 |
| add r5d, 128 |
| sar r5d, 8 |
| imul r5d, 181 |
| add r5d, 384 |
| sar r5d, 9 |
| add rsp, (1+8*32+1*WIN64)*16 |
| jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 |
| |
| cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ |
| 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ |
| dst, stride, c, eob |
| LEA r6, base |
| test eobd, eobd |
| jz .dconly |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 2, 0, 6 |
| mov [rsp+gprsize*1+(64*9+8)*16], r0 |
| mov [rsp+gprsize*2+(64*9+8)*16], r1 |
| mov [rsp+gprsize*3+(64*9+8)*16], r2 |
| mov [rsp+gprsize*4+(64*9+8)*16], r6 |
| %else |
| DECLARE_REG_TMP 8, 9, 4, 7, 0 |
| mov [rsp+gprsize*1+(64*9+1)*16], r9 |
| mov [rsp+gprsize*0+64*16], r0 |
| %if WIN64 |
| mov [rsp+gprsize*2+(64*9+1)*16], r7 |
| mov [rsp+gprsize*3+(64*9+1)*16], r8 |
| %endif |
| %endif |
| %undef cmp |
| |
| ; remove entirely-zero iterations |
| mov r5d, 14 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jge .end_zero_loop |
| pxor m0, m0 |
| .zero_loop: |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| lea t4, [rsp+7*64*16] |
| .zero_loop_inner: |
| mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 |
| mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 |
| mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 |
| mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 |
| sub t4, 64*16 |
| cmp t4, rsp |
| jge .zero_loop_inner |
| %if ARCH_X86_32 |
| mov r6, [rsp+gprsize*4+(64*9+8)*16] |
| %endif |
| sub r5d, 2 |
| cmp eobw, word [o2(tbl_32x32_2d)+r5] |
| jl .zero_loop |
| .end_zero_loop: |
| mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd |
| %if ARCH_X86_32 |
| mov cq, [rsp+gprsize*3+(64*9+8)*16] |
| %endif |
| ; actual first pass after skipping all-zero data |
| .loop_pass1: |
| %if ARCH_X86_64 |
| mova m11, [o(pd_2048)] |
| mova m12, [o(clip_18b_min)] |
| mova m13, [o(clip_18b_max)] |
| mova m14, [o(pd_2896)] |
| %endif |
| |
| mov r3, rsp |
| lea r4, [o(idct64_mul_16bpc)] |
| mova m0, [cq+128* 1+r5*8] |
| mova m1, [cq+128*31+r5*8] |
| mova m2, [cq+128*17+r5*8] |
| mova m3, [cq+128*15+r5*8] |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 7+r5*8] |
| mova m1, [cq+128*25+r5*8] |
| mova m2, [cq+128*23+r5*8] |
| mova m3, [cq+128* 9+r5*8] |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 5+r5*8] |
| mova m1, [cq+128*27+r5*8] |
| mova m2, [cq+128*21+r5*8] |
| mova m3, [cq+128*11+r5*8] |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| mova m0, [cq+128* 3+r5*8] |
| mova m1, [cq+128*29+r5*8] |
| mova m2, [cq+128*19+r5*8] |
| mova m3, [cq+128*13+r5*8] |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 |
| |
| mova m0, [cq+128* 2+r5*8] |
| mova m1, [cq+128*14+r5*8] |
| mova m2, [cq+128*18+r5*8] |
| mova m3, [cq+128*30+r5*8] |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast |
| |
| mova m0, [cq+128* 6+r5*8] |
| mova m1, [cq+128*10+r5*8] |
| mova m2, [cq+128*22+r5*8] |
| mova m3, [cq+128*26+r5*8] |
| call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast |
| add r3, 16*(24+4*ARCH_X86_32) |
| |
| mova m0, [cq+128* 4+r5*8] |
| mova m1, [cq+128*12+r5*8] |
| mova m2, [cq+128*20+r5*8] |
| mova m3, [cq+128*28+r5*8] |
| call m(idct_16x4_internal_16bpc).main_oddhalf_fast |
| |
| mova m0, [cq+128* 0+r5*8] |
| mova m1, [cq+128* 8+r5*8] |
| mova m2, [cq+128*16+r5*8] |
| mova m3, [cq+128*24+r5*8] |
| call m(idct_8x4_internal_16bpc).main_pass1_fast |
| call m(idct_8x4_internal_16bpc).round |
| mova [r3-(7+4*ARCH_X86_32)*16], m1 |
| mova [r3-(6+4*ARCH_X86_32)*16], m2 |
| mova [r3-(5+4*ARCH_X86_32)*16], m3 |
| mova [r3-(4+4*ARCH_X86_32)*16], m4 |
| mova [r3-(3+4*ARCH_X86_32)*16], m5 |
| mova [r3-(2+4*ARCH_X86_32)*16], m6 |
| mova [r3-(1+4*ARCH_X86_32)*16], m7 |
| sub r3, 16*(40+4*ARCH_X86_32-4) |
| |
| %if ARCH_X86_64 |
| psrld m15, m11, 10 ; pd_2 |
| %else |
| mova m7, [o(pd_2)] |
| %endif |
| call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start |
| |
| lea r3, [rsp+56*16] |
| movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] |
| movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] |
| movzx t0d, t1b |
| movzx t2d, t3b |
| shr t1d, 8 |
| shr t3d, 8 |
| lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] |
| call .shift_transpose |
| ; zero cq |
| pxor m7, m7 |
| %if ARCH_X86_32 |
| mov cq, [rsp+gprsize*3+(64*9+8)*16] |
| %endif |
| lea r4, [cq+30*128+r5*8] |
| .zero_cq_loop: |
| REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 |
| sub r4, 4*128 |
| cmp r4, cq |
| jg .zero_cq_loop |
| %if ARCH_X86_32 |
| mov r6, [rsp+gprsize*4+(64*9+8)*16] |
| %endif |
| sub r5d, 2 |
| jge .loop_pass1 |
| |
| ; pass=2 code starts here |
| mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] |
| %if ARCH_X86_32 |
| mov strideq, [rsp+gprsize*2+(9*64+8)*16] |
| %else |
| mov r0, [rsp+gprsize*0+64*16] |
| %endif |
| add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 |
| cmp eobd, 151 |
| jl .fast |
| ; fall-through |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 8, 9 |
| %else |
| DECLARE_REG_TMP 1, 5 |
| %endif |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] |
| jmp .run |
| .fast: |
| lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] |
| lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] |
| .run: |
| |
| %if ARCH_X86_64 |
| lea r2, [dstq+128] |
| mov r7, -16 |
| %else |
| lea r2, [rsp+(64*8+3)*16] |
| mov [r2+4*gprsize], t0 |
| mov [r2+5*gprsize], t1 |
| mov r1, [r2+2*gprsize] |
| mov dword [r2+0*gprsize], 8 |
| %endif |
| jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 |
| |
| ; copy of pass=1 tmp-regs |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 4, 1, 2, 0, 6 |
| %else |
| DECLARE_REG_TMP 8, 9, 4, 7, 0 |
| %endif |
| |
| .shift_transpose: |
| mova m0, [r3+0*16] |
| mova m1, [r3+1*16] |
| mova m2, [r3+2*16] |
| mova m3, [r3+3*16] |
| mova m4, [r3+4*16] |
| mova m5, [r3+5*16] |
| mova m6, [r3+6*16] |
| mova m7, [r3+7*16] |
| REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 |
| packssdw m0, m1 |
| packssdw m2, m3 |
| packssdw m4, m5 |
| packssdw m6, m7 |
| call m(idct_8x4_internal_16bpc).transpose4x8packed |
| mova [t4+t0*8], m0 |
| mova [t4+t1*8], m1 |
| mova [t4+t2*8], m2 |
| mova [t4+t3*8], m3 |
| sub t4, 16*64 |
| sub r3, 8*16 |
| cmp r3, rsp |
| jg .shift_transpose |
| ret |
| |
| .dconly: |
| imul r5d, [cq], 181 |
| mov [cq], eobd ; 0 |
| mov r3d, 64 |
| add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ |
| (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 |
| jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 |