| ; Copyright © 2021, VideoLAN and dav1d authors |
| ; Copyright © 2021, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA |
| |
| filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 |
| pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 |
| |
| pb_0_1: times 4 db 0, 1 |
| pb_2_3: times 4 db 2, 3 |
| pw_1: times 4 dw 1 |
| pw_2: times 4 dw 2 |
| pw_4: times 4 dw 4 |
| pw_512: times 4 dw 512 |
| pw_2048: times 4 dw 2048 |
| |
| %macro JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - 2*4) |
| %xdefine %%base mangle(private_prefix %+ _%1_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%base %+ .%3 - (%%table - 2*4) |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) |
| %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) |
| %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) |
| |
| JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ |
| s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ |
| s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 |
| JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ |
| s4-8*4, s8-8*4, s16-8*4, s32-8*4 |
| JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 |
| JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 |
| JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 |
| |
| cextern smooth_weights_1d_16bpc |
| cextern smooth_weights_2d_16bpc |
| cextern filter_intra_taps |
| |
| SECTION .text |
| |
| INIT_XMM ssse3 |
| cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h |
| LEA r5, ipred_dc_left_16bpc_ssse3_table |
| movd m4, wm |
| tzcnt wd, wm |
| add tlq, 2 |
| movifnidn hd, hm |
| pxor m3, m3 |
| pavgw m4, m3 |
| movd m5, wd |
| movu m0, [tlq] |
| movsxd r6, [r5+wq*4] |
| add r6, r5 |
| add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| |
| cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_dc_left_16bpc_ssse3_table |
| mov hd, hm |
| movd m4, hm |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| pxor m3, m3 |
| sub tlq, hq |
| pavgw m4, m3 |
| movd m5, r6d |
| movu m0, [tlq] |
| movsxd r6, [r5+r6*4] |
| add r6, r5 |
| add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| .h64: |
| movu m2, [tlq+112] |
| movu m1, [tlq+ 96] |
| paddw m0, m2 |
| movu m2, [tlq+ 80] |
| paddw m1, m2 |
| movu m2, [tlq+ 64] |
| paddw m0, m2 |
| paddw m0, m1 |
| .h32: |
| movu m1, [tlq+ 48] |
| movu m2, [tlq+ 32] |
| paddw m1, m2 |
| paddw m0, m1 |
| .h16: |
| movu m1, [tlq+ 16] |
| paddw m0, m1 |
| .h8: |
| movhlps m1, m0 |
| paddw m0, m1 |
| .h4: |
| punpcklwd m0, m3 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m4, m0, q1032 |
| paddd m0, m4 |
| psrld m0, m5 |
| lea stride3q, [strideq*3] |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| jmp wq |
| |
| cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 |
| movifnidn hd, hm |
| tzcnt r6d, hd |
| lea r5d, [wq+hq] |
| movd m4, r5d |
| tzcnt r5d, r5d |
| movd m5, r5d |
| LEA r5, ipred_dc_16bpc_ssse3_table |
| tzcnt wd, wd |
| movsxd r6, [r5+r6*4] |
| movsxd wq, [r5+wq*4+5*4] |
| pxor m3, m3 |
| psrlw m4, 1 |
| add r6, r5 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp r6 |
| .h4: |
| movq m0, [tlq-8] |
| jmp wq |
| .w4: |
| movq m1, [tlq+2] |
| paddw m1, m0 |
| punpckhwd m0, m3 |
| punpcklwd m1, m3 |
| paddd m0, m1 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m1, m0, q1032 |
| paddd m0, m1 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw m0, 3 |
| jmp .w4_end |
| .w4_mul: |
| mov r2d, 0xAAAB |
| mov r3d, 0x6667 |
| cmp hd, 16 |
| cmove r2d, r3d |
| psrld m0, 2 |
| movd m1, r2d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w4_end: |
| pshuflw m0, m0, q0000 |
| .s4: |
| movq [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4 |
| RET |
| .h8: |
| mova m0, [tlq-16] |
| jmp wq |
| .w8: |
| movu m1, [tlq+2] |
| paddw m0, m1 |
| punpcklwd m1, m0, m3 |
| punpckhwd m0, m3 |
| paddd m0, m1 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m1, m0, q1032 |
| paddd m0, m1 |
| psrld m0, m5 |
| cmp hd, 8 |
| je .w8_end |
| mov r2d, 0xAAAB |
| mov r3d, 0x6667 |
| cmp hd, 32 |
| cmove r2d, r3d |
| movd m1, r2d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w8_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s8: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s8 |
| RET |
| .h16: |
| mova m0, [tlq-32] |
| paddw m0, [tlq-16] |
| jmp wq |
| .w16: |
| movu m1, [tlq+ 2] |
| movu m2, [tlq+18] |
| paddw m1, m2 |
| paddw m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| paddd m0, m1 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m1, m0, q1032 |
| paddd m0, m1 |
| psrld m0, m5 |
| cmp hd, 16 |
| je .w16_end |
| mov r2d, 0xAAAB |
| mov r3d, 0x6667 |
| test hd, 8|32 |
| cmovz r2d, r3d |
| movd m1, r2d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w16_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s16c: |
| mova m1, m0 |
| .s16: |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m1 |
| mova [dstq+strideq*1+16*0], m0 |
| mova [dstq+strideq*1+16*1], m1 |
| mova [dstq+strideq*2+16*0], m0 |
| mova [dstq+strideq*2+16*1], m1 |
| mova [dstq+stride3q +16*0], m0 |
| mova [dstq+stride3q +16*1], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s16 |
| RET |
| .h32: |
| mova m0, [tlq-64] |
| paddw m0, [tlq-48] |
| paddw m0, [tlq-32] |
| paddw m0, [tlq-16] |
| jmp wq |
| .w32: |
| movu m1, [tlq+ 2] |
| movu m2, [tlq+18] |
| paddw m1, m2 |
| movu m2, [tlq+34] |
| paddw m0, m2 |
| movu m2, [tlq+50] |
| paddw m1, m2 |
| paddw m0, m1 |
| punpcklwd m1, m0, m3 |
| punpckhwd m0, m3 |
| paddd m0, m1 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m1, m0, q1032 |
| paddd m0, m1 |
| psrld m0, m5 |
| cmp hd, 32 |
| je .w32_end |
| mov r2d, 0xAAAB |
| mov r3d, 0x6667 |
| cmp hd, 8 |
| cmove r2d, r3d |
| movd m1, r2d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w32_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s32c: |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| .s32: |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m1 |
| mova [dstq+strideq*0+16*2], m2 |
| mova [dstq+strideq*0+16*3], m3 |
| mova [dstq+strideq*1+16*0], m0 |
| mova [dstq+strideq*1+16*1], m1 |
| mova [dstq+strideq*1+16*2], m2 |
| mova [dstq+strideq*1+16*3], m3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .s32 |
| RET |
| .h64: |
| mova m0, [tlq-128] |
| mova m1, [tlq-112] |
| paddw m0, [tlq- 96] |
| paddw m1, [tlq- 80] |
| paddw m0, [tlq- 64] |
| paddw m1, [tlq- 48] |
| paddw m0, [tlq- 32] |
| paddw m1, [tlq- 16] |
| paddw m0, m1 |
| jmp wq |
| .w64: |
| movu m1, [tlq+ 2] |
| movu m2, [tlq+ 18] |
| paddw m1, m2 |
| movu m2, [tlq+ 34] |
| paddw m0, m2 |
| movu m2, [tlq+ 50] |
| paddw m1, m2 |
| movu m2, [tlq+ 66] |
| paddw m0, m2 |
| movu m2, [tlq+ 82] |
| paddw m1, m2 |
| movu m2, [tlq+ 98] |
| paddw m0, m2 |
| movu m2, [tlq+114] |
| paddw m1, m2 |
| paddw m0, m1 |
| punpcklwd m1, m0, m3 |
| punpckhwd m0, m3 |
| paddd m0, m1 |
| paddd m4, m0 |
| punpckhqdq m0, m0 |
| paddd m0, m4 |
| pshuflw m1, m0, q1032 |
| paddd m0, m1 |
| psrld m0, m5 |
| cmp hd, 64 |
| je .w64_end |
| mov r2d, 0xAAAB |
| mov r3d, 0x6667 |
| cmp hd, 16 |
| cmove r2d, r3d |
| movd m1, r2d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w64_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s64: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m0 |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m0 |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m0 |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m0 |
| add dstq, strideq |
| dec hd |
| jg .s64 |
| RET |
| |
| cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 |
| mov r6d, r8m |
| LEA r5, ipred_dc_128_16bpc_ssse3_table |
| tzcnt wd, wm |
| shr r6d, 11 |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_dc_splat_16bpc_ssse3_table |
| movifnidn hd, hm |
| movu m0, [tlq+ 2] |
| movu m1, [tlq+ 18] |
| movu m2, [tlq+ 34] |
| movu m3, [tlq+ 50] |
| cmp wd, 64 |
| je .w64 |
| tzcnt wd, wd |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w64: |
| WIN64_SPILL_XMM 8 |
| movu m4, [tlq+ 66] |
| movu m5, [tlq+ 82] |
| movu m6, [tlq+ 98] |
| movu m7, [tlq+114] |
| .w64_loop: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| mova [dstq+16*4], m4 |
| mova [dstq+16*5], m5 |
| mova [dstq+16*6], m6 |
| mova [dstq+16*7], m7 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| |
| cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 |
| %define base r5-ipred_h_16bpc_ssse3_table |
| tzcnt wd, wm |
| LEA r5, ipred_h_16bpc_ssse3_table |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| movddup m2, [base+pb_0_1] |
| movddup m3, [base+pb_2_3] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| sub tlq, 8 |
| movq m3, [tlq] |
| pshuflw m0, m3, q3333 |
| pshuflw m1, m3, q2222 |
| pshuflw m2, m3, q1111 |
| pshuflw m3, m3, q0000 |
| movq [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m1 |
| movq [dstq+strideq*2], m2 |
| movq [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4 |
| RET |
| .w8: |
| sub tlq, 8 |
| movq m3, [tlq] |
| punpcklwd m3, m3 |
| pshufd m0, m3, q3333 |
| pshufd m1, m3, q2222 |
| pshufd m2, m3, q1111 |
| pshufd m3, m3, q0000 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8 |
| RET |
| .w16: |
| sub tlq, 4 |
| movd m1, [tlq] |
| pshufb m0, m1, m3 |
| pshufb m1, m2 |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m0 |
| mova [dstq+strideq*1+16*0], m1 |
| mova [dstq+strideq*1+16*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16 |
| RET |
| .w32: |
| sub tlq, 4 |
| movd m1, [tlq] |
| pshufb m0, m1, m3 |
| pshufb m1, m2 |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m0 |
| mova [dstq+strideq*0+16*2], m0 |
| mova [dstq+strideq*0+16*3], m0 |
| mova [dstq+strideq*1+16*0], m1 |
| mova [dstq+strideq*1+16*1], m1 |
| mova [dstq+strideq*1+16*2], m1 |
| mova [dstq+strideq*1+16*3], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32 |
| RET |
| .w64: |
| sub tlq, 2 |
| movd m0, [tlq] |
| pshufb m0, m2 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m0 |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m0 |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m0 |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64 |
| RET |
| |
| cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left |
| %define base r5-ipred_paeth_16bpc_ssse3_table |
| movifnidn hd, hm |
| pshuflw m4, [tlq], q0000 |
| mov leftq, tlq |
| add hd, hd |
| punpcklqdq m4, m4 ; topleft |
| sub leftq, hq |
| and wd, ~7 |
| jnz .w8 |
| movddup m5, [tlq+2] ; top |
| psubw m6, m5, m4 |
| pabsw m7, m6 |
| .w4_loop: |
| movd m1, [leftq+hq-4] |
| punpcklwd m1, m1 |
| punpckldq m1, m1 ; left |
| %macro PAETH 0 |
| paddw m0, m6, m1 |
| psubw m2, m4, m0 ; tldiff |
| psubw m0, m5 ; tdiff |
| pabsw m2, m2 |
| pabsw m0, m0 |
| pminsw m2, m0 |
| pcmpeqw m0, m2 |
| pand m3, m5, m0 |
| pandn m0, m4 |
| por m0, m3 |
| pcmpgtw m3, m7, m2 |
| pand m0, m3 |
| pandn m3, m1 |
| por m0, m3 |
| %endmacro |
| PAETH |
| movhps [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2*2 |
| jg .w4_loop |
| RET |
| .w8: |
| %if ARCH_X86_32 |
| PUSH r6 |
| %define r7d hm |
| %assign regs_used 7 |
| %elif WIN64 |
| movaps r4m, m8 |
| PUSH r7 |
| %assign regs_used 8 |
| %endif |
| %if ARCH_X86_64 |
| movddup m8, [pb_0_1] |
| %endif |
| lea tlq, [tlq+wq*2+2] |
| neg wq |
| mov r7d, hd |
| .w8_loop0: |
| movu m5, [tlq+wq*2] |
| mov r6, dstq |
| add dstq, 16 |
| psubw m6, m5, m4 |
| pabsw m7, m6 |
| .w8_loop: |
| movd m1, [leftq+hq-2] |
| %if ARCH_X86_64 |
| pshufb m1, m8 |
| %else |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| %endif |
| PAETH |
| mova [r6], m0 |
| add r6, strideq |
| sub hd, 1*2 |
| jg .w8_loop |
| mov hd, r7d |
| add wq, 8 |
| jl .w8_loop0 |
| %if WIN64 |
| movaps m8, r4m |
| %endif |
| RET |
| |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 7 |
| %else |
| DECLARE_REG_TMP 4 |
| %endif |
| |
| cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights |
| LEA weightsq, smooth_weights_1d_16bpc |
| mov hd, hm |
| lea weightsq, [weightsq+hq*4] |
| neg hq |
| movd m5, [tlq+hq*2] ; bottom |
| pshuflw m5, m5, q0000 |
| punpcklqdq m5, m5 |
| cmp wd, 4 |
| jne .w8 |
| movddup m4, [tlq+2] ; top |
| lea r3, [strideq*3] |
| psubw m4, m5 ; top - bottom |
| .w4_loop: |
| movq m1, [weightsq+hq*2] |
| punpcklwd m1, m1 |
| pshufd m0, m1, q1100 |
| punpckhdq m1, m1 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| paddw m0, m5 |
| paddw m1, m5 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m1 |
| movhps [dstq+r3 ], m1 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w4_loop |
| RET |
| .w8: |
| %if ARCH_X86_32 |
| PUSH r6 |
| %assign regs_used 7 |
| mov hm, hq |
| %define hq hm |
| %elif WIN64 |
| PUSH r7 |
| %assign regs_used 8 |
| %endif |
| .w8_loop0: |
| mov t0, hq |
| movu m4, [tlq+2] |
| add tlq, 16 |
| mov r6, dstq |
| add dstq, 16 |
| psubw m4, m5 |
| .w8_loop: |
| movq m3, [weightsq+t0*2] |
| punpcklwd m3, m3 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| REPX {paddw x, m5}, m0, m1, m2, m3 |
| mova [r6+strideq*0], m0 |
| mova [r6+strideq*1], m1 |
| lea r6, [r6+strideq*2] |
| mova [r6+strideq*0], m2 |
| mova [r6+strideq*1], m3 |
| lea r6, [r6+strideq*2] |
| add t0, 4 |
| jl .w8_loop |
| sub wd, 8 |
| jg .w8_loop0 |
| RET |
| |
| cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights |
| LEA weightsq, smooth_weights_1d_16bpc |
| mov wd, wm |
| movifnidn hd, hm |
| movd m5, [tlq+wq*2] ; right |
| sub tlq, 8 |
| add hd, hd |
| pshuflw m5, m5, q0000 |
| sub tlq, hq |
| punpcklqdq m5, m5 |
| cmp wd, 4 |
| jne .w8 |
| movddup m4, [weightsq+4*2] |
| lea r3, [strideq*3] |
| .w4_loop: |
| movq m1, [tlq+hq] ; left |
| punpcklwd m1, m1 |
| psubw m1, m5 ; left - right |
| pshufd m0, m1, q3322 |
| punpckldq m1, m1 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| paddw m0, m5 |
| paddw m1, m5 |
| movhps [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m0 |
| movhps [dstq+strideq*2], m1 |
| movq [dstq+r3 ], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4*2 |
| jg .w4_loop |
| RET |
| .w8: |
| lea weightsq, [weightsq+wq*4] |
| neg wq |
| %if ARCH_X86_32 |
| PUSH r6 |
| %assign regs_used 7 |
| %define hd hm |
| %elif WIN64 |
| PUSH r7 |
| %assign regs_used 8 |
| %endif |
| .w8_loop0: |
| mov t0d, hd |
| mova m4, [weightsq+wq*2] |
| mov r6, dstq |
| add dstq, 16 |
| .w8_loop: |
| movq m3, [tlq+t0*(1+ARCH_X86_32)] |
| punpcklwd m3, m3 |
| psubw m3, m5 |
| pshufd m0, m3, q3333 |
| pshufd m1, m3, q2222 |
| pshufd m2, m3, q1111 |
| pshufd m3, m3, q0000 |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| REPX {paddw x, m5}, m0, m1, m2, m3 |
| mova [r6+strideq*0], m0 |
| mova [r6+strideq*1], m1 |
| lea r6, [r6+strideq*2] |
| mova [r6+strideq*0], m2 |
| mova [r6+strideq*1], m3 |
| lea r6, [r6+strideq*2] |
| sub t0d, 4*(1+ARCH_X86_64) |
| jg .w8_loop |
| add wq, 8 |
| jl .w8_loop0 |
| RET |
| |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 10 |
| %else |
| DECLARE_REG_TMP 3 |
| %endif |
| |
| cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ |
| h_weights, v_weights, top |
| LEA h_weightsq, smooth_weights_2d_16bpc |
| mov wd, wm |
| mov hd, hm |
| movd m7, [tlq+wq*2] ; right |
| lea v_weightsq, [h_weightsq+hq*8] |
| neg hq |
| movd m6, [tlq+hq*2] ; bottom |
| pshuflw m7, m7, q0000 |
| pshuflw m6, m6, q0000 |
| cmp wd, 4 |
| jne .w8 |
| movq m4, [tlq+2] ; top |
| mova m5, [h_weightsq+4*4] |
| punpcklwd m4, m6 ; top, bottom |
| pxor m6, m6 |
| .w4_loop: |
| movq m1, [v_weightsq+hq*4] |
| sub tlq, 4 |
| movd m3, [tlq] ; left |
| pshufd m0, m1, q0000 |
| pshufd m1, m1, q1111 |
| pmaddwd m0, m4 |
| punpcklwd m3, m7 ; left, right |
| pmaddwd m1, m4 |
| pshufd m2, m3, q1111 |
| pshufd m3, m3, q0000 |
| pmaddwd m2, m5 |
| pmaddwd m3, m5 |
| paddd m0, m2 |
| paddd m1, m3 |
| psrld m0, 8 |
| psrld m1, 8 |
| packssdw m0, m1 |
| pavgw m0, m6 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| add hq, 2 |
| jl .w4_loop |
| RET |
| .w8: |
| %if ARCH_X86_32 |
| lea h_weightsq, [h_weightsq+wq*4] |
| mov t0, tlq |
| mov r1m, tlq |
| mov r2m, hq |
| %define m8 [h_weightsq+16*0] |
| %define m9 [h_weightsq+16*1] |
| %else |
| %if WIN64 |
| movaps r4m, m8 |
| movaps r6m, m9 |
| PUSH r7 |
| PUSH r8 |
| %endif |
| PUSH r9 |
| PUSH r10 |
| %assign regs_used 11 |
| lea h_weightsq, [h_weightsq+wq*8] |
| lea topq, [tlq+wq*2] |
| neg wq |
| mov r8, tlq |
| mov r9, hq |
| %endif |
| punpcklqdq m6, m6 |
| .w8_loop0: |
| %if ARCH_X86_32 |
| movu m5, [t0+2] |
| add t0, 16 |
| mov r0m, t0 |
| %else |
| movu m5, [topq+wq*2+2] |
| mova m8, [h_weightsq+wq*4+16*0] |
| mova m9, [h_weightsq+wq*4+16*1] |
| %endif |
| mov t0, dstq |
| add dstq, 16 |
| punpcklwd m4, m5, m6 |
| punpckhwd m5, m6 |
| .w8_loop: |
| movd m1, [v_weightsq+hq*4] |
| sub tlq, 2 |
| movd m3, [tlq] ; left |
| pshufd m1, m1, q0000 |
| pmaddwd m0, m4, m1 |
| pshuflw m3, m3, q0000 |
| pmaddwd m1, m5 |
| punpcklwd m3, m7 ; left, right |
| pmaddwd m2, m8, m3 |
| pmaddwd m3, m9 |
| paddd m0, m2 |
| paddd m1, m3 |
| psrld m0, 8 |
| psrld m1, 8 |
| packssdw m0, m1 |
| pxor m1, m1 |
| pavgw m0, m1 |
| mova [t0], m0 |
| add t0, strideq |
| inc hq |
| jl .w8_loop |
| %if ARCH_X86_32 |
| mov t0, r0m |
| mov tlq, r1m |
| add h_weightsq, 16*2 |
| mov hq, r2m |
| sub dword wm, 8 |
| jg .w8_loop0 |
| %else |
| mov tlq, r8 |
| mov hq, r9 |
| add wq, 8 |
| jl .w8_loop0 |
| %endif |
| %if WIN64 |
| movaps m8, r4m |
| movaps m9, r6m |
| %endif |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter |
| %else |
| cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter |
| %define m8 [esp+16*0] |
| %define m9 [esp+16*1] |
| %define m10 [esp+16*2] |
| %define m11 [esp+16*3] |
| %define m12 [esp+16*4] |
| %define m13 [esp+16*5] |
| %define m14 [esp+16*6] |
| %define m15 [esp+16*7] |
| %endif |
| %define base r6-$$ |
| movifnidn hd, hm |
| movd m6, r8m ; bitdepth_max |
| %ifidn filterd, filterm |
| movzx filterd, filterb |
| %else |
| movzx filterd, byte filterm |
| %endif |
| LEA r6, $$ |
| shl filterd, 6 |
| movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 |
| mova m1, [base+filter_intra_taps+filterq+16*0] |
| mova m2, [base+filter_intra_taps+filterq+16*1] |
| mova m3, [base+filter_intra_taps+filterq+16*2] |
| mova m4, [base+filter_intra_taps+filterq+16*3] |
| pxor m5, m5 |
| %if ARCH_X86_64 |
| punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper |
| punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid |
| punpcklbw m10, m5, m2 ; having to perform sign-extension. |
| punpckhbw m11, m5, m2 |
| punpcklbw m12, m5, m3 |
| punpckhbw m13, m5, m3 |
| punpcklbw m14, m5, m4 |
| punpckhbw m15, m5, m4 |
| %else |
| punpcklbw m7, m5, m1 |
| mova m8, m7 |
| punpckhbw m7, m5, m1 |
| mova m9, m7 |
| punpcklbw m7, m5, m2 |
| mova m10, m7 |
| punpckhbw m7, m5, m2 |
| mova m11, m7 |
| punpcklbw m7, m5, m3 |
| mova m12, m7 |
| punpckhbw m7, m5, m3 |
| mova m13, m7 |
| punpcklbw m7, m5, m4 |
| mova m14, m7 |
| punpckhbw m7, m5, m4 |
| mova m15, m7 |
| %endif |
| mova m7, [base+filter_shuf] |
| add hd, hd |
| mov r5, dstq |
| pshuflw m6, m6, q0000 |
| mov r6, tlq |
| punpcklqdq m6, m6 |
| sub tlq, hq |
| .left_loop: |
| pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ |
| pshufd m1, m0, q0000 |
| pmaddwd m2, m8, m1 |
| pmaddwd m1, m9 |
| pshufd m4, m0, q1111 |
| pmaddwd m3, m10, m4 |
| pmaddwd m4, m11 |
| paddd m2, m3 |
| paddd m1, m4 |
| pshufd m4, m0, q2222 |
| pmaddwd m3, m12, m4 |
| pmaddwd m4, m13 |
| paddd m2, m3 |
| paddd m1, m4 |
| pshufd m3, m0, q3333 |
| pmaddwd m0, m14, m3 |
| pmaddwd m3, m15 |
| paddd m0, m2 |
| paddd m1, m3 |
| psrad m0, 11 ; x >> 3 |
| psrad m1, 11 |
| packssdw m0, m1 |
| pmaxsw m0, m5 |
| pavgw m0, m5 ; (x + 8) >> 4 |
| pminsw m0, m6 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movlps m0, [tlq+hq-10] |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2*2 |
| jg .left_loop |
| sub wd, 4 |
| jz .end |
| sub tld, r6d ; -h*2 |
| sub r6, r5 ; tl-dst |
| .right_loop0: |
| add r5, 8 |
| mov hd, tld |
| movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ |
| mov dstq, r5 |
| .right_loop: |
| pshufd m2, m0, q0000 |
| pmaddwd m1, m8, m2 |
| pmaddwd m2, m9 |
| pshufd m4, m0, q1111 |
| pmaddwd m3, m10, m4 |
| pmaddwd m4, m11 |
| pinsrw m0, [dstq+strideq*0-2], 5 |
| paddd m1, m3 |
| paddd m2, m4 |
| pshufd m0, m0, q2222 |
| movddup m4, [dstq+strideq*1-8] |
| pmaddwd m3, m12, m0 |
| pmaddwd m0, m13 |
| paddd m1, m3 |
| paddd m0, m2 |
| pshuflw m2, m4, q3333 |
| punpcklwd m2, m5 |
| pmaddwd m3, m14, m2 |
| pmaddwd m2, m15 |
| paddd m1, m3 |
| paddd m0, m2 |
| psrad m1, 11 |
| psrad m0, 11 |
| packssdw m0, m1 |
| pmaxsw m0, m5 |
| pavgw m0, m5 |
| pminsw m0, m6 |
| movhps [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m0 |
| palignr m0, m4, 14 |
| lea dstq, [dstq+strideq*2] |
| add hd, 2*2 |
| jl .right_loop |
| sub wd, 4 |
| jg .right_loop0 |
| .end: |
| RET |
| |
| %if UNIX64 |
| DECLARE_REG_TMP 7 |
| %else |
| DECLARE_REG_TMP 5 |
| %endif |
| |
| cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac |
| LEA t0, ipred_cfl_left_16bpc_ssse3_table |
| movd m4, wd |
| tzcnt wd, wd |
| movifnidn hd, hm |
| add tlq, 2 |
| movsxd r6, [t0+wq*4] |
| movd m5, wd |
| jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) |
| |
| cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha |
| movifnidn hd, hm |
| LEA t0, ipred_cfl_left_16bpc_ssse3_table |
| tzcnt wd, wm |
| lea r6d, [hq*2] |
| movd m4, hd |
| sub tlq, r6 |
| tzcnt r6d, hd |
| movd m5, r6d |
| movsxd r6, [t0+r6*4] |
| .start: |
| movd m7, r7m |
| movu m0, [tlq] |
| add r6, t0 |
| add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table |
| movsxd wq, [t0+wq*4] |
| pxor m6, m6 |
| pshuflw m7, m7, q0000 |
| pcmpeqw m3, m3 |
| add wq, t0 |
| movifnidn acq, acmp |
| pavgw m4, m6 |
| punpcklqdq m7, m7 |
| jmp r6 |
| .h32: |
| movu m1, [tlq+48] |
| movu m2, [tlq+32] |
| paddw m0, m1 |
| paddw m0, m2 |
| .h16: |
| movu m1, [tlq+16] |
| paddw m0, m1 |
| .h8: |
| pshufd m1, m0, q1032 |
| paddw m0, m1 |
| .h4: |
| pmaddwd m0, m3 |
| psubd m4, m0 |
| pshuflw m0, m4, q1032 |
| paddd m0, m4 |
| psrld m0, m5 |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| jmp wq |
| |
| %macro IPRED_CFL 2 ; dst, src |
| pabsw m%1, m%2 |
| pmulhrsw m%1, m2 |
| psignw m%2, m1 |
| psignw m%1, m%2 |
| paddw m%1, m0 |
| pmaxsw m%1, m6 |
| pminsw m%1, m7 |
| %endmacro |
| |
| cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha |
| movifnidn hd, hm |
| tzcnt r6d, hd |
| lea t0d, [wq+hq] |
| movd m4, t0d |
| tzcnt t0d, t0d |
| movd m5, t0d |
| LEA t0, ipred_cfl_16bpc_ssse3_table |
| tzcnt wd, wd |
| movd m7, r7m |
| movsxd r6, [t0+r6*4] |
| movsxd wq, [t0+wq*4+4*4] |
| psrlw m4, 1 |
| pxor m6, m6 |
| pshuflw m7, m7, q0000 |
| add r6, t0 |
| add wq, t0 |
| movifnidn acq, acmp |
| pcmpeqw m3, m3 |
| punpcklqdq m7, m7 |
| jmp r6 |
| .h4: |
| movq m0, [tlq-8] |
| jmp wq |
| .w4: |
| movq m1, [tlq+2] |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psubd m4, m0 |
| pshufd m0, m4, q1032 |
| paddd m0, m4 |
| pshuflw m4, m0, q1032 |
| paddd m0, m4 |
| cmp hd, 4 |
| jg .w4_mul |
| psrld m0, 3 |
| jmp .w4_end |
| .w4_mul: |
| mov r6d, 0xAAAB |
| mov r2d, 0x6667 |
| cmp hd, 16 |
| cmove r6d, r2d |
| movd m1, r6d |
| psrld m0, 2 |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w4_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s4: |
| movd m1, alpham |
| lea r6, [strideq*3] |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s4_loop: |
| mova m4, [acq+16*0] |
| mova m5, [acq+16*1] |
| add acq, 16*2 |
| IPRED_CFL 3, 4 |
| IPRED_CFL 4, 5 |
| movq [dstq+strideq*0], m3 |
| movhps [dstq+strideq*1], m3 |
| movq [dstq+strideq*2], m4 |
| movhps [dstq+r6 ], m4 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4_loop |
| RET |
| .h8: |
| mova m0, [tlq-16] |
| jmp wq |
| .w8: |
| movu m1, [tlq+2] |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psubd m4, m0 |
| pshufd m0, m4, q1032 |
| paddd m0, m4 |
| pshuflw m4, m0, q1032 |
| paddd m0, m4 |
| psrld m0, m5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0xAAAB |
| mov r2d, 0x6667 |
| cmp hd, 32 |
| cmove r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w8_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s8: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s8_loop: |
| mova m4, [acq+16*0] |
| mova m5, [acq+16*1] |
| add acq, 16*2 |
| IPRED_CFL 3, 4 |
| IPRED_CFL 4, 5 |
| mova [dstq+strideq*0], m3 |
| mova [dstq+strideq*1], m4 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .s8_loop |
| RET |
| .h16: |
| mova m0, [tlq-32] |
| paddw m0, [tlq-16] |
| jmp wq |
| .w16: |
| movu m1, [tlq+ 2] |
| movu m2, [tlq+18] |
| paddw m1, m2 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psubd m4, m0 |
| pshufd m0, m4, q1032 |
| paddd m0, m4 |
| pshuflw m4, m0, q1032 |
| paddd m0, m4 |
| psrld m0, m5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0xAAAB |
| mov r2d, 0x6667 |
| test hd, 8|32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w16_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s16: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s16_loop: |
| mova m4, [acq+16*0] |
| mova m5, [acq+16*1] |
| add acq, 16*2 |
| IPRED_CFL 3, 4 |
| IPRED_CFL 4, 5 |
| mova [dstq+16*0], m3 |
| mova [dstq+16*1], m4 |
| add dstq, strideq |
| dec hd |
| jg .s16_loop |
| RET |
| .h32: |
| mova m0, [tlq-64] |
| paddw m0, [tlq-48] |
| paddw m0, [tlq-32] |
| paddw m0, [tlq-16] |
| jmp wq |
| .w32: |
| movu m1, [tlq+ 2] |
| movu m2, [tlq+18] |
| paddw m1, m2 |
| movu m2, [tlq+34] |
| paddw m1, m2 |
| movu m2, [tlq+50] |
| paddw m1, m2 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psubd m4, m0 |
| pshufd m0, m4, q1032 |
| paddd m0, m4 |
| pshuflw m4, m0, q1032 |
| paddd m0, m4 |
| psrld m0, m5 |
| cmp hd, 32 |
| je .w32_end |
| mov r6d, 0xAAAB |
| mov r2d, 0x6667 |
| cmp hd, 8 |
| cmove r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| psrlw m0, 1 |
| .w32_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s32: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s32_loop: |
| mova m4, [acq+16*0] |
| mova m5, [acq+16*1] |
| IPRED_CFL 3, 4 |
| IPRED_CFL 4, 5 |
| mova [dstq+16*0], m3 |
| mova [dstq+16*1], m4 |
| mova m4, [acq+16*2] |
| mova m5, [acq+16*3] |
| add acq, 16*4 |
| IPRED_CFL 3, 4 |
| IPRED_CFL 4, 5 |
| mova [dstq+16*2], m3 |
| mova [dstq+16*3], m4 |
| add dstq, strideq |
| dec hd |
| jg .s32_loop |
| RET |
| |
| cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac |
| tzcnt wd, wm |
| LEA t0, ipred_cfl_splat_16bpc_ssse3_table |
| mov r6d, r7m |
| movifnidn hd, hm |
| shr r6d, 11 |
| movd m7, r7m |
| movsxd wq, [t0+wq*4] |
| movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] |
| pshuflw m7, m7, q0000 |
| pxor m6, m6 |
| add wq, t0 |
| movifnidn acq, acmp |
| punpcklqdq m7, m7 |
| jmp wq |
| |
| cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h |
| movifnidn hpadd, hpadm |
| %if ARCH_X86_32 && PIC |
| pcmpeqw m5, m5 |
| pabsw m5, m5 |
| paddw m5, m5 |
| %else |
| movddup m5, [pw_2] |
| %endif |
| mov hd, hm |
| shl hpadd, 2 |
| pxor m4, m4 |
| sub hd, hpadd |
| cmp dword wm, 8 |
| mov r5, acq |
| jg .w16 |
| je .w8 |
| lea r3, [strideq*3] |
| .w4_loop: |
| pmaddwd m0, m5, [ypxq+strideq*0] |
| pmaddwd m1, m5, [ypxq+strideq*1] |
| pmaddwd m2, m5, [ypxq+strideq*2] |
| pmaddwd m3, m5, [ypxq+r3 ] |
| lea ypxq, [ypxq+strideq*4] |
| paddd m0, m1 |
| paddd m2, m3 |
| paddd m4, m0 |
| packssdw m0, m2 |
| paddd m4, m2 |
| mova [acq], m0 |
| add acq, 16 |
| sub hd, 2 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .dc |
| punpckhqdq m0, m0 |
| pslld m2, 2 |
| .w4_hpad: |
| mova [acq+16*0], m0 |
| paddd m4, m2 |
| mova [acq+16*1], m0 |
| add acq, 16*2 |
| sub hpadd, 4 |
| jg .w4_hpad |
| jmp .dc |
| .w8: |
| %if ARCH_X86_32 |
| cmp dword wpadm, 0 |
| %else |
| test wpadd, wpadd |
| %endif |
| jnz .w8_wpad1 |
| .w8_loop: |
| pmaddwd m0, m5, [ypxq+strideq*0+16*0] |
| pmaddwd m2, m5, [ypxq+strideq*1+16*0] |
| pmaddwd m1, m5, [ypxq+strideq*0+16*1] |
| pmaddwd m3, m5, [ypxq+strideq*1+16*1] |
| lea ypxq, [ypxq+strideq*2] |
| paddd m0, m2 |
| paddd m1, m3 |
| paddd m2, m0, m1 |
| packssdw m0, m1 |
| paddd m4, m2 |
| mova [acq], m0 |
| add acq, 16 |
| dec hd |
| jg .w8_loop |
| .w8_hpad: |
| test hpadd, hpadd |
| jz .dc |
| pslld m2, 2 |
| mova m1, m0 |
| jmp .hpad |
| .w8_wpad1: |
| pmaddwd m0, m5, [ypxq+strideq*0] |
| pmaddwd m1, m5, [ypxq+strideq*1] |
| lea ypxq, [ypxq+strideq*2] |
| paddd m0, m1 |
| pshufd m1, m0, q3333 |
| paddd m2, m0, m1 |
| packssdw m0, m1 |
| paddd m4, m2 |
| mova [acq], m0 |
| add acq, 16 |
| dec hd |
| jg .w8_wpad1 |
| jmp .w8_hpad |
| .w16_wpad3: |
| pshufd m3, m0, q3333 |
| mova m1, m3 |
| mova m2, m3 |
| jmp .w16_wpad_end |
| .w16_wpad2: |
| pshufd m1, m3, q3333 |
| mova m2, m1 |
| jmp .w16_wpad_end |
| .w16_wpad1: |
| pshufd m2, m1, q3333 |
| jmp .w16_wpad_end |
| .w16: |
| movifnidn wpadd, wpadm |
| WIN64_SPILL_XMM 7 |
| .w16_loop: |
| pmaddwd m0, m5, [ypxq+strideq*0+16*0] |
| pmaddwd m6, m5, [ypxq+strideq*1+16*0] |
| paddd m0, m6 |
| cmp wpadd, 2 |
| jg .w16_wpad3 |
| pmaddwd m3, m5, [ypxq+strideq*0+16*1] |
| pmaddwd m6, m5, [ypxq+strideq*1+16*1] |
| paddd m3, m6 |
| je .w16_wpad2 |
| pmaddwd m1, m5, [ypxq+strideq*0+16*2] |
| pmaddwd m6, m5, [ypxq+strideq*1+16*2] |
| paddd m1, m6 |
| jp .w16_wpad1 |
| pmaddwd m2, m5, [ypxq+strideq*0+16*3] |
| pmaddwd m6, m5, [ypxq+strideq*1+16*3] |
| paddd m2, m6 |
| .w16_wpad_end: |
| lea ypxq, [ypxq+strideq*2] |
| paddd m6, m0, m3 |
| packssdw m0, m3 |
| paddd m6, m1 |
| mova [acq+16*0], m0 |
| packssdw m1, m2 |
| paddd m2, m6 |
| mova [acq+16*1], m1 |
| add acq, 16*2 |
| paddd m4, m2 |
| dec hd |
| jg .w16_loop |
| WIN64_RESTORE_XMM |
| add hpadd, hpadd |
| jz .dc |
| paddd m2, m2 |
| .hpad: |
| mova [acq+16*0], m0 |
| mova [acq+16*1], m1 |
| paddd m4, m2 |
| mova [acq+16*2], m0 |
| mova [acq+16*3], m1 |
| add acq, 16*4 |
| sub hpadd, 4 |
| jg .hpad |
| .dc: |
| sub r5, acq ; -w*h*2 |
| pshufd m2, m4, q1032 |
| tzcnt r1d, r5d |
| paddd m2, m4 |
| sub r1d, 2 |
| pshufd m4, m2, q2301 |
| movd m0, r1d |
| paddd m2, m4 |
| psrld m2, m0 |
| pxor m0, m0 |
| pavgw m2, m0 |
| packssdw m2, m2 |
| .dc_loop: |
| mova m0, [acq+r5+16*0] |
| mova m1, [acq+r5+16*1] |
| psubw m0, m2 |
| psubw m1, m2 |
| mova [acq+r5+16*0], m0 |
| mova [acq+r5+16*1], m1 |
| add r5, 16*2 |
| jl .dc_loop |
| RET |
| |
| cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h |
| movifnidn hpadd, hpadm |
| %if ARCH_X86_32 && PIC |
| pcmpeqw m5, m5 |
| pabsw m5, m5 |
| psllw m5, 2 |
| %else |
| movddup m5, [pw_4] |
| %endif |
| mov hd, hm |
| shl hpadd, 2 |
| pxor m4, m4 |
| sub hd, hpadd |
| cmp dword wm, 8 |
| mov r5, acq |
| jg .w16 |
| je .w8 |
| lea r3, [strideq*3] |
| .w4_loop: |
| pmaddwd m0, m5, [ypxq+strideq*0] |
| pmaddwd m3, m5, [ypxq+strideq*1] |
| pmaddwd m1, m5, [ypxq+strideq*2] |
| pmaddwd m2, m5, [ypxq+r3 ] |
| lea ypxq, [ypxq+strideq*4] |
| paddd m4, m0 |
| packssdw m0, m3 |
| paddd m3, m1 |
| packssdw m1, m2 |
| paddd m4, m2 |
| paddd m4, m3 |
| mova [acq+16*0], m0 |
| mova [acq+16*1], m1 |
| add acq, 16*2 |
| sub hd, 4 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| punpckhqdq m1, m1 |
| pslld m2, 3 |
| mova [acq+16*0], m1 |
| mova [acq+16*1], m1 |
| paddd m4, m2 |
| mova [acq+16*2], m1 |
| mova [acq+16*3], m1 |
| add acq, 16*4 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| .w8: |
| %if ARCH_X86_32 |
| cmp dword wpadm, 0 |
| %else |
| test wpadd, wpadd |
| %endif |
| jnz .w8_wpad1 |
| .w8_loop: |
| pmaddwd m0, m5, [ypxq+strideq*0+16*0] |
| pmaddwd m2, m5, [ypxq+strideq*0+16*1] |
| pmaddwd m1, m5, [ypxq+strideq*1+16*0] |
| pmaddwd m3, m5, [ypxq+strideq*1+16*1] |
| lea ypxq, [ypxq+strideq*2] |
| paddd m4, m0 |
| packssdw m0, m2 |
| paddd m4, m2 |
| mova [acq+16*0], m0 |
| paddd m2, m1, m3 |
| packssdw m1, m3 |
| paddd m4, m2 |
| mova [acq+16*1], m1 |
| add acq, 16*2 |
| sub hd, 2 |
| jg .w8_loop |
| .w8_hpad: |
| test hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| pslld m2, 2 |
| mova m0, m1 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad |
| .w8_wpad1: |
| pmaddwd m0, m5, [ypxq+strideq*0] |
| pmaddwd m1, m5, [ypxq+strideq*1] |
| lea ypxq, [ypxq+strideq*2] |
| pshufd m2, m0, q3333 |
| pshufd m3, m1, q3333 |
| paddd m4, m0 |
| packssdw m0, m2 |
| paddd m4, m2 |
| paddd m2, m1, m3 |
| packssdw m1, m3 |
| paddd m4, m2 |
| mova [acq+16*0], m0 |
| mova [acq+16*1], m1 |
| add acq, 16*2 |
| sub hd, 2 |
| jg .w8_wpad1 |
| jmp .w8_hpad |
| .w16_wpad3: |
| pshufd m3, m0, q3333 |
| mova m1, m3 |
| mova m2, m3 |
| jmp .w16_wpad_end |
| .w16_wpad2: |
| pshufd m1, m3, q3333 |
| mova m2, m1 |
| jmp .w16_wpad_end |
| .w16_wpad1: |
| pshufd m2, m1, q3333 |
| jmp .w16_wpad_end |
| .w16: |
| movifnidn wpadd, wpadm |
| WIN64_SPILL_XMM 7 |
| .w16_loop: |
| pmaddwd m0, m5, [ypxq+16*0] |
| cmp wpadd, 2 |
| jg .w16_wpad3 |
| pmaddwd m3, m5, [ypxq+16*1] |
| je .w16_wpad2 |
| pmaddwd m1, m5, [ypxq+16*2] |
| jp .w16_wpad1 |
| pmaddwd m2, m5, [ypxq+16*3] |
| .w16_wpad_end: |
| add ypxq, strideq |
| paddd m6, m0, m3 |
| packssdw m0, m3 |
| mova [acq+16*0], m0 |
| paddd m6, m1 |
| packssdw m1, m2 |
| paddd m2, m6 |
| mova [acq+16*1], m1 |
| add acq, 16*2 |
| paddd m4, m2 |
| dec hd |
| jg .w16_loop |
| WIN64_RESTORE_XMM |
| add hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| paddd m2, m2 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad |
| |
| cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h |
| %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table |
| LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table |
| tzcnt wd, wm |
| movifnidn hpadd, hpadm |
| pxor m4, m4 |
| movsxd wq, [r6+wq*4] |
| movddup m5, [base+pw_1] |
| add wq, r6 |
| mov hd, hm |
| shl hpadd, 2 |
| sub hd, hpadd |
| jmp wq |
| .w4: |
| lea r3, [strideq*3] |
| mov r5, acq |
| .w4_loop: |
| movq m0, [ypxq+strideq*0] |
| movhps m0, [ypxq+strideq*1] |
| movq m1, [ypxq+strideq*2] |
| movhps m1, [ypxq+r3 ] |
| lea ypxq, [ypxq+strideq*4] |
| psllw m0, 3 |
| psllw m1, 3 |
| mova [acq+16*0], m0 |
| pmaddwd m0, m5 |
| mova [acq+16*1], m1 |
| pmaddwd m2, m5, m1 |
| add acq, 16*2 |
| paddd m4, m0 |
| paddd m4, m2 |
| sub hd, 4 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| punpckhqdq m1, m1 |
| mova [acq+16*0], m1 |
| pslld m2, 2 |
| mova [acq+16*1], m1 |
| punpckhqdq m2, m2 |
| mova [acq+16*2], m1 |
| paddd m4, m2 |
| mova [acq+16*3], m1 |
| add acq, 16*4 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| .w8: |
| mov r5, acq |
| .w8_loop: |
| mova m0, [ypxq+strideq*0] |
| mova m1, [ypxq+strideq*1] |
| lea ypxq, [ypxq+strideq*2] |
| psllw m0, 3 |
| psllw m1, 3 |
| mova [acq+16*0], m0 |
| pmaddwd m0, m5 |
| mova [acq+16*1], m1 |
| pmaddwd m2, m5, m1 |
| add acq, 16*2 |
| paddd m4, m0 |
| paddd m4, m2 |
| sub hd, 2 |
| jg .w8_loop |
| .w8_hpad: |
| test hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| pslld m2, 2 |
| mova m0, m1 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad |
| .w16_wpad2: |
| pshufhw m3, m2, q3333 |
| pshufhw m1, m0, q3333 |
| punpckhqdq m3, m3 |
| punpckhqdq m1, m1 |
| jmp .w16_wpad_end |
| .w16: |
| movifnidn wpadd, wpadm |
| mov r5, acq |
| .w16_loop: |
| mova m2, [ypxq+strideq*0+16*0] |
| mova m0, [ypxq+strideq*1+16*0] |
| psllw m2, 3 |
| psllw m0, 3 |
| test wpadd, wpadd |
| jnz .w16_wpad2 |
| mova m3, [ypxq+strideq*0+16*1] |
| mova m1, [ypxq+strideq*1+16*1] |
| psllw m3, 3 |
| psllw m1, 3 |
| .w16_wpad_end: |
| lea ypxq, [ypxq+strideq*2] |
| mova [acq+16*0], m2 |
| pmaddwd m2, m5 |
| mova [acq+16*1], m3 |
| pmaddwd m3, m5 |
| paddd m4, m2 |
| pmaddwd m2, m5, m0 |
| mova [acq+16*2], m0 |
| paddd m4, m3 |
| pmaddwd m3, m5, m1 |
| mova [acq+16*3], m1 |
| add acq, 16*4 |
| paddd m2, m3 |
| paddd m4, m2 |
| sub hd, 2 |
| jg .w16_loop |
| add hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| paddd m2, m2 |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad |
| .w32_wpad6: |
| pshufhw m1, m0, q3333 |
| punpckhqdq m1, m1 |
| mova m2, m1 |
| mova m3, m1 |
| jmp .w32_wpad_end |
| .w32_wpad4: |
| pshufhw m2, m1, q3333 |
| punpckhqdq m2, m2 |
| mova m3, m2 |
| jmp .w32_wpad_end |
| .w32_wpad2: |
| pshufhw m3, m2, q3333 |
| punpckhqdq m3, m3 |
| jmp .w32_wpad_end |
| .w32: |
| movifnidn wpadd, wpadm |
| mov r5, acq |
| WIN64_SPILL_XMM 8 |
| .w32_loop: |
| mova m0, [ypxq+16*0] |
| psllw m0, 3 |
| cmp wpadd, 4 |
| jg .w32_wpad6 |
| mova m1, [ypxq+16*1] |
| psllw m1, 3 |
| je .w32_wpad4 |
| mova m2, [ypxq+16*2] |
| psllw m2, 3 |
| jnp .w32_wpad2 |
| mova m3, [ypxq+16*3] |
| psllw m3, 3 |
| .w32_wpad_end: |
| add ypxq, strideq |
| pmaddwd m6, m5, m0 |
| mova [acq+16*0], m0 |
| pmaddwd m7, m5, m1 |
| mova [acq+16*1], m1 |
| paddd m6, m7 |
| pmaddwd m7, m5, m2 |
| mova [acq+16*2], m2 |
| paddd m6, m7 |
| pmaddwd m7, m5, m3 |
| mova [acq+16*3], m3 |
| add acq, 16*4 |
| paddd m6, m7 |
| paddd m4, m6 |
| dec hd |
| jg .w32_loop |
| %if WIN64 |
| mova m5, m6 |
| WIN64_RESTORE_XMM |
| SWAP 5, 6 |
| %endif |
| test hpadd, hpadd |
| jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| .w32_hpad_loop: |
| mova [acq+16*0], m0 |
| mova [acq+16*1], m1 |
| paddd m4, m6 |
| mova [acq+16*2], m2 |
| mova [acq+16*3], m3 |
| add acq, 16*4 |
| dec hpadd |
| jg .w32_hpad_loop |
| jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc |
| |
| cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h |
| %define base r2-pal_pred_16bpc_ssse3_table |
| %if ARCH_X86_32 |
| %define hd r2d |
| %endif |
| mova m3, [palq] |
| LEA r2, pal_pred_16bpc_ssse3_table |
| tzcnt wd, wm |
| pshufb m3, [base+pal_pred_shuf] |
| movsxd wq, [r2+wq*4] |
| pshufd m4, m3, q1032 |
| add wq, r2 |
| movifnidn hd, hm |
| jmp wq |
| .w4: |
| mova m0, [idxq] |
| add idxq, 16 |
| pshufb m1, m3, m0 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| movq [dstq+strideq*0], m1 |
| movhps [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 4 |
| jg .w4 |
| RET |
| .w8: |
| mova m0, [idxq] |
| add idxq, 16 |
| pshufb m1, m3, m0 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8 |
| RET |
| .w16: |
| mova m0, [idxq] |
| add idxq, 16 |
| pshufb m1, m3, m0 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| add dstq, strideq |
| dec hd |
| jg .w16 |
| RET |
| .w32: |
| mova m0, [idxq+16*0] |
| pshufb m1, m3, m0 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova m2, [idxq+16*1] |
| add idxq, 16*2 |
| mova [dstq+16*0], m0 |
| pshufb m0, m3, m2 |
| mova [dstq+16*1], m1 |
| pshufb m1, m4, m2 |
| punpcklbw m2, m0, m1 |
| punpckhbw m0, m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32 |
| RET |
| .w64: |
| mova m0, [idxq+16*0] |
| pshufb m1, m3, m0 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova m2, [idxq+16*1] |
| mova [dstq+16*0], m0 |
| pshufb m0, m3, m2 |
| mova [dstq+16*1], m1 |
| pshufb m1, m4, m2 |
| punpcklbw m2, m0, m1 |
| punpckhbw m0, m1 |
| mova m1, [idxq+16*2] |
| mova [dstq+16*2], m2 |
| pshufb m2, m3, m1 |
| mova [dstq+16*3], m0 |
| pshufb m0, m4, m1 |
| punpcklbw m1, m2, m0 |
| punpckhbw m2, m0 |
| mova m0, [idxq+16*3] |
| add idxq, 16*4 |
| mova [dstq+16*4], m1 |
| pshufb m1, m3, m0 |
| mova [dstq+16*5], m2 |
| pshufb m2, m4, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m1 |
| add dstq, strideq |
| dec hd |
| jg .w64 |
| RET |