| ; Copyright © 2018-2021, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| %macro SMOOTH_WEIGHT_TABLE 1-* |
| %rep %0 |
| db %1-128, 127-%1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| ; sm_weights[], but modified to precalculate x and 256-x with offsets to |
| ; enable efficient use of pmaddubsw (which requires signed values) |
| smooth_weights: SMOOTH_WEIGHT_TABLE \ |
| 0, 0, 255, 128, 255, 149, 85, 64, \ |
| 255, 197, 146, 105, 73, 50, 37, 32, \ |
| 255, 225, 196, 170, 145, 123, 102, 84, \ |
| 68, 54, 43, 33, 26, 20, 17, 16, \ |
| 255, 240, 225, 210, 196, 182, 169, 157, \ |
| 145, 133, 122, 111, 101, 92, 83, 74, \ |
| 66, 59, 52, 45, 39, 34, 29, 25, \ |
| 21, 17, 14, 12, 10, 9, 8, 8, \ |
| 255, 248, 240, 233, 225, 218, 210, 203, \ |
| 196, 189, 182, 176, 169, 163, 156, 150, \ |
| 144, 138, 133, 127, 121, 116, 111, 106, \ |
| 101, 96, 91, 86, 82, 77, 73, 69, \ |
| 65, 61, 57, 54, 50, 47, 44, 41, \ |
| 38, 35, 32, 29, 27, 25, 22, 20, \ |
| 18, 16, 15, 13, 12, 10, 9, 8, \ |
| 7, 6, 6, 5, 5, 4, 4, 4 |
| |
| ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 |
| ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 |
| ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 |
| filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 |
| filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 |
| z_filter_wh4: db 7, 7, 19, 7, |
| z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 |
| pb_8: times 8 db 8 |
| pd_32768: dd 32768 |
| z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 |
| z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 |
| db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 |
| z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 |
| z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 |
| z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 |
| z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 |
| pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 |
| z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 |
| db 7, 8, 8, 9, 9, 10, 10, 11 |
| z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 |
| z_filter_k: times 4 db 0, 16 |
| times 4 db 0, 20 |
| times 4 db 8, 16 |
| times 4 db 32, 16 |
| times 4 db 24, 20 |
| times 4 db 16, 16 |
| times 4 db 0, 0 |
| times 4 db 0, 0 |
| pw_8: times 8 db 8, 0 |
| pb_3: times 16 db 3 |
| pw_62: times 8 dw 62 |
| pw_64: times 8 dw 64 |
| pw_256: times 8 dw 256 |
| pw_512: times 8 dw 512 |
| pw_m256: times 8 dw -256 |
| pb_2: times 8 db 2 |
| pb_4: times 8 db 4 |
| pb_128: times 8 db 128 |
| pb_m16: times 8 db -16 |
| pw_128: times 4 dw 128 |
| pw_255: times 4 dw 255 |
| pb_36_m4: times 4 db 36, -4 |
| pb_127_m127: times 4 db 127, -127 |
| |
| %macro JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - 2*4) |
| %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%base %+ .%3 - (%%table - 2*4) |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) |
| %define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) |
| |
| JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ |
| s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 |
| JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ |
| s4-8*4, s8-8*4, s16-8*4, s32-8*4 |
| JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 |
| JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 |
| |
| cextern dr_intra_derivative |
| cextern filter_intra_taps |
| |
| SECTION .text |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| %macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 |
| pshuflw m1, m0, %3 ; extend 8 byte for 2 pos |
| punpcklqdq m1, m1 |
| mova [dstq + %2], m1 |
| %if %1 > 16 |
| mova [dstq + 16 + %2], m1 |
| %endif |
| %if %1 > 32 |
| mova [dstq + 32 + %2], m1 |
| mova [dstq + 48 + %2], m1 |
| %endif |
| %endmacro |
| |
| %macro IPRED_H 1 ; width |
| sub tlq, 4 |
| movd m0, [tlq] ; get 4 bytes of topleft data |
| punpcklbw m0, m0 ; extend 2 byte |
| %if %1 == 4 |
| pshuflw m1, m0, q2233 |
| movd [dstq+strideq*0], m1 |
| psrlq m1, 32 |
| movd [dstq+strideq*1], m1 |
| pshuflw m0, m0, q0011 |
| movd [dstq+strideq*2], m0 |
| psrlq m0, 32 |
| movd [dstq+stride3q ], m0 |
| |
| %elif %1 == 8 |
| punpcklwd m0, m0 |
| punpckhdq m1, m0, m0 |
| punpckldq m0, m0 |
| movq [dstq+strideq*1], m1 |
| movhps [dstq+strideq*0], m1 |
| movq [dstq+stride3q ], m0 |
| movhps [dstq+strideq*2], m0 |
| %else |
| IPRED_SET %1, 0, q3333 |
| IPRED_SET %1, strideq, q2222 |
| IPRED_SET %1, strideq*2, q1111 |
| IPRED_SET %1, stride3q, q0000 |
| %endif |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w%1 |
| RET |
| %endmacro |
| |
| INIT_XMM ssse3 |
| cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_h_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| IPRED_H 4 |
| .w8: |
| IPRED_H 8 |
| .w16: |
| IPRED_H 16 |
| .w32: |
| IPRED_H 32 |
| .w64: |
| IPRED_H 64 |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_dc_splat_ssse3_table |
| tzcnt wd, wm |
| movu m0, [tlq+ 1] |
| movu m1, [tlq+17] |
| movu m2, [tlq+33] |
| movu m3, [tlq+49] |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| movifnidn hd, hm |
| movifnidn wd, wm |
| tzcnt r6d, hd |
| lea r5d, [wq+hq] |
| movd m4, r5d |
| tzcnt r5d, r5d |
| movd m5, r5d |
| LEA r5, ipred_dc_ssse3_table |
| tzcnt wd, wd |
| movsxd r6, [r5+r6*4] |
| movsxd wq, [r5+wq*4+20] |
| pcmpeqd m3, m3 |
| psrlw m4, 1 ; dc = (width + height) >> 1; |
| add r6, r5 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp r6 |
| .h4: |
| movd m0, [tlq-4] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w4: |
| movd m1, [tlq+1] |
| pmaddubsw m1, m3 |
| psubw m0, m4 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw m0, 3 ; dc >>= ctz(width + height); |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq m1, m0, m0 |
| paddw m0, m1 |
| psrlq m1, m0, 32 |
| paddw m0, m1 |
| psrlw m0, 2 |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 8 |
| cmovz r6d, r2d |
| movd m5, r6d |
| pmulhuw m0, m5 |
| .w4_end: |
| pxor m1, m1 |
| pshufb m0, m1 |
| .s4: |
| movd [dstq+strideq*0], m0 |
| movd [dstq+strideq*1], m0 |
| movd [dstq+strideq*2], m0 |
| movd [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4 |
| RET |
| ALIGN function_align |
| .h8: |
| movq m0, [tlq-8] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w8: |
| movq m1, [tlq+1] |
| pmaddubsw m1, m3 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| paddw m0, m1 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w8_end: |
| pxor m1, m1 |
| pshufb m0, m1 |
| .s8: |
| movq [dstq+strideq*0], m0 |
| movq [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s8 |
| RET |
| ALIGN function_align |
| .h16: |
| mova m0, [tlq-16] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w16: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 8|32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w16_end: |
| pxor m1, m1 |
| pshufb m0, m1 |
| .s16: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s16 |
| RET |
| ALIGN function_align |
| .h32: |
| mova m0, [tlq-32] |
| pmaddubsw m0, m3 |
| mova m2, [tlq-16] |
| pmaddubsw m2, m3 |
| paddw m0, m2 |
| jmp wq |
| .w32: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| movu m2, [tlq+17] |
| pmaddubsw m2, m3 |
| paddw m1, m2 |
| paddw m0, m1 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 64|16 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w32_end: |
| pxor m1, m1 |
| pshufb m0, m1 |
| mova m1, m0 |
| .s32: |
| mova [dstq], m0 |
| mova [dstq+16], m1 |
| mova [dstq+strideq], m0 |
| mova [dstq+strideq+16], m1 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+strideq*2+16], m1 |
| mova [dstq+stride3q], m0 |
| mova [dstq+stride3q+16], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s32 |
| RET |
| ALIGN function_align |
| .h64: |
| mova m0, [tlq-64] |
| mova m1, [tlq-48] |
| pmaddubsw m0, m3 |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| mova m1, [tlq-32] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| mova m1, [tlq-16] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| jmp wq |
| .w64: |
| movu m1, [tlq+ 1] |
| movu m2, [tlq+17] |
| pmaddubsw m1, m3 |
| pmaddubsw m2, m3 |
| paddw m1, m2 |
| movu m2, [tlq+33] |
| pmaddubsw m2, m3 |
| paddw m1, m2 |
| movu m2, [tlq+49] |
| pmaddubsw m2, m3 |
| paddw m1, m2 |
| paddw m0, m1 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 64 |
| je .w64_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w64_end: |
| pxor m1, m1 |
| pshufb m0, m1 |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| .s64: |
| mova [dstq], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| mova [dstq+strideq], m0 |
| mova [dstq+strideq+16], m1 |
| mova [dstq+strideq+32], m2 |
| mova [dstq+strideq+48], m3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .s64 |
| RET |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_dc_left_ssse3_table |
| mov hd, hm ; zero upper half |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movu m0, [tlq] |
| movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] |
| movd m2, r6d |
| psrld m3, m2 |
| movsxd r6, [r5+r6*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| .h64: |
| movu m1, [tlq+48] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| movu m1, [tlq+32] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| .h32: |
| movu m1, [tlq+16] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| .h16: |
| pshufd m1, m0, q3232 ; psrlq m1, m0, 16 |
| paddw m0, m1 |
| .h8: |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| .h4: |
| pmaddwd m0, m2 |
| pmulhrsw m0, m3 |
| lea stride3q, [strideq*3] |
| pxor m1, m1 |
| pshufb m0, m1 |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| jmp wq |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 |
| LEA r5, ipred_dc_splat_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h |
| LEA r5, ipred_dc_left_ssse3_table |
| tzcnt wd, wm |
| inc tlq |
| movu m0, [tlq] |
| movifnidn hd, hm |
| movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] |
| movd m2, wd |
| psrld m3, m2 |
| movsxd r6, [r5+wq*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] |
| ; w * a = (w - 128) * a + 128 * a |
| ; (256 - w) * b = (127 - w) * b + 129 * b |
| ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] |
| pmaddubsw m6, m%3, m%1 |
| pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b |
| paddw m6, m%5 |
| paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] |
| psrlw m6, 8 |
| psrlw m0, 8 |
| packuswb m6, m0 |
| %endmacro |
| |
| cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights |
| %define base r6-ipred_smooth_v_ssse3_table |
| LEA r6, ipred_smooth_v_ssse3_table |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| movddup m0, [base+pb_127_m127] |
| movddup m1, [base+pw_128] |
| lea weightsq, [base+smooth_weights+hq*4] |
| neg hq |
| movd m5, [tlq+hq] |
| pxor m2, m2 |
| pshufb m5, m2 |
| add wq, r6 |
| jmp wq |
| .w4: |
| movd m2, [tlq+1] |
| punpckldq m2, m2 |
| punpcklbw m2, m5 ; top, bottom |
| lea r3, [strideq*3] |
| mova m4, [base+ipred_v_shuf] |
| mova m5, m4 |
| punpckldq m4, m4 |
| punpckhdq m5, m5 |
| pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom |
| paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok |
| paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 |
| .w4_loop: |
| movu m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| movd [dstq+strideq*0], m6 |
| pshuflw m1, m6, q1032 |
| movd [dstq+strideq*1], m1 |
| punpckhqdq m6, m6 |
| movd [dstq+strideq*2], m6 |
| psrlq m6, 32 |
| movd [dstq+r3 ], m6 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w4_loop |
| RET |
| ALIGN function_align |
| .w8: |
| movq m2, [tlq+1] |
| punpcklbw m2, m5 |
| mova m5, [base+ipred_v_shuf] |
| lea r3, [strideq*3] |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 |
| paddw m3, m1 ; m3 is output for loop |
| .w8_loop: |
| movq m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| movq [dstq+strideq*0], m6 |
| movhps [dstq+strideq*1], m6 |
| lea dstq, [dstq+strideq*2] |
| add hq, 2 |
| jl .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| movu m3, [tlq+1] |
| punpcklbw m2, m3, m5 |
| punpckhbw m3, m5 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 ; m4 and m5 is output for loop |
| .w16_loop: |
| movd m1, [weightsq+hq*2] |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq], m6 |
| add dstq, strideq |
| add hq, 1 |
| jl .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| %if WIN64 |
| movaps [rsp+24], xmm7 |
| %define xmm_regs_used 8 |
| %endif |
| mova m7, m5 |
| .w32_loop_init: |
| mov r3d, 2 |
| .w32_loop: |
| movddup m0, [base+pb_127_m127] |
| movddup m1, [base+pw_128] |
| movu m3, [tlq+1] |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| movd m1, [weightsq+hq*2] |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq], m6 |
| add tlq, 16 |
| add dstq, 16 |
| dec r3d |
| jg .w32_loop |
| lea dstq, [dstq-32+strideq] |
| sub tlq, 32 |
| add hq, 1 |
| jl .w32_loop_init |
| RET |
| ALIGN function_align |
| .w64: |
| %if WIN64 |
| movaps [rsp+24], xmm7 |
| %define xmm_regs_used 8 |
| %endif |
| mova m7, m5 |
| .w64_loop_init: |
| mov r3d, 4 |
| .w64_loop: |
| movddup m0, [base+pb_127_m127] |
| movddup m1, [base+pw_128] |
| movu m3, [tlq+1] |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| movd m1, [weightsq+hq*2] |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq], m6 |
| add tlq, 16 |
| add dstq, 16 |
| dec r3d |
| jg .w64_loop |
| lea dstq, [dstq-64+strideq] |
| sub tlq, 64 |
| add hq, 1 |
| jl .w64_loop_init |
| RET |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h |
| %define base r6-ipred_smooth_h_ssse3_table |
| LEA r6, ipred_smooth_h_ssse3_table |
| mov wd, wm |
| movd m3, [tlq+wq] |
| pxor m1, m1 |
| pshufb m3, m1 ; right |
| tzcnt wd, wd |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| movddup m4, [base+pb_127_m127] |
| movddup m5, [base+pw_128] |
| add wq, r6 |
| jmp wq |
| .w4: |
| movddup m6, [base+smooth_weights+4*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 4 |
| sub tlq, hq |
| lea r3, [strideq*3] |
| .w4_loop: |
| movd m2, [tlq+hq] ; left |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| movd [dstq+strideq*0], m0 |
| pshuflw m1, m0, q1032 |
| movd [dstq+strideq*1], m1 |
| punpckhqdq m0, m0 |
| movd [dstq+strideq*2], m0 |
| psrlq m0, 32 |
| movd [dstq+r3 ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| ALIGN function_align |
| .w8: |
| mova m6, [base+smooth_weights+8*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 4 |
| sub tlq, hq |
| punpckldq m7, m7 |
| .w8_loop: |
| movd m2, [tlq+hq] ; left |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| mova m6, [base+smooth_weights+16*2] |
| mova m7, [base+smooth_weights+16*3] |
| sub tlq, 1 |
| sub tlq, hq |
| .w16_loop: |
| pxor m1, m1 |
| movd m2, [tlq+hq] ; left |
| pshufb m2, m1 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m7 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq], m0 |
| lea dstq, [dstq+strideq] |
| sub hd, 1 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| sub tlq, 1 |
| sub tlq, hq |
| pxor m6, m6 |
| .w32_loop_init: |
| mov r5, 2 |
| lea r3, [base+smooth_weights+16*4] |
| .w32_loop: |
| mova m7, [r3] |
| add r3, 16 |
| movd m2, [tlq+hq] ; left |
| pshufb m2, m6 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m7 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| mova m7, [r3] |
| add r3, 16 |
| pmaddubsw m2, m7 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq], m0 |
| add dstq, 16 |
| dec r5 |
| jg .w32_loop |
| lea dstq, [dstq-32+strideq] |
| sub hd, 1 |
| jg .w32_loop_init |
| RET |
| ALIGN function_align |
| .w64: |
| sub tlq, 1 |
| sub tlq, hq |
| pxor m6, m6 |
| .w64_loop_init: |
| mov r5, 4 |
| lea r3, [base+smooth_weights+16*8] |
| .w64_loop: |
| mova m7, [r3] |
| add r3, 16 |
| movd m2, [tlq+hq] ; left |
| pshufb m2, m6 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m7 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| mova m7, [r3] |
| add r3, 16 |
| pmaddubsw m2, m7 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq], m0 |
| add dstq, 16 |
| dec r5 |
| jg .w64_loop |
| lea dstq, [dstq-64+strideq] |
| sub hd, 1 |
| jg .w64_loop_init |
| RET |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int a); |
| ;--------------------------------------------------------------------------------------- |
| %macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 |
| pmaddubsw m6, m%3, m%1 |
| mova m0, m6 |
| pmaddubsw m6, m%4, m%2 |
| mova m1, m6 |
| %ifnum %5 |
| paddw m0, m%5 |
| %else |
| paddw m0, %5 |
| %endif |
| %ifnum %6 |
| paddw m1, m%6 |
| %else |
| paddw m1, %6 |
| %endif |
| %ifnum %7 |
| %else |
| mova m3, %7 |
| %endif |
| pavgw m0, m2 |
| pavgw m1, m3 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| %endmacro |
| |
| %macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] |
| mova m1, [rsp+16*%1] ; top |
| punpckhbw m6, m1, m0 ; top, bottom |
| punpcklbw m1, m0 ; top, bottom |
| pmaddubsw m2, m1, m5 |
| mova [rsp+16*%2], m1 |
| paddw m1, m3 ; 1 * top + 255 * bottom + 255 |
| paddw m2, m1 ; 128 * top + 129 * bottom + 255 |
| mova [rsp+16*%3], m2 |
| pmaddubsw m2, m6, m5 |
| mova [rsp+16*%4], m6 |
| paddw m6, m3 ; 1 * top + 255 * bottom + 255 |
| paddw m2, m6 ; 128 * top + 129 * bottom + 255 |
| mova [rsp+16*%5], m2 |
| movd m1, [tlq+hq] ; left |
| pshufb m1, [base+pb_3] ; topleft[-(1 + y)] |
| punpcklbw m1, m4 ; left, right |
| pmaddubsw m2, m1, m5 ; 127 * left - 127 * right |
| paddw m2, m1 ; 128 * left + 129 * right |
| mova m3, m2 |
| pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; |
| pmaddubsw m1, %7 |
| paddw m2, m3, m0 |
| paddw m3, m1 |
| movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; |
| mova m7, [rsp+16*%9] |
| pshufb m1, m7 |
| mova [rsp+16*%8], m3 |
| mova m4, [rsp+16*%2] |
| mova m5, [rsp+16*%3] |
| mova m3, [rsp+16*%4] |
| mova m7, [rsp+16*%5] |
| SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] |
| mova [dstq], m0 |
| movddup m3, [base+pw_255] ; recovery |
| mova m0, [rsp+16*%10] ; recovery |
| mova m4, [rsp+16*%11] ; recovery |
| mova m5, [rsp+16*%12] ; recovery |
| %endmacro |
| |
| cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights |
| %define base r6-ipred_smooth_ssse3_table |
| mov wd, wm |
| mov hd, hm |
| LEA r6, ipred_smooth_ssse3_table |
| movd m4, [tlq+wq] ; right |
| pxor m2, m2 |
| pshufb m4, m2 |
| tzcnt wd, wd |
| mov r5, tlq |
| sub r5, hq |
| movsxd wq, [r6+wq*4] |
| movddup m5, [base+pb_127_m127] |
| movd m0, [r5] |
| pshufb m0, m2 ; bottom |
| movddup m3, [base+pw_255] |
| add wq, r6 |
| lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] |
| jmp wq |
| .w4: |
| mova m7, [base+ipred_v_shuf] |
| movd m1, [tlq+1] ; left |
| pshufd m1, m1, q0000 |
| sub tlq, 4 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| punpcklbw m1, m0 ; top, bottom |
| pshufd m6, m7, q1100 |
| pshufd m7, m7, q3322 |
| pmaddubsw m2, m1, m5 |
| paddw m3, m1 ; 1 * top + 255 * bottom + 255 |
| paddw m2, m3 ; 128 * top + 129 * bottom + 255 |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; |
| punpcklqdq m1, m1 |
| mova [rsp+16*2], m1 |
| mova [rsp+16*3], m4 |
| mova [rsp+16*4], m6 |
| mova [rsp+16*5], m5 |
| .w4_loop: |
| movd m1, [tlq+hq] ; left |
| pshufb m1, [base+ipred_h_shuf] |
| punpcklbw m0, m1, m4 ; left, right |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 ; 127 * left - 127 * right |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 ; 128 * left + 129 * right |
| paddw m3, m1 |
| mova m4, [rsp+16*2] |
| pmaddubsw m0, m4 |
| pmaddubsw m1, m4 |
| paddw m2, m0 |
| paddw m3, m1 |
| movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; |
| add v_weightsq, 8 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| mova m4, [rsp+16*0] |
| mova m5, [rsp+16*1] |
| SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 |
| mova m4, [rsp+16*3] |
| mova m6, [rsp+16*4] |
| mova m5, [rsp+16*5] |
| movd [dstq+strideq*0], m0 |
| pshuflw m1, m0, q1032 |
| movd [dstq+strideq*1], m1 |
| punpckhqdq m0, m0 |
| movd [dstq+strideq*2], m0 |
| psrlq m0, 32 |
| movd [dstq+r3 ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| ALIGN function_align |
| .w8: |
| mova m7, [base+ipred_v_shuf] |
| movq m1, [tlq+1] ; left |
| punpcklqdq m1, m1 |
| sub tlq, 4 |
| sub tlq, hq |
| punpcklbw m1, m0 |
| pshufd m6, m7, q0000 |
| pshufd m7, m7, q1111 |
| pmaddubsw m2, m1, m5 |
| paddw m3, m1 |
| paddw m2, m3 |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; |
| mova [rsp+16*2], m1 |
| mova [rsp+16*3], m4 |
| mova [rsp+16*4], m6 |
| mova [rsp+16*5], m5 |
| .w8_loop: |
| movd m1, [tlq+hq] ; left |
| pshufb m1, [base+ipred_h_shuf] |
| pshufd m1, m1, q1100 |
| punpcklbw m0, m1, m4 |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 |
| paddw m3, m1 |
| mova m4, [rsp+16*2] |
| pmaddubsw m0, m4 |
| pmaddubsw m1, m4 |
| paddw m2, m0 |
| paddw m3, m1 |
| movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; |
| add v_weightsq, 4 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| mova m4, [rsp+16*0] |
| mova m5, [rsp+16*1] |
| SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 |
| mova m4, [rsp+16*3] |
| mova m6, [rsp+16*4] |
| mova m5, [rsp+16*5] |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| mova m7, [base+ipred_v_shuf] |
| movu m1, [tlq+1] ; left |
| sub tlq, 4 |
| sub tlq, hq |
| punpckhbw m6, m1, m0 ; top, bottom |
| punpcklbw m1, m0 ; top, bottom |
| pshufd m7, m7, q0000 |
| mova [rsp+16*2], m7 |
| pmaddubsw m2, m6, m5 |
| mova [rsp+16*5], m6 |
| paddw m6, m3 ; 1 * top + 255 * bottom + 255 |
| paddw m2, m6 ; 128 * top + 129 * bottom + 255 |
| mova [rsp+16*6], m2 |
| pmaddubsw m2, m1, m5 |
| paddw m3, m1 ; 1 * top + 255 * bottom + 255 |
| mova [rsp+16*0], m1 |
| paddw m2, m3 ; 128 * top + 129 * bottom + 255 |
| mova [rsp+16*1], m2 |
| mova [rsp+16*3], m4 |
| mova [rsp+16*4], m5 |
| .w16_loop: |
| movd m1, [tlq+hq] ; left |
| pshufb m1, [base+pb_3] ; topleft[-(1 + y)] |
| punpcklbw m1, m4 ; left, right |
| pmaddubsw m2, m1, m5 ; 127 * left - 127 * right |
| paddw m2, m1 ; 128 * left + 129 * right |
| mova m0, m1 |
| mova m3, m2 |
| pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; |
| pmaddubsw m1, [base+smooth_weights+16*3] |
| paddw m2, m0 |
| paddw m3, m1 |
| movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; |
| add v_weightsq, 2 |
| mova m7, [rsp+16*2] |
| pshufb m1, m7 |
| mova [rsp+16*7], m3 |
| mova m4, [rsp+16*0] |
| mova m5, [rsp+16*1] |
| mova m3, [rsp+16*5] |
| mova m7, [rsp+16*6] |
| SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] |
| mova m4, [rsp+16*3] |
| mova m5, [rsp+16*4] |
| mova [dstq], m0 |
| lea dstq, [dstq+strideq] |
| sub hd, 1 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| movu m1, [tlq+1] ; top topleft[1 + x] |
| movu m2, [tlq+17] ; top |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| sub tlq, 4 |
| sub tlq, hq |
| mova m7, [base+ipred_v_shuf] |
| pshufd m7, m7, q0000 |
| mova [rsp+16*2], m7 |
| mova [rsp+16*3], m0 |
| mova [rsp+16*4], m4 |
| mova [rsp+16*5], m5 |
| .w32_loop: |
| SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 |
| add dstq, 16 |
| SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 |
| lea dstq, [dstq-16+strideq] |
| add v_weightsq, 2 |
| sub hd, 1 |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| movu m1, [tlq+1] ; top topleft[1 + x] |
| movu m2, [tlq+17] ; top |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| movu m1, [tlq+33] ; top |
| movu m2, [tlq+49] ; top |
| mova [rsp+16*11], m1 |
| mova [rsp+16*12], m2 |
| sub tlq, 4 |
| sub tlq, hq |
| mova m7, [base+ipred_v_shuf] |
| pshufd m7, m7, q0000 |
| mova [rsp+16*2], m7 |
| mova [rsp+16*3], m0 |
| mova [rsp+16*4], m4 |
| mova [rsp+16*5], m5 |
| .w64_loop: |
| SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 |
| add dstq, 16 |
| SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 |
| add dstq, 16 |
| SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 |
| add dstq, 16 |
| SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 |
| lea dstq, [dstq-48+strideq] |
| add v_weightsq, 2 |
| sub hd, 1 |
| jg .w64_loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx |
| %define base r7-$$ |
| lea r7, [$$] |
| movifnidn hd, hm |
| mova m8, [base+pw_62] |
| mova m9, [base+pw_64] |
| mova m10, [base+pw_512] |
| %else |
| cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, stride, tl, w, h, angle, dx |
| %define base r4-$$ |
| %define m8 [base+pw_62] |
| %define m9 [base+pw_64] |
| %define m10 [base+pw_512] |
| %define hd dword [rsp+16*12] |
| %define hb byte [rsp+16*12] |
| mov r3, hm |
| LEA r4, $$ |
| mov hd, r3 |
| %endif |
| tzcnt wd, wm |
| movifnidn angled, anglem |
| inc tlq |
| movsxd wq, [base+ipred_z1_ssse3_table+wq*4] |
| lea wq, [base+wq+ipred_z1_ssse3_table] |
| mov dxd, angled |
| and dxd, 0x7e |
| add angled, 165 ; ~90 |
| movzx dxd, word [base+dr_intra_derivative+dxq] |
| xor angled, 0x4ff ; d = 90 - angle |
| jmp wq |
| .w4: |
| %if ARCH_X86_64 |
| cmp angleb, 40 |
| %else |
| mov r3d, angled ; rNb only valid for r0-r3 on x86-32 |
| cmp r3b, 40 |
| %endif |
| jae .w4_no_upsample |
| lea r3d, [angleq-1024] |
| sar r3d, 7 |
| add r3d, hd |
| jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) |
| mova m1, [tlq-1] |
| pshufb m0, m1, [base+z_upsample1] |
| pshufb m1, [base+z_upsample2] |
| movddup m2, [base+pb_36_m4] |
| add dxd, dxd |
| pmaddubsw m0, m2 |
| pshufd m7, m1, q3333 |
| movd [rsp+16], m7 ; top[max_base_x] |
| pmaddubsw m1, m2 |
| movd m6, dxd |
| mov r3d, dxd ; xpos |
| pshufb m6, [base+pw_256] |
| paddw m1, m0 |
| movq m0, [tlq] |
| pmulhrsw m1, m10 |
| paddw m7, m6, m6 |
| punpcklqdq m6, m7 ; xpos0 xpos1 |
| packuswb m1, m1 |
| punpcklbw m0, m1 |
| mova [rsp], m0 |
| .w4_upsample_loop: |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| movq m0, [rsp+r3] |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base1 |
| movhps m0, [rsp+r2] |
| pand m2, m8, m6 ; frac |
| psubw m1, m9, m2 ; 64-frac |
| psllw m2, 8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| paddw m6, m7 ; xpos += dx |
| pmulhrsw m0, m10 |
| packuswb m0, m0 |
| movd [dstq+strideq*0], m0 |
| pshuflw m0, m0, q1032 |
| movd [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w4_upsample_loop |
| RET |
| .w4_no_upsample: |
| mov r3d, 7 ; max_base |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w4_main |
| %if ARCH_X86_64 |
| lea r3d, [hq+3] |
| %else |
| mov r3d, hd |
| add r3d, 3 |
| %endif |
| movd m0, r3d |
| movd m2, angled |
| shr angled, 8 ; is_sm << 1 |
| pxor m1, m1 |
| pshufb m0, m1 |
| pshufb m2, m1 |
| pcmpeqb m1, m0, [base+z_filter_wh4] |
| pand m1, m2 |
| pcmpgtb m1, [base+z_filter_t_w48+angleq*8] |
| pmovmskb r5d, m1 |
| mov r3d, 7 |
| test r5d, r5d |
| jz .w4_main ; filter_strength == 0 |
| mova m3, [tlq-1] |
| imul r5d, 0x55555555 |
| movu m7, [base+z_filter_s+8] |
| shr r5d, 30 ; filter_strength |
| movddup m0, [base+pb_8] |
| pminub m7, m0 |
| pshufb m0, m3, [base+z_filter_s] |
| movddup m4, [base+z_filter_k-8+r5*8+24*0] |
| pshufb m3, m7 |
| movddup m5, [base+z_filter_k-8+r5*8+24*1] |
| shufps m2, m0, m3, q2121 |
| movddup m6, [base+z_filter_k-8+r5*8+24*2] |
| pmaddubsw m0, m4 |
| pmaddubsw m1, m2, m4 |
| pmaddubsw m2, m5 |
| paddd m5, m6 |
| pmaddubsw m4, m3, m5 |
| pmaddubsw m3, m6 |
| paddw m0, m2 |
| paddw m1, m4 |
| paddw m0, m3 |
| pshufd m1, m1, q3333 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| mov r5d, 9 |
| mov tlq, rsp |
| cmp hd, 4 |
| cmovne r3d, r5d |
| packuswb m0, m1 |
| mova [tlq], m0 |
| .w4_main: |
| add tlq, r3 |
| movd m5, dxd |
| movddup m0, [base+z_base_inc] ; base_inc << 6 |
| movd m7, [tlq] ; top[max_base_x] |
| shl r3d, 6 |
| movd m4, r3d |
| pshufb m5, [base+pw_256] |
| mov r5d, dxd ; xpos |
| pshufb m7, [base+pw_m256] |
| sub r5, r3 |
| pshufb m4, [base+pw_256] |
| mova m3, [base+z1_shuf_w4] |
| paddw m6, m5, m5 |
| psubw m4, m0 ; max_base_x |
| punpcklqdq m5, m6 ; xpos0 xpos1 |
| .w4_loop: |
| lea r3, [r5+dxq] |
| sar r5, 6 ; base0 |
| movq m0, [tlq+r5] |
| lea r5, [r3+dxq] |
| sar r3, 6 ; base1 |
| movhps m0, [tlq+r3] |
| pand m2, m8, m5 ; frac |
| psubw m1, m9, m2 ; 64-frac |
| psllw m2, 8 |
| pshufb m0, m3 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m4, m5 ; base < max_base_x |
| pmulhrsw m0, m10 |
| paddw m5, m6 ; xpos += dx |
| pand m0, m1 |
| pandn m1, m7 |
| por m0, m1 |
| packuswb m0, m0 |
| movd [dstq+strideq*0], m0 |
| pshuflw m0, m0, q1032 |
| movd [dstq+strideq*1], m0 |
| sub hd, 2 |
| jz .w4_end |
| lea dstq, [dstq+strideq*2] |
| test r5d, r5d |
| jl .w4_loop |
| packuswb m7, m7 |
| .w4_end_loop: |
| movd [dstq+strideq*0], m7 |
| movd [dstq+strideq*1], m7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w4_end_loop |
| .w4_end: |
| RET |
| .w8: |
| lea r3d, [angleq+216] |
| mov r3b, hb |
| cmp r3d, 8 |
| ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 |
| mova m5, [base+z_upsample1] |
| movu m3, [base+z_filter_s+6] |
| movd m4, hd |
| mova m0, [tlq-1] |
| movu m1, [tlq+7] |
| pxor m7, m7 |
| pshufb m4, m7 |
| movddup m7, [base+pb_36_m4] |
| pminub m4, m3 |
| add dxd, dxd |
| pshufb m2, m0, m5 |
| pmaddubsw m2, m7 |
| pshufb m0, m3 |
| pmaddubsw m0, m7 |
| movd m6, dxd |
| pshufb m3, m1, m5 |
| pmaddubsw m3, m7 |
| pshufb m1, m4 |
| pmaddubsw m1, m7 |
| pshufb m6, [base+pw_256] |
| mov r3d, dxd |
| paddw m2, m0 |
| paddw m7, m6, m6 |
| paddw m3, m1 |
| punpcklqdq m6, m7 ; xpos0 xpos1 |
| movu m1, [tlq] |
| pmulhrsw m2, m10 |
| pmulhrsw m3, m10 |
| packuswb m2, m3 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| mova [rsp+16*0], m0 |
| mova [rsp+16*1], m1 |
| .w8_upsample_loop: |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| movu m0, [rsp+r3] |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base1 |
| movu m1, [rsp+r2] |
| pand m2, m8, m6 |
| psubw m3, m9, m2 |
| psllw m2, 8 |
| por m3, m2 |
| punpcklqdq m2, m3, m3 ; frac0 |
| pmaddubsw m0, m2 |
| punpckhqdq m3, m3 ; frac1 |
| pmaddubsw m1, m3 |
| paddw m6, m7 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| packuswb m0, m1 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_upsample_loop |
| RET |
| .w8_no_upsample: |
| %if ARCH_X86_64 |
| lea r3d, [hq+7] |
| %else |
| mov r3d, hd |
| add r3d, 7 |
| %endif |
| movd m0, r3d |
| and r3d, 7 |
| or r3d, 8 ; imin(h+7, 15) |
| test angled, 0x400 |
| jnz .w8_main |
| movd m2, angled |
| shr angled, 8 ; is_sm << 1 |
| pxor m1, m1 |
| pshufb m0, m1 |
| pshufb m2, m1 |
| movu m1, [base+z_filter_wh8] |
| psrldq m3, [base+z_filter_t_w48+angleq*8], 4 |
| pcmpeqb m1, m0 |
| pand m1, m2 |
| pcmpgtb m1, m3 |
| pmovmskb r5d, m1 |
| test r5d, r5d |
| jz .w8_main ; filter_strength == 0 |
| mova m0, [tlq- 1] |
| imul r5d, 0x55555555 |
| mova m1, [tlq+15] |
| shr r5d, 30 ; filter_strength |
| movd m2, [tlq+r3] |
| lea tlq, [rsp+16*4] |
| sub r5, 3 |
| mova [tlq-16*1], m0 |
| pxor m3, m3 |
| mova [tlq+16*0], m1 |
| pshufb m0, m3 |
| pshufb m2, m3 |
| mova [tlq-16*2], m0 |
| movq [tlq+r3-15], m2 |
| call .filter_edge |
| sar r5d, 1 |
| add r5d, 17 |
| cmp hd, 8 |
| cmova r3d, r5d |
| .w8_main: |
| add tlq, r3 |
| movd m5, dxd |
| movd m7, [tlq] |
| shl r3d, 6 |
| movu m3, [base+z_filter_s+2] |
| movd m4, r3d |
| pshufb m5, [base+pw_256] |
| mov r5d, dxd |
| pshufb m7, [base+pw_m256] |
| sub r5, r3 |
| pshufb m4, [base+pw_256] |
| psubw m4, [base+z_base_inc] |
| mova m6, m5 |
| .w8_loop: |
| mov r3, r5 |
| sar r3, 6 |
| movu m0, [tlq+r3] |
| pand m1, m8, m5 |
| psubw m2, m9, m1 |
| psllw m1, 8 |
| pshufb m0, m3 |
| por m1, m2 |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m4, m5 |
| paddw m5, m6 |
| pmulhrsw m0, m10 |
| pand m0, m1 |
| pandn m1, m7 |
| por m0, m1 |
| packuswb m0, m0 |
| movq [dstq], m0 |
| dec hd |
| jz .w8_end |
| add dstq, strideq |
| add r5, dxq |
| jl .w8_loop |
| packuswb m7, m7 |
| .w8_end_loop: |
| movq [dstq], m7 |
| add dstq, strideq |
| dec hd |
| jg .w8_end_loop |
| .w8_end: |
| RET |
| .w16: |
| %if ARCH_X86_64 |
| lea r3d, [hq+15] |
| %else |
| mov r3d, hd |
| add r3d, 15 |
| %endif |
| movd m0, r3d |
| and r3d, 15 |
| or r3d, 16 ; imin(h+15, 31) |
| test angled, 0x400 |
| jnz .w16_main |
| movd m2, angled |
| shr angled, 8 ; is_sm << 1 |
| pxor m1, m1 |
| pshufb m0, m1 |
| pshufb m2, m1 |
| movq m3, [base+z_filter_t_w16+angleq*4] |
| pcmpeqb m1, m0, [base+z_filter_wh16] |
| pand m1, m2 |
| pcmpgtb m1, m3 |
| pmovmskb r5d, m1 |
| test r5d, r5d |
| jz .w16_main ; filter_strength == 0 |
| mova m0, [tlq- 1] |
| imul r5d, 0x24924924 |
| mova m1, [tlq+15] |
| shr r5d, 30 |
| movd m2, [tlq+30] |
| adc r5, -4 ; filter_strength-3 |
| movd m3, [tlq+r3] |
| lea tlq, [rsp+16*4] |
| mova [tlq-16*1], m0 |
| pxor m4, m4 |
| mova [tlq+16*0], m1 |
| pshufb m0, m4 |
| movd [rsp], m2 |
| pshufb m3, m4 |
| mova [tlq-16*2], m0 |
| movd [tlq+r3-15], m3 |
| call .filter_edge |
| cmp hd, 16 |
| jle .w16_main |
| pshuflw m0, [rsp], q0000 |
| sar r5, 1 |
| movd m1, [base+z_filter_k_tail+4+r5*4] |
| lea r3d, [r5+33] |
| pmaddubsw m0, m1 |
| %if ARCH_X86_64 |
| pmulhrsw m0, m10 |
| %else |
| pmulhrsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| movd [tlq+32], m0 |
| .w16_main: |
| add tlq, r3 |
| movd m5, dxd |
| movd m7, [tlq] |
| movd m4, r3d |
| shl r3d, 6 |
| pshufb m5, [base+pw_256] |
| pxor m6, m6 |
| pshufb m7, m6 |
| mov r5d, dxd |
| pshufb m4, m6 |
| sub r5, r3 |
| psubb m4, [base+pb_0to15] |
| mova m6, m5 |
| .w16_loop: |
| mov r3, r5 |
| sar r3, 6 |
| movu m1, [tlq+r3+0] |
| pand m0, m8, m5 |
| movu m2, [tlq+r3+1] |
| psubw m3, m9, m0 |
| psllw m0, 8 |
| por m3, m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| psrlw m3, m5, 6 |
| packsswb m3, m3 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| paddw m5, m6 |
| pcmpgtb m2, m4, m3 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| mova [dstq], m0 |
| dec hd |
| jz .w16_end |
| add dstq, strideq |
| add r5, dxq |
| jl .w16_loop |
| .w16_end_loop: |
| mova [dstq], m7 |
| add dstq, strideq |
| dec hd |
| jg .w16_end_loop |
| .w16_end: |
| RET |
| .w32: |
| %if ARCH_X86_64 |
| lea r3d, [hq+31] |
| %else |
| mov r3d, hd |
| add r3d, 31 |
| %endif |
| and r3d, 31 |
| or r3d, 32 ; imin(h+31, 63) |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w32_main |
| |
| mova m0, [tlq- 1] |
| mova m1, [tlq+15] |
| mova m2, [tlq+31] |
| mova m3, [tlq+47] |
| movd m4, [tlq+62] |
| movd m5, [tlq+r3] |
| lea tlq, [rsp+16*6] |
| mova [tlq-16*3], m0 |
| pxor m6, m6 |
| mova [tlq-16*2], m1 |
| pshufb m0, m6 |
| mova [tlq-16*1], m2 |
| xor r5d, r5d ; filter_strength = 3 |
| mova [tlq+16*0], m3 |
| movd [rsp], m4 |
| pshufb m5, m6 |
| mova [tlq-16*4], m0 |
| movd [tlq+r3-47], m5 |
| |
| call .filter_edge |
| sub tlq, 16*2 |
| call .filter_edge |
| |
| cmp hd, 32 |
| jle .w32_main |
| pshuflw m0, [rsp], q0000 |
| movd m1, [base+z_filter_k_tail+4] |
| add r3d, 2 |
| pmaddubsw m0, m1 |
| %if ARCH_X86_64 |
| pmulhrsw m0, m10 |
| %else |
| pmulhrsw m0, m4 |
| %endif |
| packuswb m0, m0 |
| movd [tlq+64], m0 |
| .w32_main: |
| add tlq, r3 |
| movd m0, r3d |
| movd m7, [tlq] |
| shl r3d, 6 |
| movd m5, dxd |
| pxor m6, m6 |
| mov r5d, dxd |
| pshufb m0, m6 |
| pshufb m5, [base+pw_256] |
| sub r5, r3 |
| pshufb m7, m6 |
| psubb m0, [base+pb_0to15] |
| movddup m1, [base+pb_m16] |
| mova [rsp+16*0], m0 |
| paddb m0, m1 |
| mova [rsp+16*1], m0 |
| mova m6, m5 |
| .w32_loop: |
| mov r3, r5 |
| sar r3, 6 |
| movu m1, [tlq+r3+16*0+0] |
| pand m0, m8, m5 |
| movu m2, [tlq+r3+16*0+1] |
| psubw m3, m9, m0 |
| psllw m0, 8 |
| por m3, m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| psrlw m4, m5, 6 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| packsswb m4, m4 |
| pcmpgtb m2, [rsp+16*0], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| movu m1, [tlq+r3+16*1+0] |
| movu m2, [tlq+r3+16*1+1] |
| mova [dstq+16*0], m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| paddw m5, m6 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pcmpgtb m2, [rsp+16*1], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| mova [dstq+16*1], m0 |
| dec hd |
| jz .w32_end |
| add dstq, strideq |
| add r5, dxq |
| jl .w32_loop |
| .w32_end_loop: |
| mova [dstq+16*0], m7 |
| mova [dstq+16*1], m7 |
| add dstq, strideq |
| dec hd |
| jg .w32_end_loop |
| .w32_end: |
| RET |
| .w64: |
| %if ARCH_X86_64 |
| lea r3d, [hq+63] |
| %else |
| mov r3d, hd |
| add r3d, 63 |
| %endif |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w64_main |
| mova m0, [tlq- 1] |
| mova m1, [tlq+ 15] |
| mova m2, [tlq+ 31] |
| mova m3, [tlq+ 47] |
| mova [rsp+16*3], m0 |
| pxor m5, m5 |
| mova [rsp+16*4], m1 |
| pshufb m0, m5 |
| mova [rsp+16*5], m2 |
| mova [rsp+16*6], m3 |
| mova [rsp+16*2], m0 |
| mova m0, [tlq+ 63] |
| mova m1, [tlq+ 79] |
| mova m2, [tlq+ 95] |
| mova m3, [tlq+111] |
| movd m4, [tlq+r3] |
| lea tlq, [rsp+16*10] |
| mova [tlq-16*3], m0 |
| xor r5d, r5d ; filter_strength = 3 |
| mova [tlq-16*2], m1 |
| pshufb m4, m5 |
| mova [tlq-16*1], m2 |
| mova [tlq+16*0], m3 |
| movd [tlq+r3-111], m4 |
| cmp hd, 64 |
| jl .w64_filter96 ; skip one call if the last 32 bytes aren't used |
| call .filter_edge |
| .w64_filter96: |
| sub tlq, 16*2 |
| call .filter_edge |
| sub tlq, 16*2 |
| call .filter_edge |
| sub tlq, 16*2 |
| call .filter_edge |
| .w64_main: |
| add tlq, r3 |
| movd m0, r3d |
| movd m7, [tlq] |
| shl r3d, 6 |
| movd m5, dxd |
| pxor m6, m6 |
| mov r5d, dxd |
| pshufb m0, m6 |
| sub r5, r3 |
| pshufb m5, [base+pw_256] |
| pshufb m7, m6 |
| psubb m0, [base+pb_0to15] |
| movddup m1, [base+pb_m16] |
| mova [rsp+16*0], m0 |
| paddb m0, m1 |
| mova [rsp+16*1], m0 |
| paddb m0, m1 |
| mova [rsp+16*2], m0 |
| paddb m0, m1 |
| mova [rsp+16*3], m0 |
| mova m6, m5 |
| .w64_loop: |
| mov r3, r5 |
| sar r3, 6 |
| movu m1, [tlq+r3+16*0+0] |
| pand m0, m8, m5 |
| movu m2, [tlq+r3+16*0+1] |
| psubw m3, m9, m0 |
| psllw m0, 8 |
| por m3, m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| psrlw m4, m5, 6 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| packsswb m4, m4 |
| pcmpgtb m2, [rsp+16*0], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| movu m1, [tlq+r3+16*1+0] |
| movu m2, [tlq+r3+16*1+1] |
| mova [dstq+16*0], m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pcmpgtb m2, [rsp+16*1], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| movu m1, [tlq+r3+16*2+0] |
| movu m2, [tlq+r3+16*2+1] |
| mova [dstq+16*1], m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pcmpgtb m2, [rsp+16*2], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| movu m1, [tlq+r3+16*3+0] |
| movu m2, [tlq+r3+16*3+1] |
| mova [dstq+16*2], m0 |
| punpcklbw m0, m1, m2 |
| pmaddubsw m0, m3 |
| punpckhbw m1, m2 |
| pmaddubsw m1, m3 |
| paddw m5, m6 |
| pmulhrsw m0, m10 |
| pmulhrsw m1, m10 |
| pcmpgtb m2, [rsp+16*3], m4 |
| packuswb m0, m1 |
| pand m0, m2 |
| pandn m2, m7 |
| por m0, m2 |
| mova [dstq+16*3], m0 |
| dec hd |
| jz .w64_end |
| add dstq, strideq |
| add r5, dxq |
| jl .w64_loop |
| .w64_end_loop: |
| mova [dstq+16*0], m7 |
| mova [dstq+16*1], m7 |
| mova [dstq+16*2], m7 |
| mova [dstq+16*3], m7 |
| add dstq, strideq |
| dec hd |
| jg .w64_end_loop |
| .w64_end: |
| RET |
| ALIGN function_align |
| .filter_edge: ; 32 pixels/iteration |
| movddup m7, [base+z_filter_k+8*2+r5*8+24*0] |
| movu m2, [tlq-17] |
| mova m1, [tlq-16] |
| movu m3, [tlq- 1] |
| mova m4, [tlq+ 0] |
| punpcklbw m0, m2, m1 |
| pmaddubsw m0, m7 |
| punpckhbw m2, m1 |
| pmaddubsw m2, m7 |
| punpcklbw m1, m3, m4 |
| pmaddubsw m1, m7 |
| punpckhbw m3, m4 |
| pmaddubsw m3, m7 |
| movddup m7, [base+z_filter_k+8*2+r5*8+24*1] |
| movu m5, [tlq-15] |
| movu m6, [tlq-14] |
| punpcklbw m4, m5, m6 |
| pmaddubsw m4, m7 |
| punpckhbw m5, m6 |
| pmaddubsw m5, m7 |
| paddw m0, m4 |
| paddw m2, m5 |
| movu m5, [tlq+ 1] |
| movu m6, [tlq+ 2] |
| punpcklbw m4, m5, m6 |
| pmaddubsw m4, m7 |
| punpckhbw m5, m6 |
| pmaddubsw m5, m7 |
| paddw m1, m4 |
| paddw m3, m5 |
| test r5d, r5d |
| jnz .filter_end ; 3-tap |
| movddup m7, [base+z_filter_k+8*8] |
| movu m5, [tlq-13] |
| movu m6, [tlq+ 3] |
| punpcklbw m4, m5, m5 |
| pmaddubsw m4, m7 |
| punpckhbw m5, m5 |
| pmaddubsw m5, m7 |
| paddw m0, m4 |
| paddw m2, m5 |
| punpcklbw m5, m6, m6 |
| pmaddubsw m5, m7 |
| punpckhbw m6, m6 |
| pmaddubsw m6, m7 |
| paddw m1, m5 |
| paddw m3, m6 |
| .filter_end: |
| %if ARCH_X86_64 |
| REPX {pmulhrsw x, m10}, m0, m2, m1, m3 |
| %else |
| mova m4, m10 |
| REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 |
| %endif |
| packuswb m0, m2 |
| packuswb m1, m3 |
| mova [tlq+16*0], m0 |
| mova [tlq+16*1], m1 |
| ret |
| |
| ;--------------------------------------------------------------------------------------- |
| ;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, |
| ; const uint8_t *idx, const int w, const int h); |
| ;--------------------------------------------------------------------------------------- |
| cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h |
| mova m4, [palq] |
| LEA r2, pal_pred_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r2+wq*4] |
| packuswb m4, m4 |
| add wq, r2 |
| lea r2, [strideq*3] |
| jmp wq |
| .w4: |
| pshufb m0, m4, [idxq] |
| add idxq, 16 |
| movd [dstq ], m0 |
| pshuflw m1, m0, q1032 |
| movd [dstq+strideq ], m1 |
| punpckhqdq m0, m0 |
| movd [dstq+strideq*2], m0 |
| psrlq m0, 32 |
| movd [dstq+r2 ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4 |
| RET |
| ALIGN function_align |
| .w8: |
| pshufb m0, m4, [idxq] |
| pshufb m1, m4, [idxq+16] |
| add idxq, 32 |
| movq [dstq ], m0 |
| movhps [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m1 |
| movhps [dstq+r2 ], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8 |
| RET |
| ALIGN function_align |
| .w16: |
| pshufb m0, m4, [idxq] |
| pshufb m1, m4, [idxq+16] |
| pshufb m2, m4, [idxq+32] |
| pshufb m3, m4, [idxq+48] |
| add idxq, 64 |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r2 ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16 |
| RET |
| ALIGN function_align |
| .w32: |
| pshufb m0, m4, [idxq] |
| pshufb m1, m4, [idxq+16] |
| pshufb m2, m4, [idxq+32] |
| pshufb m3, m4, [idxq+48] |
| add idxq, 64 |
| mova [dstq ], m0 |
| mova [dstq+16 ], m1 |
| mova [dstq+strideq ], m2 |
| mova [dstq+strideq+16], m3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32 |
| RET |
| ALIGN function_align |
| .w64: |
| pshufb m0, m4, [idxq] |
| pshufb m1, m4, [idxq+16] |
| pshufb m2, m4, [idxq+32] |
| pshufb m3, m4, [idxq+48] |
| add idxq, 64 |
| mova [dstq ], m0 |
| mova [dstq+16], m1 |
| mova [dstq+32], m2 |
| mova [dstq+48], m3 |
| add dstq, strideq |
| sub hd, 1 |
| jg .w64 |
| RET |
| |
| ;--------------------------------------------------------------------------------------- |
| ;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int16_t *ac, const int alpha); |
| ;--------------------------------------------------------------------------------------- |
| %macro IPRED_CFL 1 ; ac in, unpacked pixels out |
| psignw m3, m%1, m1 |
| pabsw m%1, m%1 |
| pmulhrsw m%1, m2 |
| psignw m%1, m3 |
| paddw m%1, m0 |
| %endmacro |
| |
| %if UNIX64 |
| DECLARE_REG_TMP 7 |
| %else |
| DECLARE_REG_TMP 5 |
| %endif |
| |
| cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| movifnidn wd, wm |
| movifnidn hd, hm |
| tzcnt r6d, hd |
| lea t0d, [wq+hq] |
| movd m4, t0d |
| tzcnt t0d, t0d |
| movd m5, t0d |
| LEA t0, ipred_cfl_ssse3_table |
| tzcnt wd, wd |
| movsxd r6, [t0+r6*4] |
| movsxd wq, [t0+wq*4+16] |
| pcmpeqd m3, m3 |
| psrlw m4, 1 |
| add r6, t0 |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| .h4: |
| movd m0, [tlq-4] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w4: |
| movd m1, [tlq+1] |
| pmaddubsw m1, m3 |
| psubw m0, m4 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw m0, 3 ; dc >>= ctz(width + height); |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq m1, m0, m0 |
| paddw m0, m1 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| psrlw m0, 2 |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 8 |
| cmovz r6d, r2d |
| movd m5, r6d |
| pmulhuw m0, m5 |
| .w4_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s4: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| lea r6, [strideq*3] |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s4_loop: |
| mova m4, [acq] |
| mova m5, [acq+16] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| movd [dstq+strideq*0], m4 |
| pshuflw m4, m4, q1032 |
| movd [dstq+strideq*1], m4 |
| punpckhqdq m4, m4 |
| movd [dstq+strideq*2], m4 |
| psrlq m4, 32 |
| movd [dstq+r6 ], m4 |
| lea dstq, [dstq+strideq*4] |
| add acq, 32 |
| sub hd, 4 |
| jg .s4_loop |
| RET |
| ALIGN function_align |
| .h8: |
| movq m0, [tlq-8] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w8: |
| movq m1, [tlq+1] |
| pmaddubsw m1, m3 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| paddw m0, m1 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w8_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s8: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| lea r6, [strideq*3] |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s8_loop: |
| mova m4, [acq] |
| mova m5, [acq+16] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| movq [dstq ], m4 |
| movhps [dstq+strideq ], m4 |
| mova m4, [acq+32] |
| mova m5, [acq+48] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| movq [dstq+strideq*2], m4 |
| movhps [dstq+r6 ], m4 |
| lea dstq, [dstq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .s8_loop |
| RET |
| ALIGN function_align |
| .h16: |
| mova m0, [tlq-16] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w16: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 8|32 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w16_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s16: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s16_loop: |
| mova m4, [acq] |
| mova m5, [acq+16] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| mova [dstq], m4 |
| mova m4, [acq+32] |
| mova m5, [acq+48] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| mova [dstq+strideq], m4 |
| lea dstq, [dstq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .s16_loop |
| RET |
| ALIGN function_align |
| .h32: |
| mova m0, [tlq-32] |
| pmaddubsw m0, m3 |
| mova m2, [tlq-16] |
| pmaddubsw m2, m3 |
| paddw m0, m2 |
| jmp wq |
| .w32: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| movu m2, [tlq+17] |
| pmaddubsw m2, m3 |
| paddw m1, m2 |
| paddw m0, m1 |
| psubw m4, m0 |
| punpckhqdq m0, m0 |
| psubw m0, m4 |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| pmaddwd m0, m3 |
| psrlw m0, m5 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hd, 64|16 |
| cmovz r6d, r2d |
| movd m1, r6d |
| pmulhuw m0, m1 |
| .w32_end: |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| .s32: |
| movd m1, alpham |
| pshuflw m1, m1, q0000 |
| punpcklqdq m1, m1 |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s32_loop: |
| mova m4, [acq] |
| mova m5, [acq+16] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| mova [dstq], m4 |
| mova m4, [acq+32] |
| mova m5, [acq+48] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| mova [dstq+16], m4 |
| add dstq, strideq |
| add acq, 64 |
| dec hd |
| jg .s32_loop |
| RET |
| |
| ;--------------------------------------------------------------------------------------- |
| ;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int16_t *ac, const int alpha); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| mov hd, hm ; zero upper half |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movu m0, [tlq] |
| mov t0d, 0x8000 |
| movd m3, t0d |
| movd m2, r6d |
| psrld m3, m2 |
| LEA t0, ipred_cfl_left_ssse3_table |
| movsxd r6, [t0+r6*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, t0 |
| add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table |
| movsxd wq, [t0+wq*4] |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| .h32: |
| movu m1, [tlq+16] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| .h16: |
| pshufd m1, m0, q3232 ; psrlq m1, m0, 16 |
| paddw m0, m1 |
| .h8: |
| pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 |
| paddw m0, m1 |
| .h4: |
| pmaddwd m0, m2 |
| pmulhrsw m0, m3 |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| jmp wq |
| |
| ;--------------------------------------------------------------------------------------- |
| ;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int16_t *ac, const int alpha); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| LEA t0, ipred_cfl_left_ssse3_table |
| tzcnt wd, wm |
| inc tlq |
| movu m0, [tlq] |
| movifnidn hd, hm |
| mov r6d, 0x8000 |
| movd m3, r6d |
| movd m2, wd |
| psrld m3, m2 |
| movsxd r6, [t0+wq*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, t0 |
| add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table |
| movsxd wq, [t0+wq*4] |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| |
| ;--------------------------------------------------------------------------------------- |
| ;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, |
| ; const int width, const int height, const int16_t *ac, const int alpha); |
| ;--------------------------------------------------------------------------------------- |
| cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| tzcnt wd, wm |
| movifnidn hd, hm |
| LEA r6, ipred_cfl_splat_ssse3_table |
| movsxd wq, [r6+wq*4] |
| movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] |
| add wq, r6 |
| movifnidn acq, acmp |
| jmp wq |
| |
| %macro RELOAD_ACQ_32 1 |
| mov acq, ac_bakq ; restore acq |
| %endmacro |
| |
| %if ARCH_X86_64 |
| cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak |
| DECLARE_REG_TMP 7 |
| movddup m2, [pb_2] |
| %else |
| cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h |
| DECLARE_REG_TMP 4 |
| %define ac_bakq acmp |
| mov t0d, 0x02020202 |
| movd m2, t0d |
| pshufd m2, m2, q0000 |
| %endif |
| movifnidn wd, wm |
| mov t0d, hm |
| mov hd, t0d |
| imul t0d, wd |
| movd m5, t0d |
| movifnidn hpadd, hpadm |
| %if ARCH_X86_64 |
| mov ac_bakq, acq |
| %endif |
| shl hpadd, 2 |
| sub hd, hpadd |
| pxor m4, m4 |
| cmp wd, 8 |
| jg .w16 |
| je .w8 |
| ; fall-through |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h |
| %endif |
| .w4: |
| lea stride3q, [strideq*3] |
| .w4_loop: |
| movq m0, [yq] |
| movq m1, [yq+strideq] |
| movhps m0, [yq+strideq*2] |
| movhps m1, [yq+stride3q] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*4] |
| add acq, 16 |
| sub hd, 2 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .calc_avg_4_8 |
| punpckhqdq m0, m0 |
| .w4_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 16 |
| sub hpadd, 2 |
| jg .w4_hpad_loop |
| jmp .calc_avg_4_8 |
| .w8: |
| lea stride3q, [strideq*3] |
| test wpadd, wpadd |
| jnz .w8_wpad |
| .w8_loop: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| mova m0, [yq+strideq*2] |
| mova m1, [yq+stride3q] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 2 |
| jg .w8_loop |
| test hpadd, hpadd |
| jz .calc_avg_4_8 |
| jmp .w8_hpad |
| .w8_wpad: ; wpadd=1 |
| movddup m0, [yq] |
| movddup m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| pshufhw m0, m0, q3333 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 16 |
| sub hd, 1 |
| jg .w8_wpad |
| test hpadd, hpadd |
| jz .calc_avg_4_8 |
| .w8_hpad: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 16 |
| sub hpadd, 1 |
| jg .w8_hpad |
| jmp .calc_avg_4_8 |
| .w16: |
| test wpadd, wpadd |
| jnz .w16_wpad |
| .w16_loop: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| mova m6, [yq+16] |
| mova m1, [yq+strideq+16] |
| pmaddubsw m6, m2 |
| pmaddubsw m1, m2 |
| paddw m6, m1 |
| mova [acq+16], m6 |
| paddw m4, m6 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jg .w16_loop |
| test hpadd, hpadd |
| jz .calc_avg16 |
| jmp .w16_hpad_loop |
| .w16_wpad: |
| cmp wpadd, 2 |
| jl .w16_pad1 |
| je .w16_pad2 |
| .w16_pad3: |
| movddup m0, [yq] |
| movddup m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| pshufhw m0, m0, q3333 |
| mova [acq], m0 |
| paddw m4, m0 |
| mova m6, m0 |
| punpckhqdq m6, m0, m0 |
| mova [acq+16], m6 |
| paddw m4, m6 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jg .w16_pad3 |
| jmp .w16_wpad_done |
| .w16_pad2: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| pshufhw m6, m0, q3333 |
| punpckhqdq m6, m6 |
| mova [acq+16], m6 |
| paddw m4, m6 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jg .w16_pad2 |
| jmp .w16_wpad_done |
| .w16_pad1: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| movddup m6, [yq+16] |
| movddup m1, [yq+strideq+16] |
| pmaddubsw m6, m2 |
| pmaddubsw m1, m2 |
| paddw m6, m1 |
| pshufhw m6, m6, q3333 |
| mova [acq+16], m6 |
| paddw m4, m6 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jg .w16_pad1 |
| .w16_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg16 |
| .w16_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| mova [acq+16], m6 |
| paddw m4, m6 |
| add acq, 32 |
| dec hpadd |
| jg .w16_hpad_loop |
| jmp .calc_avg16 |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h |
| %endif |
| .calc_avg_4_8: |
| psrlw m2, 9 |
| pmaddwd m4, m2 |
| jmp .calc_avg |
| .calc_avg16: |
| psrld m0, m4, 16 |
| pslld m4, 16 |
| psrld m4, 16 |
| paddd m4, m0 |
| .calc_avg: |
| movd szd, m5 |
| psrad m5, 1 |
| tzcnt r1d, szd |
| paddd m4, m5 |
| movd m1, r1d |
| pshufd m0, m4, q2301 |
| paddd m0, m4 |
| pshufd m4, m0, q1032 |
| paddd m0, m4 |
| psrad m0, m1 ; sum >>= log2sz; |
| packssdw m0, m0 |
| RELOAD_ACQ_32 acq |
| .sub_loop: |
| mova m1, [acq] |
| psubw m1, m0 ; ac[x] -= sum; |
| mova [acq], m1 |
| add acq, 16 |
| sub szd, 8 |
| jg .sub_loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak |
| movddup m2, [pb_4] |
| %else |
| cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h |
| mov t0d, 0x04040404 |
| movd m2, t0d |
| pshufd m2, m2, q0000 |
| %endif |
| movifnidn wd, wm |
| mov t0d, hm |
| mov hd, t0d |
| imul t0d, wd |
| movd m6, t0d |
| movifnidn hpadd, hpadm |
| %if ARCH_X86_64 |
| mov ac_bakq, acq |
| %endif |
| shl hpadd, 2 |
| sub hd, hpadd |
| pxor m4, m4 |
| pxor m5, m5 |
| cmp wd, 8 |
| jg .w16 |
| je .w8 |
| ; fall-through |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h |
| %endif |
| .w4: |
| lea stride3q, [strideq*3] |
| .w4_loop: |
| movq m1, [yq] |
| movhps m1, [yq+strideq] |
| movq m0, [yq+strideq*2] |
| movhps m0, [yq+stride3q] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 4 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .calc_avg_4 |
| punpckhqdq m0, m0 |
| .w4_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 16 |
| sub hpadd, 2 |
| jg .w4_hpad_loop |
| jmp .calc_avg_4 |
| .w8: |
| lea stride3q, [strideq*3] |
| test wpadd, wpadd |
| jnz .w8_wpad |
| .w8_loop: |
| mova m1, [yq] |
| mova m0, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| mova m1, [yq+strideq*2] |
| mova m0, [yq+stride3q] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .w8_loop |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| jmp .w8_hpad |
| .w8_wpad: |
| movddup m1, [yq] |
| pmaddubsw m1, m2 |
| pshufhw m1, m1, q3333 |
| mova [acq], m1 |
| paddw m5, m1 |
| movddup m0, [yq+strideq] |
| pmaddubsw m0, m2 |
| pshufhw m0, m0, q3333 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| sub hd, 2 |
| jg .w8_wpad |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| .w8_hpad: |
| mova [acq], m0 |
| paddw m4, m0 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 2 |
| jg .w8_hpad |
| jmp .calc_avg_8_16 |
| .w16: |
| test wpadd, wpadd |
| jnz .w16_wpad |
| .w16_loop: |
| mova m1, [yq] |
| mova m0, [yq+16] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| paddw m5, m1 |
| mova m1, [yq+strideq] |
| mova m0, [yq+strideq+16] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| paddw m4, m1 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_loop |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| jmp .w16_hpad_loop |
| .w16_wpad: |
| cmp wpadd, 2 |
| jl .w16_pad1 |
| je .w16_pad2 |
| .w16_pad3: |
| movddup m1, [yq] |
| pmaddubsw m1, m2 |
| pshufhw m1, m1, q3333 |
| mova [acq], m1 |
| paddw m5, m1 |
| punpckhqdq m1, m1 |
| mova [acq+16], m1 |
| paddw m5, m1 |
| movddup m1, [yq+strideq] |
| pmaddubsw m1, m2 |
| pshufhw m1, m1, q3333 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| punpckhqdq m0, m1, m1 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad3 |
| jmp .w16_wpad_done |
| .w16_pad2: |
| mova m1, [yq] |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| pshufhw m1, m1, q3333 |
| punpckhqdq m1, m1 |
| mova [acq+16], m1 |
| paddw m5, m1 |
| mova m1, [yq+strideq] |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| mova m0, m1 |
| pshufhw m0, m0, q3333 |
| punpckhqdq m0, m0 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad2 |
| jmp .w16_wpad_done |
| .w16_pad1: |
| mova m1, [yq] |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| movddup m0, [yq+16] |
| pmaddubsw m0, m2 |
| pshufhw m0, m0, q3333 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| mova m1, [yq+strideq] |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| movddup m0, [yq+strideq+16] |
| pmaddubsw m0, m2 |
| pshufhw m0, m0, q3333 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad1 |
| .w16_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| .w16_hpad_loop: |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m4, m1 |
| paddw m5, m0 |
| mova [acq+32], m1 |
| mova [acq+48], m0 |
| paddw m4, m1 |
| paddw m5, m0 |
| add acq, 64 |
| sub hpadd, 2 |
| jg .w16_hpad_loop |
| jmp .calc_avg_8_16 |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h |
| %endif |
| .calc_avg_4: |
| psrlw m2, 10 |
| pmaddwd m5, m2 |
| pmaddwd m0, m4, m2 |
| jmp .calc_avg |
| .calc_avg_8_16: |
| mova m0, m5 |
| psrld m5, 16 |
| pslld m0, 16 |
| psrld m0, 16 |
| paddd m5, m0 |
| mova m0, m4 |
| psrld m0, 16 |
| pslld m4, 16 |
| psrld m4, 16 |
| paddd m0, m4 |
| .calc_avg: |
| paddd m5, m0 |
| movd szd, m6 |
| psrad m6, 1 |
| tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); |
| paddd m5, m6 |
| movd m1, r1d |
| pshufd m0, m5, q2301 |
| paddd m0, m5 |
| pshufd m5, m0, q1032 |
| paddd m0, m5 |
| psrad m0, m1 ; sum >>= log2sz; |
| packssdw m0, m0 |
| RELOAD_ACQ_32 acq ; ac = ac_orig |
| .sub_loop: |
| mova m1, [acq] |
| psubw m1, m0 |
| mova [acq], m1 |
| add acq, 16 |
| sub szd, 8 |
| jg .sub_loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak |
| movddup m2, [pb_4] |
| %else |
| cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h |
| %define ac_bakq [rsp+16*4] |
| mov t0d, 0x04040404 |
| movd m2, t0d |
| pshufd m2, m2, q0000 |
| %endif |
| movifnidn wd, wm |
| movifnidn hpadd, hpadm |
| movd m0, hpadd |
| mov t0d, hm |
| mov hd, t0d |
| imul t0d, wd |
| movd m6, t0d |
| movd hpadd, m0 |
| mov ac_bakq, acq |
| shl hpadd, 2 |
| sub hd, hpadd |
| pxor m5, m5 |
| pxor m4, m4 |
| cmp wd, 16 |
| jg .w32 |
| cmp wd, 8 |
| jg .w16 |
| je .w8 |
| ; fall-through |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h |
| %endif |
| .w4: |
| lea stride3q, [strideq*3] |
| .w4_loop: |
| movd m1, [yq] |
| movd m3, [yq+strideq] |
| punpckldq m1, m3 |
| punpcklbw m1, m1 |
| movd m0, [yq+strideq*2] |
| movd m3, [yq+stride3q] |
| punpckldq m0, m3 |
| punpcklbw m0, m0 |
| pmaddubsw m1, m2 |
| pmaddubsw m0, m2 |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 4 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .calc_avg_4 |
| punpckhqdq m0, m0 |
| .w4_hpad_loop: |
| mova [acq], m0 |
| paddw m5, m0 |
| add acq, 16 |
| sub hpadd, 2 |
| jg .w4_hpad_loop |
| .calc_avg_4: |
| psrlw m2, 10 |
| pmaddwd m5, m2 |
| jmp .calc_avg |
| |
| .w8: |
| lea stride3q, [strideq*3] |
| test wpadd, wpadd |
| jnz .w8_wpad |
| .w8_loop: |
| movq m1, [yq] |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| movq m0, [yq+strideq] |
| punpcklbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| movq m1, [yq+strideq*2] |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| movq m0, [yq+stride3q] |
| punpcklbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .w8_loop |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| jmp .w8_hpad |
| .w8_wpad: |
| movd m1, [yq] |
| punpcklbw m1, m1 |
| punpcklqdq m1, m1 |
| pmaddubsw m1, m2 |
| pshufhw m1, m1, q3333 |
| mova [acq], m1 |
| paddw m5, m1 |
| movd m0, [yq+strideq] |
| punpcklbw m0, m0 |
| punpcklqdq m0, m0 |
| pmaddubsw m0, m2 |
| pshufhw m0, m0, q3333 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| sub hd, 2 |
| jg .w8_wpad |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| .w8_hpad: |
| mova [acq], m0 |
| paddw m5, m0 |
| mova [acq+16], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 2 |
| jg .w8_hpad |
| jmp .calc_avg_8_16 |
| |
| .w16: |
| test wpadd, wpadd |
| jnz .w16_wpad |
| .w16_loop: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| mova m0, [yq+strideq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_loop |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| jmp .w16_hpad_loop |
| .w16_wpad: |
| cmp wpadd, 2 |
| jl .w16_pad1 |
| je .w16_pad2 |
| .w16_pad3: |
| movd m1, [yq] |
| punpcklbw m1, m1 |
| punpcklqdq m1, m1 |
| pshufhw m1, m1, q3333 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| punpckhqdq m1, m1 |
| mova [acq+16], m1 |
| paddw m5, m1 |
| movd m1, [yq+strideq] |
| punpcklbw m1, m1 |
| punpcklqdq m1, m1 |
| pshufhw m1, m1, q3333 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| punpckhqdq m0, m1, m1 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad3 |
| jmp .w16_wpad_done |
| .w16_pad2: |
| movq m1, [yq] |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| pshufhw m1, m1, q3333 |
| punpckhqdq m1, m1 |
| mova [acq+16], m1 |
| paddw m5, m1 |
| movq m1, [yq+strideq] |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| mova m0, m1 |
| pshufhw m0, m0, q3333 |
| punpckhqdq m0, m0 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad2 |
| jmp .w16_wpad_done |
| .w16_pad1: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1 |
| punpckhbw m0, m0 |
| punpcklqdq m0, m0 |
| pshufhw m0, m0, q3333 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0 |
| mova m0, [yq+strideq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq+32], m1 |
| paddw m4, m1 |
| punpckhbw m0, m0 |
| punpcklqdq m0, m0 |
| pshufhw m0, m0, q3333 |
| pmaddubsw m0, m2 |
| mova [acq+48], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_pad1 |
| .w16_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg_8_16 |
| .w16_hpad_loop: |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m4, m1 |
| paddw m5, m0 |
| mova [acq+32], m1 |
| mova [acq+48], m0 |
| paddw m4, m1 |
| paddw m5, m0 |
| add acq, 64 |
| sub hpadd, 2 |
| jg .w16_hpad_loop |
| .calc_avg_8_16: |
| mova m0, m5 |
| psrld m5, 16 |
| pslld m0, 16 |
| psrld m0, 16 |
| paddd m5, m0 |
| mova m0, m4 |
| psrld m0, 16 |
| pslld m4, 16 |
| psrld m4, 16 |
| paddd m0, m4 |
| paddd m5, m0 |
| jmp .calc_avg |
| |
| .w32: |
| pxor m0, m0 |
| mova [rsp ], m0 |
| mova [rsp+16], m0 |
| mova [rsp+32], m0 |
| mova [rsp+48], m0 |
| test wpadd, wpadd |
| jnz .w32_wpad |
| .w32_loop: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m4, [yq+16] |
| mova m3, m4 |
| punpcklbw m3, m3 |
| pmaddubsw m3, m2 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| punpckhbw m4, m4 |
| pmaddubsw m4, m2 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_loop |
| test hpadd, hpadd |
| jz .calc_avg_32 |
| jmp .w32_hpad_loop |
| .w32_wpad: |
| cmp wpadd, 2 |
| jl .w32_pad1 |
| je .w32_pad2 |
| cmp wpadd, 4 |
| jl .w32_pad3 |
| je .w32_pad4 |
| cmp wpadd, 6 |
| jl .w32_pad5 |
| je .w32_pad6 |
| .w32_pad7: |
| movd m1, [yq] |
| punpcklbw m1, m1 |
| punpcklqdq m1, m1 |
| pshufhw m1, m1, q3333 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| mova m0, m1 |
| punpckhqdq m0, m0 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m3, m0 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| mova m4, m3 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad7 |
| jmp .w32_wpad_done |
| .w32_pad6: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| pshufhw m0, m1, q3333 |
| punpckhqdq m0, m0 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m3, m0 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| mova m4, m3 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad6 |
| jmp .w32_wpad_done |
| .w32_pad5: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova m5, [rsp] |
| paddw m5, m1 |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| punpcklqdq m0, m0 |
| pshufhw m0, m0, q3333 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m3, m0 |
| punpckhqdq m3, m3 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| mova m4, m3 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad5 |
| jmp .w32_wpad_done |
| .w32_pad4: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m3, m0 |
| pshufhw m3, m3, q3333 |
| punpckhqdq m3, m3 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| mova m4, m3 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad4 |
| jmp .w32_wpad_done |
| .w32_pad3: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| movd m3, [yq+16] |
| punpcklbw m3, m3 |
| punpcklqdq m3, m3 |
| pshufhw m3, m3, q3333 |
| pmaddubsw m3, m2 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| mova m4, m3 |
| punpckhqdq m4, m4 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad3 |
| jmp .w32_wpad_done |
| .w32_pad2: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m3, [yq+16] |
| punpcklbw m3, m3 |
| pmaddubsw m3, m2 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| pshufhw m4, m3, q3333 |
| punpckhqdq m4, m4 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad2 |
| jmp .w32_wpad_done |
| .w32_pad1: |
| mova m0, [yq] |
| mova m1, m0 |
| punpcklbw m1, m1 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| punpckhbw m0, m0 |
| pmaddubsw m0, m2 |
| mova [acq+16], m0 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova m4, [yq+16] |
| mova m3, m4 |
| punpcklbw m3, m3 |
| pmaddubsw m3, m2 |
| mova [acq+32], m3 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| punpckhbw m4, m4 |
| punpcklqdq m4, m4 |
| pshufhw m4, m4, q3333 |
| pmaddubsw m4, m2 |
| mova [acq+48], m4 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| lea yq, [yq+strideq] |
| add acq, 64 |
| sub hd, 1 |
| jg .w32_pad1 |
| .w32_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg_32 |
| .w32_hpad_loop: |
| mova [acq], m1 |
| mova [acq+16], m0 |
| paddw m5, m1, [rsp] |
| mova [rsp ], m5 |
| paddw m5, m0, [rsp+16] |
| mova [rsp+16], m5 |
| mova [acq+32], m3 |
| mova [acq+48], m4 |
| paddw m5, m3, [rsp+32] |
| mova [rsp+32], m5 |
| paddw m5, m4, [rsp+48] |
| mova [rsp+48], m5 |
| add acq, 64 |
| sub hpadd, 1 |
| jg .w32_hpad_loop |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak |
| %else |
| DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h |
| %endif |
| |
| .calc_avg_32: |
| mova m5, [rsp] |
| mova m0, m5 |
| psrld m5, 16 |
| pslld m0, 16 |
| psrld m0, 16 |
| paddd m5, m0 |
| mova m0, [rsp+16] |
| mova m3, m0 |
| psrld m0, 16 |
| pslld m3, 16 |
| psrld m3, 16 |
| paddd m0, m3 |
| paddd m5, m0 |
| mova m0, [rsp+32] |
| mova m3, m0 |
| psrld m0, 16 |
| pslld m3, 16 |
| psrld m3, 16 |
| paddd m0, m3 |
| mova m1, [rsp+48] |
| mova m3, m1 |
| psrld m1, 16 |
| pslld m3, 16 |
| psrld m3, 16 |
| paddd m1, m3 |
| paddd m1, m0 |
| paddd m5, m1 |
| .calc_avg: |
| movd szd, m6 |
| psrad m6, 1 |
| tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); |
| paddd m5, m6 |
| movd m1, r1d |
| pshufd m0, m5, q2301 |
| paddd m0, m5 |
| pshufd m5, m0, q1032 |
| paddd m0, m5 |
| psrad m0, m1 ; sum >>= log2sz; |
| packssdw m0, m0 |
| RELOAD_ACQ_32 acq ; ac = ac_orig |
| .sub_loop: |
| mova m1, [acq] |
| psubw m1, m0 |
| mova [acq], m1 |
| add acq, 16 |
| sub szd, 8 |
| jg .sub_loop |
| RET |
| |
| ; %1 simd register that hold the mask and will hold the result |
| ; %2 simd register that holds the "true" values |
| ; %3 location of the "false" values (simd register/memory) |
| %macro BLEND 3 ; mask, true, false |
| pand %2, %1 |
| pandn %1, %3 |
| por %1, %2 |
| %endmacro |
| |
| %macro PAETH 2 ; top, ldiff |
| pavgb m1, m%1, m3 |
| pxor m0, m%1, m3 |
| pand m0, m4 |
| psubusb m2, m5, m1 |
| psubb m1, m0 |
| psubusb m1, m5 |
| por m1, m2 |
| paddusb m1, m1 |
| por m1, m0 ; min(tldiff, 255) |
| psubusb m2, m5, m3 |
| psubusb m0, m3, m5 |
| por m2, m0 ; tdiff |
| %ifnum %2 |
| pminub m2, m%2 |
| pcmpeqb m0, m%2, m2 ; ldiff <= tdiff |
| %else |
| mova m0, %2 |
| pminub m2, m0 |
| pcmpeqb m0, m2 |
| %endif |
| pminub m1, m2 |
| pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff |
| mova m2, m3 |
| BLEND m0, m2, m%1 |
| BLEND m1, m0, m5 |
| %endmacro |
| |
| cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h |
| %define base r5-ipred_paeth_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| pxor m0, m0 |
| movd m5, [tlq] |
| pshufb m5, m0 |
| LEA r5, ipred_paeth_ssse3_table |
| movsxd wq, [r5+wq*4] |
| movddup m4, [base+ipred_paeth_shuf] |
| add wq, r5 |
| jmp wq |
| .w4: |
| movd m6, [tlq+1] ; top |
| pshufd m6, m6, q0000 |
| lea r3, [strideq*3] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 ; ldiff |
| .w4_loop: |
| sub tlq, 4 |
| movd m3, [tlq] |
| mova m1, [base+ipred_h_shuf] |
| pshufb m3, m1 ; left |
| PAETH 6, 7 |
| movd [dstq ], m1 |
| pshuflw m0, m1, q1032 |
| movd [dstq+strideq ], m0 |
| punpckhqdq m1, m1 |
| movd [dstq+strideq*2], m1 |
| psrlq m1, 32 |
| movd [dstq+r3 ], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| ALIGN function_align |
| .w8: |
| movddup m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w8_loop: |
| sub tlq, 2 |
| movd m3, [tlq] |
| pshufb m3, [base+ipred_paeth_shuf] |
| PAETH 6, 7 |
| movq [dstq ], m1 |
| movhps [dstq+strideq], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| movu m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w16_loop: |
| sub tlq, 1 |
| movd m3, [tlq] |
| pxor m1, m1 |
| pshufb m3, m1 |
| PAETH 6, 7 |
| mova [dstq], m1 |
| add dstq, strideq |
| sub hd, 1 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| movu m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp ], m6 |
| mova [rsp+16], m7 |
| movu m6, [tlq+17] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp+32], m6 |
| .w32_loop: |
| dec tlq |
| movd m3, [tlq] |
| pxor m1, m1 |
| pshufb m3, m1 |
| mova m6, [rsp] |
| PAETH 6, [rsp+16] |
| mova [dstq ], m1 |
| mova m6, [rsp+32] |
| PAETH 6, 7 |
| mova [dstq+16], m1 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| movu m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp ], m6 |
| mova [rsp+16], m7 |
| movu m6, [tlq+17] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp+32], m6 |
| mova [rsp+48], m7 |
| movu m6, [tlq+33] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp+64], m6 |
| mova [rsp+80], m7 |
| movu m6, [tlq+49] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| mova [rsp+96], m6 |
| .w64_loop: |
| dec tlq |
| movd m3, [tlq] |
| pxor m1, m1 |
| pshufb m3, m1 |
| mova m6, [rsp] |
| PAETH 6, [rsp+16] |
| mova [dstq ], m1 |
| mova m6, [rsp+32] |
| PAETH 6, [rsp+48] |
| mova [dstq+16], m1 |
| mova m6, [rsp+64] |
| PAETH 6, [rsp+80] |
| mova [dstq+32], m1 |
| mova m6, [rsp+96] |
| PAETH 6, 7 |
| mova [dstq+48], m1 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| |
| |
| %macro FILTER 4 ;dst, src, tmp, shuf |
| %ifnum %4 |
| pshufb m%2, m%4 |
| %else |
| pshufb m%2, %4 |
| %endif |
| pshufd m%1, m%2, q0000 ;p0 p1 |
| pmaddubsw m%1, m2 |
| pshufd m%3, m%2, q1111 ;p2 p3 |
| pmaddubsw m%3, m3 |
| paddw m%1, [base+pw_8] |
| paddw m%1, m%3 |
| pshufd m%3, m%2, q2222 ;p4 p5 |
| pmaddubsw m%3, m4 |
| paddw m%1, m%3 |
| pshufd m%3, m%2, q3333 ;p6 __ |
| pmaddubsw m%3, m5 |
| paddw m%1, m%3 |
| psraw m%1, 4 |
| packuswb m%1, m%1 |
| %endmacro |
| |
| cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter |
| %define base r6-$$ |
| LEA r6, $$ |
| tzcnt wd, wm |
| %ifidn filterd, filterm |
| movzx filterd, filterb |
| %else |
| movzx filterd, byte filterm |
| %endif |
| shl filterd, 6 |
| lea filterq, [base+filter_intra_taps+filterq] |
| movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 |
| movsxd wq, [base+ipred_filter_ssse3_table+wq*4] |
| mova m2, [filterq+16*0] |
| mova m3, [filterq+16*1] |
| mova m4, [filterq+16*2] |
| mova m5, [filterq+16*3] |
| lea wq, [base+ipred_filter_ssse3_table+wq] |
| mov hd, hm |
| jmp wq |
| .w4: |
| mova m1, [base+filter_shuf1] |
| sub tlq, 3 |
| sub tlq, hq |
| jmp .w4_loop_start |
| .w4_loop: |
| movd m0, [tlq+hq] |
| punpckldq m0, m6 |
| lea dstq, [dstq+strideq*2] |
| .w4_loop_start: |
| FILTER 6, 0, 7, 1 |
| movd [dstq+strideq*0], m6 |
| pshuflw m6, m6, q1032 |
| movd [dstq+strideq*1], m6 |
| sub hd, 2 |
| jg .w4_loop |
| RET |
| |
| ALIGN function_align |
| .w8: |
| movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 |
| sub tlq, 5 |
| sub tlq, hq |
| |
| .w8_loop: |
| FILTER 7, 0, 1, [base+filter_shuf1] |
| punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| FILTER 0, 6, 1, [base+filter_shuf2] |
| |
| punpckldq m6, m7, m0 |
| movq [dstq+strideq*0], m6 |
| punpckhqdq m6, m6 |
| movq [dstq+strideq*1], m6 |
| |
| movd m0, [tlq+hq] ;_ 6 5 0 |
| punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 |
| |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| |
| ALIGN function_align |
| .w16: |
| movu m6, [tlq+1] ;top row |
| sub tlq, 5 |
| sub tlq, hq |
| |
| .w16_loop: |
| FILTER 7, 0, 1, [base+filter_shuf1] |
| punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m6, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+4+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| |
| FILTER 7, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+8+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m6, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| movd [dstq+12+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| mova [dstq+strideq*1], m6 |
| |
| movd m0, [tlq+hq] ;_ 6 5 0 |
| punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 |
| |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| |
| ALIGN function_align |
| .w32: |
| movu m6, [tlq+1] ;top row |
| lea filterq, [tlq+17] |
| sub tlq, 5 |
| sub tlq, hq |
| |
| .w32_loop: |
| FILTER 7, 0, 1, [base+filter_shuf1] |
| punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m6, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+4+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| |
| FILTER 7, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+8+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m6, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| movu m1, [filterq] |
| punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ |
| punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+12+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| mova [dstq+strideq*1], m6 |
| |
| mova m6, m1 |
| |
| FILTER 7, 0, 6, [base+filter_shuf2] |
| punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+16+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m1, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+20+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| |
| FILTER 7, 0, 1, [base+filter_shuf2] |
| punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| movd [dstq+24+strideq*0], m7 |
| psrlq m7, 32 |
| palignr m7, m6, 4 |
| |
| FILTER 6, 0, 1, [base+filter_shuf2] |
| movd [dstq+28+strideq*0], m6 |
| psrlq m6, 32 |
| palignr m6, m7, 4 |
| mova [dstq+16+strideq*1], m6 |
| |
| mova m6, [dstq+strideq*1] |
| movd m0, [tlq+hq] ;_ 6 5 0 |
| punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 |
| lea filterq, [dstq+16+strideq*1] |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_loop |
| RET |