| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 64 |
| |
| %macro SMOOTH_WEIGHT_TABLE 1-* |
| %rep %0 |
| db %1-128, 127-%1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| ; sm_weights[], but modified to precalculate x and 256-x with offsets to |
| ; enable efficient use of pmaddubsw (which requires signed values) |
| smooth_weights: SMOOTH_WEIGHT_TABLE \ |
| 0, 0, 255, 128, 255, 149, 85, 64, \ |
| 255, 197, 146, 105, 73, 50, 37, 32, \ |
| 255, 225, 196, 170, 145, 123, 102, 84, \ |
| 68, 54, 43, 33, 26, 20, 17, 16, \ |
| 255, 240, 225, 210, 196, 182, 169, 157, \ |
| 145, 133, 122, 111, 101, 92, 83, 74, \ |
| 66, 59, 52, 45, 39, 34, 29, 25, \ |
| 21, 17, 14, 12, 10, 9, 8, 8, \ |
| 255, 248, 240, 233, 225, 218, 210, 203, \ |
| 196, 189, 182, 176, 169, 163, 156, 150, \ |
| 144, 138, 133, 127, 121, 116, 111, 106, \ |
| 101, 96, 91, 86, 82, 77, 73, 69, \ |
| 65, 61, 57, 54, 50, 47, 44, 41, \ |
| 38, 35, 32, 29, 27, 25, 22, 20, \ |
| 18, 16, 15, 13, 12, 10, 9, 8, \ |
| 7, 6, 6, 5, 5, 4, 4, 4 |
| |
| pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 |
| pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 |
| pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 |
| db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 |
| z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 |
| db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 |
| db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 |
| z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 |
| db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 |
| db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line |
| pb_128: times 4 db 128 ; those are just placed here for alignment. |
| pb_36_m4: times 2 db 36, -4 |
| z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 |
| z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 |
| z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 |
| z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 |
| z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 |
| z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 |
| z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 |
| z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 |
| z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 |
| z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 |
| z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 |
| z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 |
| dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 |
| z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 |
| dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 |
| z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 |
| db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 |
| ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 |
| filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 |
| db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 |
| filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 |
| filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 |
| pb_127_m127: times 2 db 127, -127 |
| ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 |
| db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 |
| ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 |
| db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 |
| pw_64: times 2 dw 64 |
| |
| cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 |
| db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| ; w=8, w_pad=1 as well as second half of previous one |
| cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 |
| times 5 db 6, 7 |
| ; w=16,w_pad=2 |
| db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| times 8 db 14, 15 |
| ; w=16,w_pad=3 |
| db 0, 1, 2, 3, 4, 5 |
| times 13 db 6, 7 |
| pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| |
| %define pb_0to15 cfl_ac_w16_pad_shuffle |
| %define pb_1 (ipred_h_shuf+12) |
| %define pb_2 (ipred_h_shuf+20) |
| %define pb_3 (ipred_h_shuf+ 4) |
| %define pb_4 (ipred_h_shuf+24) |
| %define pb_5 (ipred_h_shuf+ 8) |
| %define pb_7 (ipred_h_shuf+ 0) |
| %define pb_8 (z_upsample2 +12) |
| %define pb_12 (z2_y_shuf_h4+20) |
| %define pb_14 (z2_y_shuf_h4+ 4) |
| %define pb_15 (z_filter_s +32) |
| %define pb_27 (z2_y_shuf_h4+ 8) |
| %define pb_31 (z2_y_shuf_h4+12) |
| %define pb_32 (z2_y_shuf_h4+16) |
| %define pb_90 (z2_y_shuf_h4+ 0) |
| %define pw_1 (z2_y_shuf_h4+24) |
| %define pw_8 (z_filter_k +32) |
| |
| pw_62: times 2 dw 62 |
| pw_128: times 2 dw 128 |
| pw_255: times 2 dw 255 |
| pw_512: times 2 dw 512 |
| |
| %macro JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - 2*4) |
| %xdefine %%base mangle(private_prefix %+ _%1_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%base %+ .%3 - (%%table - 2*4) |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) |
| %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) |
| |
| JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 |
| JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ |
| s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 |
| JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ |
| s4-8*4, s8-8*4, s16-8*4, s32-8*4 |
| JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 |
| JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 |
| JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 |
| JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 |
| |
| cextern dr_intra_derivative |
| cextern filter_intra_taps |
| |
| SECTION .text |
| |
| INIT_YMM avx2 |
| cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h |
| lea r5, [ipred_dc_left_avx2_table] |
| tzcnt wd, wm |
| inc tlq |
| movu m0, [tlq] |
| movifnidn hd, hm |
| mov r6d, 0x8000 |
| shrx r6d, r6d, wd |
| movd xm3, r6d |
| movsxd r6, [r5+wq*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| |
| cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| mov hd, hm ; zero upper half |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movu m0, [tlq] |
| mov r5d, 0x8000 |
| shrx r5d, r5d, r6d |
| movd xm3, r5d |
| lea r5, [ipred_dc_left_avx2_table] |
| movsxd r6, [r5+r6*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| .h64: |
| movu m1, [tlq+32] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| .h32: |
| vextracti128 xm1, m0, 1 |
| paddw xm0, xm1 |
| .h16: |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| .h8: |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| .h4: |
| pmaddwd xm0, xm2 |
| pmulhrsw xm0, xm3 |
| lea stride3q, [strideq*3] |
| vpbroadcastb m0, xm0 |
| mova m1, m0 |
| jmp wq |
| |
| cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| movifnidn hd, hm |
| movifnidn wd, wm |
| tzcnt r6d, hd |
| lea r5d, [wq+hq] |
| movd xm4, r5d |
| tzcnt r5d, r5d |
| movd xm5, r5d |
| lea r5, [ipred_dc_avx2_table] |
| tzcnt wd, wd |
| movsxd r6, [r5+r6*4] |
| movsxd wq, [r5+wq*4+5*4] |
| pcmpeqd m3, m3 |
| psrlw xm4, 1 |
| add r6, r5 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp r6 |
| .h4: |
| movd xm0, [tlq-4] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w4: |
| movd xm1, [tlq+1] |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw xm0, 3 |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq xm1, xm0, xm0 |
| lea r2d, [hq*2] |
| mov r6d, 0x55563334 |
| paddw xm0, xm1 |
| shrx r6d, r6d, r2d |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| movd xm1, r6d |
| psrlw xm0, 2 |
| pmulhuw xm0, xm1 |
| .w4_end: |
| vpbroadcastb xm0, xm0 |
| .s4: |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm0 |
| movd [dstq+strideq*2], xm0 |
| movd [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4 |
| RET |
| ALIGN function_align |
| .h8: |
| movq xm0, [tlq-8] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w8: |
| movq xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| punpckhqdq xm2, xm0, xm0 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmovz r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w8_end: |
| vpbroadcastb xm0, xm0 |
| .s8: |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm0 |
| movq [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s8 |
| RET |
| ALIGN function_align |
| .h16: |
| mova xm0, [tlq-16] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w16: |
| movu xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hb, 8|32 |
| cmovz r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w16_end: |
| vpbroadcastb xm0, xm0 |
| .s16: |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm0 |
| mova [dstq+strideq*2], xm0 |
| mova [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s16 |
| RET |
| ALIGN function_align |
| .h32: |
| mova m0, [tlq-32] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w32: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| vextracti128 xm1, m0, 1 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w32_end: |
| vpbroadcastb m0, xm0 |
| .s32: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s32 |
| RET |
| ALIGN function_align |
| .h64: |
| mova m0, [tlq-64] |
| mova m1, [tlq-32] |
| pmaddubsw m0, m3 |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| jmp wq |
| .w64: |
| movu m1, [tlq+ 1] |
| movu m2, [tlq+33] |
| pmaddubsw m1, m3 |
| pmaddubsw m2, m3 |
| paddw m0, m1 |
| paddw m0, m2 |
| vextracti128 xm1, m0, 1 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 64 |
| je .w64_end |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, hd |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w64_end: |
| vpbroadcastb m0, xm0 |
| mova m1, m0 |
| .s64: |
| mova [dstq+strideq*0+32*0], m0 |
| mova [dstq+strideq*0+32*1], m1 |
| mova [dstq+strideq*1+32*0], m0 |
| mova [dstq+strideq*1+32*1], m1 |
| mova [dstq+strideq*2+32*0], m0 |
| mova [dstq+strideq*2+32*1], m1 |
| mova [dstq+stride3q +32*0], m0 |
| mova [dstq+stride3q +32*1], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s64 |
| RET |
| |
| cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] |
| mova m1, m0 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_avx2_table] |
| tzcnt wd, wm |
| movu m0, [tlq+ 1] |
| movu m1, [tlq+33] |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| %macro IPRED_H 2 ; w, store_type |
| vpbroadcastb m0, [tlq-1] |
| vpbroadcastb m1, [tlq-2] |
| vpbroadcastb m2, [tlq-3] |
| sub tlq, 4 |
| vpbroadcastb m3, [tlq+0] |
| mov%2 [dstq+strideq*0], m0 |
| mov%2 [dstq+strideq*1], m1 |
| mov%2 [dstq+strideq*2], m2 |
| mov%2 [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w%1 |
| RET |
| ALIGN function_align |
| %endmacro |
| |
| INIT_XMM avx2 |
| cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_h_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| IPRED_H 4, d |
| .w8: |
| IPRED_H 8, q |
| .w16: |
| IPRED_H 16, a |
| INIT_YMM avx2 |
| .w32: |
| IPRED_H 32, a |
| .w64: |
| vpbroadcastb m0, [tlq-1] |
| vpbroadcastb m1, [tlq-2] |
| vpbroadcastb m2, [tlq-3] |
| sub tlq, 4 |
| vpbroadcastb m3, [tlq+0] |
| mova [dstq+strideq*0+32*0], m0 |
| mova [dstq+strideq*0+32*1], m0 |
| mova [dstq+strideq*1+32*0], m1 |
| mova [dstq+strideq*1+32*1], m1 |
| mova [dstq+strideq*2+32*0], m2 |
| mova [dstq+strideq*2+32*1], m2 |
| mova [dstq+stride3q +32*0], m3 |
| mova [dstq+stride3q +32*1], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w64 |
| RET |
| |
| %macro PAETH 2 ; top, ldiff |
| pavgb m1, m%1, m3 ; Calculating tldiff normally requires |
| pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it |
| pand m0, m4 ; in 8-bit with some tricks which avoids |
| psubusb m2, m5, m1 ; having to unpack everything to 16-bit. |
| psubb m1, m0 |
| psubusb m1, m5 |
| por m1, m2 |
| paddusb m1, m1 |
| por m1, m0 ; min(tldiff, 255) |
| psubusb m2, m5, m3 |
| psubusb m0, m3, m5 |
| por m2, m0 ; tdiff |
| pminub m2, m%2 |
| pcmpeqb m0, m%2, m2 ; ldiff <= tdiff |
| vpblendvb m0, m%1, m3, m0 |
| pminub m1, m2 |
| pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff |
| vpblendvb m0, m5, m0, m1 |
| %endmacro |
| |
| cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h |
| %define base r5-ipred_paeth_avx2_table |
| lea r5, [ipred_paeth_avx2_table] |
| tzcnt wd, wm |
| vpbroadcastb m5, [tlq] ; topleft |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m4, [base+pb_1] |
| add wq, r5 |
| jmp wq |
| .w4: |
| vpbroadcastd m6, [tlq+1] ; top |
| mova m8, [base+ipred_h_shuf] |
| lea r3, [strideq*3] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 ; ldiff |
| .w4_loop: |
| sub tlq, 8 |
| vpbroadcastq m3, [tlq] |
| pshufb m3, m8 ; left |
| PAETH 6, 7 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| vpbroadcastq m6, [tlq+1] |
| mova m8, [base+ipred_h_shuf] |
| lea r3, [strideq*3] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w8_loop: |
| sub tlq, 4 |
| vpbroadcastd m3, [tlq] |
| pshufb m3, m8 |
| PAETH 6, 7 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| vbroadcasti128 m6, [tlq+1] |
| mova xm8, xm4 ; lower half = 1, upper half = 0 |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w16_loop: |
| sub tlq, 2 |
| vpbroadcastd m3, [tlq] |
| pshufb m3, m8 |
| PAETH 6, 7 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| movu m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w32_loop: |
| dec tlq |
| vpbroadcastb m3, [tlq] |
| PAETH 6, 7 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| movu m6, [tlq+ 1] |
| movu m7, [tlq+33] |
| %if WIN64 |
| movaps r4m, xmm9 |
| %endif |
| psubusb m8, m5, m6 |
| psubusb m0, m6, m5 |
| psubusb m9, m5, m7 |
| psubusb m1, m7, m5 |
| por m8, m0 |
| por m9, m1 |
| .w64_loop: |
| dec tlq |
| vpbroadcastb m3, [tlq] |
| PAETH 6, 8 |
| mova [dstq+32*0], m0 |
| PAETH 7, 9 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| %if WIN64 |
| movaps xmm9, r4m |
| %endif |
| RET |
| |
| %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] |
| ; w * a = (w - 128) * a + 128 * a |
| ; (256 - w) * b = (127 - w) * b + 129 * b |
| pmaddubsw m0, m%3, m%1 |
| pmaddubsw m1, m%4, m%2 |
| paddw m0, m%5 |
| paddw m1, m%6 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| %endmacro |
| |
| cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights |
| %define base r6-ipred_smooth_v_avx2_table |
| lea r6, [ipred_smooth_v_avx2_table] |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m0, [base+pb_127_m127] |
| vpbroadcastd m1, [base+pw_128] |
| lea weightsq, [base+smooth_weights+hq*4] |
| neg hq |
| vpbroadcastb m5, [tlq+hq] ; bottom |
| add wq, r6 |
| jmp wq |
| .w4: |
| vpbroadcastd m2, [tlq+1] |
| punpcklbw m2, m5 ; top, bottom |
| mova m5, [base+ipred_v_shuf] |
| lea r3, [strideq*3] |
| punpckldq m4, m5, m5 |
| punpckhdq m5, m5 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok |
| paddw m3, m1 ; 128 * top + 129 * bottom + 128 |
| .w4_loop: |
| vbroadcasti128 m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 1 |
| pextrd [dstq+r3 ], xm1, 1 |
| cmp hd, -4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 2 |
| pextrd [dstq+strideq*1], xm1, 2 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| add hq, 8 |
| jl .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| vpbroadcastq m2, [tlq+1] |
| punpcklbw m2, m5 |
| mova m5, [base+ipred_v_shuf] |
| lea r3, [strideq*3] |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 |
| paddw m3, m1 |
| .w8_loop: |
| vpbroadcastq m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| WIN64_SPILL_XMM 7 |
| vbroadcasti128 m3, [tlq+1] |
| mova m6, [base+ipred_v_shuf] |
| punpcklbw m2, m3, m5 |
| punpckhbw m3, m5 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w16_loop: |
| vpbroadcastd m1, [weightsq+hq*2] |
| pshufb m1, m6 |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| add hq, 2 |
| jl .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 6 |
| movu m3, [tlq+1] |
| punpcklbw m2, m3, m5 |
| punpckhbw m3, m5 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w32_loop: |
| vpbroadcastw m1, [weightsq+hq*2] |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq], m0 |
| add dstq, strideq |
| inc hq |
| jl .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| WIN64_SPILL_XMM 11 |
| movu m4, [tlq+ 1] |
| movu m8, [tlq+33] |
| punpcklbw m3, m4, m5 |
| punpckhbw m4, m5 |
| punpcklbw m7, m8, m5 |
| punpckhbw m8, m5 |
| pmaddubsw m5, m3, m0 |
| pmaddubsw m6, m4, m0 |
| pmaddubsw m9, m7, m0 |
| pmaddubsw m10, m8, m0 |
| paddw m2, m1, m3 |
| paddw m5, m2 |
| paddw m2, m1, m4 |
| paddw m6, m2 |
| paddw m0, m1, m7 |
| paddw m9, m0 |
| paddw m1, m8 |
| paddw m10, m1 |
| .w64_loop: |
| vpbroadcastw m2, [weightsq+hq*2] |
| SMOOTH 2, 2, 3, 4, 5, 6 |
| mova [dstq+32*0], m0 |
| SMOOTH 2, 2, 7, 8, 9, 10 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| inc hq |
| jl .w64_loop |
| RET |
| |
| %macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used |
| %assign stack_offset 0 |
| %assign stack_size_padded 0 |
| %assign regs_used %2 |
| %xdefine rstk rsp |
| SETUP_STACK_POINTER %1 |
| %if regs_used != %2 && WIN64 |
| PUSH r%2 |
| %endif |
| ALLOC_STACK %1, %3 |
| %endmacro |
| |
| cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h |
| %define base r6-ipred_smooth_h_avx2_table |
| lea r6, [ipred_smooth_h_avx2_table] |
| mov wd, wm |
| vpbroadcastb m3, [tlq+wq] ; right |
| tzcnt wd, wd |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m4, [base+pb_127_m127] |
| vpbroadcastd m5, [base+pw_128] |
| add wq, r6 |
| jmp wq |
| .w4: |
| WIN64_SPILL_XMM 8 |
| vpbroadcastq m6, [base+smooth_weights+4*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 8 |
| sub tlq, hq |
| lea r3, [strideq*3] |
| .w4_loop: |
| vpbroadcastq m2, [tlq+hq] |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 8 |
| vbroadcasti128 m6, [base+smooth_weights+8*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 4 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| .w8_loop: |
| vpbroadcastd m2, [tlq+hq] |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 |
| paddw m0, m1 |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| SETUP_STACK_FRAME 32*4, 7, 8 |
| lea r3, [rsp+64*2-4] |
| call .prep ; only worthwhile for for w16 and above |
| sub tlq, 2 |
| vpbroadcastd xm6, [base+pb_1] |
| mova xm7, [base+ipred_v_shuf+16] |
| vinserti128 m7, [base+ipred_v_shuf+ 0], 1 |
| vbroadcasti128 m4, [base+smooth_weights+16*2] |
| vbroadcasti128 m5, [base+smooth_weights+16*3] |
| .w16_loop: |
| vpbroadcastd m1, [tlq+hq] |
| vpbroadcastd m2, [r3+hq*2] |
| pshufb m1, m6 |
| punpcklbw m1, m3 |
| pshufb m2, m7 |
| SMOOTH 4, 5, 1, 1, 2, 2 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| SETUP_STACK_FRAME 32*4, 7, 6 |
| lea r3, [rsp+64*2-2] |
| call .prep |
| dec tlq |
| mova xm4, [base+smooth_weights+16*4] |
| vinserti128 m4, [base+smooth_weights+16*6], 1 |
| mova xm5, [base+smooth_weights+16*5] |
| vinserti128 m5, [base+smooth_weights+16*7], 1 |
| .w32_loop: |
| vpbroadcastb m1, [tlq+hq] |
| punpcklbw m1, m3 |
| vpbroadcastw m2, [r3+hq*2] |
| SMOOTH 4, 5, 1, 1, 2, 2 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| SETUP_STACK_FRAME 32*4, 7, 9 |
| lea r3, [rsp+64*2-2] |
| call .prep |
| add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table |
| dec tlq |
| mova xm5, [r6-16*7] |
| vinserti128 m5, [r6-16*5], 1 |
| mova xm6, [r6-16*6] |
| vinserti128 m6, [r6-16*4], 1 |
| mova xm7, [r6-16*3] |
| vinserti128 m7, [r6-16*1], 1 |
| mova xm8, [r6-16*2] |
| vinserti128 m8, [r6-16*0], 1 |
| .w64_loop: |
| vpbroadcastb m2, [tlq+hq] |
| punpcklbw m2, m3 |
| vpbroadcastw m4, [r3+hq*2] |
| SMOOTH 5, 6, 2, 2, 4, 4 |
| mova [dstq+32*0], m0 |
| SMOOTH 7, 8, 2, 2, 4, 4 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| ALIGN function_align |
| .prep: |
| vpermq m2, [tlq-32*1], q3120 |
| punpckhbw m1, m2, m3 |
| punpcklbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m1, m5 ; 1 * left + 256 * right + 128 |
| paddw m0, m1 ; 128 * left + 129 * right + 128 |
| pmaddubsw m1, m2, m4 |
| paddw m2, m5 |
| paddw m1, m2 |
| vpermq m2, [tlq-32*2], q3120 |
| mova [rsp+gprsize+32*3], m0 |
| mova [rsp+gprsize+32*2], m1 |
| punpckhbw m1, m2, m3 |
| punpcklbw m2, m3 |
| pmaddubsw m0, m1, m4 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m2, m5 |
| paddw m1, m2 |
| mova [rsp+gprsize+32*1], m0 |
| mova [rsp+gprsize+32*0], m1 |
| sub r3, hq |
| sub tlq, hq |
| sub r3, hq |
| ret |
| |
| %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] |
| pmaddubsw m0, m%3, m%1 |
| pmaddubsw m1, m%4, m%2 |
| %ifnum %5 |
| paddw m0, m%5 |
| %else |
| paddw m0, %5 |
| %endif |
| %ifnum %6 |
| paddw m1, m%6 |
| %else |
| paddw m1, %6 |
| %endif |
| pavgw m0, m2 |
| pavgw m1, m3 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| %endmacro |
| |
| cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights |
| %define base r6-ipred_smooth_avx2_table |
| lea r6, [ipred_smooth_avx2_table] |
| mov wd, wm |
| vpbroadcastb m4, [tlq+wq] ; right |
| tzcnt wd, wd |
| mov hd, hm |
| mov r5, tlq |
| sub r5, hq |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m5, [base+pb_127_m127] |
| vpbroadcastb m0, [r5] ; bottom |
| vpbroadcastd m3, [base+pw_255] |
| add wq, r6 |
| lea v_weightsq, [base+smooth_weights+hq*2] |
| jmp wq |
| .w4: |
| WIN64_SPILL_XMM 12 |
| mova m10, [base+ipred_h_shuf] |
| vpbroadcastq m11, [base+smooth_weights+4*2] |
| mova m7, [base+ipred_v_shuf] |
| vpbroadcastd m8, [tlq+1] |
| sub tlq, 8 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| punpcklbw m8, m0 ; top, bottom |
| pshufd m6, m7, q2200 |
| pshufd m7, m7, q3311 |
| pmaddubsw m9, m8, m5 |
| paddw m3, m8 ; 1 * top + 255 * bottom + 255 |
| paddw m9, m3 ; 128 * top + 129 * bottom + 255 |
| .w4_loop: |
| vpbroadcastq m1, [tlq+hq] |
| pshufb m1, m10 |
| punpcklbw m0, m1, m4 ; left, right |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 ; 127 * left - 127 * right |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 ; 128 * left + 129 * right |
| paddw m3, m1 |
| pmaddubsw m0, m11 |
| pmaddubsw m1, m11 |
| paddw m2, m0 |
| paddw m3, m1 |
| vbroadcasti128 m1, [v_weightsq] |
| add v_weightsq, 16 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| SMOOTH_2D_END 0, 1, 8, 8, 9, 9 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 12 |
| mova m10, [base+ipred_h_shuf] |
| vbroadcasti128 m11, [base+smooth_weights+8*2] |
| mova m7, [base+ipred_v_shuf] |
| vpbroadcastq m8, [tlq+1] |
| sub tlq, 4 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| punpcklbw m8, m0 |
| pshufd m6, m7, q0000 |
| pshufd m7, m7, q1111 |
| pmaddubsw m9, m8, m5 |
| paddw m3, m8 |
| paddw m9, m3 |
| .w8_loop: |
| vpbroadcastd m1, [tlq+hq] |
| pshufb m1, m10 |
| punpcklbw m0, m1, m4 |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 |
| paddw m3, m1 |
| pmaddubsw m0, m11 |
| pmaddubsw m1, m11 |
| paddw m2, m0 |
| paddw m3, m1 |
| vpbroadcastq m1, [v_weightsq] |
| add v_weightsq, 8 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| SMOOTH_2D_END 0, 1, 8, 8, 9, 9 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| SETUP_STACK_FRAME 32*4, 7, 14 |
| vbroadcasti128 m11, [tlq+1] |
| lea r3, [rsp+64*2-4] |
| punpcklbw m10, m11, m0 ; top, bottom |
| punpckhbw m11, m0 |
| call .prep_v |
| sub tlq, 2 |
| pmaddubsw m12, m10, m5 |
| pmaddubsw m13, m11, m5 |
| vpbroadcastd xm5, [base+pb_1] |
| mova m9, [base+ipred_v_shuf] |
| vbroadcasti128 m6, [base+smooth_weights+16*2] |
| vbroadcasti128 m7, [base+smooth_weights+16*3] |
| vpermq m8, m9, q1032 |
| paddw m0, m10, m3 |
| paddw m3, m11 |
| paddw m12, m0 |
| paddw m13, m3 |
| .w16_loop: |
| vpbroadcastd m3, [tlq+hq] |
| vpbroadcastd m0, [r3+hq*2] |
| vpbroadcastd m1, [v_weightsq] |
| add v_weightsq, 4 |
| pshufb m3, m5 |
| punpcklbw m3, m4 ; left, right |
| pmaddubsw m2, m3, m6 |
| pmaddubsw m3, m7 |
| pshufb m0, m8 |
| pshufb m1, m9 |
| paddw m2, m0 |
| paddw m3, m0 |
| SMOOTH_2D_END 1, 1, 10, 11, 12, 13 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| SETUP_STACK_FRAME 32*4, 7, 11 |
| movu m8, [tlq+1] |
| lea r3, [rsp+64*2-2] |
| punpcklbw m7, m8, m0 |
| punpckhbw m8, m0 |
| call .prep_v |
| dec tlq |
| pmaddubsw m9, m7, m5 |
| pmaddubsw m10, m8, m5 |
| mova xm5, [base+smooth_weights+16*4] |
| vinserti128 m5, [base+smooth_weights+16*6], 1 |
| mova xm6, [base+smooth_weights+16*5] |
| vinserti128 m6, [base+smooth_weights+16*7], 1 |
| paddw m0, m7, m3 |
| paddw m3, m8 |
| paddw m9, m0 |
| paddw m10, m3 |
| .w32_loop: |
| vpbroadcastb m3, [tlq+hq] |
| punpcklbw m3, m4 |
| vpbroadcastw m0, [r3+hq*2] |
| vpbroadcastw m1, [v_weightsq] |
| add v_weightsq, 2 |
| pmaddubsw m2, m3, m5 |
| pmaddubsw m3, m6 |
| paddw m2, m0 |
| paddw m3, m0 |
| SMOOTH_2D_END 1, 1, 7, 8, 9, 10 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| SETUP_STACK_FRAME 32*8, 7, 16 |
| movu m13, [tlq+1 ] |
| movu m15, [tlq+33] |
| add r6, smooth_weights+16*15-ipred_smooth_avx2_table |
| lea r3, [rsp+64*2-2] |
| punpcklbw m12, m13, m0 |
| punpckhbw m13, m0 |
| punpcklbw m14, m15, m0 |
| punpckhbw m15, m0 |
| call .prep_v |
| dec tlq |
| pmaddubsw m0, m12, m5 |
| pmaddubsw m1, m13, m5 |
| pmaddubsw m2, m14, m5 |
| pmaddubsw m5, m15, m5 |
| mova xm8, [r6-16*7] |
| vinserti128 m8, [r6-16*5], 1 |
| mova xm9, [r6-16*6] |
| vinserti128 m9, [r6-16*4], 1 |
| mova xm10, [r6-16*3] |
| vinserti128 m10, [r6-16*1], 1 |
| mova xm11, [r6-16*2] |
| vinserti128 m11, [r6-16*0], 1 |
| lea r6, [rsp+32*4] |
| paddw m0, m3 |
| paddw m1, m3 |
| paddw m2, m3 |
| paddw m3, m5 |
| paddw m0, m12 |
| paddw m1, m13 |
| paddw m2, m14 |
| paddw m3, m15 |
| mova [r6+32*0], m0 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m2 |
| mova [r6+32*3], m3 |
| .w64_loop: |
| vpbroadcastb m5, [tlq+hq] |
| punpcklbw m5, m4 |
| vpbroadcastw m6, [r3+hq*2] |
| vpbroadcastw m7, [v_weightsq] |
| add v_weightsq, 2 |
| pmaddubsw m2, m5, m8 |
| pmaddubsw m3, m5, m9 |
| paddw m2, m6 |
| paddw m3, m6 |
| SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] |
| mova [dstq+32*0], m0 |
| pmaddubsw m2, m5, m10 |
| pmaddubsw m3, m5, m11 |
| paddw m2, m6 |
| paddw m3, m6 |
| SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| ALIGN function_align |
| .prep_v: |
| vpermq m2, [tlq-32*1], q3120 |
| punpckhbw m1, m2, m4 |
| punpcklbw m2, m4 |
| pmaddubsw m0, m1, m5 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m2, m5 |
| paddw m1, m2 |
| vpermq m2, [tlq-32*2], q3120 |
| mova [rsp+gprsize+32*3], m0 |
| mova [rsp+gprsize+32*2], m1 |
| punpckhbw m1, m2, m4 |
| punpcklbw m2, m4 |
| pmaddubsw m0, m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m5 |
| paddw m1, m2 |
| mova [rsp+gprsize+32*1], m0 |
| mova [rsp+gprsize+32*0], m1 |
| sub r3, hq |
| sub tlq, hq |
| sub r3, hq |
| ret |
| |
| cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase |
| %assign org_stack_offset stack_offset |
| lea r6, [ipred_z1_avx2_table] |
| tzcnt wd, wm |
| movifnidn angled, anglem |
| movifnidn hd, hm |
| lea r7, [dr_intra_derivative] |
| inc tlq |
| movsxd wq, [r6+wq*4] |
| add wq, r6 |
| mov dxd, angled |
| and dxd, 0x7e |
| add angled, 165 ; ~90 |
| movzx dxd, word [r7+dxq] |
| xor angled, 0x4ff ; d = 90 - angle |
| vpbroadcastd m3, [pw_512] |
| vpbroadcastd m4, [pw_62] |
| vpbroadcastd m5, [pw_64] |
| jmp wq |
| .w4: |
| cmp angleb, 40 |
| jae .w4_no_upsample |
| lea r3d, [angleq-1024] |
| sar r3d, 7 |
| add r3d, hd |
| jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) |
| ALLOC_STACK -32, 8 |
| mova xm1, [tlq-1] |
| pshufb xm0, xm1, [z_upsample1] |
| pshufb xm1, [z_upsample2] |
| vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse |
| add dxd, dxd ; pw_512 (which is already in m3) |
| pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 |
| pextrd [rsp+16], xm1, 3 ; top[max_base_x] |
| pmaddubsw xm1, xm2 |
| movd xm7, dxd |
| mov r3d, dxd ; xpos |
| vpbroadcastw m7, xm7 |
| paddw xm1, xm0 |
| movq xm0, [tlq] |
| pmulhrsw xm1, xm3 |
| pslldq m6, m7, 8 |
| paddw xm2, xm7, xm7 |
| lea r2, [strideq*3] |
| paddw m6, m7 |
| packuswb xm1, xm1 |
| paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 |
| punpcklbw xm0, xm1 |
| psllw m7, 2 |
| mova [rsp], xm0 |
| .w4_upsample_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| vpbroadcastq m1, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vpbroadcastq m2, [rsp+r5] |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| movq xm0, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| movhps xm0, [rsp+r5] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 ; frac |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 ; 64-frac |
| psllw m2, 8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| paddw m6, m7 ; xpos += dx |
| pmulhrsw m0, m3 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r2 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_upsample_loop |
| RET |
| ALIGN function_align |
| .filter_strength: ; w4/w8/w16 |
| ; The C version uses a lot of branches, but we can do all the comparisons |
| ; in parallel and use popcnt to get the final filter strength value. |
| %define base r3-z_filter_t0 |
| lea r3, [z_filter_t0] |
| movd xm0, maxbased |
| movd xm2, angled |
| shr angled, 8 ; is_sm << 1 |
| vpbroadcastb m0, xm0 |
| vpbroadcastb m2, xm2 |
| pcmpeqb m1, m0, [base+z_filter_wh] |
| pand m1, m2 |
| mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases |
| pcmpgtb m1, m2 |
| pmovmskb r5d, m1 |
| popcnt r5d, r5d ; sets ZF which can be used by caller |
| ret |
| .w4_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -16, 11 |
| mov maxbased, 7 |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w4_main |
| lea maxbased, [hq+3] |
| call .filter_strength |
| mov maxbased, 7 |
| jz .w4_main ; filter_strength == 0 |
| vpbroadcastd m7, [base+pb_8] |
| vbroadcasti128 m2, [tlq-1] |
| pminub m1, m7, [base+z_filter_s] |
| vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] |
| pminub m7, [base+z_filter_s+8] |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] |
| pshufb m0, m2, m1 |
| shufps m1, m7, q2121 |
| pmaddubsw m0, m8 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m9 |
| pshufb m2, m7 |
| pmaddubsw m2, m10 |
| paddw m0, m1 |
| paddw m0, m2 |
| pmulhrsw m0, m3 |
| mov r3d, 9 |
| mov tlq, rsp |
| cmp hd, 4 |
| cmova maxbased, r3d |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| mova [tlq], xm0 |
| .w4_main: |
| movd xm6, dxd |
| vpbroadcastq m0, [z_base_inc] ; base_inc << 6 |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| mov r3d, dxd ; xpos |
| movd xm9, maxbased |
| vpbroadcastw m9, xm9 |
| vbroadcasti128 m8, [z1_shuf_w4] |
| psrlw m7, 8 ; top[max_base_x] |
| paddw m10, m6, m6 |
| psubw m9, m0 ; max_base_x |
| vpblendd m6, m10, 0xcc |
| mova xm0, xm10 |
| paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 |
| paddw m10, m10 |
| .w4_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| vpbroadcastq m1, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vpbroadcastq m2, [tlq+r5] |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| movq xm0, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| movhps xm0, [tlq+r5] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 ; frac |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 ; 64-frac |
| psllw m2, 8 |
| pshufb m0, m8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m6 ; base < max_base_x |
| pmulhrsw m0, m3 |
| paddw m6, m10 ; xpos += dx |
| lea r5, [dstq+strideq*2] |
| vpblendvb m0, m7, m0, m1 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [r5 +strideq*0], xm0 |
| pextrd [r5 +strideq*1], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| sub hd, 4 |
| jz .w4_end |
| lea dstq, [dstq+strideq*4] |
| cmp r3d, maxbased |
| jb .w4_loop |
| packuswb xm7, xm7 |
| lea r6, [strideq*3] |
| .w4_end_loop: |
| movd [dstq+strideq*0], xm7 |
| movd [dstq+strideq*1], xm7 |
| movd [dstq+strideq*2], xm7 |
| movd [dstq+r6 ], xm7 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_end_loop |
| .w4_end: |
| RET |
| ALIGN function_align |
| .w8: |
| lea r3d, [angleq+216] |
| mov r3b, hb |
| cmp r3d, 8 |
| ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 8 |
| movu xm2, [z_filter_s+6] |
| mova xm0, [tlq-1] |
| movd xm6, hd |
| vinserti128 m0, [tlq+7], 1 |
| vpbroadcastb xm6, xm6 |
| vbroadcasti128 m1, [z_upsample1] |
| pminub xm6, xm2 |
| vpbroadcastd m7, [pb_36_m4] |
| vinserti128 m2, xm6, 1 |
| add dxd, dxd |
| pshufb m1, m0, m1 |
| pshufb m2, m0, m2 |
| movd xm6, dxd |
| pmaddubsw m1, m7 |
| pmaddubsw m2, m7 |
| vpbroadcastw m6, xm6 |
| mov r3d, dxd |
| psrldq m0, 1 |
| lea r2, [strideq*3] |
| paddw m7, m6, m6 |
| paddw m1, m2 |
| vpblendd m6, m7, 0xf0 |
| pmulhrsw m1, m3 |
| pslldq m2, m7, 8 |
| paddw m7, m7 |
| paddw m6, m2 |
| packuswb m1, m1 |
| punpcklbw m0, m1 |
| mova [rsp], m0 |
| .w8_upsample_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| movu xm0, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [rsp+r5], 1 |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| punpcklqdq m1, m2, m2 ; frac0 frac1 |
| pmaddubsw m0, m1 |
| movu xm1, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| vinserti128 m1, [rsp+r5], 1 |
| punpckhqdq m2, m2 ; frac2 frac3 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| paddw m6, m7 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r2 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_upsample_loop |
| RET |
| .w8_no_intra_edge_filter: |
| and maxbased, 7 |
| or maxbased, 8 ; imin(h+7, 15) |
| jmp .w8_main |
| .w8_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 10 |
| lea maxbased, [hq+7] |
| test angled, 0x400 |
| jnz .w8_no_intra_edge_filter |
| call .filter_strength |
| jz .w8_main ; filter_strength == 0 |
| movu xm2, [tlq] |
| pminub xm1, xm0, [base+z_filter_s+14] |
| vinserti128 m2, [tlq-1], 1 |
| vinserti128 m1, [base+z_filter_s+ 0], 1 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] |
| pminub xm0, [base+z_filter_s+22] |
| vinserti128 m0, [base+z_filter_s+ 8], 1 |
| pshufb m6, m2, m1 |
| pmaddubsw m6, m7 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] |
| movzx r3d, byte [tlq+15] |
| shufps m1, m0, q2121 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m7 |
| paddw m1, m6 |
| sub r5d, 3 |
| jnz .w8_3tap |
| ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, |
| ; which also results in an awkward edge case where out[w*2] is |
| ; slightly different from out[max_base_x] when h > w. |
| vpbroadcastd m7, [z_filter_k+4*8] |
| movzx r2d, byte [tlq+14] |
| pshufb m2, m0 |
| pmaddubsw m2, m7 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] |
| shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 |
| mov [rsp+16], r2b |
| paddw m1, m2 |
| .w8_3tap: |
| pmulhrsw m1, m3 |
| sar r5d, 1 |
| mov tlq, rsp |
| add r5d, 17 ; w*2 + (filter_strength == 3) |
| cmp hd, 8 |
| cmova maxbased, r5d |
| mov [tlq+r5], r3b |
| vextracti128 xm0, m1, 1 |
| packuswb xm0, xm1 |
| mova [tlq], xm0 |
| .w8_main: |
| movd xm2, dxd |
| vbroadcasti128 m0, [z_base_inc] |
| vpbroadcastw m2, xm2 |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| psrlw m7, 8 |
| psubw m9, m0 |
| mov r3d, dxd |
| paddw m6, m2, m2 |
| vpblendd m2, m6, 0xf0 |
| .w8_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 |
| pand m0, m4, m2 |
| psubw m1, m5, m0 |
| psllw m0, 8 |
| por m1, m0 |
| movu xm0, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [tlq+r5], 1 |
| pshufb m0, m8 |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m2 |
| paddw m2, m6 |
| pmulhrsw m0, m3 |
| vpblendvb m0, m7, m0, m1 |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| sub hd, 2 |
| jz .w8_end |
| lea dstq, [dstq+strideq*2] |
| cmp r3d, maxbased |
| jb .w8_loop |
| packuswb xm7, xm7 |
| .w8_end_loop: |
| movq [dstq+strideq*0], xm7 |
| movq [dstq+strideq*1], xm7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_end_loop |
| .w8_end: |
| RET |
| .w16_no_intra_edge_filter: |
| and maxbased, 15 |
| or maxbased, 16 ; imin(h+15, 31) |
| jmp .w16_main |
| ALIGN function_align |
| .w16: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -64, 12 |
| lea maxbased, [hq+15] |
| test angled, 0x400 |
| jnz .w16_no_intra_edge_filter |
| call .filter_strength |
| jz .w16_main ; filter_strength == 0 |
| vpbroadcastd m1, [base+pb_12] |
| vbroadcasti128 m6, [base+z_filter_s+8] |
| vinserti128 m2, m6, [base+z_filter_s], 0 |
| vinserti128 m6, [base+z_filter_s+16], 1 |
| mova xm10, [tlq-1] |
| vinserti128 m10, [tlq+3], 1 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] |
| vbroadcasti128 m7, [base+z_filter_s+14] |
| vinserti128 m8, m7, [base+z_filter_s+6], 0 |
| vinserti128 m7, [base+z_filter_s+22], 1 |
| psubw m0, m1 |
| movu xm11, [tlq+12] |
| vinserti128 m11, [tlq+16], 1 |
| pminub m8, m0 |
| pminub m7, m0 |
| pshufb m0, m10, m2 |
| shufps m2, m6, q2121 |
| pmaddubsw m0, m9 |
| pshufb m1, m11, m8 |
| shufps m8, m7, q2121 |
| pmaddubsw m1, m9 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| movzx r3d, byte [tlq+31] |
| pshufb m2, m10, m2 |
| pmaddubsw m2, m9 |
| pshufb m8, m11, m8 |
| pmaddubsw m8, m9 |
| paddw m0, m2 |
| paddw m1, m8 |
| sub r5d, 3 |
| jnz .w16_3tap |
| vpbroadcastd m9, [z_filter_k+4*8] |
| movzx r2d, byte [tlq+30] |
| pshufb m10, m6 |
| pmaddubsw m10, m9 |
| pshufb m11, m7 |
| pmaddubsw m11, m9 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] |
| shr r2d, 3 |
| mov [rsp+32], r2b |
| paddw m0, m10 |
| paddw m1, m11 |
| .w16_3tap: |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| sar r5d, 1 |
| mov tlq, rsp |
| add r5d, 33 |
| cmp hd, 16 |
| cmova maxbased, r5d |
| mov [tlq+r5], r3b |
| packuswb m0, m1 |
| vpermq m0, m0, q3120 |
| mova [tlq], m0 |
| .w16_main: |
| movd xm6, dxd |
| vbroadcasti128 m0, [z_base_inc] |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| mov r3d, dxd |
| psubw m9, m0 |
| paddw m11, m6, m6 |
| psubw m10, m9, m3 ; 64*8 |
| vpblendd m6, m11, 0xf0 |
| .w16_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu xm0, [tlq+r3+0] |
| movu xm1, [tlq+r3+8] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [tlq+r5+0], 1 |
| vinserti128 m1, [tlq+r5+8], 1 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| sub hd, 2 |
| jz .w16_end |
| lea dstq, [dstq+strideq*2] |
| cmp r3d, maxbased |
| jb .w16_loop |
| .w16_end_loop: |
| mova [dstq+strideq*0], xm7 |
| mova [dstq+strideq*1], xm7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_end_loop |
| .w16_end: |
| RET |
| ALIGN function_align |
| .w32: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -96, 15 |
| lea r3d, [hq+31] |
| mov maxbased, 63 |
| cmp hd, 32 |
| cmovb maxbased, r3d |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w32_main |
| vbroadcasti128 m0, [pb_0to15] |
| sub r3d, 29 ; h+2 |
| movu xm13, [tlq+29] ; 32-39 |
| movd xm1, r3d |
| movu xm14, [tlq+37] ; 40-47 |
| sub r3d, 8 ; h-6 |
| vinserti128 m14, [tlq+51], 1 ; 56-63 |
| vpbroadcastb xm1, xm1 |
| mova xm11, [tlq- 1] ; 0- 7 |
| vinserti128 m11, [tlq+13], 1 ; 16-23 |
| movd xm2, r3d |
| movu xm12, [tlq+ 5] ; 8-15 |
| vinserti128 m12, [tlq+19], 1 ; 24-31 |
| pminub xm1, xm0 ; clip 32x8 |
| mova m7, [z_filter_s+0] |
| pshufb xm13, xm1 |
| vpbroadcastd m1, [pb_12] |
| vpbroadcastb xm2, xm2 |
| vinserti128 m13, [tlq+43], 1 ; 48-55 |
| vinserti128 m8, m7, [z_filter_s+4], 1 |
| vpblendd m2, m1, 0xf0 |
| vinserti128 m7, [z_filter_s+12], 0 |
| pminub m2, m0 ; clip 32x16 and 32x(32|64) |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m14, m2 |
| pshufb m0, m11, m8 |
| shufps m8, m7, q1021 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m10, m11, m8 |
| shufps m8, m7, q2121 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m8 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m8 |
| pmaddubsw m10, m9 |
| paddw m1, m10 |
| pshufb m10, m14, m8 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*2] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m7 |
| pmaddubsw m12, m9 |
| movzx r3d, byte [tlq+63] |
| movzx r2d, byte [tlq+62] |
| paddw m0, m11 |
| paddw m2, m12 |
| pshufb m13, m7 |
| pmaddubsw m13, m9 |
| pshufb m14, m7 |
| pmaddubsw m14, m9 |
| paddw m1, m13 |
| paddw m6, m14 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] ; edge case for 32x64 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| shr r2d, 3 |
| mov [rsp+64], r2b |
| mov tlq, rsp |
| mov [tlq+65], r3b |
| mov r3d, 65 |
| cmp hd, 32 |
| cmova maxbased, r3d |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq+ 0], m0 |
| mova [tlq+32], m1 |
| .w32_main: |
| movd xm6, dxd |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| mov r5d, dxd |
| psubw m9, [z_base_inc] |
| mova m11, m6 |
| psubw m10, m9, m3 ; 64*8 |
| .w32_loop: |
| mov r3d, r5d |
| shr r3d, 6 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu m0, [tlq+r3+0] |
| movu m1, [tlq+r3+8] |
| add r5d, dxd |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| mova [dstq], m0 |
| dec hd |
| jz .w32_end |
| add dstq, strideq |
| cmp r5d, maxbased |
| jb .w32_loop |
| test hb, 1 |
| jz .w32_end_loop |
| mova [dstq], m7 |
| add dstq, strideq |
| dec hd |
| jz .w32_end |
| .w32_end_loop: |
| mova [dstq+strideq*0], m7 |
| mova [dstq+strideq*1], m7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_end_loop |
| .w32_end: |
| RET |
| ALIGN function_align |
| .w64: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -128, 16 |
| lea maxbased, [hq+63] |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w64_main |
| mova xm11, [tlq- 1] ; 0- 7 |
| vinserti128 m11, [tlq+13], 1 ; 16-23 |
| movu xm12, [tlq+ 5] ; 8-15 |
| vinserti128 m12, [tlq+19], 1 ; 24-31 |
| mova m7, [z_filter_s+0] |
| vinserti128 m8, m7, [z_filter_s+4], 1 |
| vinserti128 m7, [z_filter_s+12], 0 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| movu xm13, [tlq+29] ; 32-39 |
| vinserti128 m13, [tlq+43], 1 ; 48-55 |
| movu xm14, [tlq+37] ; 40-47 |
| vinserti128 m14, [tlq+51], 1 ; 56-63 |
| pshufb m0, m11, m8 |
| shufps m8, m7, q1021 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m10, m11, m8 |
| shufps m15, m8, m7, q2121 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m15 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m15 |
| pmaddubsw m10, m9 |
| paddw m1, m10 |
| pshufb m10, m14, m15 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m10, [z_filter_k+4*2+12*2] |
| pshufb m11, m15 |
| pmaddubsw m11, m10 |
| pshufb m12, m7 |
| pmaddubsw m12, m10 |
| pshufb m13, m7 |
| pmaddubsw m13, m10 |
| pshufb m14, m7 |
| pmaddubsw m14, m10 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| movu xm11, [tlq+ 61] ; 64- 71 |
| vinserti128 m11, [tlq+ 75], 1 ; 80- 87 |
| movu xm12, [tlq+ 69] ; 72- 79 |
| vinserti128 m12, [tlq+ 83], 1 ; 88- 95 |
| movu xm13, [tlq+ 93] ; 96-103 |
| vinserti128 m13, [tlq+107], 1 ; 112-119 |
| movu xm14, [tlq+101] ; 104-111 |
| vinserti128 m14, [tlq+115], 1 ; 120-127 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| lea r3d, [hq-20] |
| mov tlq, rsp |
| packuswb m0, m2 |
| packuswb m1, m6 |
| vpbroadcastd xm2, [pb_14] |
| vbroadcasti128 m6, [pb_0to15] |
| mova [tlq+32*0], m0 |
| mova [tlq+32*1], m1 |
| movd xm0, r3d |
| vpbroadcastd m1, [pb_12] |
| vpbroadcastb m0, xm0 |
| paddb m0, m2 |
| pminub m0, m6 ; clip 64x16 and 64x32 |
| pshufb m12, m0 |
| pminub m1, m6 ; clip 64x64 |
| pshufb m14, m1 |
| pshufb m0, m11, m7 |
| pmaddubsw m0, m10 |
| pshufb m2, m12, m7 |
| pmaddubsw m2, m10 |
| pshufb m1, m13, m7 |
| pmaddubsw m1, m10 |
| pshufb m6, m14, m7 |
| pmaddubsw m6, m10 |
| pshufb m7, m11, m15 |
| pmaddubsw m7, m9 |
| pshufb m10, m12, m15 |
| pmaddubsw m10, m9 |
| paddw m0, m7 |
| pshufb m7, m13, m15 |
| pmaddubsw m7, m9 |
| paddw m2, m10 |
| pshufb m10, m14, m15 |
| pmaddubsw m10, m9 |
| paddw m1, m7 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m8 |
| pmaddubsw m12, m9 |
| pshufb m13, m8 |
| pmaddubsw m13, m9 |
| pshufb m14, m8 |
| pmaddubsw m14, m9 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq+32*2], m0 |
| mova [tlq+32*3], m1 |
| .w64_main: |
| movd xm12, dxd |
| vpbroadcastb m7, [tlq+maxbaseq] |
| lea r3d, [dxq-64] |
| shl maxbased, 6 |
| vpbroadcastw m12, xm12 |
| sub r3d, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| movd xm6, r3d |
| mov r5d, dxd |
| mova m10, [pb_1to32] |
| vpbroadcastd m11, [pb_32] |
| vpbroadcastw m6, xm6 |
| .w64_loop: |
| mov r3d, r5d |
| shr r3d, 6 |
| movu m0, [tlq+r3+ 0] |
| movu m1, [tlq+r3+ 8] |
| pand m2, m4, m6 |
| psubw m9, m5, m2 |
| psllw m2, 8 |
| por m9, m2 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| psraw m2, m6, 6 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packsswb m2, m2 |
| paddb m2, m10 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [dstq+ 0], m0 |
| movu m0, [tlq+r3+32] |
| movu m1, [tlq+r3+40] |
| add r5d, dxd |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| paddb m2, m11 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| paddw m6, m12 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [dstq+32], m0 |
| dec hd |
| jz .w64_end |
| add dstq, strideq |
| cmp r5d, maxbased |
| jb .w64_loop |
| .w64_end_loop: |
| mova [dstq+ 0], m7 |
| mova [dstq+32], m7 |
| add dstq, strideq |
| dec hd |
| jg .w64_end_loop |
| .w64_end: |
| RET |
| |
| cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy |
| %define base r9-z_filter_t0 |
| lea r9, [ipred_z2_avx2_table] |
| tzcnt wd, wm |
| movifnidn angled, anglem |
| movifnidn hd, hm |
| lea dxq, [dr_intra_derivative-90] |
| movsxd wq, [r9+wq*4] |
| movzx dyd, angleb |
| xor angled, 0x400 |
| mov r8, dxq |
| sub dxq, dyq |
| add wq, r9 |
| add r9, z_filter_t0-ipred_z2_avx2_table |
| mova m2, [tlq-64] |
| mova m0, [tlq-32] |
| mova m1, [tlq] |
| and dyd, ~1 |
| and dxq, ~1 |
| movzx dyd, word [r8+dyq] ; angle - 90 |
| movzx dxd, word [dxq+270] ; 180 - angle |
| vpbroadcastd m13, [base+pw_512] |
| vpbroadcastd m14, [base+pw_62] |
| vpbroadcastd m15, [base+pw_64] |
| mova [rsp+ 0], m2 |
| mova [rsp+32], m0 |
| mova [rsp+64], m1 |
| neg dxd |
| neg dyd |
| jmp wq |
| .w4: |
| vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 |
| vbroadcasti128 m10, [base+z1_shuf_w4] |
| vbroadcasti128 m11, [base+z2_shuf_h4] |
| lea r2d, [dxq+(65<<6)] ; xpos |
| movd xm5, dyd |
| mov r8d, (63-4)<<6 |
| mov dyq, -4 |
| pshuflw xm5, xm5, q0000 |
| pmullw xm5, [base+z2_ymul] |
| test angled, 0x400 |
| jnz .w4_main ; !enable_intra_edge_filter |
| lea r3d, [hq+2] |
| add angled, 1022 |
| shl r3d, 6 |
| test r3d, angled |
| jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) |
| vpbroadcastd xm3, [base+pb_4] |
| call .upsample_above |
| sub angled, 1075 ; angle - 53 |
| lea r3d, [hq+3] |
| xor angled, 0x7f ; 180 - angle |
| call .filter_strength |
| jmp .w4_filter_left |
| ALIGN function_align |
| .filter_strength: |
| movd xm8, r3d |
| mov r3d, angled |
| movd xm7, angled |
| vpbroadcastb m8, xm8 |
| shr r3d, 8 ; is_sm << 1 |
| vpbroadcastb m7, xm7 |
| pcmpeqb m8, [base+z_filter_wh] |
| mova xm9, [r9+r3*8] |
| pand m0, m8, m7 |
| pcmpgtb m0, m9 |
| pmovmskb r3d, m0 |
| popcnt r3d, r3d |
| ret |
| ALIGN function_align |
| .upsample_above: ; w4/w8 |
| pshufb xm2, xm1, [base+z_upsample1-2] |
| pminub xm3, [base+z_filter_s+4] |
| vpbroadcastd xm4, [base+pb_36_m4] |
| vbroadcasti128 m10, [base+pb_0to15] |
| pshufb xm3, xm1, xm3 |
| pmaddubsw xm2, xm4 |
| pmaddubsw xm3, xm4 |
| lea r2d, [r2+dxq+(1<<6)] |
| add dxd, dxd |
| paddw xm2, xm3 |
| pmulhrsw xm2, xm13 |
| sub r8d, 3<<6 |
| paddw m6, m6 |
| packuswb xm2, xm2 |
| punpcklbw xm1, xm2 |
| mova [rsp+gprsize+64], xm1 |
| ret |
| ALIGN function_align |
| .upsample_left: ; h4/h8 |
| mov r3d, hd |
| and r3d, 4 |
| movd xm2, [rsp+gprsize+64] |
| movddup xm0, [rsp+gprsize+56] |
| movd xm1, r3d |
| palignr xm2, xm0, 1 |
| vpbroadcastb xm1, xm1 |
| pshufb xm2, [base+z_filter_s+18] |
| vpbroadcastd xm3, [base+pb_36_m4] |
| pmaxub xm1, [base+z_upsample1-2] |
| pshufb xm1, xm0, xm1 |
| pmaddubsw xm2, xm3 |
| pmaddubsw xm1, xm3 |
| paddw xm5, xm5 |
| add dyq, dyq |
| paddw xm1, xm2 |
| pmulhrsw xm1, xm13 |
| vbroadcasti128 m11, [base+z2_upsample] |
| paddw xm5, xm15 |
| packuswb xm1, xm1 |
| punpcklbw xm0, xm1 |
| mova [rsp+gprsize+48], xm0 |
| ret |
| .w4_no_upsample_above: |
| lea r3d, [hq+3] |
| sub angled, 1112 ; angle - 90 |
| call .filter_strength |
| jz .w4_no_filter_above |
| vpbroadcastd xm2, [base+pb_4] |
| pminub xm2, [base+z_filter_s] |
| vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] |
| pshufb xm3, xm1, xm2 ; 00 01 12 23 |
| pshufd xm2, xm2, q0321 |
| pmaddubsw xm0, xm3, xm0 |
| pshufb xm2, xm1, xm2 ; 12 23 34 44 |
| pmaddubsw xm2, xm4 |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] |
| punpckhqdq xm3, xm3 ; 34 44 44 44 |
| pmaddubsw xm3, xm4 |
| movd xm4, r6m ; max_width |
| pminsw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm13 |
| psubb xm4, [base+pb_1to32] |
| psrlq xm1, 8 |
| packuswb xm0, xm0 |
| vpblendvb xm0, xm1, xm4 |
| movd [rsp+65], xm0 |
| .w4_no_filter_above: |
| lea r3d, [hq+2] |
| add angled, 973 ; angle + 883 |
| shl r3d, 6 |
| test r3d, angled |
| jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) |
| vpbroadcastd xm0, [base+pb_90] |
| psubb xm0, xm7 ; 180 - angle |
| pand xm0, xm8 ; reuse from previous filter_strength call |
| pcmpgtb xm0, xm9 |
| pmovmskb r3d, xm0 |
| popcnt r3d, r3d |
| .w4_filter_left: |
| jz .w4_main |
| mov r5d, 10 |
| cmp hd, 16 |
| movu xm2, [rsp+49] |
| vinserti128 m2, [rsp+43], 1 |
| cmovl r5d, hd |
| xor r5d, 15 ; h == 16 ? 5 : 15 - h |
| movd xm0, r5d |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab |
| vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd |
| vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef |
| vpbroadcastb m0, xm0 |
| pmaxub m0, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] |
| pshufb m0, m2, m0 |
| pmaddubsw m0, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] |
| pshufb m2, m4 |
| pmaddubsw m2, m3 |
| movd xm4, r7m ; max_height |
| pminsw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| psubb xm4, [base+pb_16to1] |
| paddw m1, m0 |
| paddw m1, m2 |
| pmulhrsw m1, m13 |
| vextracti128 xm0, m1, 1 |
| packuswb xm0, xm1 |
| vpblendvb xm0, [rsp+48], xm4 |
| mova [rsp+48], xm0 |
| jmp .w4_main |
| .w4_upsample_left: |
| call .upsample_left |
| .w4_main: |
| movd xm0, dxd |
| mova m12, [base+z2_y_shuf_h4] |
| lea r5, [rsp+56] ; left-7 |
| vpbroadcastw m0, xm0 |
| lea r9, [strideq*3] |
| psraw xm1, xm5, 6 |
| pand xm5, xm14 ; frac_y |
| pxor xm2, xm2 |
| paddw m7, m0, m0 |
| psubw xm4, xm2, xm1 ; base_y |
| vpblendd m0, m7, 0xcc |
| mova xm1, xm7 |
| punpcklwd xm4, xm2 |
| paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 |
| psubw xm1, xm15, xm5 ; 64-frac_y |
| psllw xm5, 8 |
| paddw m7, m7 |
| paddw m6, m0 |
| por xm5, xm1 ; 64-frac_y, frac_y |
| vpbroadcastq m5, xm5 |
| .w4_loop: |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x0 |
| vpbroadcastq m1, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x1 |
| vpbroadcastq m2, [rsp+r3] |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x2 |
| movq xm0, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x3 |
| movhps xm0, [rsp+r3] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m14, m6 ; frac_x |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m15, m2 ; 64-frac_x |
| psllw m2, 8 |
| pshufb m0, m10 |
| por m1, m2 ; 64-frac_x, frac_x |
| pmaddubsw m0, m1 |
| cmp r3d, 64 |
| jge .w4_toponly |
| mova m1, m7 ; arbitrary negative value |
| vpgatherdq m3, [r5+xm4], m1 |
| pshufb m1, m3, m11 |
| vpermd m1, m12, m1 |
| pmaddubsw m1, m5 |
| psraw m2, m6, 15 ; base_x < topleft |
| vpblendvb m0, m1, m2 |
| .w4_toponly: |
| pmulhrsw m0, m13 |
| paddw m6, m7 ; xpos += dx |
| add r5, dyq |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r9 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| sub hd, 4 |
| jz .w4_end |
| lea dstq, [dstq+strideq*4] |
| cmp r2d, r8d |
| jge .w4_loop |
| .w4_leftonly_loop: |
| mova m1, m7 |
| vpgatherdq m2, [r5+xm4], m1 |
| add r5, dyq |
| pshufb m0, m2, m11 |
| vpermd m0, m12, m0 |
| pmaddubsw m0, m5 |
| pmulhrsw m0, m13 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r9 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_leftonly_loop |
| .w4_end: |
| RET |
| .w8: |
| vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 |
| movd xm5, dyd |
| vbroadcasti128 m10, [base+z_filter_s+2] |
| vbroadcasti128 m11, [base+z2_shuf_h4] |
| lea r2d, [dxq+(65<<6)] ; xpos |
| vpbroadcastw xm5, xm5 |
| mov r8d, (63-8)<<6 |
| mov dyq, -4 |
| pmullw xm5, [base+z2_ymul] |
| test angled, 0x400 |
| jnz .w8_main |
| lea r3d, [angleq+126] |
| mov r3b, hb |
| cmp r3d, 8 |
| ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm |
| vpbroadcastd xm3, [base+pb_8] |
| movhps [rsp+80], xm1 |
| call .upsample_above |
| sub angled, 53 ; angle - 53 |
| lea r3d, [hq+7] |
| xor angled, 0x7f ; 180 - angle |
| call .filter_strength |
| jmp .w8_filter_left |
| .w8_no_upsample_above: |
| lea r3d, [hq+7] |
| sub angled, 90 ; angle - 90 |
| call .filter_strength |
| jz .w8_no_filter_above |
| vpbroadcastd xm3, [base+pb_8] |
| pminub xm3, [base+z_filter_s+8] |
| vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] |
| pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 |
| pmaddubsw xm0, xm2, xm0 |
| pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 |
| shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 |
| pmaddubsw xm2, xm4 |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] |
| pmaddubsw xm3, xm4 |
| movd xm4, r6m ; max_width |
| pminuw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm13 |
| psubb xm4, [base+pb_1to32] |
| psrldq xm1, 1 |
| packuswb xm0, xm0 |
| vpblendvb xm0, xm1, xm4 |
| movq [rsp+65], xm0 |
| .w8_no_filter_above: |
| lea r3d, [angleq-51] |
| mov r3b, hb |
| cmp r3d, 8 |
| jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm |
| vpbroadcastd m0, [base+pb_90] |
| psubb m0, m7 |
| pand m0, m8 |
| pcmpgtb m0, m9 |
| pmovmskb r3d, m0 |
| popcnt r3d, r3d |
| .w8_filter_left: |
| jz .w8_main |
| vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] |
| vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] |
| cmp hd, 32 |
| jne .w8_filter_left_h16 |
| movu xm2, [rsp+27] |
| vinserti128 m2, [rsp+35], 1 |
| vpbroadcastd xm0, [base+pb_5] |
| vbroadcasti128 m3, [base+z_filter_s+ 8] |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| pmaxub m3, m0 |
| pshufb m3, m2, m3 |
| pmaddubsw m3, m7 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m8 |
| pshufb m2, m4 |
| pmaddubsw m2, m9 |
| paddw m3, m1 |
| paddw m3, m2 |
| pmulhrsw m3, m13 |
| jmp .w8_filter_left_top16 |
| .w8_filter_left_h16: |
| mov r5d, 10 |
| cmp hd, 16 |
| cmovl r5d, hd |
| xor r5d, 15 ; h == 16 ? 5 : 15 - h |
| movd xm0, r5d |
| vpbroadcastb m0, xm0 |
| .w8_filter_left_top16: |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd |
| vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef |
| pmaxub m0, m2 |
| movu xm2, [rsp+49] |
| vinserti128 m2, [rsp+43], 1 |
| pshufb m0, m2, m0 |
| pmaddubsw m0, m7 |
| movd xm7, r7m ; max_height |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m8 |
| pshufb m2, m4 |
| pmaddubsw m2, m9 |
| pminsw xm7, xm15 |
| paddw m1, m0 |
| vpbroadcastb m7, xm7 |
| paddw m1, m2 |
| pmulhrsw m1, m13 |
| psubb m7, [base+pb_32to1] |
| packuswb m3, m1 |
| vpermq m3, m3, q1320 |
| vpblendvb m3, [rsp+32], m7 |
| mova [rsp+32], m3 |
| jmp .w8_main |
| .w8_upsample_left: |
| call .upsample_left |
| .w8_main: |
| movd xm3, dxd |
| lea r5, [rsp+56] ; left-7 |
| pshufd xm1, xm5, q3120 |
| pand xm5, xm14 |
| vpbroadcastw m3, xm3 |
| pxor xm0, xm0 |
| psubw xm2, xm15, xm5 |
| psraw xm1, 6 |
| lea r9, [strideq*3] |
| paddw m7, m3, m3 |
| psubw xm9, xm0, xm1 ; base_y |
| psllw xm5, 8 |
| punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 |
| vpblendd m3, m7, 0xf0 ; xpos0 xpos1 |
| por xm5, xm2 ; 64-frac_y, frac_y |
| punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 |
| paddw m6, m3 |
| vinserti128 m12, m5, xm5, 1 |
| .w8_loop: |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x0 |
| movu xm0, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x1 |
| vinserti128 m0, [rsp+r3], 1 |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x2 |
| movu xm1, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x3 |
| vinserti128 m1, [rsp+r3], 1 |
| pand m2, m14, m6 |
| paddsw m4, m6, m7 |
| psubw m5, m15, m2 |
| psllw m2, 8 |
| pshufb m0, m10 |
| por m2, m5 |
| pmaddubsw m0, m2 |
| pand m2, m14, m4 |
| psubw m5, m15, m2 |
| psllw m2, 8 |
| pshufb m1, m10 |
| por m2, m5 |
| pmaddubsw m1, m2 |
| cmp r3d, 64 |
| jge .w8_toponly |
| mova m5, m7 |
| vpgatherdq m3, [r5+xm9], m7 |
| mova m7, m5 |
| vpgatherdq m2, [r5+xm8], m5 |
| pshufb m3, m11 |
| pshufb m2, m11 |
| punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 |
| vpermq m5, m5, q3120 ; y0 y1 |
| vpermq m2, m2, q3120 ; y2 y3 |
| pmaddubsw m5, m12 |
| pmaddubsw m2, m12 |
| psraw m6, 15 ; base_x < topleft |
| vpblendvb m0, m5, m6 |
| psraw m3, m4, 15 |
| vpblendvb m1, m2, m3 |
| .w8_toponly: |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| paddw m6, m4, m7 ; xpos += dx |
| add r5, dyq |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r9 ], xm1 |
| sub hd, 4 |
| jz .w8_end |
| lea dstq, [dstq+strideq*4] |
| cmp r2d, r8d |
| jge .w8_loop |
| .w8_leftonly_loop: |
| mova m0, m7 |
| vpgatherdq m5, [r5+xm9], m7 |
| mova m7, m0 |
| vpgatherdq m3, [r5+xm8], m0 |
| add r5, dyq |
| pshufb m2, m5, m11 |
| pshufb m1, m3, m11 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| pmaddubsw m0, m12 |
| pmaddubsw m1, m12 |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r9 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_leftonly_loop |
| .w8_end: |
| RET |
| .w16: |
| mov r8d, hd |
| test angled, 0x400 |
| jnz .w16_main |
| lea r3d, [hq+15] |
| sub angled, 90 |
| call .filter_strength |
| jz .w16_no_filter_above |
| vbroadcasti128 m6, [tlq+1] |
| mova xm2, [base+z_filter_s] |
| vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de |
| movu xm3, [base+z_filter_s+8] |
| vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff |
| vpblendd m1, m6, 0xf0 |
| vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] |
| vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] |
| pshufb m2, m1, m2 |
| pshufb m1, m3 |
| pmaddubsw m0, m2, m0 |
| shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff |
| pmaddubsw m2, m4 |
| pmaddubsw m1, m5 |
| movd xm4, r6m ; max_width |
| pminsw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| paddw m0, m2 |
| paddw m0, m1 |
| pmulhrsw m0, m13 |
| psubb xm4, [base+pb_1to32] |
| vextracti128 xm2, m0, 1 |
| packuswb xm0, xm2 |
| vpblendvb xm0, xm6, xm4 |
| movu [rsp+65], xm0 |
| .w16_no_filter_above: |
| vpbroadcastd m0, [base+pb_90] |
| psubb m0, m7 |
| pand m0, m8 |
| pcmpgtb m0, m9 |
| pmovmskb r3d, m0 |
| popcnt r3d, r3d |
| jz .w16_main |
| vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] |
| vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] |
| .w16_filter_left: |
| movd xm6, r7m ; max_height |
| pminsw xm6, xm15 |
| vpbroadcastb m6, xm6 |
| cmp hd, 32 |
| jl .w16_filter_left_h16 |
| vpbroadcastd xm0, [base+pb_5] |
| vbroadcasti128 m10, [base+z_filter_s+ 8] |
| vbroadcasti128 m11, [base+z_filter_s+12] |
| vbroadcasti128 m12, [base+z_filter_s+16] |
| je .w16_filter_left_h32 |
| movu m3, [tlq-69] |
| movu m5, [tlq-61] |
| pmaxub m1, m10, m0 |
| pshufb m1, m3, m1 |
| pmaddubsw m1, m7 |
| pshufb m2, m3, m11 |
| pmaddubsw m2, m8 |
| pshufb m3, m12 |
| pmaddubsw m3, m9 |
| paddw m1, m2 |
| pshufb m2, m5, m10 |
| pmaddubsw m2, m7 |
| pshufb m4, m5, m11 |
| pmaddubsw m4, m8 |
| pshufb m5, m12 |
| pmaddubsw m5, m9 |
| paddw m1, m3 |
| vpbroadcastd m3, [base+pb_32] |
| paddb m3, [base+pb_32to1] |
| paddw m2, m4 |
| paddw m2, m5 |
| pmulhrsw m1, m13 |
| pmulhrsw m2, m13 |
| psubb m3, m6, m3 |
| packuswb m1, m2 |
| vpblendvb m1, [tlq-64], m3 |
| mova [rsp], m1 |
| jmp .w16_filter_left_top32 |
| .w16_filter_left_h32: |
| pmaxub m10, m0 |
| .w16_filter_left_top32: |
| movu xm2, [tlq-37] |
| vinserti128 m2, [tlq-29], 1 |
| pshufb m3, m2, m10 |
| pshufb m1, m2, m11 |
| pshufb m2, m12 |
| pmaddubsw m3, m7 |
| pmaddubsw m1, m8 |
| pmaddubsw m2, m9 |
| paddw m3, m1 |
| paddw m3, m2 |
| pmulhrsw m3, m13 |
| jmp .w16_filter_left_top16 |
| .w16_filter_left_h16: |
| mov r5d, 10 |
| cmp hd, 16 |
| cmovl r5d, hd |
| xor r5d, 15 ; h == 16 ? 5 : 15 - h |
| movd xm0, r5d |
| vpbroadcastb m0, xm0 |
| .w16_filter_left_top16: |
| movu xm2, [tlq-15] |
| vinserti128 m2, [tlq-21], 1 |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab |
| vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd |
| vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef |
| pmaxub m0, m5 |
| pshufb m0, m2, m0 |
| pmaddubsw m0, m7 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m8 |
| pshufb m2, m4 |
| pmaddubsw m2, m9 |
| psubb m6, [base+pb_32to1] |
| paddw m1, m0 |
| paddw m1, m2 |
| pmulhrsw m1, m13 |
| packuswb m3, m1 |
| vpermq m3, m3, q1320 |
| vpblendvb m3, [tlq-32], m6 |
| mova [rsp+32], m3 |
| .w16_main: |
| movd xm1, dyd |
| vbroadcasti128 m10, [base+z_filter_s+2] |
| movd xm7, dxd |
| vbroadcasti128 m11, [base+z2_shuf_h2] |
| vpbroadcastw m1, xm1 |
| vpbroadcastw m7, xm7 |
| mov r7, dstq |
| pmullw m0, m1, [base+z2_ymul] |
| psllw xm1, 4 |
| paddw m6, m7, [base+z2_base_inc] |
| lea r9d, [dxq+(65<<6)] ; xpos |
| movd [rsp+156], xm1 |
| .w16_loop0: |
| mov r2d, r9d |
| mova [rsp+160], m0 |
| lea r5, [rsp+60] ; left-3 |
| mova [rsp+192], m6 |
| pxor m1, m1 |
| psraw m2, m0, 6 |
| pand m0, m14 |
| psubw m9, m1, m2 ; base_y |
| psubw m12, m15, m0 |
| punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 |
| psllw m0, 8 |
| punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 |
| por m12, m0 ; 64-frac_y, frac_y |
| .w16_loop: |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x0 |
| movu xm0, [rsp+r2] |
| vinserti128 m0, [rsp+r2+8], 1 |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x1 |
| movu xm1, [rsp+r3] |
| vinserti128 m1, [rsp+r3+8], 1 |
| pand m2, m14, m6 |
| paddsw m5, m6, m7 |
| psubw m3, m15, m2 |
| psllw m2, 8 |
| pshufb m0, m10 |
| por m2, m3 |
| pmaddubsw m0, m2 |
| pand m2, m14, m5 |
| psubw m3, m15, m2 |
| psllw m2, 8 |
| pshufb m1, m10 |
| por m2, m3 |
| pmaddubsw m1, m2 |
| cmp r3d, 64 |
| jge .w16_toponly |
| punpckhwd m2, m5, m5 ; mask out unnecessary loads |
| vpgatherdd m4, [r5+m9], m2 |
| punpcklwd m2, m5, m5 |
| vpgatherdd m3, [r5+m8], m2 |
| pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 |
| pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 |
| punpcklqdq m2, m3, m4 ; y0 |
| punpckhqdq m3, m4 ; y1 |
| pmaddubsw m2, m12 |
| pmaddubsw m3, m12 |
| psraw m6, 15 ; base_x < topleft |
| vpblendvb m0, m2, m6 |
| psraw m6, m5, 15 |
| vpblendvb m1, m3, m6 |
| .w16_toponly: |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| paddw m6, m5, m7 ; xpos += dx |
| sub r5, 2 |
| packuswb m0, m1 |
| vpermq m0, m0, q3120 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| sub hd, 2 |
| jz .w16_end |
| lea dstq, [dstq+strideq*2] |
| cmp r2d, (63-16)<<6 |
| jge .w16_loop |
| .w16_leftonly_loop: |
| mova m0, m7 |
| vpgatherdd m4, [r5+m9], m7 |
| mova m7, m0 |
| vpgatherdd m3, [r5+m8], m0 |
| sub r5, 2 |
| pshufb m2, m4, m11 |
| pshufb m1, m3, m11 |
| punpcklqdq m0, m1, m2 |
| punpckhqdq m1, m2 |
| pmaddubsw m0, m12 |
| pmaddubsw m1, m12 |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| packuswb m0, m1 |
| vpermq m0, m0, q3120 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_leftonly_loop |
| .w16_end: |
| sub r8d, 1<<8 |
| jl .w16_ret |
| vpbroadcastd m0, [rsp+156] |
| paddw m0, [rsp+160] ; base_y += 16*dy |
| paddw m6, m13, [rsp+192] |
| add r7, 16 |
| add r9d, 16<<6 |
| movzx hd, r8b |
| mov dstq, r7 |
| paddw m6, m13 ; base_x += 16*64 |
| jmp .w16_loop0 |
| .w16_ret: |
| RET |
| .w32: |
| mova m2, [tlq+32] |
| lea r8d, [hq+(1<<8)] |
| mova [rsp+96], m2 |
| test angled, 0x400 |
| jnz .w16_main |
| vpbroadcastd m7, [base+z_filter_k+4*2+12*0] |
| vpbroadcastd m8, [base+z_filter_k+4*2+12*1] |
| vpbroadcastd m9, [base+z_filter_k+4*2+12*2] |
| mova xm5, [base+z_filter_s] |
| vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc |
| vinserti128 m1, [tlq+11], 1 |
| movu xm6, [base+z_filter_s+12] |
| vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff |
| movu xm3, [tlq+ 6] |
| vinserti128 m3, [tlq+17], 1 |
| movd xm0, r6m ; max_width |
| pminsw xm0, xm15 |
| vpbroadcastb m10, xm0 |
| .w32_filter_above: |
| pshufb m0, m1, m5 |
| shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de |
| pmaddubsw m0, m7 |
| pshufb m2, m1, m4 |
| shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff |
| pmaddubsw m2, m8 |
| pshufb m1, m5 |
| pmaddubsw m1, m9 |
| paddw m0, m2 |
| paddw m0, m1 |
| pshufb m1, m3, m4 |
| pmaddubsw m1, m7 |
| pshufb m2, m3, m5 |
| pmaddubsw m2, m8 |
| pshufb m3, m6 |
| pmaddubsw m3, m9 |
| paddw m1, m2 |
| paddw m1, m3 |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| psubb m10, [base+pb_1to32] |
| packuswb m0, m1 |
| vpblendvb m0, [tlq+1], m10 |
| movu [rsp+65], m0 |
| jmp .w16_filter_left |
| .w64: |
| mova m2, [tlq+32] |
| mov r3d, [tlq+64] |
| lea r8d, [hq+(3<<8)] |
| mova [rsp+ 96], m2 |
| mov [rsp+128], r3d |
| test angled, 0x400 |
| jnz .w16_main |
| vpbroadcastd m7, [base+z_filter_k+4*2+12*0] |
| vpbroadcastd m8, [base+z_filter_k+4*2+12*1] |
| vpbroadcastd m9, [base+z_filter_k+4*2+12*2] |
| movu xm6, [base+z_filter_s+ 4] |
| vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc |
| movu xm3, [tlq+30] |
| vinserti128 m3, [tlq+43], 1 |
| movu xm5, [base+z_filter_s+16] |
| vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff |
| pshufb m0, m3, m6 |
| shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de |
| pmaddubsw m0, m7 |
| pshufb m2, m3, m4 |
| shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff |
| pmaddubsw m2, m8 |
| pshufb m3, m6 |
| pmaddubsw m3, m9 |
| paddw m0, m2 |
| paddw m0, m3 |
| movu xm2, [tlq+36] |
| vinserti128 m2, [tlq+49], 1 |
| pshufb m4, m2, m4 |
| pmaddubsw m4, m7 |
| pshufb m3, m2, m6 |
| pmaddubsw m3, m8 |
| pshufb m2, m5 |
| pmaddubsw m2, m9 |
| movd xm5, r6m ; max_width |
| pminsw xm5, xm15 |
| vpbroadcastb m10, xm5 |
| paddw m3, m4 |
| paddw m2, m3 |
| vpbroadcastd m3, [base+pb_32] |
| pmulhrsw m0, m13 |
| pmulhrsw m2, m13 |
| mova xm5, [base+z_filter_s] |
| vinserti128 m5, [base+z_filter_s+6], 1 |
| psubb m3, m10, m3 |
| psubb m3, [base+pb_1to32] |
| vinserti128 m1, [tlq+13], 1 |
| packuswb m0, m2 |
| vpblendvb m0, [tlq+33], m3 |
| movu xm3, [tlq+ 6] |
| vinserti128 m3, [tlq+19], 1 |
| movu [rsp+97], m0 |
| jmp .w32_filter_above |
| |
| cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase |
| %assign org_stack_offset stack_offset |
| lea r6, [ipred_z3_avx2_table] |
| tzcnt hd, hm |
| movifnidn angled, anglem |
| lea r7, [dr_intra_derivative+45*2-1] |
| dec tlq |
| movsxd hq, [r6+hq*4] |
| sub angled, 180 |
| add hq, r6 |
| mov dyd, angled |
| neg dyd |
| xor angled, 0x400 |
| or dyq, ~0x7e |
| movzx dyd, word [r7+dyq] |
| vpbroadcastd m3, [pw_512] |
| vpbroadcastd m4, [pw_62] |
| vpbroadcastd m5, [pw_64] |
| mov org_wd, wd |
| jmp hq |
| .h4: |
| lea r7, [strideq*3] |
| cmp angleb, 40 |
| jae .h4_no_upsample |
| lea r4d, [angleq-1024] |
| sar r4d, 7 |
| add r4d, wd |
| jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) |
| ALLOC_STACK -32, 9 |
| movu xm8, [tlq-7] |
| pshufb xm0, xm8, [z_upsample1-4] |
| vpbroadcastb xm2, xm8 |
| pshufb xm1, xm8, [z_filter_s+2] |
| mova [rsp+16], xm2 ; top[max_base_y] |
| vpbroadcastd xm2, [pb_36_m4] |
| add dyd, dyd |
| pmaddubsw xm0, xm2 |
| pmaddubsw xm1, xm2 |
| movd xm7, dyd |
| mov r2d, dyd |
| vpbroadcastw m7, xm7 |
| paddw xm1, xm0 |
| pmulhrsw xm1, xm3 |
| pslldq m6, m7, 8 |
| paddw xm2, xm7, xm7 |
| paddw m6, m7 |
| packuswb xm1, xm1 |
| paddw m6, m2 |
| punpcklbw xm1, xm8 |
| mova xm8, [z_transpose4] |
| psllw m7, 2 |
| pshufb xm1, [pb_15to0] |
| mova [rsp], xm1 |
| .h4_upsample_loop: |
| lea r4d, [r2+dyq] |
| shr r2d, 6 |
| vpbroadcastq m1, [rsp+r2] |
| lea r2d, [r4+dyq] |
| shr r4d, 6 |
| vpbroadcastq m2, [rsp+r4] |
| lea r4d, [r2+dyq] |
| shr r2d, 6 |
| movq xm0, [rsp+r2] |
| lea r2d, [r4+dyq] |
| shr r4d, 6 |
| movhps xm0, [rsp+r4] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 |
| psllw m2, 8 |
| por m1, m2 |
| pmaddubsw m0, m1 |
| paddw m6, m7 |
| pmulhrsw m0, m3 |
| vextracti128 xm1, m0, 1 |
| packuswb xm1, xm0 |
| pshufb xm1, xm8 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm1, 2 |
| pextrd [dstq+r7 ], xm1, 3 |
| add dstq, 4 |
| sub wd, 4 |
| jg .h4_upsample_loop |
| RET |
| ALIGN function_align |
| .filter_strength: ; h4/h8/h16 |
| %define base r4-z_filter_t0 |
| lea r4, [z_filter_t0] |
| movd xm0, maxbased |
| movd xm2, angled |
| shr angled, 8 ; is_sm << 1 |
| vpbroadcastb m0, xm0 |
| vpbroadcastb m2, xm2 |
| pcmpeqb m1, m0, [base+z_filter_wh] |
| pand m1, m2 |
| mova xm2, [r4+angleq*8] |
| pcmpgtb m1, m2 |
| pmovmskb r5d, m1 |
| popcnt r5d, r5d |
| ret |
| .h4_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -16, 12 |
| mov maxbased, 7 |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .h4_main |
| lea maxbased, [wq+3] |
| call .filter_strength |
| mov maxbased, 7 |
| jz .h4_main ; filter_strength == 0 |
| vpbroadcastd m7, [base+pb_7] |
| vbroadcasti128 m2, [tlq-14] |
| pmaxub m1, m7, [base+z_filter_s-4] |
| vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] |
| pmaxub m7, [base+z_filter_s+4] |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] |
| pshufb m0, m2, m1 |
| shufps m1, m7, q2121 |
| pmaddubsw m0, m8 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m9 |
| pshufb m2, m7 |
| pmaddubsw m2, m10 |
| paddw m0, m1 |
| paddw m0, m2 |
| pmulhrsw m0, m3 |
| mov r4d, 9 |
| lea tlq, [rsp+15] |
| cmp wd, 4 |
| cmova maxbased, r4d |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| mova [rsp], xm0 |
| .h4_main: |
| movd xm6, dyd |
| vpbroadcastq m0, [z_base_inc] ; base_inc << 6 |
| mov r4, tlq |
| sub tlq, 4 |
| neg dyq |
| vpbroadcastw m6, xm6 |
| sub r4, maxbaseq |
| shl maxbased, 6 |
| vpbroadcastb m7, [r4] |
| lea r4, [dyq+63] ; ypos |
| movd xm9, maxbased |
| not maxbased |
| vbroadcasti128 m8, [z3_shuf_w4] |
| add maxbased, 64 |
| vpbroadcastw m9, xm9 |
| psrlw m7, 8 ; top[max_base_y] |
| paddw m10, m6, m6 |
| psubw m9, m0 ; max_base_y |
| vpblendd m6, m10, 0xcc |
| mova xm0, xm10 |
| paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 |
| paddw m10, m10 |
| mova xm11, [z_transpose4] |
| .h4_loop: |
| lea r5, [r4+dyq] |
| sar r4, 6 ; base0 |
| vpbroadcastq m1, [tlq+r4] |
| lea r4, [r5+dyq] |
| sar r5, 6 ; base1 |
| vpbroadcastq m2, [tlq+r5] |
| lea r5, [r4+dyq] |
| sar r4, 6 ; base2 |
| movq xm0, [tlq+r4] |
| lea r4, [r5+dyq] |
| sar r5, 6 ; base3 |
| movhps xm0, [tlq+r5] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 ; frac |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 ; 64-frac |
| psllw m2, 8 |
| pshufb m0, m8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m6 ; base < max_base_y |
| pmulhrsw m0, m3 |
| paddw m6, m10 ; ypos += dy |
| vpblendvb m0, m7, m0, m1 |
| vextracti128 xm1, m0, 1 |
| packuswb xm1, xm0 |
| pshufb xm1, xm11 ; transpose |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm1, 2 |
| pextrd [dstq+r7 ], xm1, 3 |
| sub wd, 4 |
| jz .h4_end |
| add dstq, 4 |
| cmp r4d, maxbased |
| jg .h4_loop |
| packuswb xm7, xm7 |
| .h4_end_loop: |
| movd [dstq+strideq*0], xm7 |
| movd [dstq+strideq*1], xm7 |
| movd [dstq+strideq*2], xm7 |
| movd [dstq+r7 ], xm7 |
| add dstq, 4 |
| sub wd, 4 |
| jg .h4_end_loop |
| .h4_end: |
| RET |
| ALIGN function_align |
| .h8: |
| lea r4d, [angleq+216] |
| mov r4b, wb |
| cmp r4d, 8 |
| ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 8 |
| and r4d, 4 |
| mova xm0, [tlq-15] |
| vinserti128 m0, [tlq- 9], 1 |
| movd xm1, r4d |
| movu xm2, [z_filter_s+2] |
| vinserti128 m2, [z_filter_s+6], 1 |
| vpbroadcastb xm1, xm1 ; w & 4 |
| vpbroadcastd m7, [pb_36_m4] |
| pmaxub xm1, [z_upsample1-4] ; clip 4x8 |
| vinserti128 m1, [z_upsample1], 1 |
| add dyd, dyd |
| pshufb m1, m0, m1 |
| pshufb m2, m0, m2 |
| vinserti128 m0, [tlq-7], 1 |
| movd xm6, dyd |
| pmaddubsw m1, m7 |
| pmaddubsw m2, m7 |
| vpbroadcastw m6, xm6 |
| mov r2d, dyd |
| lea r5, [strideq*3] |
| paddw m7, m6, m6 |
| paddw m1, m2 |
| vpblendd m6, m7, 0xf0 |
| pmulhrsw m1, m3 |
| pslldq m2, m7, 8 |
| paddw m7, m7 |
| paddw m6, m2 |
| vbroadcasti128 m2, [pb_15to0] |
| packuswb m1, m1 |
| punpcklbw m1, m0 |
| pshufb m1, m2 |
| vextracti128 [rsp+ 0], m1, 1 |
| mova [rsp+16], xm1 |
| .h8_upsample_loop: |
| lea r4d, [r2+dyq] |
| shr r2d, 6 ; base0 |
| movu xm0, [rsp+r2] |
| lea r2d, [r4+dyq] |
| shr r4d, 6 ; base1 |
| vinserti128 m0, [rsp+r4], 1 |
| lea r4d, [r2+dyq] |
| shr r2d, 6 ; base2 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| punpcklqdq m1, m2, m2 ; frac0 frac1 |
| pmaddubsw m0, m1 |
| movu xm1, [rsp+r2] |
| lea r2d, [r4+dyq] |
| shr r4d, 6 ; base3 |
| vinserti128 m1, [rsp+r4], 1 |
| punpckhqdq m2, m2 ; frac2 frac3 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| paddw m6, m7 |
| pmulhrsw m1, m3 |
| lea r4, [dstq+strideq*4] |
| psllw m1, 8 |
| por m0, m1 |
| vextracti128 xm1, m0, 1 |
| punpcklbw xm2, xm0, xm1 |
| punpckhbw xm0, xm1 |
| movd [dstq+strideq*0], xm2 |
| pextrd [dstq+strideq*1], xm2, 1 |
| pextrd [dstq+strideq*2], xm2, 2 |
| pextrd [dstq+r5 ], xm2, 3 |
| movd [r4 +strideq*0], xm0 |
| pextrd [r4 +strideq*1], xm0, 1 |
| pextrd [r4 +strideq*2], xm0, 2 |
| pextrd [r4 +r5 ], xm0, 3 |
| add dstq, 4 |
| sub wd, 4 |
| jg .h8_upsample_loop |
| RET |
| .h8_no_intra_edge_filter: |
| and maxbased, 7 |
| or maxbased, 8 ; imin(w+7, 15) |
| jmp .h8_main |
| .h8_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 10 |
| lea maxbased, [wq+7] |
| test angled, 0x400 |
| jnz .h8_no_intra_edge_filter |
| call .filter_strength |
| jz .h8_main ; filter_strength == 0 |
| vpbroadcastd xm6, [base+pb_15] |
| pcmpeqb xm1, xm1 |
| psubusb xm6, xm0 |
| psubb xm6, xm1 ; w == 4 ? 5 : 1 |
| movu xm2, [tlq-16] |
| pmaxub xm1, xm6, [base+z_filter_s] |
| vinserti128 m2, [tlq-14], 1 |
| vinserti128 m1, [base+z_filter_s+12], 1 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] |
| pmaxub xm6, [base+z_filter_s+ 8] |
| vinserti128 m6, [base+z_filter_s+20], 1 |
| pshufb m0, m2, m1 |
| pmaddubsw m0, m7 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] |
| movzx r4d, byte [tlq-15] |
| shufps m1, m6, q2121 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m7 |
| paddw m0, m1 |
| sub r5d, 3 |
| jnz .h8_3tap |
| vpbroadcastd m7, [z_filter_k+4*8] |
| movzx r2d, byte [tlq-14] |
| pshufb m2, m6 |
| pmaddubsw m2, m7 |
| sub r2d, r4d |
| lea r2d, [r2+r4*8+4] |
| shr r2d, 3 |
| mov [rsp+15], r2b |
| paddw m0, m2 |
| .h8_3tap: |
| pmulhrsw m0, m3 |
| sar r5d, 1 |
| lea tlq, [rsp+31] |
| add r5d, 17 |
| cmp wd, 8 |
| cmova maxbased, r5d |
| neg r5 |
| mov [tlq+r5], r4b |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| mova [tlq-15], xm0 |
| .h8_main: |
| movd xm2, dyd |
| vbroadcasti128 m0, [z_base_inc] |
| mov r4, tlq |
| sub tlq, 8 |
| neg dyq |
| vpbroadcastw m2, xm2 |
| sub r4, maxbaseq |
| shl maxbased, 6 |
| vpbroadcastb m7, [r4] |
| lea r4, [dyq+63] |
| movd xm9, maxbased |
| not maxbased |
| vbroadcasti128 m8, [z3_shuf] |
| add maxbased, 64 |
| vpbroadcastw m9, xm9 |
| psrlw m7, 8 |
| psubw m9, m0 |
| paddw m6, m2, m2 |
| vpblendd m2, m6, 0x0f |
| .h8_loop: |
| lea r5, [r4+dyq] |
| sar r4, 6 |
| pand m0, m4, m2 |
| psubw m1, m5, m0 |
| psllw m0, 8 |
| por m1, m0 |
| vbroadcasti128 m0, [tlq+r4] |
| lea r4, [r5+dyq] |
| sar r5, 6 |
| vinserti128 m0, [tlq+r5], 0 |
| sub rsp, 8*2 |
| pshufb m0, m8 |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m2 |
| paddw m2, m6 |
| pmulhrsw m0, m3 |
| vpblendvb m0, m7, m0, m1 |
| vextracti128 xm1, m0, 1 |
| psllw xm0, 8 |
| por xm0, xm1 ; interleave rows (partial transpose) |
| mova [rsp], xm0 |
| sub wd, 2 |
| jz .h8_transpose |
| cmp r4d, maxbased |
| jg .h8_loop |
| packuswb xm0, xm7, xm7 |
| .h8_end_loop: |
| sub rsp, 8*2 |
| mova [rsp], xm0 |
| sub wd, 2 |
| jg .h8_end_loop |
| .h8_transpose: |
| mova xm2, [rsp+16*1] |
| sub org_wd, 8 |
| lea r2, [strideq*3] |
| lea r6, [dstq+org_wq] |
| cmovg dstq, r6 |
| punpcklwd xm1, xm2, xm0 |
| punpckhwd xm2, xm0 |
| lea r6, [dstq+strideq*4] |
| jge .h8_w8 |
| add rsp, 16*2 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm1, 2 |
| pextrd [dstq+r2 ], xm1, 3 |
| movd [r6 +strideq*0], xm2 |
| pextrd [r6 +strideq*1], xm2, 1 |
| pextrd [r6 +strideq*2], xm2, 2 |
| pextrd [r6 +r2 ], xm2, 3 |
| jmp .h8_end |
| .h8_w8_loop: |
| mova xm0, [rsp+16*0] |
| mova xm2, [rsp+16*1] |
| punpcklwd xm1, xm2, xm0 |
| punpckhwd xm2, xm0 |
| .h8_w8: ; w8/w16/w32 |
| mova xm0, [rsp+16*2] |
| mova xm4, [rsp+16*3] |
| add rsp, 16*4 |
| punpcklwd xm3, xm4, xm0 |
| punpckhwd xm4, xm0 |
| punpckldq xm0, xm3, xm1 |
| punpckhdq xm3, xm1 |
| punpckldq xm1, xm4, xm2 |
| punpckhdq xm4, xm2 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm3 |
| movhps [dstq+r2 ], xm3 |
| movq [r6 +strideq*0], xm1 |
| movhps [r6 +strideq*1], xm1 |
| movq [r6 +strideq*2], xm4 |
| movhps [r6 +r2 ], xm4 |
| sub dstq, 8 |
| sub r6, 8 |
| sub org_wd, 8 |
| jge .h8_w8_loop |
| .h8_end: |
| RET |
| .h16_no_intra_edge_filter: |
| and maxbased, 15 |
| or maxbased, 16 ; imin(w+15, 31) |
| jmp .h16_main |
| ALIGN function_align |
| .h16: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -64, 12 |
| lea maxbased, [wq+15] |
| test angled, 0x400 |
| jnz .h16_no_intra_edge_filter |
| call .filter_strength |
| jz .h16_main ; filter_strength == 0 |
| vpbroadcastd m11, [base+pb_27] |
| vpbroadcastd m1, [base+pb_1] |
| vbroadcasti128 m6, [base+z_filter_s+12] |
| vinserti128 m2, m6, [base+z_filter_s+4], 0 |
| vinserti128 m6, [base+z_filter_s+20], 1 |
| movu xm10, [tlq-18] |
| vinserti128 m10, [tlq-14], 1 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] |
| vbroadcasti128 m7, [base+z_filter_s+8] |
| vinserti128 m8, m7, [base+z_filter_s+0], 0 |
| vinserti128 m7, [base+z_filter_s+16], 1 |
| psubusb m11, m0 |
| por m1, m11 |
| movu xm11, [tlq-32] |
| vinserti128 m11, [tlq-28], 1 |
| pmaxub m8, m1 |
| pmaxub m7, m1 |
| pshufb m0, m10, m2 |
| shufps m2, m6, q2121 |
| pmaddubsw m0, m9 |
| pshufb m1, m11, m8 |
| shufps m8, m7, q2121 |
| pmaddubsw m1, m9 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| movzx r4d, byte [tlq-31] |
| pshufb m2, m10, m2 |
| pmaddubsw m2, m9 |
| pshufb m8, m11, m8 |
| pmaddubsw m8, m9 |
| paddw m0, m2 |
| paddw m1, m8 |
| sub r5d, 3 |
| jnz .h16_3tap |
| vpbroadcastd m9, [z_filter_k+4*8] |
| movzx r2d, byte [tlq-30] |
| pshufb m10, m6 |
| pmaddubsw m10, m9 |
| pshufb m11, m7 |
| pmaddubsw m11, m9 |
| sub r2d, r4d |
| lea r2d, [r2+r4*8+4] |
| shr r2d, 3 |
| mov [rsp+31], r2b |
| paddw m0, m10 |
| paddw m1, m11 |
| .h16_3tap: |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| sar r5d, 1 |
| lea tlq, [rsp+63] |
| add r5d, 33 |
| cmp wd, 16 |
| cmova maxbased, r5d |
| neg r5 |
| mov [tlq+r5], r4b |
| packuswb m0, m1 |
| vpermq m0, m0, q2031 |
| mova [tlq-31], m0 |
| .h16_main: |
| movd xm6, dyd |
| vbroadcasti128 m0, [z_base_inc] |
| mov r4, tlq |
| sub tlq, 8 |
| neg dyq |
| vpbroadcastw m6, xm6 |
| sub r4, maxbaseq |
| shl maxbased, 6 |
| vpbroadcastb m7, [r4] |
| lea r4, [dyq+63] |
| movd xm9, maxbased |
| not maxbased |
| vbroadcasti128 m8, [z3_shuf] |
| add maxbased, 64 |
| vpbroadcastw m9, xm9 |
| psubw m9, m0 |
| paddw m11, m6, m6 |
| psubw m10, m9, m3 ; 64*8 |
| vpblendd m6, m11, 0xf0 |
| .h16_loop: |
| lea r5, [r4+dyq] |
| sar r4, 6 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu xm0, [tlq+r4-0] |
| movu xm1, [tlq+r4-8] |
| lea r4, [r5+dyq] |
| sar r5, 6 |
| vinserti128 m0, [tlq+r5-0], 1 |
| vinserti128 m1, [tlq+r5-8], 1 |
| sub rsp, 32 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| vpermq m0, m0, q3120 |
| mova [rsp], m0 |
| sub wd, 2 |
| jz .h16_transpose |
| cmp r4d, maxbased |
| jg .h16_loop |
| mova m0, m7 |
| .h16_end_loop: |
| sub rsp, 32 |
| mova [rsp], m7 |
| sub wd, 2 |
| jg .h16_end_loop |
| .h16_transpose: |
| mova m2, [rsp+32*1] |
| sub org_wd, 8 |
| lea r2, [strideq*3] |
| lea r6, [dstq+org_wq] |
| cmovg dstq, r6 |
| punpcklbw m1, m2, m0 |
| punpckhbw m2, m0 |
| lea r3, [strideq*5] |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| lea r4, [strideq+r2*2] ; stride*7 |
| jge .h16_w8 |
| add rsp, 32*2 |
| movd [dstq+strideq*0], xm0 |
| pextrd [dstq+strideq*1], xm0, 1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r2 ], xm0, 3 |
| vextracti128 xm0, m0, 1 |
| movd [dstq+strideq*4], xm1 |
| pextrd [dstq+r3 ], xm1, 1 |
| pextrd [dstq+r2*2 ], xm1, 2 |
| pextrd [dstq+r4 ], xm1, 3 |
| lea dstq, [dstq+strideq*8] |
| vextracti128 xm1, m1, 1 |
| movd [dstq+strideq*0], xm0 |
| pextrd [dstq+strideq*1], xm0, 1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r2 ], xm0, 3 |
| movd [dstq+strideq*4], xm1 |
| pextrd [dstq+r3 ], xm1, 1 |
| pextrd [dstq+r2*2 ], xm1, 2 |
| pextrd [dstq+r4 ], xm1, 3 |
| jmp .h16_end |
| .h16_w8_loop: |
| mova m0, [rsp+32*0] |
| mova m2, [rsp+32*1] |
| punpcklbw m1, m2, m0 |
| punpckhbw m2, m0 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| .h16_w8: |
| mova m2, [rsp+32*2] |
| mova m4, [rsp+32*3] |
| lea r6, [dstq+strideq*8] |
| add rsp, 32*4 |
| punpcklbw m3, m4, m2 |
| punpckhbw m4, m2 |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| punpckldq m4, m2, m0 |
| punpckhdq m2, m0 |
| punpckldq m0, m3, m1 |
| punpckhdq m3, m1 |
| movq [dstq+strideq*0], xm4 |
| movhps [dstq+strideq*1], xm4 |
| vextracti128 xm4, m4, 1 |
| movq [dstq+strideq*2], xm2 |
| movhps [dstq+r2 ], xm2 |
| vextracti128 xm2, m2, 1 |
| movq [dstq+strideq*4], xm0 |
| movhps [dstq+r3 ], xm0 |
| vextracti128 xm0, m0, 1 |
| movq [dstq+r2*2 ], xm3 |
| movhps [dstq+r4 ], xm3 |
| vextracti128 xm3, m3, 1 |
| movq [r6+strideq*0], xm4 |
| movhps [r6+strideq*1], xm4 |
| movq [r6+strideq*2], xm2 |
| movhps [r6+r2 ], xm2 |
| movq [r6+strideq*4], xm0 |
| movhps [r6+r3 ], xm0 |
| movq [r6+r2*2 ], xm3 |
| movhps [r6+r4 ], xm3 |
| sub dstq, 8 |
| sub org_wd, 8 |
| jge .h16_w8_loop |
| .h16_end: |
| RET |
| ALIGN function_align |
| .h32: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -96, 15 |
| lea maxbased, [wq+31] |
| and maxbased, 31 |
| or maxbased, 32 ; imin(w+31, 63) |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .h32_main |
| vbroadcasti128 m0, [pb_0to15] |
| mov r4d, 21 |
| mov r5d, 3 |
| movu xm11, [tlq-66] ; 56-63 |
| vinserti128 m11, [tlq-52], 1 ; 40-47 |
| sub r4d, wd ; 21-w |
| cmovg r5d, r4d |
| movu xm12, [tlq-58] ; 48-55 |
| vinserti128 m12, [tlq-44], 1 ; 32-39 |
| sub r4d, 8 ; 13-w |
| movd xm1, r5d |
| movu xm13, [tlq-34] ; 24-31 |
| vinserti128 m13, [tlq-20], 1 ; 8-15 |
| movd xm2, r4d |
| vpbroadcastb m1, xm1 |
| movu xm14, [tlq-28] ; 16-23 |
| vinserti128 m14, [tlq-14], 1 ; 0- 7 |
| vpbroadcastb m2, xm2 |
| pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 |
| movu m7, [z_filter_s+4] |
| pshufb m11, m1 |
| vinserti128 m8, m7, [z_filter_s+8], 1 |
| vinserti128 m7, [z_filter_s+16], 0 |
| pmaxsb m2, m0 ; clip 8x32 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m12, m2 |
| pshufb m0, m11, m8 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| shufps m8, m7, q1021 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m10, m11, m8 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m8 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m8 |
| pmaddubsw m10, m9 |
| shufps m8, m7, q2121 |
| paddw m1, m10 |
| pshufb m10, m14, m8 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*2] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m8 |
| pmaddubsw m12, m9 |
| movzx r4d, byte [tlq-63] |
| movzx r2d, byte [tlq-62] |
| paddw m0, m11 |
| paddw m2, m12 |
| pshufb m13, m8 |
| pmaddubsw m13, m9 |
| pshufb m14, m7 |
| pmaddubsw m14, m9 |
| paddw m1, m13 |
| paddw m6, m14 |
| sub r2d, r4d |
| lea r2d, [r2+r4*8+4] ; edge case for 64x32 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| shr r2d, 3 |
| mov [rsp+31], r2b |
| lea tlq, [rsp+95] |
| mov [tlq-65], r4b |
| mov r4d, 65 |
| cmp wd, 32 |
| cmova maxbased, r4d |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq-63], m0 |
| mova [tlq-31], m1 |
| .h32_main: |
| movd xm6, dyd |
| mov r4, tlq |
| sub tlq, 8 |
| neg dyq |
| vpbroadcastw m6, xm6 |
| sub r4, maxbaseq |
| shl maxbased, 6 |
| vpbroadcastb m7, [r4] |
| lea r4, [dyq+63] |
| movd xm9, maxbased |
| not maxbased |
| vbroadcasti128 m8, [z3_shuf] |
| add maxbased, 64 |
| vpbroadcastw m9, xm9 |
| psubw m9, [z_base_inc] |
| mova m11, m6 |
| psubw m10, m9, m3 ; 64*8 |
| .h32_loop: |
| mov r5, r4 |
| sar r5, 6 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu xm0, [tlq+r5- 0] |
| vinserti128 m0, [tlq+r5-16], 1 |
| movu xm1, [tlq+r5- 8] |
| vinserti128 m1, [tlq+r5-24], 1 |
| sub rsp, 32 |
| add r4, dyq |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| mova [rsp], m0 |
| dec wd |
| jz .h32_transpose |
| cmp r4d, maxbased |
| jg .h32_loop |
| .h32_end_loop: |
| sub rsp, 32 |
| mova [rsp], m7 |
| dec wd |
| jg .h32_end_loop |
| .h32_transpose: |
| lea dstq, [dstq+org_wq-8] |
| lea r2, [strideq*3] |
| lea r3, [strideq*5] |
| lea r4, [strideq+r2*2] ; stride*7 |
| .h32_w8_loop: |
| mova m7, [rsp+32*0] |
| mova m6, [rsp+32*1] |
| mova m5, [rsp+32*2] |
| mova m4, [rsp+32*3] |
| mova m3, [rsp+32*4] |
| mova m2, [rsp+32*5] |
| mova m1, [rsp+32*6] |
| mova m0, [rsp+32*7] |
| lea r6, [dstq+strideq*8] |
| add rsp, 32*8 |
| punpcklbw m8, m0, m1 |
| punpckhbw m0, m1 |
| punpcklbw m1, m2, m3 |
| punpckhbw m2, m3 |
| punpcklbw m3, m4, m5 |
| punpckhbw m4, m5 |
| punpcklbw m5, m6, m7 |
| punpckhbw m6, m7 |
| punpcklwd m7, m8, m1 |
| punpckhwd m8, m1 |
| punpcklwd m1, m0, m2 |
| punpckhwd m0, m2 |
| punpcklwd m2, m3, m5 |
| punpckhwd m3, m5 |
| punpcklwd m5, m4, m6 |
| punpckhwd m4, m6 |
| punpckldq m6, m7, m2 |
| punpckhdq m7, m2 |
| punpckldq m2, m8, m3 |
| punpckhdq m8, m3 |
| punpckldq m3, m1, m5 |
| punpckhdq m1, m5 |
| punpckldq m5, m0, m4 |
| punpckhdq m0, m4 |
| movq [dstq+strideq*0], xm6 |
| movhps [dstq+strideq*1], xm6 |
| vextracti128 xm6, m6, 1 |
| movq [dstq+strideq*2], xm7 |
| movhps [dstq+r2 ], xm7 |
| vextracti128 xm7, m7, 1 |
| movq [dstq+strideq*4], xm2 |
| movhps [dstq+r3 ], xm2 |
| vextracti128 xm2, m2, 1 |
| movq [dstq+r2*2 ], xm8 |
| movhps [dstq+r4 ], xm8 |
| vextracti128 xm8, m8, 1 |
| movq [r6+strideq*0], xm3 |
| movhps [r6+strideq*1], xm3 |
| vextracti128 xm3, m3, 1 |
| movq [r6+strideq*2], xm1 |
| movhps [r6+r2 ], xm1 |
| vextracti128 xm1, m1, 1 |
| movq [r6+strideq*4], xm5 |
| movhps [r6+r3 ], xm5 |
| vextracti128 xm5, m5, 1 |
| movq [r6+r2*2 ], xm0 |
| movhps [r6+r4 ], xm0 |
| lea r6, [r6+strideq*8] |
| vextracti128 xm0, m0, 1 |
| movq [r6+strideq*0], xm6 |
| movhps [r6+strideq*1], xm6 |
| movq [r6+strideq*2], xm7 |
| movhps [r6+r2 ], xm7 |
| movq [r6+strideq*4], xm2 |
| movhps [r6+r3 ], xm2 |
| movq [r6+r2*2 ], xm8 |
| movhps [r6+r4 ], xm8 |
| lea r6, [r6+strideq*8] |
| movq [r6+strideq*0], xm3 |
| movhps [r6+strideq*1], xm3 |
| movq [r6+strideq*2], xm1 |
| movhps [r6+r2 ], xm1 |
| movq [r6+strideq*4], xm5 |
| movhps [r6+r3 ], xm5 |
| movq [r6+r2*2 ], xm0 |
| movhps [r6+r4 ], xm0 |
| sub dstq, 8 |
| sub org_wd, 8 |
| jg .h32_w8_loop |
| RET |
| ALIGN function_align |
| .h64: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -128, 16 |
| lea maxbased, [wq+63] |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .h64_main |
| mov r4d, 21 |
| vpbroadcastb xm11, [tlq-127] |
| vpblendd xm11, [tlq-130], 0x0e ; 120-127 |
| sub r4d, wd ; 21-w |
| mov r5d, 3 |
| vinserti128 m11, [tlq-116], 1 ; 104-111 |
| movu m7, [z_filter_s+4] |
| cmp wd, 32 |
| cmove r4d, r5d |
| vinserti128 m8, m7, [z_filter_s+8], 1 |
| vbroadcasti128 m6, [pb_0to15] |
| movd xm1, r4d |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| movu xm12, [tlq-122] ; 112-119 |
| vinserti128 m12, [tlq-108], 1 ; 96-103 |
| vpbroadcastb m1, xm1 |
| movu xm13, [tlq- 98] ; 88- 95 |
| vinserti128 m13, [tlq- 84], 1 ; 72- 79 |
| movu xm14, [tlq- 90] ; 80- 87 |
| vinserti128 m14, [tlq- 76], 1 ; 64- 71 |
| vinserti128 m7, [z_filter_s+16], 0 |
| pshufb m0, m11, m8 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pmaxsb m1, m6 ; clip (16|32)x64 |
| pshufb m13, m1 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| shufps m15, m8, m7, q1021 |
| pshufb m10, m11, m15 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m15 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m15 |
| pmaddubsw m10, m9 |
| paddw m1, m10 |
| pshufb m10, m14, m15 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*2] |
| shufps m10, m8, m7, q2132 |
| pshufb m11, m10 |
| pmaddubsw m11, m9 |
| pshufb m12, m10 |
| pmaddubsw m12, m9 |
| pshufb m13, m10 |
| pmaddubsw m13, m9 |
| pshufb m14, m10 |
| pmaddubsw m14, m9 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| movu xm11, [tlq-66] ; 56-63 |
| vinserti128 m11, [tlq-52], 1 ; 40-47 |
| movu xm12, [tlq-58] ; 48-55 |
| vinserti128 m12, [tlq-44], 1 ; 32-39 |
| movu xm13, [tlq-34] ; 24-31 |
| vinserti128 m13, [tlq-20], 1 ; 8-15 |
| movu xm14, [tlq-28] ; 16-23 |
| vinserti128 m14, [tlq-14], 1 ; 0- 7 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| lea tlq, [rsp+127] |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq-127], m0 |
| mova [tlq- 95], m1 |
| pshufb m0, m11, m10 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m10 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m10 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m7 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m7, m11, m15 |
| pmaddubsw m7, m9 |
| paddw m0, m7 |
| pshufb m7, m12, m15 |
| pmaddubsw m7, m9 |
| paddw m2, m7 |
| pshufb m7, m13, m15 |
| pmaddubsw m7, m9 |
| paddw m1, m7 |
| pshufb m7, m14, m10 |
| pmaddubsw m7, m9 |
| paddw m6, m7 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m8 |
| pmaddubsw m12, m9 |
| pshufb m13, m8 |
| pmaddubsw m13, m9 |
| pshufb m14, m15 |
| pmaddubsw m14, m9 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq-63], m0 |
| mova [tlq-31], m1 |
| .h64_main: |
| movd xm12, dyd |
| neg maxbaseq |
| vbroadcasti128 m8, [z3_shuf] |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m12, xm12 |
| lea r5d, [dyq+maxbaseq-64] |
| neg dyq |
| or maxbased, 63 |
| lea r4, [dyq+63] |
| movd xm6, r5d |
| mova xm10, [pb_1to32+16] |
| vinserti128 m10, [pb_1to32], 1 |
| vpbroadcastd m11, [pb_32] |
| vpbroadcastw m6, xm6 |
| .h64_loop: |
| mov r5, r4 |
| sar r5, 6 |
| movu m0, [tlq+r5-24] |
| movu m1, [tlq+r5-32] |
| pand m2, m4, m6 |
| psubw m9, m5, m2 |
| psllw m2, 8 |
| por m9, m2 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| psraw m2, m6, 6 |
| sub rsp, 64 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packsswb m2, m2 |
| paddb m2, m10 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [rsp+32], m0 |
| movu m0, [tlq+r5-56] |
| movu m1, [tlq+r5-64] |
| add r4, dyq |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| paddb m2, m11 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| paddw m6, m12 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [rsp], m0 |
| dec wd |
| jz .h64_transpose |
| cmp r4d, maxbased |
| jg .h64_loop |
| .h64_end_loop: |
| sub rsp, 64 |
| mova [rsp+32], m7 |
| mova [rsp+ 0], m7 |
| dec wd |
| jg .h64_end_loop |
| .h64_transpose: |
| lea r2, [strideq*3] |
| lea r3, [strideq*5] |
| imul r5, strideq, -8 |
| lea dstq, [dstq+org_wq-16] |
| lea r4, [strideq+r2*2] ; stride*7 |
| .h64_transpose_loop0: |
| lea r6, [rsp+16*3] |
| .h64_transpose_loop: |
| mova xm0, [r6+64*15] |
| vinserti128 m0, [r6+64* 7], 1 |
| mova xm1, [r6+64*14] |
| vinserti128 m1, [r6+64* 6], 1 |
| mova xm2, [r6+64*13] |
| vinserti128 m2, [r6+64* 5], 1 |
| mova xm3, [r6+64*12] |
| vinserti128 m3, [r6+64* 4], 1 |
| mova xm4, [r6+64*11] |
| vinserti128 m4, [r6+64* 3], 1 |
| mova xm5, [r6+64*10] |
| vinserti128 m5, [r6+64* 2], 1 |
| mova xm6, [r6+64* 9] |
| vinserti128 m6, [r6+64* 1], 1 |
| mova xm7, [r6+64* 8] |
| vinserti128 m7, [r6+64* 0], 1 |
| sub r6, 16 |
| punpcklbw m8, m0, m1 |
| punpckhbw m0, m1 |
| punpcklbw m1, m2, m3 |
| punpckhbw m2, m3 |
| punpcklbw m3, m4, m5 |
| punpckhbw m4, m5 |
| punpcklbw m5, m6, m7 |
| punpckhbw m6, m7 |
| punpcklwd m7, m8, m1 |
| punpckhwd m8, m1 |
| punpcklwd m1, m0, m2 |
| punpckhwd m0, m2 |
| punpcklwd m2, m3, m5 |
| punpckhwd m3, m5 |
| punpcklwd m5, m4, m6 |
| punpckhwd m4, m6 |
| punpckldq m6, m7, m2 |
| punpckhdq m7, m2 |
| punpckldq m2, m8, m3 |
| punpckhdq m8, m3 |
| punpckldq m3, m1, m5 |
| punpckhdq m1, m5 |
| punpckldq m5, m0, m4 |
| punpckhdq m0, m4 |
| vpermq m6, m6, q3120 |
| vpermq m7, m7, q3120 |
| vpermq m2, m2, q3120 |
| vpermq m8, m8, q3120 |
| vpermq m3, m3, q3120 |
| vpermq m1, m1, q3120 |
| vpermq m5, m5, q3120 |
| vpermq m0, m0, q3120 |
| mova [dstq+strideq*0], xm6 |
| vextracti128 [dstq+strideq*1], m6, 1 |
| mova [dstq+strideq*2], xm7 |
| vextracti128 [dstq+r2 ], m7, 1 |
| mova [dstq+strideq*4], xm2 |
| vextracti128 [dstq+r3 ], m2, 1 |
| mova [dstq+r2*2 ], xm8 |
| vextracti128 [dstq+r4 ], m8, 1 |
| sub dstq, r5 |
| mova [dstq+strideq*0], xm3 |
| vextracti128 [dstq+strideq*1], m3, 1 |
| mova [dstq+strideq*2], xm1 |
| vextracti128 [dstq+r2 ], m1, 1 |
| mova [dstq+strideq*4], xm5 |
| vextracti128 [dstq+r3 ], m5, 1 |
| mova [dstq+r2*2 ], xm0 |
| vextracti128 [dstq+r4 ], m0, 1 |
| sub dstq, r5 |
| cmp r6, rsp |
| jae .h64_transpose_loop |
| add rsp, 64*16 |
| lea dstq, [dstq+r5*8-16] |
| sub org_wd, 16 |
| jg .h64_transpose_loop0 |
| .h64_end: |
| RET |
| |
| %macro FILTER_XMM 4 ; dst, src, tmp, shuf |
| %ifnum %4 |
| pshufb xm%2, xm%4 |
| %else |
| pshufb xm%2, %4 |
| %endif |
| pshufd xm%1, xm%2, q0000 ; p0 p1 |
| pmaddubsw xm%1, xm2 |
| pshufd xm%3, xm%2, q1111 ; p2 p3 |
| pmaddubsw xm%3, xm3 |
| paddw xm%1, xm1 |
| paddw xm%1, xm%3 |
| pshufd xm%3, xm%2, q2222 ; p4 p5 |
| pmaddubsw xm%3, xm4 |
| paddw xm%1, xm%3 |
| pshufd xm%3, xm%2, q3333 ; p6 __ |
| pmaddubsw xm%3, xm5 |
| paddw xm%1, xm%3 |
| psraw xm%1, 4 |
| packuswb xm%1, xm%1 |
| %endmacro |
| |
| %macro FILTER_YMM 4 ; dst, src, tmp, shuf |
| pshufb m%2, m%4 |
| pshufd m%1, m%2, q0000 |
| pmaddubsw m%1, m2 |
| pshufd m%3, m%2, q1111 |
| pmaddubsw m%3, m3 |
| paddw m%1, m1 |
| paddw m%1, m%3 |
| pshufd m%3, m%2, q2222 |
| pmaddubsw m%3, m4 |
| paddw m%1, m%3 |
| pshufd m%3, m%2, q3333 |
| pmaddubsw m%3, m5 |
| paddw m%1, m%3 |
| psraw m%1, 4 |
| vpermq m%3, m%1, q1032 |
| packuswb m%1, m%3 |
| %endmacro |
| |
| ; The ipred_filter SIMD processes 4x2 blocks in the following order which |
| ; increases parallelism compared to doing things row by row. One redundant |
| ; block is calculated for w8 and w16, two for w32. |
| ; w4 w8 w16 w32 |
| ; 1 1 2 1 2 3 5 1 2 3 5 b c d f |
| ; 2 2 3 2 4 5 7 2 4 5 7 c e f h |
| ; 3 3 4 4 6 7 9 4 6 7 9 e g h j |
| ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ |
| ; 5 8 8 i |
| |
| cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter |
| %define base r6-ipred_filter_avx2_table |
| lea r6, [filter_intra_taps] |
| tzcnt wd, wm |
| %ifidn filterd, filterm |
| movzx filterd, filterb |
| %else |
| movzx filterd, byte filterm |
| %endif |
| shl filterd, 6 |
| add filterq, r6 |
| lea r6, [ipred_filter_avx2_table] |
| movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m1, [base+pw_8] |
| vbroadcasti128 m2, [filterq+16*0] |
| vbroadcasti128 m3, [filterq+16*1] |
| vbroadcasti128 m4, [filterq+16*2] |
| vbroadcasti128 m5, [filterq+16*3] |
| add wq, r6 |
| mov hd, hm |
| jmp wq |
| .w4: |
| WIN64_SPILL_XMM 9 |
| mova xm8, [base+filter_shuf2] |
| sub tlq, 3 |
| sub tlq, hq |
| jmp .w4_loop_start |
| .w4_loop: |
| pinsrd xm0, xm6, [tlq+hq], 0 |
| lea dstq, [dstq+strideq*2] |
| .w4_loop_start: |
| FILTER_XMM 6, 0, 7, 8 |
| movd [dstq+strideq*0], xm6 |
| pextrd [dstq+strideq*1], xm6, 1 |
| sub hd, 2 |
| jg .w4_loop |
| RET |
| ALIGN function_align |
| .w8: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 10 |
| mova m8, [base+filter_shuf1] |
| FILTER_XMM 7, 0, 6, [base+filter_shuf2] |
| vpbroadcastd m0, [tlq+4] |
| vpbroadcastd m6, [tlq+5] |
| sub tlq, 4 |
| sub tlq, hq |
| vpbroadcastq m7, xm7 |
| vpblendd m7, m6, 0x20 |
| .w8_loop: |
| vpbroadcastd xm6, [tlq+hq] |
| palignr m6, m0, 12 |
| vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ |
| ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| mova xm6, xm7 |
| call .main |
| vpblendd xm6, xm7, 0x0c |
| pshufd xm6, xm6, q3120 |
| movq [dstq+strideq*0], xm6 |
| movhps [dstq+strideq*1], xm6 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| %if WIN64 |
| %assign stack_offset stack_offset - stack_size_padded |
| %assign xmm_regs_used 15 |
| %assign stack_size_padded 0x98 |
| SUB rsp, stack_size_padded |
| %endif |
| sub hd, 2 |
| TAIL_CALL .w16_main, 0 |
| .w16_main: |
| %if WIN64 |
| movaps [rsp+0xa8], xmm6 |
| movaps [rsp+0xb8], xmm7 |
| movaps [rsp+0x28], xmm8 |
| movaps [rsp+0x38], xmm9 |
| movaps [rsp+0x48], xmm10 |
| movaps [rsp+0x58], xmm11 |
| movaps [rsp+0x68], xmm12 |
| movaps [rsp+0x78], xmm13 |
| movaps [rsp+0x88], xmm14 |
| %endif |
| FILTER_XMM 12, 0, 7, [base+filter_shuf2] |
| vpbroadcastd m0, [tlq+5] |
| vpblendd m0, [tlq-12], 0x14 |
| mova m8, [base+filter_shuf1] |
| vpbroadcastq m7, xm12 |
| vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ |
| ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| call .main ; c0 d0 a1 b1 a1 b1 c0 d0 |
| movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| vinserti128 m14, m8, [base+filter_shuf3], 0 |
| vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 |
| FILTER_XMM 6, 9, 10, 14 |
| vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 |
| vpbroadcastd m9, [tlq+13] |
| vpbroadcastd m10, [tlq+12] |
| psrld m11, m8, 4 |
| vpblendd m6, m9, 0x20 ; top |
| sub tlq, 6 |
| sub tlq, hq |
| .w16_loop: |
| vpbroadcastd xm9, [tlq+hq] |
| palignr m9, m0, 12 |
| vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ |
| ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| mova xm13, xm7 |
| call .main ; e0 f0 c1 d1 c1 d1 e0 f0 |
| vpblendd m9, m12, m10, 0xf0 |
| vpblendd m12, m6, 0xc0 |
| pshufd m9, m9, q3333 |
| vpblendd m9, m6, 0xee |
| vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 |
| vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 |
| vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 |
| vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 |
| mova [dstq+strideq*0], xm9 |
| vextracti128 [dstq+strideq*1], m9, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 |
| pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] |
| vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 |
| shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 |
| shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm6 |
| ret |
| ALIGN function_align |
| .w32: |
| sub rsp, stack_size_padded |
| sub hd, 2 |
| lea r3, [dstq+16] |
| lea r5d, [hq-2] |
| call .w16_main |
| add tlq, r5 |
| mov dstq, r3 |
| lea r3, [strideq-4] |
| lea r4, [r3+strideq*2] |
| movq xm0, [tlq+21] |
| pinsrd xm0, [dstq-4], 2 |
| pinsrd xm0, [dstq+r3*1], 3 |
| FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 |
| movq xm7, [dstq+r3*2] |
| pinsrd xm7, [dstq+r4], 2 |
| palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 |
| vpbroadcastd m0, [tlq+28] |
| vpbroadcastd m9, [tlq+29] |
| vbroadcasti128 m8, [base+filter_shuf1+16] |
| vpblendd m0, m9, 0x20 |
| vpblendd m0, m7, 0x0f |
| vpbroadcastq m7, xm12 |
| vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| call .main ; c0 d0 a1 b1 a1 b1 c0 d0 |
| add r3, 2 |
| lea r4, [r4+strideq*2] |
| movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 |
| FILTER_XMM 6, 9, 10, 14 |
| vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 |
| vpbroadcastd m9, [tlq+37] |
| vpbroadcastd m10, [tlq+36] |
| vpblendd m6, m9, 0x20 ; top |
| .w32_loop: |
| movq xm9, [dstq+r3*4] |
| pinsrd xm9, [dstq+r4], 2 |
| .w32_loop_last: |
| palignr m9, m0, 12 |
| vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| mova xm13, xm7 ; c0 d0 |
| call .main ; e0 f0 c1 d1 c1 d1 e0 f0 |
| vpblendd m9, m12, m10, 0xf0 |
| vpblendd m12, m6, 0xc0 |
| pshufd m9, m9, q3333 |
| vpblendd m9, m6, 0xee |
| vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 |
| vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 |
| vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 |
| vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 |
| mova [dstq+strideq*0], xm9 |
| vextracti128 [dstq+strideq*1], m9, 1 |
| lea dstq, [dstq+strideq*2] |
| sub r5d, 2 |
| jg .w32_loop |
| jz .w32_loop_last |
| vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 |
| pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 |
| FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] |
| vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 |
| shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 |
| shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm6 |
| RET |
| ALIGN function_align |
| .main: |
| FILTER_YMM 7, 0, 9, 8 |
| ret |
| |
| %if WIN64 |
| DECLARE_REG_TMP 5 |
| %else |
| DECLARE_REG_TMP 7 |
| %endif |
| |
| %macro IPRED_CFL 1 ; ac in, unpacked pixels out |
| psignw m3, m%1, m1 |
| pabsw m%1, m%1 |
| pmulhrsw m%1, m2 |
| psignw m%1, m3 |
| paddw m%1, m0 |
| %endmacro |
| |
| cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| lea t0, [ipred_cfl_left_avx2_table] |
| tzcnt wd, wm |
| inc tlq |
| movu m0, [tlq] |
| movifnidn hd, hm |
| mov r6d, 0x8000 |
| shrx r6d, r6d, wd |
| movd xm3, r6d |
| movsxd r6, [t0+wq*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, t0 |
| add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table |
| movsxd wq, [t0+wq*4] |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| |
| cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| mov hd, hm ; zero upper half |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movu m0, [tlq] |
| mov t0d, 0x8000 |
| shrx t0d, t0d, r6d |
| movd xm3, t0d |
| lea t0, [ipred_cfl_left_avx2_table] |
| movsxd r6, [t0+r6*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, t0 |
| add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table |
| movsxd wq, [t0+wq*4] |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| .h32: |
| vextracti128 xm1, m0, 1 |
| paddw xm0, xm1 |
| .h16: |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| .h8: |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| .h4: |
| pmaddwd xm0, xm2 |
| pmulhrsw xm0, xm3 |
| vpbroadcastw m0, xm0 |
| jmp wq |
| |
| cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| movifnidn hd, hm |
| movifnidn wd, wm |
| tzcnt r6d, hd |
| lea t0d, [wq+hq] |
| movd xm4, t0d |
| tzcnt t0d, t0d |
| movd xm5, t0d |
| lea t0, [ipred_cfl_avx2_table] |
| tzcnt wd, wd |
| movsxd r6, [t0+r6*4] |
| movsxd wq, [t0+wq*4+4*4] |
| pcmpeqd m3, m3 |
| psrlw xm4, 1 |
| add r6, t0 |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp r6 |
| .h4: |
| movd xm0, [tlq-4] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w4: |
| movd xm1, [tlq+1] |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw xm0, 3 |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq xm1, xm0, xm0 |
| lea r2d, [hq*2] |
| mov r6d, 0x55563334 |
| paddw xm0, xm1 |
| shrx r6d, r6d, r2d |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| movd xm1, r6d |
| psrlw xm0, 2 |
| pmulhuw xm0, xm1 |
| .w4_end: |
| vpbroadcastw m0, xm0 |
| .s4: |
| vpbroadcastw m1, alpham |
| lea r6, [strideq*3] |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s4_loop: |
| mova m4, [acq] |
| IPRED_CFL 4 |
| packuswb m4, m4 |
| vextracti128 xm5, m4, 1 |
| movd [dstq+strideq*0], xm4 |
| pextrd [dstq+strideq*1], xm4, 1 |
| movd [dstq+strideq*2], xm5 |
| pextrd [dstq+r6 ], xm5, 1 |
| lea dstq, [dstq+strideq*4] |
| add acq, 32 |
| sub hd, 4 |
| jg .s4_loop |
| RET |
| ALIGN function_align |
| .h8: |
| movq xm0, [tlq-8] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w8: |
| movq xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| punpckhqdq xm2, xm0, xm0 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmovz r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w8_end: |
| vpbroadcastw m0, xm0 |
| .s8: |
| vpbroadcastw m1, alpham |
| lea r6, [strideq*3] |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s8_loop: |
| mova m4, [acq] |
| mova m5, [acq+32] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| vextracti128 xm5, m4, 1 |
| movq [dstq+strideq*0], xm4 |
| movq [dstq+strideq*1], xm5 |
| movhps [dstq+strideq*2], xm4 |
| movhps [dstq+r6 ], xm5 |
| lea dstq, [dstq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .s8_loop |
| RET |
| ALIGN function_align |
| .h16: |
| mova xm0, [tlq-16] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w16: |
| movu xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hb, 8|32 |
| cmovz r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w16_end: |
| vpbroadcastw m0, xm0 |
| .s16: |
| vpbroadcastw m1, alpham |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s16_loop: |
| mova m4, [acq] |
| mova m5, [acq+32] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| vpermq m4, m4, q3120 |
| mova [dstq+strideq*0], xm4 |
| vextracti128 [dstq+strideq*1], m4, 1 |
| lea dstq, [dstq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .s16_loop |
| RET |
| ALIGN function_align |
| .h32: |
| mova m0, [tlq-32] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w32: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| vextracti128 xm1, m0, 1 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w32_end: |
| vpbroadcastw m0, xm0 |
| .s32: |
| vpbroadcastw m1, alpham |
| pabsw m2, m1 |
| psllw m2, 9 |
| .s32_loop: |
| mova m4, [acq] |
| mova m5, [acq+32] |
| IPRED_CFL 4 |
| IPRED_CFL 5 |
| packuswb m4, m5 |
| vpermq m4, m4, q3120 |
| mova [dstq], m4 |
| add dstq, strideq |
| add acq, 64 |
| dec hd |
| jg .s32_loop |
| RET |
| |
| cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha |
| lea t0, [ipred_cfl_splat_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [t0+wq*4] |
| vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] |
| add wq, t0 |
| movifnidn acq, acmp |
| jmp wq |
| |
| cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak |
| movifnidn hpadd, hpadm |
| movifnidn wd, wm |
| mov hd, hm |
| mov szd, wd |
| mov ac_bakq, acq |
| imul szd, hd |
| shl hpadd, 2 |
| sub hd, hpadd |
| vpbroadcastd m2, [pb_2] |
| pxor m4, m4 |
| cmp wd, 8 |
| jg .w16 |
| je .w8 |
| ; fall-through |
| |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak |
| .w4: |
| lea stride3q, [strideq*3] |
| .w4_loop: |
| movq xm0, [yq] |
| movq xm1, [yq+strideq] |
| movhps xm0, [yq+strideq*2] |
| movhps xm1, [yq+stride3q] |
| pmaddubsw xm0, xm2 |
| pmaddubsw xm1, xm2 |
| paddw xm0, xm1 |
| mova [acq], xm0 |
| paddw xm4, xm0 |
| lea yq, [yq+strideq*4] |
| add acq, 16 |
| sub hd, 2 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| vpermq m0, m0, q1111 |
| .w4_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 4 |
| jg .w4_hpad_loop |
| jmp .calc_avg |
| |
| .w8: |
| lea stride3q, [strideq*3] |
| test wpadd, wpadd |
| jnz .w8_wpad |
| .w8_loop: |
| mova xm0, [yq] |
| mova xm1, [yq+strideq] |
| vinserti128 m0, [yq+strideq*2], 1 |
| vinserti128 m1, [yq+stride3q], 1 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 2 |
| jg .w8_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| jmp .w8_hpad |
| .w8_wpad: |
| vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] |
| .w8_wpad_loop: |
| movq xm0, [yq] |
| movq xm1, [yq+strideq] |
| vinserti128 m0, [yq+strideq*2], 1 |
| vinserti128 m1, [yq+stride3q], 1 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| pshufb m0, m3 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 2 |
| jg .w8_wpad_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| .w8_hpad: |
| vpermq m0, m0, q3232 |
| .w8_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 2 |
| jg .w8_hpad_loop |
| jmp .calc_avg |
| |
| .w16: |
| test wpadd, wpadd |
| jnz .w16_wpad |
| .w16_loop: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jg .w16_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| jmp .w16_hpad_loop |
| .w16_wpad: |
| DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak |
| lea iptrq, [ipred_cfl_ac_420_avx2_table] |
| shl wpadd, 2 |
| mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ |
| ipred_cfl_ac_420_avx2_table+wpadq*8-32] |
| movsxd wpadq, [iptrq+wpadq+4] |
| add iptrq, wpadq |
| jmp iptrq |
| .w16_pad3: |
| vpbroadcastq m0, [yq] |
| vpbroadcastq m1, [yq+strideq] |
| jmp .w16_wpad_end |
| .w16_pad2: |
| vbroadcasti128 m0, [yq] |
| vbroadcasti128 m1, [yq+strideq] |
| jmp .w16_wpad_end |
| .w16_pad1: |
| mova m0, [yq] |
| mova m1, [yq+strideq] |
| ; fall-through |
| .w16_wpad_end: |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| pshufb m0, m3 |
| mova [acq], m0 |
| paddw m4, m0 |
| lea yq, [yq+strideq*2] |
| add acq, 32 |
| dec hd |
| jz .w16_wpad_done |
| jmp iptrq |
| .w16_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg |
| .w16_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 32 |
| dec hpadd |
| jg .w16_hpad_loop |
| ; fall-through |
| |
| .calc_avg: |
| vpbroadcastd m2, [pw_1] |
| pmaddwd m0, m4, m2 |
| vextracti128 xm1, m0, 1 |
| tzcnt r1d, szd |
| paddd xm0, xm1 |
| movd xm2, r1d |
| movd xm3, szd |
| punpckhqdq xm1, xm0, xm0 |
| paddd xm0, xm1 |
| psrad xm3, 1 |
| psrlq xm1, xm0, 32 |
| paddd xm0, xm3 |
| paddd xm0, xm1 |
| psrad xm0, xm2 |
| vpbroadcastw m0, xm0 |
| .sub_loop: |
| mova m1, [ac_bakq] |
| psubw m1, m0 |
| mova [ac_bakq], m1 |
| add ac_bakq, 32 |
| sub szd, 16 |
| jg .sub_loop |
| RET |
| |
| cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak |
| movifnidn hpadd, hpadm |
| movifnidn wd, wm |
| mov hd, hm |
| mov szd, wd |
| mov ac_bakq, acq |
| imul szd, hd |
| shl hpadd, 2 |
| sub hd, hpadd |
| vpbroadcastd m2, [pb_4] |
| pxor m4, m4 |
| pxor m5, m5 |
| cmp wd, 8 |
| jg .w16 |
| je .w8 |
| ; fall-through |
| |
| DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak |
| .w4: |
| lea stride3q, [strideq*3] |
| .w4_loop: |
| movq xm1, [yq] |
| movhps xm1, [yq+strideq] |
| movq xm0, [yq+strideq*2] |
| movhps xm0, [yq+stride3q] |
| pmaddubsw xm0, xm2 |
| pmaddubsw xm1, xm2 |
| mova [acq], xm1 |
| mova [acq+16], xm0 |
| paddw xm4, xm0 |
| paddw xm5, xm1 |
| lea yq, [yq+strideq*4] |
| add acq, 32 |
| sub hd, 4 |
| jg .w4_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| vpermq m0, m0, q1111 |
| .w4_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 4 |
| jg .w4_hpad_loop |
| jmp .calc_avg |
| |
| .w8: |
| lea stride3q, [strideq*3] |
| test wpadd, wpadd |
| jnz .w8_wpad |
| .w8_loop: |
| mova xm1, [yq] |
| vinserti128 m1, [yq+strideq], 1 |
| mova xm0, [yq+strideq*2] |
| vinserti128 m0, [yq+stride3q], 1 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova [acq+32], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .w8_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| jmp .w8_hpad |
| .w8_wpad: |
| vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] |
| .w8_wpad_loop: |
| movq xm1, [yq] |
| vinserti128 m1, [yq+strideq], 1 |
| movq xm0, [yq+strideq*2] |
| vinserti128 m0, [yq+stride3q], 1 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pshufb m0, m3 |
| pshufb m1, m3 |
| mova [acq], m1 |
| mova [acq+32], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*4] |
| add acq, 64 |
| sub hd, 4 |
| jg .w8_wpad_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| .w8_hpad: |
| vpermq m0, m0, q3232 |
| .w8_hpad_loop: |
| mova [acq], m0 |
| paddw m4, m0 |
| add acq, 32 |
| sub hpadd, 2 |
| jg .w8_hpad_loop |
| jmp .calc_avg |
| |
| .w16: |
| test wpadd, wpadd |
| jnz .w16_wpad |
| .w16_loop: |
| mova m1, [yq] |
| mova m0, [yq+strideq] |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| mova [acq], m1 |
| mova [acq+32], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jg .w16_loop |
| test hpadd, hpadd |
| jz .calc_avg |
| jmp .w16_hpad_loop |
| .w16_wpad: |
| DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak |
| lea iptrq, [ipred_cfl_ac_422_avx2_table] |
| shl wpadd, 2 |
| mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ |
| ipred_cfl_ac_422_avx2_table+wpadq*8-32] |
| movsxd wpadq, [iptrq+wpadq+4] |
| add iptrq, wpadq |
| jmp iptrq |
| .w16_pad3: |
| vpbroadcastq m1, [yq] |
| vpbroadcastq m0, [yq+strideq] |
| jmp .w16_wpad_end |
| .w16_pad2: |
| vbroadcasti128 m1, [yq] |
| vbroadcasti128 m0, [yq+strideq] |
| jmp .w16_wpad_end |
| .w16_pad1: |
| mova m1, [yq] |
| mova m0, [yq+strideq] |
| ; fall-through |
| .w16_wpad_end: |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pshufb m0, m3 |
| pshufb m1, m3 |
| mova [acq], m1 |
| mova [acq+32], m0 |
| paddw m4, m0 |
| paddw m5, m1 |
| lea yq, [yq+strideq*2] |
| add acq, 64 |
| sub hd, 2 |
| jz .w16_wpad_done |
| jmp iptrq |
| .w16_wpad_done: |
| test hpadd, hpadd |
| jz .calc_avg |
| .w16_hpad_loop: |
| mova [acq], m0 |
| mova [acq+32], m0 |
| paddw m4, m0 |
| paddw m5, m0 |
| add acq, 64 |
| sub hpadd, 2 |
| jg .w16_hpad_loop |
| ; fall-through |
| |
| .calc_avg: |
| vpbroadcastd m2, [pw_1] |
| pmaddwd m5, m5, m2 |
| pmaddwd m0, m4, m2 |
| paddd m0, m5 |
| vextracti128 xm1, m0, 1 |
| tzcnt r1d, szd |
| paddd xm0, xm1 |
| movd xm2, r1d |
| movd xm3, szd |
| punpckhqdq xm1, xm0, xm0 |
| paddd xm0, xm1 |
| psrad xm3, 1 |
| psrlq xm1, xm0, 32 |
| paddd xm0, xm3 |
| paddd xm0, xm1 |
| psrad xm0, xm2 |
| vpbroadcastw m0, xm0 |
| .sub_loop: |
| mova m1, [ac_bakq] |
| psubw m1, m0 |
| mova [ac_bakq], m1 |
| add ac_bakq, 32 |
| sub szd, 16 |
| jg .sub_loop |
| RET |
| |
| cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h |
| vbroadcasti128 m4, [palq] |
| lea r2, [pal_pred_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r2+wq*4] |
| packuswb m4, m4 |
| add wq, r2 |
| lea r2, [strideq*3] |
| jmp wq |
| .w4: |
| pshufb xm0, xm4, [idxq] |
| add idxq, 16 |
| movd [dstq+strideq*0], xm0 |
| pextrd [dstq+strideq*1], xm0, 1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r2 ], xm0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4 |
| RET |
| ALIGN function_align |
| .w8: |
| pshufb xm0, xm4, [idxq+16*0] |
| pshufb xm1, xm4, [idxq+16*1] |
| add idxq, 16*2 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm1 |
| movhps [dstq+r2 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8 |
| RET |
| ALIGN function_align |
| .w16: |
| pshufb m0, m4, [idxq+32*0] |
| pshufb m1, m4, [idxq+32*1] |
| add idxq, 32*2 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], xm1 |
| vextracti128 [dstq+r2 ], m1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16 |
| RET |
| ALIGN function_align |
| .w32: |
| pshufb m0, m4, [idxq+32*0] |
| pshufb m1, m4, [idxq+32*1] |
| pshufb m2, m4, [idxq+32*2] |
| pshufb m3, m4, [idxq+32*3] |
| add idxq, 32*4 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+r2 ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w32 |
| RET |
| ALIGN function_align |
| .w64: |
| pshufb m0, m4, [idxq+32*0] |
| pshufb m1, m4, [idxq+32*1] |
| pshufb m2, m4, [idxq+32*2] |
| pshufb m3, m4, [idxq+32*3] |
| add idxq, 32*4 |
| mova [dstq+strideq*0+32*0], m0 |
| mova [dstq+strideq*0+32*1], m1 |
| mova [dstq+strideq*1+32*0], m2 |
| mova [dstq+strideq*1+32*1], m3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w64 |
| RET |
| |
| %endif |