| ; Copyright © 2018-2021, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 64 |
| |
| %macro SMOOTH_WEIGHT_TABLE 1-* |
| %rep %0 |
| db %1-128, 127-%1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| ; sm_weights[], but modified to precalculate x and 256-x with offsets to |
| ; enable efficient use of pmaddubsw (which requires signed values) |
| smooth_weights: SMOOTH_WEIGHT_TABLE \ |
| 0, 0, 255, 128, 255, 149, 85, 64, \ |
| 255, 197, 146, 105, 73, 50, 37, 32, \ |
| 255, 225, 196, 170, 145, 123, 102, 84, \ |
| 68, 54, 43, 33, 26, 20, 17, 16, \ |
| 255, 240, 225, 210, 196, 182, 169, 157, \ |
| 145, 133, 122, 111, 101, 92, 83, 74, \ |
| 66, 59, 52, 45, 39, 34, 29, 25, \ |
| 21, 17, 14, 12, 10, 9, 8, 8, \ |
| 255, 248, 240, 233, 225, 218, 210, 203, \ |
| 196, 189, 182, 176, 169, 163, 156, 150, \ |
| 144, 138, 133, 127, 121, 116, 111, 106, \ |
| 101, 96, 91, 86, 82, 77, 73, 69, \ |
| 65, 61, 57, 54, 50, 47, 44, 41, \ |
| 38, 35, 32, 29, 27, 25, 22, 20, \ |
| 18, 16, 15, 13, 12, 10, 9, 8, \ |
| 7, 6, 6, 5, 5, 4, 4, 4 |
| |
| pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 |
| pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 |
| pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 |
| db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 |
| z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 |
| db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 |
| db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 |
| z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 |
| db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 |
| db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line |
| pb_128: times 4 db 128 ; those are just placed here for alignment. |
| pb_36_m4: times 2 db 36, -4 |
| z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 |
| z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 |
| z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 |
| z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 |
| z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 |
| z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 |
| z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 |
| z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 |
| z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 |
| z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 |
| z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 |
| z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 |
| dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 |
| z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 |
| dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 |
| z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 |
| db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 |
| ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 |
| filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 |
| db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 |
| filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 |
| filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 |
| pb_127_m127: times 2 db 127, -127 |
| ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 |
| db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 |
| ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 |
| db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 |
| pw_64: times 2 dw 64 |
| |
| cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 |
| times 9 db 7, -1 |
| cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 |
| db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| ; w=8, w_pad=1 as well as second half of previous one |
| cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 |
| times 5 db 6, 7 |
| ; w=16,w_pad=2 |
| db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| times 8 db 14, 15 |
| ; w=16,w_pad=3 |
| db 0, 1, 2, 3, 4, 5 |
| times 13 db 6, 7 |
| pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| |
| %define pb_0to15 cfl_ac_w16_pad_shuffle |
| %define pb_1 (ipred_h_shuf+12) |
| %define pb_2 (ipred_h_shuf+20) |
| %define pb_3 (ipred_h_shuf+ 4) |
| %define pb_4 (ipred_h_shuf+24) |
| %define pb_5 (ipred_h_shuf+ 8) |
| %define pb_7 (ipred_h_shuf+ 0) |
| %define pb_8 (z_upsample2 +12) |
| %define pb_12 (z2_y_shuf_h4+20) |
| %define pb_14 (z2_y_shuf_h4+ 4) |
| %define pb_15 (z_filter_s +32) |
| %define pb_27 (z2_y_shuf_h4+ 8) |
| %define pb_31 (z2_y_shuf_h4+12) |
| %define pb_32 (z2_y_shuf_h4+16) |
| %define pb_90 (z2_y_shuf_h4+ 0) |
| %define pw_1 (z2_y_shuf_h4+24) |
| %define pw_8 (z_filter_k +32) |
| |
| pw_62: times 2 dw 62 |
| pw_128: times 2 dw 128 |
| pw_255: times 2 dw 255 |
| pw_512: times 2 dw 512 |
| |
| %macro JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - 2*4) |
| %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%base %+ .%3 - (%%table - 2*4) |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) |
| %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) |
| |
| JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 |
| JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ |
| s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 |
| JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 |
| JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ |
| s4-8*4, s8-8*4, s16-8*4, s32-8*4 |
| JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 |
| JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 |
| JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 |
| JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 |
| JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 |
| |
| cextern dr_intra_derivative |
| cextern filter_intra_taps |
| |
| SECTION .text |
| |
| INIT_YMM avx2 |
| cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h |
| lea r5, [ipred_dc_left_avx2_table] |
| tzcnt wd, wm |
| inc tlq |
| movu m0, [tlq] |
| movifnidn hd, hm |
| mov r6d, 0x8000 |
| shrx r6d, r6d, wd |
| movd xm3, r6d |
| movsxd r6, [r5+wq*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| |
| cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| mov hd, hm ; zero upper half |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movu m0, [tlq] |
| mov r5d, 0x8000 |
| shrx r5d, r5d, r6d |
| movd xm3, r5d |
| lea r5, [ipred_dc_left_avx2_table] |
| movsxd r6, [r5+r6*4] |
| pcmpeqd m2, m2 |
| pmaddubsw m0, m2 |
| add r6, r5 |
| add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| .h64: |
| movu m1, [tlq+32] ; unaligned when jumping here from dc_top |
| pmaddubsw m1, m2 |
| paddw m0, m1 |
| .h32: |
| vextracti128 xm1, m0, 1 |
| paddw xm0, xm1 |
| .h16: |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| .h8: |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| .h4: |
| pmaddwd xm0, xm2 |
| pmulhrsw xm0, xm3 |
| lea stride3q, [strideq*3] |
| vpbroadcastb m0, xm0 |
| mova m1, m0 |
| jmp wq |
| |
| cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| movifnidn hd, hm |
| movifnidn wd, wm |
| tzcnt r6d, hd |
| lea r5d, [wq+hq] |
| movd xm4, r5d |
| tzcnt r5d, r5d |
| movd xm5, r5d |
| lea r5, [ipred_dc_avx2_table] |
| tzcnt wd, wd |
| movsxd r6, [r5+r6*4] |
| movsxd wq, [r5+wq*4+5*4] |
| pcmpeqd m3, m3 |
| psrlw xm4, 1 |
| add r6, r5 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp r6 |
| .h4: |
| movd xm0, [tlq-4] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w4: |
| movd xm1, [tlq+1] |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw xm0, 3 |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq xm1, xm0, xm0 |
| lea r2d, [hq*2] |
| mov r6d, 0x55563334 |
| paddw xm0, xm1 |
| shrx r6d, r6d, r2d |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| movd xm1, r6d |
| psrlw xm0, 2 |
| pmulhuw xm0, xm1 |
| .w4_end: |
| vpbroadcastb xm0, xm0 |
| .s4: |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm0 |
| movd [dstq+strideq*2], xm0 |
| movd [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4 |
| RET |
| ALIGN function_align |
| .h8: |
| movq xm0, [tlq-8] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w8: |
| movq xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| punpckhqdq xm2, xm0, xm0 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmove r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w8_end: |
| vpbroadcastb xm0, xm0 |
| .s8: |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm0 |
| movq [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s8 |
| RET |
| ALIGN function_align |
| .h16: |
| mova xm0, [tlq-16] |
| pmaddubsw xm0, xm3 |
| jmp wq |
| .w16: |
| movu xm1, [tlq+1] |
| vextracti128 xm2, m0, 1 |
| pmaddubsw xm1, xm3 |
| psubw xm0, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hb, 8|32 |
| cmovz r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w16_end: |
| vpbroadcastb xm0, xm0 |
| .s16: |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm0 |
| mova [dstq+strideq*2], xm0 |
| mova [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s16 |
| RET |
| ALIGN function_align |
| .h32: |
| mova m0, [tlq-32] |
| pmaddubsw m0, m3 |
| jmp wq |
| .w32: |
| movu m1, [tlq+1] |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| vextracti128 xm1, m0, 1 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, r2d |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w32_end: |
| vpbroadcastb m0, xm0 |
| .s32: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s32 |
| RET |
| ALIGN function_align |
| .h64: |
| mova m0, [tlq-64] |
| mova m1, [tlq-32] |
| pmaddubsw m0, m3 |
| pmaddubsw m1, m3 |
| paddw m0, m1 |
| jmp wq |
| .w64: |
| movu m1, [tlq+ 1] |
| movu m2, [tlq+33] |
| pmaddubsw m1, m3 |
| pmaddubsw m2, m3 |
| paddw m0, m1 |
| paddw m0, m2 |
| vextracti128 xm1, m0, 1 |
| psubw xm0, xm4 |
| paddw xm0, xm1 |
| punpckhqdq xm1, xm0, xm0 |
| paddw xm0, xm1 |
| psrlq xm1, xm0, 32 |
| paddw xm0, xm1 |
| pmaddwd xm0, xm3 |
| psrlw xm0, xm5 |
| cmp hd, 64 |
| je .w64_end |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, hd |
| movd xm1, r6d |
| pmulhuw xm0, xm1 |
| .w64_end: |
| vpbroadcastb m0, xm0 |
| mova m1, m0 |
| .s64: |
| mova [dstq+strideq*0+32*0], m0 |
| mova [dstq+strideq*0+32*1], m1 |
| mova [dstq+strideq*1+32*0], m0 |
| mova [dstq+strideq*1+32*1], m1 |
| mova [dstq+strideq*2+32*0], m0 |
| mova [dstq+strideq*2+32*1], m1 |
| mova [dstq+stride3q +32*0], m0 |
| mova [dstq+stride3q +32*1], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s64 |
| RET |
| |
| cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] |
| mova m1, m0 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_avx2_table] |
| tzcnt wd, wm |
| movu m0, [tlq+ 1] |
| movu m1, [tlq+33] |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| %macro IPRED_H 2 ; w, store_type |
| vpbroadcastb m0, [tlq-1] |
| vpbroadcastb m1, [tlq-2] |
| vpbroadcastb m2, [tlq-3] |
| sub tlq, 4 |
| vpbroadcastb m3, [tlq+0] |
| mov%2 [dstq+strideq*0], m0 |
| mov%2 [dstq+strideq*1], m1 |
| mov%2 [dstq+strideq*2], m2 |
| mov%2 [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w%1 |
| RET |
| ALIGN function_align |
| %endmacro |
| |
| INIT_XMM avx2 |
| cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_h_avx2_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| IPRED_H 4, d |
| .w8: |
| IPRED_H 8, q |
| .w16: |
| IPRED_H 16, a |
| INIT_YMM avx2 |
| .w32: |
| IPRED_H 32, a |
| .w64: |
| vpbroadcastb m0, [tlq-1] |
| vpbroadcastb m1, [tlq-2] |
| vpbroadcastb m2, [tlq-3] |
| sub tlq, 4 |
| vpbroadcastb m3, [tlq+0] |
| mova [dstq+strideq*0+32*0], m0 |
| mova [dstq+strideq*0+32*1], m0 |
| mova [dstq+strideq*1+32*0], m1 |
| mova [dstq+strideq*1+32*1], m1 |
| mova [dstq+strideq*2+32*0], m2 |
| mova [dstq+strideq*2+32*1], m2 |
| mova [dstq+stride3q +32*0], m3 |
| mova [dstq+stride3q +32*1], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w64 |
| RET |
| |
| %macro PAETH 2 ; top, ldiff |
| pavgb m1, m%1, m3 ; Calculating tldiff normally requires |
| pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it |
| pand m0, m4 ; in 8-bit with some tricks which avoids |
| psubusb m2, m5, m1 ; having to unpack everything to 16-bit. |
| psubb m1, m0 |
| psubusb m1, m5 |
| por m1, m2 |
| paddusb m1, m1 |
| por m1, m0 ; min(tldiff, 255) |
| psubusb m2, m5, m3 |
| psubusb m0, m3, m5 |
| por m2, m0 ; tdiff |
| pminub m2, m%2 |
| pcmpeqb m0, m%2, m2 ; ldiff <= tdiff |
| vpblendvb m0, m%1, m3, m0 |
| pminub m1, m2 |
| pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff |
| vpblendvb m0, m5, m0, m1 |
| %endmacro |
| |
| cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h |
| %define base r5-ipred_paeth_avx2_table |
| lea r5, [ipred_paeth_avx2_table] |
| tzcnt wd, wm |
| vpbroadcastb m5, [tlq] ; topleft |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m4, [base+pb_1] |
| add wq, r5 |
| jmp wq |
| .w4: |
| vpbroadcastd m6, [tlq+1] ; top |
| mova m8, [base+ipred_h_shuf] |
| lea r3, [strideq*3] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 ; ldiff |
| .w4_loop: |
| sub tlq, 8 |
| vpbroadcastq m3, [tlq] |
| pshufb m3, m8 ; left |
| PAETH 6, 7 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| vpbroadcastq m6, [tlq+1] |
| mova m8, [base+ipred_h_shuf] |
| lea r3, [strideq*3] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w8_loop: |
| sub tlq, 4 |
| vpbroadcastd m3, [tlq] |
| pshufb m3, m8 |
| PAETH 6, 7 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| vbroadcasti128 m6, [tlq+1] |
| mova xm8, xm4 ; lower half = 1, upper half = 0 |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w16_loop: |
| sub tlq, 2 |
| vpbroadcastd m3, [tlq] |
| pshufb m3, m8 |
| PAETH 6, 7 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| movu m6, [tlq+1] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w32_loop: |
| dec tlq |
| vpbroadcastb m3, [tlq] |
| PAETH 6, 7 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| movu m6, [tlq+ 1] |
| movu m7, [tlq+33] |
| %if WIN64 |
| movaps r4m, xmm9 |
| %endif |
| psubusb m8, m5, m6 |
| psubusb m0, m6, m5 |
| psubusb m9, m5, m7 |
| psubusb m1, m7, m5 |
| por m8, m0 |
| por m9, m1 |
| .w64_loop: |
| dec tlq |
| vpbroadcastb m3, [tlq] |
| PAETH 6, 8 |
| mova [dstq+32*0], m0 |
| PAETH 7, 9 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| %if WIN64 |
| movaps xmm9, r4m |
| %endif |
| RET |
| |
| %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] |
| ; w * a = (w - 128) * a + 128 * a |
| ; (256 - w) * b = (127 - w) * b + 129 * b |
| pmaddubsw m0, m%3, m%1 |
| pmaddubsw m1, m%4, m%2 |
| paddw m0, m%5 |
| paddw m1, m%6 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| %endmacro |
| |
| cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights |
| %define base r6-ipred_smooth_v_avx2_table |
| lea r6, [ipred_smooth_v_avx2_table] |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m0, [base+pb_127_m127] |
| vpbroadcastd m1, [base+pw_128] |
| lea weightsq, [base+smooth_weights+hq*4] |
| neg hq |
| vpbroadcastb m5, [tlq+hq] ; bottom |
| add wq, r6 |
| jmp wq |
| .w4: |
| vpbroadcastd m2, [tlq+1] |
| punpcklbw m2, m5 ; top, bottom |
| mova m5, [base+ipred_v_shuf] |
| lea r3, [strideq*3] |
| punpckldq m4, m5, m5 |
| punpckhdq m5, m5 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok |
| paddw m3, m1 ; 128 * top + 129 * bottom + 128 |
| .w4_loop: |
| vbroadcasti128 m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 1 |
| pextrd [dstq+r3 ], xm1, 1 |
| cmp hd, -4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 2 |
| pextrd [dstq+strideq*1], xm1, 2 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| add hq, 8 |
| jl .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| vpbroadcastq m2, [tlq+1] |
| punpcklbw m2, m5 |
| mova m5, [base+ipred_v_shuf] |
| lea r3, [strideq*3] |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 |
| paddw m3, m1 |
| .w8_loop: |
| vpbroadcastq m1, [weightsq+hq*2] |
| pshufb m0, m1, m4 |
| pshufb m1, m5 |
| SMOOTH 0, 1, 2, 2, 3, 3 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| WIN64_SPILL_XMM 7 |
| vbroadcasti128 m3, [tlq+1] |
| mova m6, [base+ipred_v_shuf] |
| punpcklbw m2, m3, m5 |
| punpckhbw m3, m5 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w16_loop: |
| vpbroadcastd m1, [weightsq+hq*2] |
| pshufb m1, m6 |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| add hq, 2 |
| jl .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 6 |
| movu m3, [tlq+1] |
| punpcklbw m2, m3, m5 |
| punpckhbw m3, m5 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w32_loop: |
| vpbroadcastw m1, [weightsq+hq*2] |
| SMOOTH 1, 1, 2, 3, 4, 5 |
| mova [dstq], m0 |
| add dstq, strideq |
| inc hq |
| jl .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| WIN64_SPILL_XMM 11 |
| movu m4, [tlq+ 1] |
| movu m8, [tlq+33] |
| punpcklbw m3, m4, m5 |
| punpckhbw m4, m5 |
| punpcklbw m7, m8, m5 |
| punpckhbw m8, m5 |
| pmaddubsw m5, m3, m0 |
| pmaddubsw m6, m4, m0 |
| pmaddubsw m9, m7, m0 |
| pmaddubsw m10, m8, m0 |
| paddw m2, m1, m3 |
| paddw m5, m2 |
| paddw m2, m1, m4 |
| paddw m6, m2 |
| paddw m0, m1, m7 |
| paddw m9, m0 |
| paddw m1, m8 |
| paddw m10, m1 |
| .w64_loop: |
| vpbroadcastw m2, [weightsq+hq*2] |
| SMOOTH 2, 2, 3, 4, 5, 6 |
| mova [dstq+32*0], m0 |
| SMOOTH 2, 2, 7, 8, 9, 10 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| inc hq |
| jl .w64_loop |
| RET |
| |
| %macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used |
| %assign stack_offset 0 |
| %assign stack_size_padded 0 |
| %assign regs_used %2 |
| %xdefine rstk rsp |
| SETUP_STACK_POINTER %1 |
| %if regs_used != %2 && WIN64 |
| PUSH r%2 |
| %endif |
| ALLOC_STACK %1, %3 |
| %endmacro |
| |
| cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h |
| %define base r6-ipred_smooth_h_avx2_table |
| lea r6, [ipred_smooth_h_avx2_table] |
| mov wd, wm |
| vpbroadcastb m3, [tlq+wq] ; right |
| tzcnt wd, wd |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m4, [base+pb_127_m127] |
| vpbroadcastd m5, [base+pw_128] |
| add wq, r6 |
| jmp wq |
| .w4: |
| WIN64_SPILL_XMM 8 |
| vpbroadcastq m6, [base+smooth_weights+4*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 8 |
| sub tlq, hq |
| lea r3, [strideq*3] |
| .w4_loop: |
| vpbroadcastq m2, [tlq+hq] |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 ; left, right |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 8 |
| vbroadcasti128 m6, [base+smooth_weights+8*2] |
| mova m7, [base+ipred_h_shuf] |
| sub tlq, 4 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| .w8_loop: |
| vpbroadcastd m2, [tlq+hq] |
| pshufb m2, m7 |
| punpcklbw m1, m2, m3 |
| punpckhbw m2, m3 |
| pmaddubsw m0, m1, m4 |
| paddw m0, m1 |
| pmaddubsw m1, m6 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m1, m2 |
| pmaddubsw m2, m6 |
| paddw m2, m5 |
| paddw m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| SETUP_STACK_FRAME 32*4, 7, 8 |
| lea r3, [rsp+64*2-4] |
| call .prep ; only worthwhile for for w16 and above |
| sub tlq, 2 |
| vpbroadcastd xm6, [base+pb_1] |
| mova xm7, [base+ipred_v_shuf+16] |
| vinserti128 m7, [base+ipred_v_shuf+ 0], 1 |
| vbroadcasti128 m4, [base+smooth_weights+16*2] |
| vbroadcasti128 m5, [base+smooth_weights+16*3] |
| .w16_loop: |
| vpbroadcastd m1, [tlq+hq] |
| vpbroadcastd m2, [r3+hq*2] |
| pshufb m1, m6 |
| punpcklbw m1, m3 |
| pshufb m2, m7 |
| SMOOTH 4, 5, 1, 1, 2, 2 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| SETUP_STACK_FRAME 32*4, 7, 6 |
| lea r3, [rsp+64*2-2] |
| call .prep |
| dec tlq |
| mova xm4, [base+smooth_weights+16*4] |
| vinserti128 m4, [base+smooth_weights+16*6], 1 |
| mova xm5, [base+smooth_weights+16*5] |
| vinserti128 m5, [base+smooth_weights+16*7], 1 |
| .w32_loop: |
| vpbroadcastb m1, [tlq+hq] |
| punpcklbw m1, m3 |
| vpbroadcastw m2, [r3+hq*2] |
| SMOOTH 4, 5, 1, 1, 2, 2 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| SETUP_STACK_FRAME 32*4, 7, 9 |
| lea r3, [rsp+64*2-2] |
| call .prep |
| add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table |
| dec tlq |
| mova xm5, [r6-16*7] |
| vinserti128 m5, [r6-16*5], 1 |
| mova xm6, [r6-16*6] |
| vinserti128 m6, [r6-16*4], 1 |
| mova xm7, [r6-16*3] |
| vinserti128 m7, [r6-16*1], 1 |
| mova xm8, [r6-16*2] |
| vinserti128 m8, [r6-16*0], 1 |
| .w64_loop: |
| vpbroadcastb m2, [tlq+hq] |
| punpcklbw m2, m3 |
| vpbroadcastw m4, [r3+hq*2] |
| SMOOTH 5, 6, 2, 2, 4, 4 |
| mova [dstq+32*0], m0 |
| SMOOTH 7, 8, 2, 2, 4, 4 |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| ALIGN function_align |
| .prep: |
| vpermq m2, [tlq-32*1], q3120 |
| punpckhbw m1, m2, m3 |
| punpcklbw m2, m3 |
| pmaddubsw m0, m1, m4 ; 127 * left - 127 * right |
| paddw m1, m5 ; 1 * left + 256 * right + 128 |
| paddw m0, m1 ; 128 * left + 129 * right + 128 |
| pmaddubsw m1, m2, m4 |
| paddw m2, m5 |
| paddw m1, m2 |
| vpermq m2, [tlq-32*2], q3120 |
| mova [rsp+gprsize+32*3], m0 |
| mova [rsp+gprsize+32*2], m1 |
| punpckhbw m1, m2, m3 |
| punpcklbw m2, m3 |
| pmaddubsw m0, m1, m4 |
| paddw m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m4 |
| paddw m2, m5 |
| paddw m1, m2 |
| mova [rsp+gprsize+32*1], m0 |
| mova [rsp+gprsize+32*0], m1 |
| sub r3, hq |
| sub tlq, hq |
| sub r3, hq |
| ret |
| |
| %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] |
| pmaddubsw m0, m%3, m%1 |
| pmaddubsw m1, m%4, m%2 |
| %ifnum %5 |
| paddw m0, m%5 |
| %else |
| paddw m0, %5 |
| %endif |
| %ifnum %6 |
| paddw m1, m%6 |
| %else |
| paddw m1, %6 |
| %endif |
| pavgw m0, m2 |
| pavgw m1, m3 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| %endmacro |
| |
| cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights |
| %define base r6-ipred_smooth_avx2_table |
| lea r6, [ipred_smooth_avx2_table] |
| mov wd, wm |
| vpbroadcastb m4, [tlq+wq] ; right |
| tzcnt wd, wd |
| mov hd, hm |
| mov r5, tlq |
| sub r5, hq |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m5, [base+pb_127_m127] |
| vpbroadcastb m0, [r5] ; bottom |
| vpbroadcastd m3, [base+pw_255] |
| add wq, r6 |
| lea v_weightsq, [base+smooth_weights+hq*2] |
| jmp wq |
| .w4: |
| WIN64_SPILL_XMM 12 |
| mova m10, [base+ipred_h_shuf] |
| vpbroadcastq m11, [base+smooth_weights+4*2] |
| mova m7, [base+ipred_v_shuf] |
| vpbroadcastd m8, [tlq+1] |
| sub tlq, 8 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| punpcklbw m8, m0 ; top, bottom |
| pshufd m6, m7, q2200 |
| pshufd m7, m7, q3311 |
| pmaddubsw m9, m8, m5 |
| paddw m3, m8 ; 1 * top + 255 * bottom + 255 |
| paddw m9, m3 ; 128 * top + 129 * bottom + 255 |
| .w4_loop: |
| vpbroadcastq m1, [tlq+hq] |
| pshufb m1, m10 |
| punpcklbw m0, m1, m4 ; left, right |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 ; 127 * left - 127 * right |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 ; 128 * left + 129 * right |
| paddw m3, m1 |
| pmaddubsw m0, m11 |
| pmaddubsw m1, m11 |
| paddw m2, m0 |
| paddw m3, m1 |
| vbroadcasti128 m1, [v_weightsq] |
| add v_weightsq, 16 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| SMOOTH_2D_END 0, 1, 8, 8, 9, 9 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+r3 ], xm1, 2 |
| cmp hd, 4 |
| je .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+r3 ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 8 |
| jg .w4_loop |
| .ret: |
| RET |
| ALIGN function_align |
| .w8: |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 12 |
| mova m10, [base+ipred_h_shuf] |
| vbroadcasti128 m11, [base+smooth_weights+8*2] |
| mova m7, [base+ipred_v_shuf] |
| vpbroadcastq m8, [tlq+1] |
| sub tlq, 4 |
| lea r3, [strideq*3] |
| sub tlq, hq |
| punpcklbw m8, m0 |
| pshufd m6, m7, q0000 |
| pshufd m7, m7, q1111 |
| pmaddubsw m9, m8, m5 |
| paddw m3, m8 |
| paddw m9, m3 |
| .w8_loop: |
| vpbroadcastd m1, [tlq+hq] |
| pshufb m1, m10 |
| punpcklbw m0, m1, m4 |
| punpckhbw m1, m4 |
| pmaddubsw m2, m0, m5 |
| pmaddubsw m3, m1, m5 |
| paddw m2, m0 |
| paddw m3, m1 |
| pmaddubsw m0, m11 |
| pmaddubsw m1, m11 |
| paddw m2, m0 |
| paddw m3, m1 |
| vpbroadcastq m1, [v_weightsq] |
| add v_weightsq, 8 |
| pshufb m0, m1, m6 |
| pshufb m1, m7 |
| SMOOTH_2D_END 0, 1, 8, 8, 9, 9 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+r3 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| ALIGN function_align |
| .w16: |
| SETUP_STACK_FRAME 32*4, 7, 14 |
| vbroadcasti128 m11, [tlq+1] |
| lea r3, [rsp+64*2-4] |
| punpcklbw m10, m11, m0 ; top, bottom |
| punpckhbw m11, m0 |
| call .prep_v |
| sub tlq, 2 |
| pmaddubsw m12, m10, m5 |
| pmaddubsw m13, m11, m5 |
| vpbroadcastd xm5, [base+pb_1] |
| mova m9, [base+ipred_v_shuf] |
| vbroadcasti128 m6, [base+smooth_weights+16*2] |
| vbroadcasti128 m7, [base+smooth_weights+16*3] |
| vperm2i128 m8, m9, m9, 0x01 |
| paddw m0, m10, m3 |
| paddw m3, m11 |
| paddw m12, m0 |
| paddw m13, m3 |
| .w16_loop: |
| vpbroadcastd m3, [tlq+hq] |
| vpbroadcastd m0, [r3+hq*2] |
| vpbroadcastd m1, [v_weightsq] |
| add v_weightsq, 4 |
| pshufb m3, m5 |
| punpcklbw m3, m4 ; left, right |
| pmaddubsw m2, m3, m6 |
| pmaddubsw m3, m7 |
| pshufb m0, m8 |
| pshufb m1, m9 |
| paddw m2, m0 |
| paddw m3, m0 |
| SMOOTH_2D_END 1, 1, 10, 11, 12, 13 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| ALIGN function_align |
| .w32: |
| SETUP_STACK_FRAME 32*4, 7, 11 |
| movu m8, [tlq+1] |
| lea r3, [rsp+64*2-2] |
| punpcklbw m7, m8, m0 |
| punpckhbw m8, m0 |
| call .prep_v |
| dec tlq |
| pmaddubsw m9, m7, m5 |
| pmaddubsw m10, m8, m5 |
| mova xm5, [base+smooth_weights+16*4] |
| vinserti128 m5, [base+smooth_weights+16*6], 1 |
| mova xm6, [base+smooth_weights+16*5] |
| vinserti128 m6, [base+smooth_weights+16*7], 1 |
| paddw m0, m7, m3 |
| paddw m3, m8 |
| paddw m9, m0 |
| paddw m10, m3 |
| .w32_loop: |
| vpbroadcastb m3, [tlq+hq] |
| punpcklbw m3, m4 |
| vpbroadcastw m0, [r3+hq*2] |
| vpbroadcastw m1, [v_weightsq] |
| add v_weightsq, 2 |
| pmaddubsw m2, m3, m5 |
| pmaddubsw m3, m6 |
| paddw m2, m0 |
| paddw m3, m0 |
| SMOOTH_2D_END 1, 1, 7, 8, 9, 10 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| RET |
| ALIGN function_align |
| .w64: |
| SETUP_STACK_FRAME 32*8, 7, 16 |
| movu m13, [tlq+1 ] |
| movu m15, [tlq+33] |
| add r6, smooth_weights+16*15-ipred_smooth_avx2_table |
| lea r3, [rsp+64*2-2] |
| punpcklbw m12, m13, m0 |
| punpckhbw m13, m0 |
| punpcklbw m14, m15, m0 |
| punpckhbw m15, m0 |
| call .prep_v |
| dec tlq |
| pmaddubsw m0, m12, m5 |
| pmaddubsw m1, m13, m5 |
| pmaddubsw m2, m14, m5 |
| pmaddubsw m5, m15, m5 |
| mova xm8, [r6-16*7] |
| vinserti128 m8, [r6-16*5], 1 |
| mova xm9, [r6-16*6] |
| vinserti128 m9, [r6-16*4], 1 |
| mova xm10, [r6-16*3] |
| vinserti128 m10, [r6-16*1], 1 |
| mova xm11, [r6-16*2] |
| vinserti128 m11, [r6-16*0], 1 |
| lea r6, [rsp+32*4] |
| paddw m0, m3 |
| paddw m1, m3 |
| paddw m2, m3 |
| paddw m3, m5 |
| paddw m0, m12 |
| paddw m1, m13 |
| paddw m2, m14 |
| paddw m3, m15 |
| mova [r6+32*0], m0 |
| mova [r6+32*1], m1 |
| mova [r6+32*2], m2 |
| mova [r6+32*3], m3 |
| .w64_loop: |
| vpbroadcastb m5, [tlq+hq] |
| punpcklbw m5, m4 |
| vpbroadcastw m6, [r3+hq*2] |
| vpbroadcastw m7, [v_weightsq] |
| add v_weightsq, 2 |
| pmaddubsw m2, m5, m8 |
| pmaddubsw m3, m5, m9 |
| paddw m2, m6 |
| paddw m3, m6 |
| SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] |
| mova [dstq+32*0], m0 |
| pmaddubsw m2, m5, m10 |
| pmaddubsw m3, m5, m11 |
| paddw m2, m6 |
| paddw m3, m6 |
| SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] |
| mova [dstq+32*1], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| ALIGN function_align |
| .prep_v: |
| vpermq m2, [tlq-32*1], q3120 |
| punpckhbw m1, m2, m4 |
| punpcklbw m2, m4 |
| pmaddubsw m0, m1, m5 ; 127 * left - 127 * right |
| paddw m0, m1 ; 128 * left + 129 * right |
| pmaddubsw m1, m2, m5 |
| paddw m1, m2 |
| vpermq m2, [tlq-32*2], q3120 |
| mova [rsp+gprsize+32*3], m0 |
| mova [rsp+gprsize+32*2], m1 |
| punpckhbw m1, m2, m4 |
| punpcklbw m2, m4 |
| pmaddubsw m0, m1, m5 |
| paddw m0, m1 |
| pmaddubsw m1, m2, m5 |
| paddw m1, m2 |
| mova [rsp+gprsize+32*1], m0 |
| mova [rsp+gprsize+32*0], m1 |
| sub r3, hq |
| sub tlq, hq |
| sub r3, hq |
| ret |
| |
| cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase |
| %assign org_stack_offset stack_offset |
| lea r6, [ipred_z1_avx2_table] |
| tzcnt wd, wm |
| movifnidn angled, anglem |
| movifnidn hd, hm |
| lea r7, [dr_intra_derivative] |
| inc tlq |
| movsxd wq, [r6+wq*4] |
| add wq, r6 |
| mov dxd, angled |
| and dxd, 0x7e |
| add angled, 165 ; ~90 |
| movzx dxd, word [r7+dxq] |
| xor angled, 0x4ff ; d = 90 - angle |
| vpbroadcastd m3, [pw_512] |
| vpbroadcastd m4, [pw_62] |
| vpbroadcastd m5, [pw_64] |
| jmp wq |
| .w4: |
| cmp angleb, 40 |
| jae .w4_no_upsample |
| lea r3d, [angleq-1024] |
| sar r3d, 7 |
| add r3d, hd |
| jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) |
| ALLOC_STACK -32, 8 |
| mova xm1, [tlq-1] |
| pshufb xm0, xm1, [z_upsample1] |
| pshufb xm1, [z_upsample2] |
| vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse |
| add dxd, dxd ; pw_512 (which is already in m3) |
| pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 |
| pextrd [rsp+16], xm1, 3 ; top[max_base_x] |
| pmaddubsw xm1, xm2 |
| movd xm7, dxd |
| mov r3d, dxd ; xpos |
| vpbroadcastw m7, xm7 |
| paddw xm1, xm0 |
| movq xm0, [tlq] |
| pmulhrsw xm1, xm3 |
| pslldq m6, m7, 8 |
| paddw xm2, xm7, xm7 |
| lea r2, [strideq*3] |
| paddw m6, m7 |
| packuswb xm1, xm1 |
| paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 |
| punpcklbw xm0, xm1 |
| psllw m7, 2 |
| mova [rsp], xm0 |
| .w4_upsample_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| vpbroadcastq m1, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vpbroadcastq m2, [rsp+r5] |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| movq xm0, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| movhps xm0, [rsp+r5] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 ; frac |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 ; 64-frac |
| psllw m2, 8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| paddw m6, m7 ; xpos += dx |
| pmulhrsw m0, m3 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r2 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_upsample_loop |
| RET |
| ALIGN function_align |
| .filter_strength: ; w4/w8/w16 |
| ; The C version uses a lot of branches, but we can do all the comparisons |
| ; in parallel and use popcnt to get the final filter strength value. |
| %define base r3-z_filter_t0 |
| lea r3, [z_filter_t0] |
| movd xm0, maxbased |
| movd xm2, angled |
| shr angled, 8 ; is_sm << 1 |
| vpbroadcastb m0, xm0 |
| vpbroadcastb m2, xm2 |
| pcmpeqb m1, m0, [base+z_filter_wh] |
| pand m1, m2 |
| mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases |
| pcmpgtb m1, m2 |
| pmovmskb r5d, m1 |
| ret |
| .w4_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -16, 11 |
| mov maxbased, 7 |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w4_main |
| lea maxbased, [hq+3] |
| call .filter_strength |
| mov maxbased, 7 |
| test r5d, r5d |
| jz .w4_main ; filter_strength == 0 |
| popcnt r5d, r5d |
| vpbroadcastd m7, [base+pb_8] |
| vbroadcasti128 m2, [tlq-1] |
| pminub m1, m7, [base+z_filter_s] |
| vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] |
| pminub m7, [base+z_filter_s+8] |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] |
| pshufb m0, m2, m1 |
| shufps m1, m7, q2121 |
| pmaddubsw m0, m8 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m9 |
| pshufb m2, m7 |
| pmaddubsw m2, m10 |
| paddw m0, m1 |
| paddw m0, m2 |
| pmulhrsw m0, m3 |
| mov r3d, 9 |
| mov tlq, rsp |
| cmp hd, 4 |
| cmovne maxbased, r3d |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| mova [tlq], xm0 |
| .w4_main: |
| movd xm6, dxd |
| vpbroadcastq m0, [z_base_inc] ; base_inc << 6 |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| mov r3d, dxd ; xpos |
| movd xm9, maxbased |
| vpbroadcastw m9, xm9 |
| vbroadcasti128 m8, [z1_shuf_w4] |
| psrlw m7, 8 ; top[max_base_x] |
| paddw m10, m6, m6 |
| psubw m9, m0 ; max_base_x |
| vpblendd m6, m10, 0xcc |
| mova xm0, xm10 |
| paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 |
| paddw m10, m10 |
| .w4_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| vpbroadcastq m1, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vpbroadcastq m2, [tlq+r5] |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| movq xm0, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| movhps xm0, [tlq+r5] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m4, m6 ; frac |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m5, m2 ; 64-frac |
| psllw m2, 8 |
| pshufb m0, m8 |
| por m1, m2 ; 64-frac, frac |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m6 ; base < max_base_x |
| pmulhrsw m0, m3 |
| paddw m6, m10 ; xpos += dx |
| lea r5, [dstq+strideq*2] |
| vpblendvb m0, m7, m0, m1 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [r5 +strideq*0], xm0 |
| pextrd [r5 +strideq*1], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| sub hd, 4 |
| jz .w4_end |
| lea dstq, [dstq+strideq*4] |
| cmp r3d, maxbased |
| jb .w4_loop |
| packuswb xm7, xm7 |
| lea r6, [strideq*3] |
| .w4_end_loop: |
| movd [dstq+strideq*0], xm7 |
| movd [dstq+strideq*1], xm7 |
| movd [dstq+strideq*2], xm7 |
| movd [dstq+r6 ], xm7 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_end_loop |
| .w4_end: |
| RET |
| ALIGN function_align |
| .w8: |
| lea r3d, [angleq+216] |
| mov r3b, hb |
| cmp r3d, 8 |
| ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 8 |
| movu xm2, [z_filter_s+6] |
| mova xm0, [tlq-1] |
| movd xm6, hd |
| vinserti128 m0, [tlq+7], 1 |
| vpbroadcastb xm6, xm6 |
| vbroadcasti128 m1, [z_upsample1] |
| pminub xm6, xm2 |
| vpbroadcastd m7, [pb_36_m4] |
| vinserti128 m2, xm6, 1 |
| add dxd, dxd |
| pshufb m1, m0, m1 |
| pshufb m2, m0, m2 |
| movd xm6, dxd |
| pmaddubsw m1, m7 |
| pmaddubsw m2, m7 |
| vpbroadcastw m6, xm6 |
| mov r3d, dxd |
| psrldq m0, 1 |
| lea r2, [strideq*3] |
| paddw m7, m6, m6 |
| paddw m1, m2 |
| vpblendd m6, m7, 0xf0 |
| pmulhrsw m1, m3 |
| pslldq m2, m7, 8 |
| paddw m7, m7 |
| paddw m6, m2 |
| packuswb m1, m1 |
| punpcklbw m0, m1 |
| mova [rsp], m0 |
| .w8_upsample_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| movu xm0, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [rsp+r5], 1 |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base2 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| punpcklqdq m1, m2, m2 ; frac0 frac1 |
| pmaddubsw m0, m1 |
| movu xm1, [rsp+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base3 |
| vinserti128 m1, [rsp+r5], 1 |
| punpckhqdq m2, m2 ; frac2 frac3 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| paddw m6, m7 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r2 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_upsample_loop |
| RET |
| .w8_no_intra_edge_filter: |
| and maxbased, 7 |
| or maxbased, 8 ; imin(h+7, 15) |
| jmp .w8_main |
| .w8_no_upsample: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -32, 10 |
| lea maxbased, [hq+7] |
| test angled, 0x400 |
| jnz .w8_no_intra_edge_filter |
| call .filter_strength |
| test r5d, r5d |
| jz .w8_main ; filter_strength == 0 |
| popcnt r5d, r5d |
| movu xm2, [tlq] |
| pminub xm1, xm0, [base+z_filter_s+14] |
| vinserti128 m2, [tlq-1], 1 |
| vinserti128 m1, [base+z_filter_s+ 0], 1 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] |
| pminub xm0, [base+z_filter_s+22] |
| vinserti128 m0, [base+z_filter_s+ 8], 1 |
| pshufb m6, m2, m1 |
| pmaddubsw m6, m7 |
| vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] |
| movzx r3d, byte [tlq+15] |
| shufps m1, m0, q2121 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m7 |
| paddw m1, m6 |
| sub r5d, 3 |
| jnz .w8_3tap |
| ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, |
| ; which also results in an awkward edge case where out[w*2] is |
| ; slightly different from out[max_base_x] when h > w. |
| vpbroadcastd m7, [z_filter_k+4*8] |
| movzx r2d, byte [tlq+14] |
| pshufb m2, m0 |
| pmaddubsw m2, m7 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] |
| shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 |
| mov [rsp+16], r2b |
| paddw m1, m2 |
| .w8_3tap: |
| pmulhrsw m1, m3 |
| sar r5d, 1 |
| mov tlq, rsp |
| add r5d, 17 ; w*2 + (filter_strength == 3) |
| cmp hd, 16 |
| cmovns maxbased, r5d |
| mov [tlq+r5], r3b |
| vextracti128 xm0, m1, 1 |
| packuswb xm0, xm1 |
| mova [tlq], xm0 |
| .w8_main: |
| movd xm2, dxd |
| vbroadcasti128 m0, [z_base_inc] |
| vpbroadcastw m2, xm2 |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| psrlw m7, 8 |
| psubw m9, m0 |
| mov r3d, dxd |
| paddw m6, m2, m2 |
| vpblendd m2, m6, 0xf0 |
| .w8_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 |
| pand m0, m4, m2 |
| psubw m1, m5, m0 |
| psllw m0, 8 |
| por m1, m0 |
| movu xm0, [tlq+r3] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [tlq+r5], 1 |
| pshufb m0, m8 |
| pmaddubsw m0, m1 |
| pcmpgtw m1, m9, m2 |
| paddw m2, m6 |
| pmulhrsw m0, m3 |
| vpblendvb m0, m7, m0, m1 |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm0 |
| sub hd, 2 |
| jz .w8_end |
| lea dstq, [dstq+strideq*2] |
| cmp r3d, maxbased |
| jb .w8_loop |
| packuswb xm7, xm7 |
| .w8_end_loop: |
| movq [dstq+strideq*0], xm7 |
| movq [dstq+strideq*1], xm7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_end_loop |
| .w8_end: |
| RET |
| .w16_no_intra_edge_filter: |
| and maxbased, 15 |
| or maxbased, 16 ; imin(h+15, 31) |
| jmp .w16_main |
| ALIGN function_align |
| .w16: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -64, 12 |
| lea maxbased, [hq+15] |
| test angled, 0x400 |
| jnz .w16_no_intra_edge_filter |
| call .filter_strength |
| test r5d, r5d |
| jz .w16_main ; filter_strength == 0 |
| popcnt r5d, r5d |
| vpbroadcastd m1, [base+pb_12] |
| vbroadcasti128 m6, [base+z_filter_s+8] |
| vinserti128 m2, m6, [base+z_filter_s], 0 |
| vinserti128 m6, [base+z_filter_s+16], 1 |
| mova xm10, [tlq-1] |
| vinserti128 m10, [tlq+3], 1 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] |
| vbroadcasti128 m7, [base+z_filter_s+14] |
| vinserti128 m8, m7, [base+z_filter_s+6], 0 |
| vinserti128 m7, [base+z_filter_s+22], 1 |
| psubw m0, m1 |
| movu xm11, [tlq+12] |
| vinserti128 m11, [tlq+16], 1 |
| pminub m8, m0 |
| pminub m7, m0 |
| pshufb m0, m10, m2 |
| shufps m2, m6, q2121 |
| pmaddubsw m0, m9 |
| pshufb m1, m11, m8 |
| shufps m8, m7, q2121 |
| pmaddubsw m1, m9 |
| vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] |
| movzx r3d, byte [tlq+31] |
| pshufb m2, m10, m2 |
| pmaddubsw m2, m9 |
| pshufb m8, m11, m8 |
| pmaddubsw m8, m9 |
| paddw m0, m2 |
| paddw m1, m8 |
| sub r5d, 3 |
| jnz .w16_3tap |
| vpbroadcastd m9, [z_filter_k+4*8] |
| movzx r2d, byte [tlq+30] |
| pshufb m10, m6 |
| pmaddubsw m10, m9 |
| pshufb m11, m7 |
| pmaddubsw m11, m9 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] |
| shr r2d, 3 |
| mov [rsp+32], r2b |
| paddw m0, m10 |
| paddw m1, m11 |
| .w16_3tap: |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| sar r5d, 1 |
| mov tlq, rsp |
| add r5d, 33 |
| cmp hd, 32 |
| cmovns maxbased, r5d |
| mov [tlq+r5], r3b |
| packuswb m0, m1 |
| vpermq m0, m0, q3120 |
| mova [tlq], m0 |
| .w16_main: |
| movd xm6, dxd |
| vbroadcasti128 m0, [z_base_inc] |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| mov r3d, dxd |
| psubw m9, m0 |
| paddw m11, m6, m6 |
| psubw m10, m9, m3 ; 64*8 |
| vpblendd m6, m11, 0xf0 |
| .w16_loop: |
| lea r5d, [r3+dxq] |
| shr r3d, 6 ; base0 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu xm0, [tlq+r3+0] |
| movu xm1, [tlq+r3+8] |
| lea r3d, [r5+dxq] |
| shr r5d, 6 ; base1 |
| vinserti128 m0, [tlq+r5+0], 1 |
| vinserti128 m1, [tlq+r5+8], 1 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti128 [dstq+strideq*1], m0, 1 |
| sub hd, 2 |
| jz .w16_end |
| lea dstq, [dstq+strideq*2] |
| cmp r3d, maxbased |
| jb .w16_loop |
| .w16_end_loop: |
| mova [dstq+strideq*0], xm7 |
| mova [dstq+strideq*1], xm7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_end_loop |
| .w16_end: |
| RET |
| ALIGN function_align |
| .w32: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -96, 15 |
| lea r3d, [hq+31] |
| mov maxbased, 63 |
| cmp hd, 32 |
| cmovs maxbased, r3d |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w32_main |
| vbroadcasti128 m0, [pb_0to15] |
| sub r3d, 29 ; h+2 |
| movu xm13, [tlq+29] ; 32-39 |
| movd xm1, r3d |
| movu xm14, [tlq+37] ; 40-47 |
| sub r3d, 8 ; h-6 |
| vinserti128 m14, [tlq+51], 1 ; 56-63 |
| vpbroadcastb xm1, xm1 |
| mova xm11, [tlq- 1] ; 0- 7 |
| vinserti128 m11, [tlq+13], 1 ; 16-23 |
| movd xm2, r3d |
| movu xm12, [tlq+ 5] ; 8-15 |
| vinserti128 m12, [tlq+19], 1 ; 24-31 |
| pminub xm1, xm0 ; clip 32x8 |
| mova m7, [z_filter_s+0] |
| pshufb xm13, xm1 |
| vpbroadcastd m1, [pb_12] |
| vpbroadcastb xm2, xm2 |
| vinserti128 m13, [tlq+43], 1 ; 48-55 |
| vinserti128 m8, m7, [z_filter_s+4], 1 |
| vpblendd m2, m1, 0xf0 |
| vinserti128 m7, [z_filter_s+12], 0 |
| pminub m2, m0 ; clip 32x16 and 32x(32|64) |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m14, m2 |
| pshufb m0, m11, m8 |
| shufps m8, m7, q1021 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m10, m11, m8 |
| shufps m8, m7, q2121 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m8 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m8 |
| pmaddubsw m10, m9 |
| paddw m1, m10 |
| pshufb m10, m14, m8 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*2] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m7 |
| pmaddubsw m12, m9 |
| movzx r3d, byte [tlq+63] |
| movzx r2d, byte [tlq+62] |
| paddw m0, m11 |
| paddw m2, m12 |
| pshufb m13, m7 |
| pmaddubsw m13, m9 |
| pshufb m14, m7 |
| pmaddubsw m14, m9 |
| paddw m1, m13 |
| paddw m6, m14 |
| sub r2d, r3d |
| lea r2d, [r2+r3*8+4] ; edge case for 32x64 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| shr r2d, 3 |
| mov [rsp+64], r2b |
| mov tlq, rsp |
| mov [tlq+65], r3b |
| mov r3d, 65 |
| cmp hd, 64 |
| cmove maxbased, r3d |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq+ 0], m0 |
| mova [tlq+32], m1 |
| .w32_main: |
| movd xm6, dxd |
| vpbroadcastb m7, [tlq+maxbaseq] |
| shl maxbased, 6 |
| vpbroadcastw m6, xm6 |
| movd xm9, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| vpbroadcastw m9, xm9 |
| mov r5d, dxd |
| psubw m9, [z_base_inc] |
| mova m11, m6 |
| psubw m10, m9, m3 ; 64*8 |
| .w32_loop: |
| mov r3d, r5d |
| shr r3d, 6 |
| pand m1, m4, m6 |
| psubw m2, m5, m1 |
| psllw m1, 8 |
| por m2, m1 |
| movu m0, [tlq+r3+0] |
| movu m1, [tlq+r3+8] |
| add r5d, dxd |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m2 |
| pmaddubsw m1, m2 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| pcmpgtw m1, m9, m6 |
| pcmpgtw m2, m10, m6 |
| packsswb m1, m2 |
| paddw m6, m11 |
| vpblendvb m0, m7, m0, m1 |
| mova [dstq], m0 |
| dec hd |
| jz .w32_end |
| add dstq, strideq |
| cmp r5d, maxbased |
| jb .w32_loop |
| test hb, 1 |
| jz .w32_end_loop |
| mova [dstq], m7 |
| add dstq, strideq |
| dec hd |
| jz .w32_end |
| .w32_end_loop: |
| mova [dstq+strideq*0], m7 |
| mova [dstq+strideq*1], m7 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_end_loop |
| .w32_end: |
| RET |
| ALIGN function_align |
| .w64: |
| %assign stack_offset org_stack_offset |
| ALLOC_STACK -128, 16 |
| lea maxbased, [hq+63] |
| test angled, 0x400 ; !enable_intra_edge_filter |
| jnz .w64_main |
| mova xm11, [tlq- 1] ; 0- 7 |
| vinserti128 m11, [tlq+13], 1 ; 16-23 |
| movu xm12, [tlq+ 5] ; 8-15 |
| vinserti128 m12, [tlq+19], 1 ; 24-31 |
| mova m7, [z_filter_s+0] |
| vinserti128 m8, m7, [z_filter_s+4], 1 |
| vinserti128 m7, [z_filter_s+12], 0 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| movu xm13, [tlq+29] ; 32-39 |
| vinserti128 m13, [tlq+43], 1 ; 48-55 |
| movu xm14, [tlq+37] ; 40-47 |
| vinserti128 m14, [tlq+51], 1 ; 56-63 |
| pshufb m0, m11, m8 |
| shufps m8, m7, q1021 |
| pmaddubsw m0, m9 |
| pshufb m2, m12, m8 |
| pmaddubsw m2, m9 |
| pshufb m1, m13, m8 |
| pmaddubsw m1, m9 |
| pshufb m6, m14, m8 |
| pmaddubsw m6, m9 |
| vpbroadcastd m9, [z_filter_k+4*2+12*1] |
| pshufb m10, m11, m8 |
| shufps m15, m8, m7, q2121 |
| pmaddubsw m10, m9 |
| paddw m0, m10 |
| pshufb m10, m12, m15 |
| pmaddubsw m10, m9 |
| paddw m2, m10 |
| pshufb m10, m13, m15 |
| pmaddubsw m10, m9 |
| paddw m1, m10 |
| pshufb m10, m14, m15 |
| pmaddubsw m10, m9 |
| paddw m6, m10 |
| vpbroadcastd m10, [z_filter_k+4*2+12*2] |
| pshufb m11, m15 |
| pmaddubsw m11, m10 |
| pshufb m12, m7 |
| pmaddubsw m12, m10 |
| pshufb m13, m7 |
| pmaddubsw m13, m10 |
| pshufb m14, m7 |
| pmaddubsw m14, m10 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| movu xm11, [tlq+ 61] ; 64- 71 |
| vinserti128 m11, [tlq+ 75], 1 ; 80- 87 |
| movu xm12, [tlq+ 69] ; 72- 79 |
| vinserti128 m12, [tlq+ 83], 1 ; 88- 95 |
| movu xm13, [tlq+ 93] ; 96-103 |
| vinserti128 m13, [tlq+107], 1 ; 112-119 |
| movu xm14, [tlq+101] ; 104-111 |
| vinserti128 m14, [tlq+115], 1 ; 120-127 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| lea r3d, [hq-20] |
| mov tlq, rsp |
| packuswb m0, m2 |
| packuswb m1, m6 |
| vpbroadcastd xm2, [pb_14] |
| vbroadcasti128 m6, [pb_0to15] |
| mova [tlq+32*0], m0 |
| mova [tlq+32*1], m1 |
| movd xm0, r3d |
| vpbroadcastd m1, [pb_12] |
| vpbroadcastb m0, xm0 |
| paddb m0, m2 |
| pminub m0, m6 ; clip 64x16 and 64x32 |
| pshufb m12, m0 |
| pminub m1, m6 ; clip 64x64 |
| pshufb m14, m1 |
| pshufb m0, m11, m7 |
| pmaddubsw m0, m10 |
| pshufb m2, m12, m7 |
| pmaddubsw m2, m10 |
| pshufb m1, m13, m7 |
| pmaddubsw m1, m10 |
| pshufb m6, m14, m7 |
| pmaddubsw m6, m10 |
| pshufb m7, m11, m15 |
| pmaddubsw m7, m9 |
| pshufb m10, m12, m15 |
| pmaddubsw m10, m9 |
| paddw m0, m7 |
| pshufb m7, m13, m15 |
| pmaddubsw m7, m9 |
| paddw m2, m10 |
| pshufb m10, m14, m15 |
| pmaddubsw m10, m9 |
| paddw m1, m7 |
| paddw m6, m10 |
| vpbroadcastd m9, [z_filter_k+4*2+12*0] |
| pshufb m11, m8 |
| pmaddubsw m11, m9 |
| pshufb m12, m8 |
| pmaddubsw m12, m9 |
| pshufb m13, m8 |
| pmaddubsw m13, m9 |
| pshufb m14, m8 |
| pmaddubsw m14, m9 |
| paddw m0, m11 |
| paddw m2, m12 |
| paddw m1, m13 |
| paddw m6, m14 |
| pmulhrsw m0, m3 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| pmulhrsw m6, m3 |
| packuswb m0, m2 |
| packuswb m1, m6 |
| mova [tlq+32*2], m0 |
| mova [tlq+32*3], m1 |
| .w64_main: |
| movd xm12, dxd |
| vpbroadcastb m7, [tlq+maxbaseq] |
| lea r3d, [dxq-64] |
| shl maxbased, 6 |
| vpbroadcastw m12, xm12 |
| sub r3d, maxbased |
| vbroadcasti128 m8, [z_filter_s+2] |
| movd xm6, r3d |
| mov r5d, dxd |
| mova m10, [pb_1to32] |
| vpbroadcastd m11, [pb_32] |
| vpbroadcastw m6, xm6 |
| .w64_loop: |
| mov r3d, r5d |
| shr r3d, 6 |
| movu m0, [tlq+r3+ 0] |
| movu m1, [tlq+r3+ 8] |
| pand m2, m4, m6 |
| psubw m9, m5, m2 |
| psllw m2, 8 |
| por m9, m2 |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| psraw m2, m6, 6 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packsswb m2, m2 |
| paddb m2, m10 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [dstq+ 0], m0 |
| movu m0, [tlq+r3+32] |
| movu m1, [tlq+r3+40] |
| add r5d, dxd |
| pshufb m0, m8 |
| pshufb m1, m8 |
| pmaddubsw m0, m9 |
| pmaddubsw m1, m9 |
| paddb m2, m11 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| paddw m6, m12 |
| packuswb m0, m1 |
| vpblendvb m0, m7, m0, m2 |
| mova [dstq+32], m0 |
| dec hd |
| jz .w64_end |
| add dstq, strideq |
| cmp r5d, maxbased |
| jb .w64_loop |
| .w64_end_loop: |
| mova [dstq+ 0], m7 |
| mova [dstq+32], m7 |
| add dstq, strideq |
| dec hd |
| jg .w64_end_loop |
| .w64_end: |
| RET |
| |
| cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy |
| %define base r9-z_filter_t0 |
| lea r9, [ipred_z2_avx2_table] |
| tzcnt wd, wm |
| movifnidn angled, anglem |
| movifnidn hd, hm |
| lea dxq, [dr_intra_derivative-90] |
| movsxd wq, [r9+wq*4] |
| movzx dyd, angleb |
| xor angled, 0x400 |
| mov r8, dxq |
| sub dxq, dyq |
| add wq, r9 |
| add r9, z_filter_t0-ipred_z2_avx2_table |
| mova m2, [tlq-64] |
| mova m0, [tlq-32] |
| mova m1, [tlq] |
| and dyd, ~1 |
| and dxq, ~1 |
| movzx dyd, word [r8+dyq] ; angle - 90 |
| movzx dxd, word [dxq+270] ; 180 - angle |
| vpbroadcastd m13, [base+pw_512] |
| vpbroadcastd m14, [base+pw_62] |
| vpbroadcastd m15, [base+pw_64] |
| mova [rsp+ 0], m2 |
| mova [rsp+32], m0 |
| mova [rsp+64], m1 |
| neg dxd |
| neg dyd |
| jmp wq |
| .w4: |
| vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 |
| vbroadcasti128 m10, [base+z1_shuf_w4] |
| vbroadcasti128 m11, [base+z2_shuf_h4] |
| lea r2d, [dxq+(65<<6)] ; xpos |
| movd xm5, dyd |
| mov r8d, (63-4)<<6 |
| mov dyq, -4 |
| pshuflw xm5, xm5, q0000 |
| pmullw xm5, [base+z2_ymul] |
| test angled, 0x400 |
| jnz .w4_main ; !enable_intra_edge_filter |
| lea r3d, [hq+2] |
| add angled, 1022 |
| shl r3d, 6 |
| test r3d, angled |
| jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) |
| vpbroadcastd xm3, [base+pb_4] |
| call .upsample_above |
| sub angled, 1075 ; angle - 53 |
| lea r3d, [hq+3] |
| xor angled, 0x7f ; 180 - angle |
| call .filter_strength |
| jmp .w4_filter_left |
| ALIGN function_align |
| .filter_strength: |
| movd xm8, r3d |
| mov r3d, angled |
| movd xm7, angled |
| vpbroadcastb m8, xm8 |
| shr r3d, 8 ; is_sm << 1 |
| vpbroadcastb m7, xm7 |
| pcmpeqb m8, [base+z_filter_wh] |
| mova xm9, [r9+r3*8] |
| pand m0, m8, m7 |
| pcmpgtb m0, m9 |
| pmovmskb r3d, m0 |
| ret |
| ALIGN function_align |
| .upsample_above: ; w4/w8 |
| pshufb xm2, xm1, [base+z_upsample1-2] |
| pminub xm3, [base+z_filter_s+4] |
| vpbroadcastd xm4, [base+pb_36_m4] |
| vbroadcasti128 m10, [base+pb_0to15] |
| pshufb xm3, xm1, xm3 |
| pmaddubsw xm2, xm4 |
| pmaddubsw xm3, xm4 |
| lea r2d, [r2+dxq+(1<<6)] |
| add dxd, dxd |
| paddw xm2, xm3 |
| pmulhrsw xm2, xm13 |
| sub r8d, 3<<6 |
| paddw m6, m6 |
| packuswb xm2, xm2 |
| punpcklbw xm1, xm2 |
| mova [rsp+gprsize+64], xm1 |
| ret |
| ALIGN function_align |
| .upsample_left: ; h4/h8 |
| mov r3d, hd |
| and r3d, 4 |
| movd xm2, [rsp+gprsize+64] |
| movddup xm0, [rsp+gprsize+56] |
| movd xm1, r3d |
| palignr xm2, xm0, 1 |
| vpbroadcastb xm1, xm1 |
| pshufb xm2, [base+z_filter_s+18] |
| vpbroadcastd xm3, [base+pb_36_m4] |
| pmaxub xm1, [base+z_upsample1-2] |
| pshufb xm1, xm0, xm1 |
| pmaddubsw xm2, xm3 |
| pmaddubsw xm1, xm3 |
| paddw xm5, xm5 |
| add dyq, dyq |
| paddw xm1, xm2 |
| pmulhrsw xm1, xm13 |
| vbroadcasti128 m11, [base+z2_upsample] |
| paddw xm5, xm15 |
| packuswb xm1, xm1 |
| punpcklbw xm0, xm1 |
| mova [rsp+gprsize+48], xm0 |
| ret |
| .w4_no_upsample_above: |
| lea r3d, [hq+3] |
| sub angled, 1112 ; angle - 90 |
| call .filter_strength |
| test r3d, r3d |
| jz .w4_no_filter_above |
| popcnt r3d, r3d |
| vpbroadcastd xm2, [base+pb_4] |
| pminub xm2, [base+z_filter_s] |
| vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] |
| pshufb xm3, xm1, xm2 ; 00 01 12 23 |
| pshufd xm2, xm2, q0321 |
| pmaddubsw xm0, xm3, xm0 |
| pshufb xm2, xm1, xm2 ; 12 23 34 44 |
| pmaddubsw xm2, xm4 |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] |
| punpckhqdq xm3, xm3 ; 34 44 44 44 |
| pmaddubsw xm3, xm4 |
| movd xm4, r6m ; max_width |
| pminsw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm13 |
| psubb xm4, [base+pb_1to32] |
| psrlq xm1, 8 |
| packuswb xm0, xm0 |
| vpblendvb xm0, xm1, xm4 |
| movd [rsp+65], xm0 |
| .w4_no_filter_above: |
| lea r3d, [hq+2] |
| add angled, 973 ; angle + 883 |
| shl r3d, 6 |
| test r3d, angled |
| jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) |
| vpbroadcastd xm0, [base+pb_90] |
| psubb xm0, xm7 ; 180 - angle |
| pand xm0, xm8 ; reuse from previous filter_strength call |
| pcmpgtb xm0, xm9 |
| pmovmskb r3d, xm0 |
| .w4_filter_left: |
| test r3d, r3d |
| jz .w4_main |
| popcnt r3d, r3d |
| mov r5d, 10 |
| cmp hd, 16 |
| movu xm2, [rsp+49] |
| vinserti128 m2, [rsp+43], 1 |
| cmovs r5d, hd |
| xor r5d, 15 ; h == 16 ? 5 : 15 - h |
| movd xm0, r5d |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab |
| vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd |
| vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef |
| vpbroadcastb m0, xm0 |
| pmaxub m0, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] |
| pshufb m0, m2, m0 |
| pmaddubsw m0, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m3 |
| vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] |
| pshufb m2, m4 |
| pmaddubsw m2, m3 |
| movd xm4, r7m ; max_height |
| pminsw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| psubb xm4, [base+pb_16to1] |
| paddw m1, m0 |
| paddw m1, m2 |
| pmulhrsw m1, m13 |
| vextracti128 xm0, m1, 1 |
| packuswb xm0, xm1 |
| vpblendvb xm0, [rsp+48], xm4 |
| mova [rsp+48], xm0 |
| jmp .w4_main |
| .w4_upsample_left: |
| call .upsample_left |
| .w4_main: |
| movd xm0, dxd |
| mova m12, [base+z2_y_shuf_h4] |
| lea r5, [rsp+56] ; left-7 |
| vpbroadcastw m0, xm0 |
| lea r9, [strideq*3] |
| psraw xm1, xm5, 6 |
| pand xm5, xm14 ; frac_y |
| pxor xm2, xm2 |
| paddw m7, m0, m0 |
| psubw xm4, xm2, xm1 ; base_y |
| vpblendd m0, m7, 0xcc |
| mova xm1, xm7 |
| punpcklwd xm4, xm2 |
| paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 |
| psubw xm1, xm15, xm5 ; 64-frac_y |
| psllw xm5, 8 |
| paddw m7, m7 |
| paddw m6, m0 |
| por xm5, xm1 ; 64-frac_y, frac_y |
| vpbroadcastq m5, xm5 |
| .w4_loop: |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x0 |
| vpbroadcastq m1, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x1 |
| vpbroadcastq m2, [rsp+r3] |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x2 |
| movq xm0, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x3 |
| movhps xm0, [rsp+r3] |
| vpblendd m1, m2, 0xc0 |
| pand m2, m14, m6 ; frac_x |
| vpblendd m0, m1, 0xf0 |
| psubw m1, m15, m2 ; 64-frac_x |
| psllw m2, 8 |
| pshufb m0, m10 |
| por m1, m2 ; 64-frac_x, frac_x |
| pmaddubsw m0, m1 |
| cmp r3d, 64 |
| jge .w4_toponly |
| mova m1, m7 ; arbitrary negative value |
| vpgatherdq m3, [r5+xm4], m1 |
| pshufb m1, m3, m11 |
| vpermd m1, m12, m1 |
| pmaddubsw m1, m5 |
| psraw m2, m6, 15 ; base_x < topleft |
| vpblendvb m0, m1, m2 |
| .w4_toponly: |
| pmulhrsw m0, m13 |
| paddw m6, m7 ; xpos += dx |
| add r5, dyq |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r9 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| sub hd, 4 |
| jz .w4_end |
| lea dstq, [dstq+strideq*4] |
| cmp r2d, r8d |
| jge .w4_loop |
| .w4_leftonly_loop: |
| mova m1, m7 |
| vpgatherdq m2, [r5+xm4], m1 |
| add r5, dyq |
| pshufb m0, m2, m11 |
| vpermd m0, m12, m0 |
| pmaddubsw m0, m5 |
| pmulhrsw m0, m13 |
| packuswb m0, m0 |
| vextracti128 xm1, m0, 1 |
| movd [dstq+strideq*2], xm0 |
| pextrd [dstq+r9 ], xm0, 1 |
| movd [dstq+strideq*0], xm1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_leftonly_loop |
| .w4_end: |
| RET |
| .w8: |
| vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 |
| movd xm5, dyd |
| vbroadcasti128 m10, [base+z_filter_s+2] |
| vbroadcasti128 m11, [base+z2_shuf_h4] |
| lea r2d, [dxq+(65<<6)] ; xpos |
| vpbroadcastw xm5, xm5 |
| mov r8d, (63-8)<<6 |
| mov dyq, -4 |
| pmullw xm5, [base+z2_ymul] |
| test angled, 0x400 |
| jnz .w8_main |
| lea r3d, [angleq+126] |
| mov r3b, hb |
| cmp r3d, 8 |
| ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm |
| vpbroadcastd xm3, [base+pb_8] |
| movhps [rsp+80], xm1 |
| call .upsample_above |
| sub angled, 53 ; angle - 53 |
| lea r3d, [hq+7] |
| xor angled, 0x7f ; 180 - angle |
| call .filter_strength |
| jmp .w8_filter_left |
| .w8_no_upsample_above: |
| lea r3d, [hq+7] |
| sub angled, 90 ; angle - 90 |
| call .filter_strength |
| test r3d, r3d |
| jz .w8_no_filter_above |
| popcnt r3d, r3d |
| vpbroadcastd xm3, [base+pb_8] |
| pminub xm3, [base+z_filter_s+8] |
| vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] |
| pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 |
| pmaddubsw xm0, xm2, xm0 |
| pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 |
| shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 |
| pmaddubsw xm2, xm4 |
| vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] |
| pmaddubsw xm3, xm4 |
| movd xm4, r6m ; max_width |
| pminuw xm4, xm15 |
| vpbroadcastb xm4, xm4 |
| paddw xm0, xm2 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm13 |
| psubb xm4, [base+pb_1to32] |
| psrldq xm1, 1 |
| packuswb xm0, xm0 |
| vpblendvb xm0, xm1, xm4 |
| movq [rsp+65], xm0 |
| .w8_no_filter_above: |
| lea r3d, [angleq-51] |
| mov r3b, hb |
| cmp r3d, 8 |
| jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm |
| vpbroadcastd m0, [base+pb_90] |
| psubb m0, m7 |
| pand m0, m8 |
| pcmpgtb m0, m9 |
| pmovmskb r3d, m0 |
| .w8_filter_left: |
| test r3d, r3d |
| jz .w8_main |
| popcnt r3d, r3d |
| vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] |
| vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] |
| vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] |
| cmp hd, 32 |
| jne .w8_filter_left_h16 |
| movu xm2, [rsp+27] |
| vinserti128 m2, [rsp+35], 1 |
| vpbroadcastd xm0, [base+pb_5] |
| vbroadcasti128 m3, [base+z_filter_s+ 8] |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| pmaxub m3, m0 |
| pshufb m3, m2, m3 |
| pmaddubsw m3, m7 |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m8 |
| pshufb m2, m4 |
| pmaddubsw m2, m9 |
| paddw m3, m1 |
| paddw m3, m2 |
| pmulhrsw m3, m13 |
| jmp .w8_filter_left_top16 |
| .w8_filter_left_h16: |
| mov r5d, 10 |
| cmp hd, 16 |
| cmovs r5d, hd |
| xor r5d, 15 ; h == 16 ? 5 : 15 - h |
| movd xm0, r5d |
| vpbroadcastb m0, xm0 |
| .w8_filter_left_top16: |
| vbroadcasti128 m1, [base+z_filter_s+12] |
| vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab |
| vbroadcasti128 m4, [base+z_filter_s+16] |
| vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd |
| vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef |
| pmaxub m0, m2 |
| movu xm2, [rsp+49] |
| vinserti128 m2, [rsp+43], 1 |
| pshufb m0, m2, m0 |
| pmaddubsw m0, m7 |
| movd xm7, r7m ; max_height |
| pshufb m1, m2, m1 |
| pmaddubsw m1, m8 |
| pshufb m2, m4 |
| pmaddubsw m2, m9 |
| pminsw xm7, xm15 |
| paddw m1, m0 |
| vpbroadcastb m7, xm7 |
| paddw m1, m2 |
| pmulhrsw m1, m13 |
| psubb m7, [base+pb_32to1] |
| packuswb m3, m1 |
| vpermq m3, m3, q1320 |
| vpblendvb m3, [rsp+32], m7 |
| mova [rsp+32], m3 |
| jmp .w8_main |
| .w8_upsample_left: |
| call .upsample_left |
| .w8_main: |
| movd xm3, dxd |
| lea r5, [rsp+56] ; left-7 |
| pshufd xm1, xm5, q3120 |
| pand xm5, xm14 |
| vpbroadcastw m3, xm3 |
| pxor xm0, xm0 |
| psubw xm2, xm15, xm5 |
| psraw xm1, 6 |
| lea r9, [strideq*3] |
| paddw m7, m3, m3 |
| psubw xm9, xm0, xm1 ; base_y |
| psllw xm5, 8 |
| punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 |
| vpblendd m3, m7, 0xf0 ; xpos0 xpos1 |
| por xm5, xm2 ; 64-frac_y, frac_y |
| punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 |
| paddw m6, m3 |
| vinserti128 m12, m5, xm5, 1 |
| .w8_loop: |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x0 |
| movu xm0, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x1 |
| vinserti128 m0, [rsp+r3], 1 |
| lea r3d, [r2+dxq] |
| shr r2d, 6 ; base_x2 |
| movu xm1, [rsp+r2] |
| lea r2d, [r3+dxq] |
| shr r3d, 6 ; base_x3 |
| vinserti128 m1, [rsp+r3], 1 |
| pand m2, m14, m6 |
| paddsw m4, m6, m7 |
| psubw m5, m15, m2 |
| psllw m2, 8 |
| pshufb m0, m10 |
| por m2, m5 |
| pmaddubsw m0, m2 |
| pand m2, m14, m4 |
| psubw m5, m15, m2 |
| psllw m2, 8 |
| pshufb m1, m10 |
| por m2, m5 |
| pmaddubsw m1, m2 |
| cmp r3d, 64 |
| jge .w8_toponly |
| mova m5, m7 |
| vpgatherdq m3, [r5+xm9], m7 |
| mova m7, m5 |
| vpgatherdq m2, [r5+xm8], m5 |
| pshufb m3, m11 |
| pshufb m2, m11 |
| punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 |
| vpermq m5, m5, q3120 ; y0 y1 |
| vpermq m2, m2, q3120 ; y2 y3 |
| pmaddubsw m5, m12 |
| pmaddubsw m2, m12 |
| psraw m6, 15 ; base_x < topleft |
| vpblendvb m0, m5, m6 |
| psraw m3, m4, 15 |
| vpblendvb m1, m2, m3 |
| .w8_toponly: |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| paddw m6, m4, m7 ; xpos += dx |
| add r5, dyq |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r9 ], xm1 |
| sub hd, 4 |
| jz .w8_end |
| lea dstq, [dstq+strideq*4] |
| cmp r2d, r8d |
| jge .w8_loop |
| .w8_leftonly_loop: |
| mova m0, m7 |
| vpgatherdq m5, [r5+xm9], m7 |
| mova m7, m0 |
| vpgatherdq m3, [r5+xm8], m0 |
| add r5, dyq |
| pshufb m2, m5, m11 |
| pshufb m1, m3, m11 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| pmaddubsw m0, m12 |
| pmaddubsw m1, m12 |
| pmulhrsw m0, m13 |
| pmulhrsw m1, m13 |
| packuswb m0, m1 |
| vextracti128 xm1, m0, 1 |
| movq [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*2], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+r9 ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_leftonly_loop |
| .w8_end: |
| RET |
| .w16: |
| mov r8d, hd |
| test angled, 0x400 |
| jnz .w16_main |
| lea r3d, [hq+15] |
| sub angled, 90 |
| call .filter_strength |
| test r3d, r3d |
|