| ; Copyright © 2020, VideoLAN and dav1d authors |
| ; Copyright © 2020, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 64 |
| |
| %macro SMOOTH_WEIGHT_TABLE 1-* |
| %rep %0 |
| db %1-128, 127-%1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| smooth_weights: SMOOTH_WEIGHT_TABLE \ |
| 0, 0, 255, 128, 255, 149, 85, 64, \ |
| 255, 197, 146, 105, 73, 50, 37, 32, \ |
| 255, 225, 196, 170, 145, 123, 102, 84, \ |
| 68, 54, 43, 33, 26, 20, 17, 16, \ |
| 255, 240, 225, 210, 196, 182, 169, 157, \ |
| 145, 133, 122, 111, 101, 92, 83, 74, \ |
| 66, 59, 52, 45, 39, 34, 29, 25, \ |
| 21, 17, 14, 12, 10, 9, 8, 8, \ |
| 255, 248, 240, 233, 225, 218, 210, 203, \ |
| 196, 189, 182, 176, 169, 163, 156, 150, \ |
| 144, 138, 133, 127, 121, 116, 111, 106, \ |
| 101, 96, 91, 86, 82, 77, 73, 69, \ |
| 65, 61, 57, 54, 50, 47, 44, 41, \ |
| 38, 35, 32, 29, 27, 25, 22, 20, \ |
| 18, 16, 15, 13, 12, 10, 9, 8, \ |
| 7, 6, 6, 5, 5, 4, 4, 4 |
| |
| ; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ |
| filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 |
| db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 |
| db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 |
| db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 |
| db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 |
| db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 |
| db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 |
| db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 |
| db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 |
| db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 |
| db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 |
| db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 |
| db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 |
| db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 |
| db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 |
| db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 |
| db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 |
| db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 |
| db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 |
| db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 |
| filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 |
| db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 |
| db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 |
| db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 |
| filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 |
| smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 |
| db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 |
| db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 |
| db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 |
| smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 |
| db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 |
| db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 |
| smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 |
| db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 |
| db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 |
| db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 |
| ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 |
| db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 |
| |
| pb_127_m127: times 2 db 127, -127 |
| pb_128: times 4 db 128 |
| pw_128: times 2 dw 128 |
| pw_255: times 2 dw 255 |
| |
| %define pb_1 (ipred_h_shuf+24) |
| %define pb_2 (ipred_h_shuf+20) |
| %define pb_3 (ipred_h_shuf+16) |
| %define pd_8 (filter_taps+128) |
| |
| %macro JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - 2*4) |
| %xdefine %%base mangle(private_prefix %+ _%1_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%base %+ .%3 - (%%table - 2*4) |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) |
| |
| JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ |
| s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 |
| JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 |
| JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 |
| |
| SECTION .text |
| |
| INIT_ZMM avx512icl |
| cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h |
| lea r5, [ipred_dc_left_8bpc_avx512icl_table] |
| movd xm0, wm |
| tzcnt wd, wm |
| inc tlq |
| movifnidn hd, hm |
| movu ym1, [tlq] |
| movd xmm3, wd |
| movsxd r6, [r5+wq*4] |
| vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] |
| psrld xm0, 1 |
| vpdpbusd ym0, ym1, ym2 |
| add r6, r5 |
| add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| |
| cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_left_8bpc_avx512icl_table] |
| mov hd, hm |
| tzcnt r6d, hd |
| sub tlq, hq |
| tzcnt wd, wm |
| movd xm0, hm |
| movu ym1, [tlq] |
| movd xmm3, r6d |
| movsxd r6, [r5+r6*4] |
| vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] |
| psrld xm0, 1 |
| vpdpbusd ym0, ym1, ym2 |
| add r6, r5 |
| add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp r6 |
| .h64: |
| movu ym1, [tlq+32] ; unaligned when jumping here from dc_top |
| vpdpbusd ym0, ym1, ym2 |
| .h32: |
| vextracti32x4 xm1, ym0, 1 |
| paddd xm0, xm1 |
| .h16: |
| punpckhqdq xm1, xm0, xm0 |
| paddd xm0, xm1 |
| .h8: |
| psrlq xm1, xm0, 32 |
| paddd xm0, xm1 |
| .h4: |
| vpsrlvd xm0, xmm3 |
| lea stride3q, [strideq*3] |
| vpbroadcastb m0, xm0 |
| jmp wq |
| |
| cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 |
| movifnidn hd, hm |
| movifnidn wd, wm |
| tzcnt r6d, hd |
| lea r5d, [wq+hq] |
| movd xm0, r5d |
| tzcnt r5d, r5d |
| movd xmm4, r5d |
| lea r5, [ipred_dc_8bpc_avx512icl_table] |
| tzcnt wd, wd |
| movsxd r6, [r5+r6*4] |
| movsxd wq, [r5+wq*4+5*4] |
| vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] |
| psrld xm0, 1 |
| add r6, r5 |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp r6 |
| .h4: |
| movd xmm1, [tlq-4] |
| vpdpbusd xm0, xmm1, xm3 |
| jmp wq |
| .w4: |
| movd xmm1, [tlq+1] |
| vpdpbusd xm0, xmm1, xm3 |
| cmp hd, 4 |
| jg .w4_mul |
| psrlw xmm0, xm0, 3 |
| jmp .w4_end |
| .w4_mul: |
| punpckhqdq xmm1, xm0, xm0 |
| lea r2d, [hq*2] |
| mov r6d, 0x55563334 |
| paddd xmm1, xm0 |
| shrx r6d, r6d, r2d |
| psrlq xmm0, xmm1, 32 |
| paddd xmm0, xmm1 |
| movd xmm1, r6d |
| psrld xmm0, 2 |
| pmulhuw xmm0, xmm1 |
| .w4_end: |
| vpbroadcastb xm0, xmm0 |
| .s4: |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm0 |
| movd [dstq+strideq*2], xm0 |
| movd [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s4 |
| RET |
| .h8: |
| movq xmm1, [tlq-8] |
| vpdpbusd xm0, xmm1, xm3 |
| jmp wq |
| .w8: |
| movq xmm1, [tlq+1] |
| vextracti32x4 xm2, ym0, 1 |
| vpdpbusd xm0, xmm1, xm3 |
| paddd xmm2, xm2, xm0 |
| punpckhqdq xmm0, xmm2, xmm2 |
| paddd xmm0, xmm2 |
| psrlq xmm1, xmm0, 32 |
| paddd xmm0, xmm1 |
| vpsrlvd xmm0, xmm4 |
| cmp hd, 8 |
| je .w8_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| cmp hd, 32 |
| cmove r6d, r2d |
| movd xmm1, r6d |
| pmulhuw xmm0, xmm1 |
| .w8_end: |
| vpbroadcastb xm0, xmm0 |
| .s8: |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm0 |
| movq [dstq+strideq*2], xm0 |
| movq [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s8 |
| RET |
| .h16: |
| mova xmm1, [tlq-16] |
| vpdpbusd xm0, xmm1, xm3 |
| jmp wq |
| .w16: |
| movu xmm1, [tlq+1] |
| vextracti32x4 xm2, ym0, 1 |
| vpdpbusd xm0, xmm1, xm3 |
| paddd xmm2, xm2, xm0 |
| punpckhqdq xmm0, xmm2, xmm2 |
| paddd xmm0, xmm2 |
| psrlq xmm1, xmm0, 32 |
| paddd xmm0, xmm1 |
| vpsrlvd xmm0, xmm4 |
| cmp hd, 16 |
| je .w16_end |
| mov r6d, 0x5556 |
| mov r2d, 0x3334 |
| test hb, 8|32 |
| cmovz r6d, r2d |
| movd xmm1, r6d |
| pmulhuw xmm0, xmm1 |
| .w16_end: |
| vpbroadcastb xm0, xmm0 |
| .s16: |
| mova [dstq+strideq*0], xm0 |
| mova [dstq+strideq*1], xm0 |
| mova [dstq+strideq*2], xm0 |
| mova [dstq+stride3q ], xm0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s16 |
| RET |
| .h32: |
| mova ym1, [tlq-32] |
| vpdpbusd ym0, ym1, ym3 |
| jmp wq |
| .w32: |
| movu ym1, [tlq+1] |
| vpdpbusd ym0, ym1, ym3 |
| vextracti32x4 xm1, ym0, 1 |
| paddd xmm1, xm1, xm0 |
| punpckhqdq xmm0, xmm1, xmm1 |
| paddd xmm0, xmm1 |
| psrlq xmm1, xmm0, 32 |
| paddd xmm0, xmm1 |
| vpsrlvd xmm0, xmm4 |
| cmp hd, 32 |
| je .w32_end |
| lea r2d, [hq*2] |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, r2d |
| movd xmm1, r6d |
| pmulhuw xmm0, xmm1 |
| .w32_end: |
| vpbroadcastb ym0, xmm0 |
| .s32: |
| mova [dstq+strideq*0], ym0 |
| mova [dstq+strideq*1], ym0 |
| mova [dstq+strideq*2], ym0 |
| mova [dstq+stride3q ], ym0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s32 |
| RET |
| .h64: |
| mova ym1, [tlq-64] |
| mova ym2, [tlq-32] |
| vpdpbusd ym0, ym1, ym3 |
| vpdpbusd ym0, ym2, ym3 |
| jmp wq |
| .w64: |
| movu ym1, [tlq+ 1] |
| movu ym2, [tlq+33] |
| vpdpbusd ym0, ym1, ym3 |
| vpdpbusd ym0, ym2, ym3 |
| vextracti32x4 xm1, ym0, 1 |
| paddd xmm1, xm1, xm0 |
| punpckhqdq xmm0, xmm1, xmm1 |
| paddd xmm0, xmm1 |
| psrlq xmm1, xmm0, 32 |
| paddd xmm0, xmm1 |
| vpsrlvd xmm0, xmm4 |
| cmp hd, 64 |
| je .w64_end |
| mov r6d, 0x33345556 |
| shrx r6d, r6d, hd |
| movd xmm1, r6d |
| pmulhuw xmm0, xmm1 |
| .w64_end: |
| vpbroadcastb m0, xmm0 |
| .s64: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .s64 |
| RET |
| |
| cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 |
| lea r5, [ipred_dc_splat_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| movu m0, [tlq+1] |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| lea stride3q, [strideq*3] |
| jmp wq |
| |
| cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 |
| %define base r6-ipred_h_8bpc_avx512icl_table |
| lea r6, [ipred_h_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| lea stride3q, [strideq*3] |
| sub tlq, hq |
| add wq, r6 |
| jmp wq |
| .w4: |
| mova xmm1, [base+ipred_h_shuf+16] |
| .w4_loop: |
| movd xmm0, [tlq+hq-4] |
| pshufb xmm0, xmm1 |
| movd [dstq+strideq*0], xmm0 |
| pextrd [dstq+strideq*1], xmm0, 1 |
| pextrd [dstq+strideq*2], xmm0, 2 |
| pextrd [dstq+stride3q ], xmm0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| .w8: |
| movsldup xmm2, [base+ipred_h_shuf+16] |
| movshdup xmm3, [base+ipred_h_shuf+16] |
| .w8_loop: |
| movd xmm1, [tlq+hq-4] |
| pshufb xmm0, xmm1, xmm2 |
| pshufb xmm1, xmm3 |
| movq [dstq+strideq*0], xmm0 |
| movq [dstq+strideq*1], xmm1 |
| movhps [dstq+strideq*2], xmm0 |
| movhps [dstq+stride3q ], xmm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| .w16: |
| movsldup m1, [base+smooth_shuf] |
| .w16_loop: |
| vpbroadcastd m0, [tlq+hq-4] |
| pshufb m0, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], m0, 2 |
| vextracti32x4 [dstq+strideq*2], ym0, 1 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16 |
| RET |
| .w32: |
| vpbroadcastd ym3, [base+pb_1] |
| vpord m2, m3, [base+pb_2] {1to16} |
| .w32_loop: |
| vpbroadcastd m1, [tlq+hq-4] |
| pshufb m0, m1, m2 |
| pshufb m1, m3 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], ym1 |
| vextracti32x8 [dstq+stride3q ], m1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w32_loop |
| RET |
| .w64: |
| vpbroadcastd m4, [base+pb_3] |
| vpbroadcastd m5, [base+pb_2] |
| vpbroadcastd m6, [base+pb_1] |
| pxor m7, m7 |
| .w64_loop: |
| vpbroadcastd m3, [tlq+hq-4] |
| pshufb m0, m3, m4 |
| pshufb m1, m3, m5 |
| pshufb m2, m3, m6 |
| pshufb m3, m7 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w64_loop |
| RET |
| |
| %macro PAETH 0 |
| psubusb m1, m5, m4 |
| psubusb m0, m4, m5 |
| por m1, m0 ; tdiff |
| pavgb m2, m6, m4 |
| vpcmpub k1, m1, m7, 1 ; tdiff < ldiff |
| vpblendmb m0{k1}, m4, m6 |
| vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 |
| psubusb m3, m5, m2 |
| psubb m2, m4 |
| psubusb m2, m5 |
| por m2, m3 |
| pminub m1, m7 |
| paddusb m2, m2 |
| por m2, m4 ; min(tldiff, 255) |
| vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff |
| vmovdqu8 m0{k1}, m5 |
| %endmacro |
| |
| cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 |
| lea r6, [ipred_paeth_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| vpbroadcastb m5, [tlq] ; topleft |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] |
| lea topq, [tlq+1] |
| sub tlq, hq |
| add wq, r6 |
| lea stride3q, [strideq*3] |
| jmp wq |
| INIT_YMM avx512icl |
| .w4: |
| vpbroadcastd m6, [topq] |
| mova m9, [ipred_h_shuf] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 ; ldiff |
| .w4_loop: |
| vpbroadcastq m4, [tlq+hq-8] |
| pshufb m4, m9 ; left |
| PAETH |
| movd [dstq+strideq*0], xm0 |
| pextrd [dstq+strideq*1], xm0, 1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+stride3q ], xm0, 3 |
| sub hd, 8 |
| jl .w4_ret |
| vextracti32x4 xm0, m0, 1 |
| lea dstq, [dstq+strideq*4] |
| movd [dstq+strideq*0], xm0 |
| pextrd [dstq+strideq*1], xm0, 1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+stride3q ], xm0, 3 |
| lea dstq, [dstq+strideq*4] |
| jg .w4_loop |
| .w4_ret: |
| RET |
| INIT_ZMM avx512icl |
| .w8: |
| vpbroadcastq m6, [topq] |
| movsldup m9, [smooth_shuf] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w8_loop: |
| vpbroadcastq m4, [tlq+hq-8] |
| pshufb m4, m9 |
| PAETH |
| vextracti32x4 xm1, m0, 2 |
| vextracti32x4 xm2, ym0, 1 |
| vextracti32x4 xm3, m0, 3 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movq [dstq+strideq*2], xm2 |
| movq [dstq+stride3q ], xm3 |
| sub hd, 8 |
| jl .w8_ret |
| lea dstq, [dstq+strideq*4] |
| movhps [dstq+strideq*0], xm0 |
| movhps [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm2 |
| movhps [dstq+stride3q ], xm3 |
| lea dstq, [dstq+strideq*4] |
| jg .w8_loop |
| .w8_ret: |
| RET |
| .w16: |
| vbroadcasti32x4 m6, [topq] |
| movsldup m9, [smooth_shuf] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w16_loop: |
| vpbroadcastd m4, [tlq+hq-4] |
| pshufb m4, m9 |
| PAETH |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], m0, 2 |
| vextracti32x4 [dstq+strideq*2], ym0, 1 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16_loop |
| RET |
| .w32: |
| vbroadcasti32x8 m6, [topq] |
| mova ym9, ym8 |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w32_loop: |
| vpbroadcastd m4, [tlq+hq-2] |
| pshufb m4, m9 |
| PAETH |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_loop |
| RET |
| .w64: |
| movu m6, [topq] |
| psubusb m7, m5, m6 |
| psubusb m0, m6, m5 |
| por m7, m0 |
| .w64_loop: |
| vpbroadcastb m4, [tlq+hq-1] |
| PAETH |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| |
| cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 |
| %define base r6-ipred_smooth_v_8bpc_avx512icl_table |
| lea r6, [ipred_smooth_v_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| vpbroadcastd m0, [base+pb_127_m127] |
| vpbroadcastd m1, [base+pw_128] |
| lea weightsq, [base+smooth_weights+hq*4] |
| neg hq |
| vpbroadcastb m4, [tlq+hq] ; bottom |
| add wq, r6 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| vpbroadcastd m2, [tlq+1] |
| movshdup m5, [smooth_shuf] |
| mova ym6, [smooth_endA] |
| punpcklbw m2, m4 ; top, bottom |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok |
| paddw m3, m1 ; 128 * top + 129 * bottom + 128 |
| .w4_loop: |
| vbroadcasti32x4 m0, [weightsq+hq*2] |
| pshufb m0, m5 |
| pmaddubsw m0, m2, m0 |
| paddw m0, m3 |
| vpermb m0, m6, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+stride3q ], xm1, 2 |
| add hq, 8 |
| jg .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+stride3q ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| jl .w4_loop |
| .ret: |
| RET |
| .w8: |
| vpbroadcastq m2, [tlq+1] |
| movshdup m5, [smooth_shuf] |
| mova ym6, [smooth_endA] |
| punpcklbw m2, m4 |
| pmaddubsw m3, m2, m0 |
| paddw m1, m2 |
| paddw m3, m1 |
| .w8_loop: |
| vpbroadcastq m0, [weightsq+hq*2] |
| pshufb m0, m5 |
| pmaddubsw m0, m2, m0 |
| paddw m0, m3 |
| vpermb m0, m6, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+stride3q ], xm1 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w8_loop |
| RET |
| .w16: |
| vbroadcasti32x4 m3, [tlq+1] |
| movshdup m6, [smooth_shuf] |
| mova m7, [smooth_endB] |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w16_loop: |
| vpbroadcastq m1, [weightsq+hq*2] |
| pshufb m1, m6 |
| pmaddubsw m0, m2, m1 |
| pmaddubsw m1, m3, m1 |
| paddw m0, m4 |
| paddw m1, m5 |
| vpermt2b m0, m7, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], m0, 2 |
| vextracti32x4 [dstq+strideq*2], ym0, 1 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| add hq, 4 |
| jl .w16_loop |
| RET |
| .w32: |
| vbroadcasti32x8 m3, [tlq+1] |
| movshdup m6, [smooth_shuf] |
| mova m7, [smooth_endB] |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w32_loop: |
| vpbroadcastd m1, [weightsq+hq*2] |
| pshufb m1, m6 |
| pmaddubsw m0, m2, m1 |
| pmaddubsw m1, m3, m1 |
| paddw m0, m4 |
| paddw m1, m5 |
| vpermt2b m0, m7, m1 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| add hq, 2 |
| jl .w32_loop |
| RET |
| .w64: |
| movu m3, [tlq+1] |
| mova m6, [smooth_endB] |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| pmaddubsw m4, m2, m0 |
| pmaddubsw m5, m3, m0 |
| paddw m0, m1, m2 |
| paddw m1, m3 |
| paddw m4, m0 |
| paddw m5, m1 |
| .w64_loop: |
| vpbroadcastw m1, [weightsq+hq*2] |
| pmaddubsw m0, m2, m1 |
| pmaddubsw m1, m3, m1 |
| paddw m0, m4 |
| paddw m1, m5 |
| vpermt2b m0, m6, m1 |
| mova [dstq], m0 |
| add dstq, strideq |
| inc hq |
| jl .w64_loop |
| RET |
| |
| cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 |
| %define base r5-ipred_smooth_h_8bpc_avx512icl_table |
| lea r5, [ipred_smooth_h_8bpc_avx512icl_table] |
| mov r6d, wd |
| tzcnt wd, wd |
| vpbroadcastb m4, [tlq+r6] ; right |
| mov hd, hm |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m5, [base+pb_127_m127] |
| vpbroadcastd m6, [base+pw_128] |
| sub tlq, hq |
| add wq, r5 |
| vpmovb2m k1, m6 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| movsldup m3, [smooth_shuf] |
| vpbroadcastq m7, [smooth_weights+4*2] |
| mova ym8, [smooth_endA] |
| .w4_loop: |
| vpbroadcastq m0, [tlq+hq-8] |
| mova m2, m4 |
| vpshufb m2{k1}, m0, m3 ; left, right |
| pmaddubsw m0, m2, m5 |
| pmaddubsw m1, m2, m7 |
| paddw m2, m6 |
| paddw m0, m2 |
| paddw m0, m1 |
| vpermb m0, m8, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+stride3q ], xm1, 2 |
| sub hd, 8 |
| jl .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+stride3q ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| jg .w4_loop |
| .ret: |
| RET |
| .w8: |
| movsldup m3, [smooth_shuf] |
| vbroadcasti32x4 m7, [smooth_weights+8*2] |
| mova ym8, [smooth_endA] |
| .w8_loop: |
| vpbroadcastd m0, [tlq+hq-4] |
| mova m2, m4 |
| vpshufb m2{k1}, m0, m3 |
| pmaddubsw m0, m2, m5 |
| pmaddubsw m1, m2, m7 |
| paddw m2, m6 |
| paddw m0, m2 |
| paddw m0, m1 |
| vpermb m0, m8, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+stride3q ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| .w16: |
| movsldup m7, [smooth_shuf] |
| vbroadcasti32x4 m8, [smooth_weights+16*2] |
| vbroadcasti32x4 m9, [smooth_weights+16*3] |
| mova m10, [smooth_endB] |
| .w16_loop: |
| vpbroadcastd m0, [tlq+hq-4] |
| mova m3, m4 |
| vpshufb m3{k1}, m0, m7 |
| pmaddubsw m2, m3, m5 |
| pmaddubsw m0, m3, m8 |
| pmaddubsw m1, m3, m9 |
| paddw m3, m6 |
| paddw m2, m3 |
| paddw m0, m2 |
| paddw m1, m2 |
| vpermt2b m0, m10, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], m0, 2 |
| vextracti32x4 [dstq+strideq*2], ym0, 1 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16_loop |
| RET |
| .w32: |
| mova m10, [smooth_endA] |
| vpbroadcastd ym7, [pb_1] |
| vbroadcasti32x8 m8, [smooth_weights+32*2] |
| vbroadcasti32x8 m9, [smooth_weights+32*3] |
| vshufi32x4 m10, m10, q3120 |
| .w32_loop: |
| vpbroadcastd m0, [tlq+hq-2] |
| mova m3, m4 |
| vpshufb m3{k1}, m0, m7 |
| pmaddubsw m2, m3, m5 |
| pmaddubsw m0, m3, m8 |
| pmaddubsw m1, m3, m9 |
| paddw m3, m6 |
| paddw m2, m3 |
| paddw m0, m2 |
| paddw m1, m2 |
| vpermt2b m0, m10, m1 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_loop |
| RET |
| .w64: |
| mova m7, [smooth_weights+64*2] |
| mova m8, [smooth_weights+64*3] |
| mova m9, [smooth_endA] |
| .w64_loop: |
| mova m3, m4 |
| vpbroadcastb m3{k1}, [tlq+hq-1] |
| pmaddubsw m2, m3, m5 |
| pmaddubsw m0, m3, m7 |
| pmaddubsw m1, m3, m8 |
| paddw m3, m6 |
| paddw m2, m3 |
| paddw m0, m2 |
| paddw m1, m2 |
| vpermt2b m0, m9, m1 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| |
| cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 |
| %define base r5-ipred_smooth_8bpc_avx512icl_table |
| lea r5, [ipred_smooth_8bpc_avx512icl_table] |
| mov r6d, wd |
| tzcnt wd, wd |
| mov hd, hm |
| vpbroadcastb m6, [tlq+r6] ; right |
| sub tlq, hq |
| movsxd wq, [r5+wq*4] |
| vpbroadcastd m7, [base+pb_127_m127] |
| vpbroadcastb m0, [tlq] ; bottom |
| vpbroadcastd m1, [base+pw_255] |
| add wq, r5 |
| lea v_weightsq, [base+smooth_weights+hq*2] |
| vpmovb2m k1, m1 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| vpbroadcastd m8, [tlq+hq+1] |
| movsldup m4, [smooth_shuf] |
| movshdup m5, [smooth_shuf] |
| vpbroadcastq m9, [smooth_weights+4*2] |
| mova ym11, [smooth_endA] |
| |
| punpcklbw m8, m0 ; top, bottom |
| pmaddubsw m10, m8, m7 |
| paddw m1, m8 ; 1 * top + 256 * bottom + 255 |
| paddw m10, m1 ; 128 * top + 129 * bottom + 255 |
| .w4_loop: |
| vpbroadcastq m1, [tlq+hq-8] |
| vbroadcasti32x4 m0, [v_weightsq] |
| add v_weightsq, 16 |
| mova m2, m6 |
| vpshufb m2{k1}, m1, m4 ; left, right |
| pmaddubsw m1, m2, m7 ; 127 * left - 127 * right |
| pshufb m0, m5 |
| pmaddubsw m0, m8, m0 |
| paddw m1, m2 ; 128 * left + 129 * right |
| pmaddubsw m2, m9 |
| paddw m0, m10 |
| paddw m1, m2 |
| pavgw m0, m1 |
| vpermb m0, m11, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movd [dstq+strideq*0], xm0 |
| movd [dstq+strideq*1], xm1 |
| pextrd [dstq+strideq*2], xm0, 2 |
| pextrd [dstq+stride3q ], xm1, 2 |
| sub hd, 8 |
| jl .ret |
| lea dstq, [dstq+strideq*4] |
| pextrd [dstq+strideq*0], xm0, 1 |
| pextrd [dstq+strideq*1], xm1, 1 |
| pextrd [dstq+strideq*2], xm0, 3 |
| pextrd [dstq+stride3q ], xm1, 3 |
| lea dstq, [dstq+strideq*4] |
| jg .w4_loop |
| .ret: |
| RET |
| .w8: |
| vpbroadcastq m8, [tlq+hq+1] |
| movsldup m4, [smooth_shuf] |
| movshdup m5, [smooth_shuf] |
| vbroadcasti32x4 m9, [smooth_weights+8*2] |
| mova ym11, [smooth_endA] |
| punpcklbw m8, m0 |
| pmaddubsw m10, m8, m7 |
| paddw m1, m8 |
| paddw m10, m1 |
| .w8_loop: |
| vpbroadcastd m1, [tlq+hq-4] |
| vpbroadcastq m0, [v_weightsq] |
| add v_weightsq, 8 |
| mova m2, m6 |
| vpshufb m2{k1}, m1, m4 |
| pmaddubsw m1, m2, m7 |
| pshufb m0, m5 |
| pmaddubsw m0, m8, m0 |
| paddw m1, m2 |
| pmaddubsw m2, m9 |
| paddw m0, m10 |
| paddw m1, m2 |
| pavgw m0, m1 |
| vpermb m0, m11, m0 |
| vextracti32x4 xm1, ym0, 1 |
| movq [dstq+strideq*0], xm0 |
| movq [dstq+strideq*1], xm1 |
| movhps [dstq+strideq*2], xm0 |
| movhps [dstq+stride3q ], xm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8_loop |
| RET |
| .w16: |
| vbroadcasti32x4 m9, [tlq+hq+1] |
| movsldup m5, [smooth_shuf] |
| movshdup m10, [smooth_shuf] |
| vbroadcasti32x4 m11, [smooth_weights+16*2] |
| vbroadcasti32x4 m12, [smooth_weights+16*3] |
| mova m15, [smooth_endB] |
| punpcklbw m8, m9, m0 |
| punpckhbw m9, m0 |
| pmaddubsw m13, m8, m7 |
| pmaddubsw m14, m9, m7 |
| paddw m0, m1, m8 |
| paddw m1, m9 |
| paddw m13, m0 |
| paddw m14, m1 |
| .w16_loop: |
| vpbroadcastd m0, [tlq+hq-4] |
| vpbroadcastq m1, [v_weightsq] |
| add v_weightsq, 8 |
| mova m4, m6 |
| vpshufb m4{k1}, m0, m5 |
| pmaddubsw m2, m4, m7 |
| pshufb m1, m10 |
| pmaddubsw m0, m8, m1 |
| pmaddubsw m1, m9, m1 |
| paddw m2, m4 |
| pmaddubsw m3, m4, m11 |
| pmaddubsw m4, m12 |
| paddw m0, m13 |
| paddw m1, m14 |
| paddw m3, m2 |
| paddw m4, m2 |
| pavgw m0, m3 |
| pavgw m1, m4 |
| vpermt2b m0, m15, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], m0, 2 |
| vextracti32x4 [dstq+strideq*2], ym0, 1 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16_loop |
| RET |
| .w32: |
| vbroadcasti32x8 m9, [tlq+hq+1] |
| movshdup m10, [smooth_shuf] |
| mova m12, [smooth_weights+32*2] |
| vpbroadcastd ym5, [pb_1] |
| mova m15, [smooth_endB] |
| punpcklbw m8, m9, m0 |
| punpckhbw m9, m0 |
| pmaddubsw m13, m8, m7 |
| pmaddubsw m14, m9, m7 |
| vshufi32x4 m11, m12, m12, q2020 |
| vshufi32x4 m12, m12, q3131 |
| paddw m0, m1, m8 |
| paddw m1, m9 |
| paddw m13, m0 |
| paddw m14, m1 |
| .w32_loop: |
| vpbroadcastd m0, [tlq+hq-2] |
| vpbroadcastd m1, [v_weightsq] |
| add v_weightsq, 4 |
| mova m4, m6 |
| vpshufb m4{k1}, m0, m5 |
| pmaddubsw m2, m4, m7 |
| pshufb m1, m10 |
| pmaddubsw m0, m8, m1 |
| pmaddubsw m1, m9, m1 |
| paddw m2, m4 |
| pmaddubsw m3, m4, m11 |
| pmaddubsw m4, m12 |
| paddw m0, m13 |
| paddw m1, m14 |
| paddw m3, m2 |
| paddw m4, m2 |
| pavgw m0, m3 |
| pavgw m1, m4 |
| vpermt2b m0, m15, m1 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w32_loop |
| RET |
| .w64: |
| movu m9, [tlq+hq+1] |
| mova m11, [smooth_weights+64*2] |
| mova m2, [smooth_weights+64*3] |
| mova m14, [smooth_endB] |
| punpcklbw m8, m9, m0 |
| punpckhbw m9, m0 |
| pmaddubsw m12, m8, m7 |
| pmaddubsw m13, m9, m7 |
| vshufi32x4 m10, m11, m2, q2020 |
| vshufi32x4 m11, m2, q3131 |
| paddw m0, m1, m8 |
| paddw m1, m9 |
| paddw m12, m0 |
| paddw m13, m1 |
| .w64_loop: |
| mova m4, m6 |
| vpbroadcastb m4{k1}, [tlq+hq-1] |
| vpbroadcastw m1, [v_weightsq] |
| add v_weightsq, 2 |
| pmaddubsw m2, m4, m7 |
| pmaddubsw m0, m8, m1 |
| pmaddubsw m1, m9, m1 |
| paddw m2, m4 |
| pmaddubsw m3, m4, m10 |
| pmaddubsw m4, m11 |
| paddw m0, m12 |
| paddw m1, m13 |
| paddw m3, m2 |
| paddw m4, m2 |
| pavgw m0, m3 |
| pavgw m1, m4 |
| vpermt2b m0, m14, m1 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec hd |
| jg .w64_loop |
| RET |
| |
| cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 |
| lea r6, [pal_pred_8bpc_avx512icl_table] |
| tzcnt wd, wm |
| vbroadcasti32x4 m4, [palq] |
| movifnidn hd, hm |
| movsxd wq, [r6+wq*4] |
| packuswb m4, m4 |
| add wq, r6 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4: |
| pshufb xmm0, xm4, [idxq] |
| add idxq, 16 |
| movd [dstq+strideq*0], xmm0 |
| pextrd [dstq+strideq*1], xmm0, 1 |
| pextrd [dstq+strideq*2], xmm0, 2 |
| pextrd [dstq+stride3q ], xmm0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4 |
| RET |
| .w8: |
| pshufb xmm0, xm4, [idxq+16*0] |
| pshufb xmm1, xm4, [idxq+16*1] |
| add idxq, 16*2 |
| movq [dstq+strideq*0], xmm0 |
| movhps [dstq+strideq*1], xmm0 |
| movq [dstq+strideq*2], xmm1 |
| movhps [dstq+stride3q ], xmm1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w8 |
| RET |
| .w16: |
| pshufb m0, m4, [idxq] |
| add idxq, 64 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], ym0, 1 |
| vextracti32x4 [dstq+strideq*2], m0, 2 |
| vextracti32x4 [dstq+stride3q ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w16 |
| RET |
| .w32: |
| pshufb m0, m4, [idxq+64*0] |
| pshufb m1, m4, [idxq+64*1] |
| add idxq, 64*2 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], ym1 |
| vextracti32x8 [dstq+stride3q ], m1, 1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w32 |
| RET |
| .w64: |
| pshufb m0, m4, [idxq+64*0] |
| pshufb m1, m4, [idxq+64*1] |
| pshufb m2, m4, [idxq+64*2] |
| pshufb m3, m4, [idxq+64*3] |
| add idxq, 64*4 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| mova [dstq+strideq*2], m2 |
| mova [dstq+stride3q ], m3 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w64 |
| RET |
| |
| ; The ipred_filter code processes 4x2 blocks in the following order |
| ; which increases parallelism compared to doing things row by row. |
| ; Some redundant blocks are calculated for w > 4. |
| ; w4 w8 w16 w32 |
| ; 1 1 2 1 2 3 4 1 2 3 4 9 a b c |
| ; 2 2 3 2 3 4 5 2 3 4 5 a b c d |
| ; 3 3 4 3 4 5 6 3 4 5 6 b c d e |
| ; 4 4 5 4 5 6 7 4 5 6 7 c d e f |
| ; 5 5 6 5 6 7 8 5 6 7 8 d e f g |
| ; 6 6 7 6 7 8 9 6 7 8 9 e f g h |
| ; 7 7 8 7 8 9 a 7 8 9 a f g h i |
| ; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ |
| ; 9 9 a b h i j |
| ; a b i j |
| ; b j |
| |
| cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt |
| %define base r6-filter_taps |
| lea r6, [filter_taps] |
| %ifidn fltd, fltm |
| movzx fltd, fltb |
| %else |
| movzx fltd, byte fltm |
| %endif |
| vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 |
| movifnidn hd, hm |
| shl fltd, 6 |
| vpbroadcastd m6, [base+pd_8] |
| vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ |
| vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 |
| vbroadcasti32x4 m8, [r6+fltq+16*1] |
| vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ |
| vbroadcasti32x4 m10, [r6+fltq+16*3] |
| mova xmm0, xm6 |
| vpdpbusd xmm0, xmm2, xm7 |
| mova xmm1, xm6 |
| vpdpbusd xmm1, xmm2, xm8 |
| vpdpbusd xmm0, xmm3, xm9 |
| vpdpbusd xmm1, xmm3, xm10 |
| packssdw xmm0, xmm1 |
| cmp wd, 8 |
| jb .w4 |
| vpbroadcastd ym2, [tlq+5] |
| mova m11, [base+filter_perm] |
| mov r5, 0xffffffffffff000f |
| psrldq xmm2, 1 ; __ t0 |
| kmovq k1, r5 ; 0x000f |
| psraw xm5, xmm0, 4 |
| packuswb xmm2, xm5 ; __ t0 a0 b0 |
| pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 |
| je .w8 |
| kxnorb k3, k3, k3 ; 0x00ff |
| vpbroadcastd xm3, [tlq-4] |
| kandnq k2, k3, k1 ; 0xffffffffffff0000 |
| vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ |
| mova ym0, ym6 |
| vpdpbusd ym0, ym2, ym7 |
| mova ym1, ym6 |
| vpdpbusd ym1, ym2, ym8 |
| pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 |
| vpbroadcastd m2, [tlq+9] |
| vpdpbusd ym0, ym3, ym9 |
| vpdpbusd ym1, ym3, ym10 |
| vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ |
| kunpckbw k4, k1, k3 ; 0x0fff |
| packssdw ym0, ym1 |
| psraw ym0, 4 ; a0 d0 a1 b1 |
| packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 |
| pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 |
| vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ |
| mova m4, m6 |
| vpdpbusd m4, m2, m7 |
| mova m1, m6 |
| vpdpbusd m1, m2, m8 |
| psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 |
| vpbroadcastd m2, [tlq+13] |
| vpdpbusd m4, m3, m9 |
| vpdpbusd m1, m3, m10 |
| mova m12, [base+filter_end] |
| lea r5d, [hq-6] |
| mov r6, dstq |
| cmovp hd, r5d ; w == 16 ? h : h - 6 |
| packssdw m4, m1 |
| psraw m4, 4 ; e0 f0 c1 d1 a2 b2 |
| packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 |
| pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 |
| .w16_loop: |
| vpbroadcastd xm3, [tlq-8] |
| vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ |
| mova m1, m6 |
| vpdpbusd m1, m2, m7 |
| mova m0, m6 |
| vpdpbusd m0, m2, m8 |
| sub tlq, 2 |
| vpdpbusd m1, m3, m9 |
| vpdpbusd m0, m3, m10 |
| packssdw m1, m0 |
| mova m0, m4 |
| psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 |
| packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 |
| pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 |
| vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 |
| vextracti32x4 [dstq+strideq*0], m5, 2 |
| vextracti32x4 [dstq+strideq*1], m5, 3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w16_loop |
| cmp wd, 16 |
| je .ret |
| mova xm13, [filter_perm+16] |
| mova xmm3, [r6+strideq*0] |
| punpckhdq xmm3, [r6+strideq*1] |
| vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 |
| pinsrb xm3, xmm3, [tlq+r5+16], 7 |
| pshufb xm3, xm13 |
| vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ |
| mova m0, m6 |
| vpdpbusd m0, m2, m7 |
| mova m1, m6 |
| vpdpbusd m1, m2, m8 |
| kunpckbw k5, k3, k1 ; 0xff0f |
| lea r3, [strideq*3] |
| vpdpbusd m0, m3, m9 |
| vpdpbusd m1, m3, m10 |
| packssdw m0, m1 |
| psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 |
| packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 |
| vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 |
| vpbroadcastd ym2, [tlq+r5+21] |
| pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 |
| vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 |
| vextracti32x4 [dstq+strideq*0], m5, 2 |
| vextracti32x4 [dstq+strideq*1], m5, 3 |
| punpckhqdq xmm3, [r6+r3] |
| pinsrb xmm3, [r6+strideq*2+15], 11 |
| pshufb xm3, xmm3, xm13 |
| vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ |
| mova m4, m6 |
| vpdpbusd m4, m2, m7 |
| mova m1, m6 |
| vpdpbusd m1, m2, m8 |
| kxnord k3, k3, k4 ; 0xfffff0ff |
| lea r4, [strideq*5] |
| vpdpbusd m4, m3, m9 |
| vpdpbusd m1, m3, m10 |
| packssdw m4, m1 |
| psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 |
| packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 |
| vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 |
| vpbroadcastd m2, [tlq+r5+25] |
| pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 |
| vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 |
| vextracti32x4 [dstq+strideq*2], m5, 2 |
| vextracti32x4 [dstq+r3 ], m5, 3 |
| punpckhqdq xmm3, [r6+r4] |
| pinsrb xmm3, [r6+strideq*4+15], 11 |
| pshufb xm3, xmm3, xm13 |
| vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ |
| mova m0, m6 |
| vpdpbusd m0, m2, m7 |
| mova m1, m6 |
| vpdpbusd m1, m2, m8 |
| kunpckwd k1, k1, k2 ; 0x000f0000 |
| vpdpbusd m0, m3, m9 |
| vpdpbusd m1, m3, m10 |
| packssdw m0, m1 |
| psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 |
| packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 |
| vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 |
| vpbroadcastd m2, [tlq+r5+29] |
| pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 |
| vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 |
| vextracti32x4 [dstq+strideq*4], m5, 2 |
| vextracti32x4 [dstq+r4 ], m5, 3 |
| lea r0, [strideq+r3*2] |
| .w32_loop: |
| punpckhqdq xmm3, [r6+r0] |
| pinsrb xmm3, [r6+r3*2+15], 11 |
| pshufb xm3, xmm3, xm13 |
| vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ |
| .w32_loop_tail: |
| mova m4, m6 |
| vpdpbusd m4, m2, m7 |
| mova m1, m6 |
| vpdpbusd m1, m2, m8 |
| vpdpbusd m4, m3, m9 |
| vpdpbusd m1, m3, m10 |
| packssdw m4, m1 |
| mova m1, m0 |
| psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 |
| packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 |
| pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 |
| vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 |
| vextracti32x4 [r6+strideq*0+16], m5, 2 |
| vextracti32x4 [r6+strideq*1+16], m5, 3 |
| lea r6, [r6+strideq*2] |
| sub r5d, 2 |
| jg .w32_loop |
| vpermb m3, m11, m1 |
| cmp r5d, -6 |
| jg .w32_loop_tail |
| .ret: |
| RET |
| .w8: |
| vpermb ym3, ym11, ymm2 |
| .w8_loop: |
| vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ |
| mova ym0, ym6 |
| vpdpbusd ym0, ym2, ym7 |
| mova ym1, ym6 |
| vpdpbusd ym1, ym2, ym8 |
| sub tlq, 2 |
| vpdpbusd ym0, ym3, ym9 |
| vpdpbusd ym1, ym3, ym10 |
| mova ym3, ym5 |
| packssdw ym0, ym1 |
| psraw ym5, ym0, 4 ; c0 d0 a1 b1 |
| packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 |
| pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 |
| vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 |
| movq [dstq+strideq*0], xm3 |
| movhps [dstq+strideq*1], xm3 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w4_loop: |
| vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ |
| mova xmm0, xm6 |
| vpdpbusd xmm0, xmm2, xm7 |
| mova xmm1, xm6 |
| vpdpbusd xmm1, xmm2, xm8 |
| sub tlq, 2 |
| vpdpbusd xmm0, xmm3, xm9 |
| vpdpbusd xmm1, xmm3, xm10 |
| packssdw xmm0, xmm1 |
| .w4: |
| psraw xmm0, 4 ; a0 b0 |
| packuswb xmm0, xmm0 |
| movd [dstq+strideq*0], xmm0 |
| pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 |
| movd [dstq+strideq*1], xmm2 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w4_loop |
| RET |
| |
| %endif ; ARCH_X86_64 |