| ; Copyright © 2021-2022, VideoLAN and dav1d authors |
| ; Copyright © 2021-2022, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| %include "x86/filmgrain_common.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 16 |
| pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 |
| gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 |
| gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 |
| next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 |
| pw_27_17_17_27: dw 27, 17, 17, 27 |
| pw_23_22: dw 23, 22, 0, 32 |
| pw_seed_xor: times 2 dw 0xb524 |
| times 2 dw 0x49d8 |
| gen_ar0_shift: times 4 db 128 |
| times 4 db 64 |
| times 4 db 32 |
| times 4 db 16 |
| pd_16: dd 16 |
| pd_m65536: dd -65536 |
| pb_1: times 4 db 1 |
| grain_max: times 2 dw 511 |
| times 2 dw 2047 |
| grain_min: times 2 dw -512 |
| times 2 dw -2048 |
| fg_max: times 2 dw 1023 |
| times 2 dw 4095 |
| times 2 dw 960 |
| times 2 dw 3840 |
| times 2 dw 940 |
| times 2 dw 3760 |
| fg_min: times 2 dw 0 |
| times 2 dw 64 |
| times 2 dw 256 |
| uv_offset_mul: dd 256 |
| dd 1024 |
| hmul_bits: dw 32768, 16384, 8192, 4096 |
| round: dw 2048, 1024, 512 |
| mul_bits: dw 256, 128, 64, 32, 16, 8 |
| round_vals: dw 32, 64, 128, 256, 512, 1024 |
| pb_8_9_0_1: db 8, 9, 0, 1 |
| |
| %macro JMP_TABLE 1-* |
| %xdefine %1_table %%table |
| %xdefine %%base %1_table |
| %xdefine %%prefix mangle(private_prefix %+ _%1) |
| %%table: |
| %rep %0 - 1 |
| dd %%prefix %+ .ar%2 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 |
| |
| SECTION .text |
| |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| INIT_YMM avx2 |
| cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax |
| %define base r4-generate_grain_y_16bpc_avx2_table |
| lea r4, [generate_grain_y_16bpc_avx2_table] |
| vpbroadcastw xm0, [fg_dataq+FGData.seed] |
| mov r6d, [fg_dataq+FGData.grain_scale_shift] |
| movq xm1, [base+next_upperbit_mask] |
| mov r3, -73*82*2 |
| movsxd r5, [fg_dataq+FGData.ar_coeff_lag] |
| lea r7d, [bdmaxq+1] |
| movq xm4, [base+mul_bits] |
| shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc |
| movq xm5, [base+hmul_bits] |
| sub r6, r7 |
| mova xm6, [base+pb_mask] |
| sub bufq, r3 |
| vpbroadcastw xm7, [base+round+r6*2-2] |
| lea r6, [gaussian_sequence] |
| movsxd r5, [r4+r5*4] |
| .loop: |
| pand xm2, xm0, xm1 |
| psrlw xm3, xm2, 10 |
| por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw xm2, xm4 ; bits 0x0f00 are set |
| pmulhuw xm0, xm5 |
| pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds |
| psllq xm2, xm3, 30 |
| por xm2, xm3 |
| psllq xm3, xm2, 15 |
| por xm2, xm0 ; aggregate each bit into next seed's high bit |
| por xm3, xm2 ; 4 next output seeds |
| pshuflw xm0, xm3, q3333 |
| psrlw xm3, 5 |
| pand xm2, xm0, xm1 |
| movq r7, xm3 |
| psrlw xm3, xm2, 10 |
| por xm2, xm3 |
| pmullw xm2, xm4 |
| pmulhuw xm0, xm5 |
| movzx r8d, r7w |
| pshufb xm3, xm6, xm2 |
| psllq xm2, xm3, 30 |
| por xm2, xm3 |
| psllq xm3, xm2, 15 |
| por xm0, xm2 |
| movd xm2, [r6+r8*2] |
| rorx r8, r7, 32 |
| por xm3, xm0 |
| shr r7d, 16 |
| pinsrw xm2, [r6+r7*2], 1 |
| pshuflw xm0, xm3, q3333 |
| movzx r7d, r8w |
| psrlw xm3, 5 |
| pinsrw xm2, [r6+r7*2], 2 |
| shr r8d, 16 |
| movq r7, xm3 |
| pinsrw xm2, [r6+r8*2], 3 |
| movzx r8d, r7w |
| pinsrw xm2, [r6+r8*2], 4 |
| rorx r8, r7, 32 |
| shr r7d, 16 |
| pinsrw xm2, [r6+r7*2], 5 |
| movzx r7d, r8w |
| pinsrw xm2, [r6+r7*2], 6 |
| shr r8d, 16 |
| pinsrw xm2, [r6+r8*2], 7 |
| paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 |
| pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support |
| mova [bufq+r3], xm2 |
| add r3, 8*2 |
| jl .loop |
| |
| ; auto-regression code |
| add r5, r4 |
| jmp r5 |
| |
| .ar1: |
| DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] |
| movd xm4, [fg_dataq+FGData.ar_coeffs_y] |
| DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 |
| pinsrb xm4, [base+pb_1], 3 |
| pmovsxbw xm4, xm4 |
| pshufd xm5, xm4, q1111 |
| pshufd xm4, xm4, q0000 |
| vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd |
| sub bufq, 2*(82*73-(82*3+79)) |
| mov hd, 70 |
| sar maxd, 1 |
| mov mind, maxd |
| xor mind, -1 |
| .y_loop_ar1: |
| mov xq, -76 |
| movsx val3d, word [bufq+xq*2-2] |
| .x_loop_ar1: |
| movu xm0, [bufq+xq*2-82*2-2] ; top/left |
| psrldq xm2, xm0, 2 ; top |
| psrldq xm1, xm0, 4 ; top/right |
| punpcklwd xm0, xm2 |
| punpcklwd xm1, xm3 |
| pmaddwd xm0, xm4 |
| pmaddwd xm1, xm5 |
| paddd xm0, xm1 |
| .x_loop_ar1_inner: |
| movd val0d, xm0 |
| psrldq xm0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| sarx val3d, val3d, shiftd |
| movsx val0d, word [bufq+xq*2] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov word [bufq+xq*2], val3w |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xb, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| .x_loop_ar1_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar1 |
| .ar0: |
| RET |
| |
| .ar2: |
| DEFINE_ARGS buf, fg_data, bdmax, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 |
| vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 |
| vpbroadcastw xm10, [base+round_vals-12+shiftq*2] |
| pxor m1, m1 |
| punpcklwd xm10, xm1 |
| pcmpgtb m1, m0 |
| punpcklbw m0, m1 ; cf5-11,0-4 |
| vpermq m1, m0, q3333 ; cf4 |
| vbroadcasti128 m11, [base+gen_shufA] |
| pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] |
| vbroadcasti128 m12, [base+gen_shufB] |
| pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] |
| punpckhwd xm1, xm0 |
| pshufhw xm9, xm0, q2121 |
| pshufd xm8, xm1, q0000 ; cf[4,9] |
| sar bdmaxd, 1 |
| punpckhqdq xm9, xm9 ; cf[10,11] |
| movd xm4, bdmaxd ; max_grain |
| pcmpeqd xm5, xm5 |
| sub bufq, 2*(82*73-(82*3+79)) |
| pxor xm5, xm4 ; min_grain |
| DEFINE_ARGS buf, fg_data, h, x |
| mov hd, 70 |
| .y_loop_ar2: |
| mov xq, -76 |
| .x_loop_ar2: |
| vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] |
| vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] |
| pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] |
| pmaddwd m0, m6 |
| punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] |
| pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| pmaddwd m1, m7 |
| pmaddwd xm2, xm8 |
| paddd m0, m1 |
| vextracti128 xm1, m0, 1 |
| paddd xm0, xm10 |
| paddd xm2, xm0 |
| movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] |
| paddd xm2, xm1 |
| pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] |
| .x_loop_ar2_inner: |
| pmaddwd xm3, xm9, xm0 |
| psrldq xm0, 2 |
| paddd xm3, xm2 |
| psrldq xm2, 4 ; shift top to next pixel |
| psrad xm3, [fg_dataq+FGData.ar_coeff_shift] |
| ; skip packssdw because we only care about one value |
| paddd xm3, xm1 |
| pminsd xm3, xm4 |
| psrldq xm1, 4 |
| pmaxsd xm3, xm5 |
| pextrw [bufq+xq*2], xm3, 0 |
| punpcklwd xm3, xm3 |
| pblendw xm0, xm3, 0010b |
| inc xq |
| jz .x_loop_ar2_end |
| test xb, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| .x_loop_ar2_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar2 |
| RET |
| |
| .ar3: |
| DEFINE_ARGS buf, fg_data, bdmax, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| sar bdmaxd, 1 |
| movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 |
| movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 |
| pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 |
| pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 |
| movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 |
| vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 |
| vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 |
| vpbroadcastw xm11, [base+round_vals+shiftq*2-12] |
| movd xm12, bdmaxd ; max_grain |
| punpcklbw m7, m7 ; sign-extension |
| punpcklbw m0, m0 ; sign-extension |
| punpcklbw xm1, xm1 |
| REPX {psraw x, 8}, m7, m0, xm1 |
| pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] |
| pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] |
| pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] |
| pshufd xm7, xm7, q3333 ; cf[6,13] |
| pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] |
| pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] |
| paddw xm0, xm11, xm11 |
| pcmpeqd xm13, xm13 |
| pblendw xm10, xm1, xm0, 00001000b |
| pxor xm13, xm12 ; min_grain |
| DEFINE_ARGS buf, fg_data, h, x |
| sub bufq, 2*(82*73-(82*3+79)) |
| mov hd, 70 |
| .y_loop_ar3: |
| mov xq, -76 |
| .x_loop_ar3: |
| movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] |
| vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] |
| movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] |
| vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] |
| palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] |
| palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] |
| punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| pmaddwd m0, m4 |
| pmaddwd m2, m6 |
| pmaddwd m3, m5 |
| paddd m0, m2 |
| movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] |
| vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] |
| paddd m0, m3 |
| psrldq m3, m2, 2 |
| punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| paddd m0, m3 |
| psrldq m3, m2, 4 |
| psrldq m2, 6 |
| vpblendd m2, m11, 0x0f ; rounding constant |
| punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] |
| pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] |
| vextracti128 xm2, m1, 1 |
| punpcklwd xm1, xm2 |
| pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] |
| paddd m0, m3 |
| vextracti128 xm2, m0, 1 |
| paddd xm0, xm1 |
| movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] |
| paddd xm0, xm2 |
| .x_loop_ar3_inner: |
| pmaddwd xm2, xm1, xm10 |
| pshuflw xm3, xm2, q1032 |
| paddd xm2, xm0 ; add top |
| paddd xm2, xm3 ; left+cur |
| psrldq xm0, 4 |
| psrad xm2, [fg_dataq+FGData.ar_coeff_shift] |
| ; skip packssdw because we only care about one value |
| pminsd xm2, xm12 |
| pmaxsd xm2, xm13 |
| pextrw [bufq+xq*2], xm2, 0 |
| pslldq xm2, 4 |
| psrldq xm1, 2 |
| pblendw xm1, xm2, 0100b |
| inc xq |
| jz .x_loop_ar3_end |
| test xb, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| .x_loop_ar3_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar3 |
| RET |
| |
| %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y |
| INIT_XMM avx2 |
| cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax |
| %define base r8-generate_grain_uv_%1_16bpc_avx2_table |
| lea r8, [generate_grain_uv_%1_16bpc_avx2_table] |
| movifnidn bdmaxd, bdmaxm |
| vpbroadcastw xm0, [fg_dataq+FGData.seed] |
| mov r5d, [fg_dataq+FGData.grain_scale_shift] |
| movq xm1, [base+next_upperbit_mask] |
| lea r6d, [bdmaxq+1] |
| movq xm4, [base+mul_bits] |
| shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc |
| movq xm5, [base+hmul_bits] |
| sub r5, r6 |
| mova xm6, [base+pb_mask] |
| vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] |
| vpbroadcastw xm7, [base+round+r5*2-2] |
| pxor xm0, xm2 |
| lea r6, [gaussian_sequence] |
| %if %2 |
| mov r7d, 73-35*%3 |
| add bufq, 44*2 |
| .loop_y: |
| mov r5, -44*2 |
| %else |
| mov r5, -82*73*2 |
| sub bufq, r5 |
| %endif |
| .loop_x: |
| pand xm2, xm0, xm1 |
| psrlw xm3, xm2, 10 |
| por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw xm2, xm4 ; bits 0x0f00 are set |
| pmulhuw xm0, xm5 |
| pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds |
| psllq xm2, xm3, 30 |
| por xm2, xm3 |
| psllq xm3, xm2, 15 |
| por xm2, xm0 ; aggregate each bit into next seed's high bit |
| por xm2, xm3 ; 4 next output seeds |
| pshuflw xm0, xm2, q3333 |
| psrlw xm2, 5 |
| movq r10, xm2 |
| movzx r9d, r10w |
| movd xm2, [r6+r9*2] |
| rorx r9, r10, 32 |
| shr r10d, 16 |
| pinsrw xm2, [r6+r10*2], 1 |
| movzx r10d, r9w |
| pinsrw xm2, [r6+r10*2], 2 |
| shr r9d, 16 |
| pinsrw xm2, [r6+r9*2], 3 |
| paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 |
| pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support |
| movq [bufq+r5], xm2 |
| add r5, 8 |
| jl .loop_x |
| %if %2 |
| add bufq, 82*2 |
| dec r7d |
| jg .loop_y |
| %endif |
| |
| ; auto-regression code |
| movsxd r6, [fg_dataq+FGData.ar_coeff_lag] |
| movsxd r6, [r8+r6*4] |
| add r6, r8 |
| jmp r6 |
| |
| INIT_YMM avx2 |
| .ar0: |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| imul uvd, 28 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| sar bdmaxd, 1 |
| vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] |
| movd xm6, bdmaxd |
| pcmpeqw m7, m7 |
| pmaddubsw m4, m0 ; ar_coeff << (14 - shift) |
| vpbroadcastw m6, xm6 ; max_gain |
| pxor m7, m6 ; min_grain |
| DEFINE_ARGS buf, bufy, h, x |
| %if %2 |
| vpbroadcastw m5, [base+hmul_bits+2+%3*2] |
| sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) |
| %else |
| sub bufq, 2*(82*70-3) |
| %endif |
| add bufyq, 2*(3+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar0: |
| %if %2 |
| ; first 32 pixels |
| movu xm0, [bufyq+16*0] |
| vinserti128 m0, [bufyq+16*2], 1 |
| movu xm1, [bufyq+16*1] |
| vinserti128 m1, [bufyq+16*3], 1 |
| %if %3 |
| movu xm2, [bufyq+82*2+16*0] |
| vinserti128 m2, [bufyq+82*2+16*2], 1 |
| movu xm3, [bufyq+82*2+16*1] |
| vinserti128 m3, [bufyq+82*2+16*3], 1 |
| paddw m0, m2 |
| paddw m1, m3 |
| %endif |
| phaddw m0, m1 |
| movu xm1, [bufyq+16*4] |
| vinserti128 m1, [bufyq+16*6], 1 |
| movu xm2, [bufyq+16*5] |
| vinserti128 m2, [bufyq+16*7], 1 |
| %if %3 |
| movu xm3, [bufyq+82*2+16*4] |
| vinserti128 m3, [bufyq+82*2+16*6], 1 |
| paddw m1, m3 |
| movu xm3, [bufyq+82*2+16*5] |
| vinserti128 m3, [bufyq+82*2+16*7], 1 |
| paddw m2, m3 |
| %endif |
| phaddw m1, m2 |
| pmulhrsw m0, m5 |
| pmulhrsw m1, m5 |
| %else |
| xor xd, xd |
| .x_loop_ar0: |
| movu m0, [bufyq+xq*2] |
| movu m1, [bufyq+xq*2+32] |
| %endif |
| paddw m0, m0 |
| paddw m1, m1 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| %if %2 |
| paddw m0, [bufq+ 0] |
| paddw m1, [bufq+32] |
| %else |
| paddw m0, [bufq+xq*2+ 0] |
| paddw m1, [bufq+xq*2+32] |
| %endif |
| pminsw m0, m6 |
| pminsw m1, m6 |
| pmaxsw m0, m7 |
| pmaxsw m1, m7 |
| %if %2 |
| movu [bufq+ 0], m0 |
| movu [bufq+32], m1 |
| |
| ; last 6 pixels |
| movu xm0, [bufyq+32*4] |
| movu xm1, [bufyq+32*4+16] |
| %if %3 |
| paddw xm0, [bufyq+32*4+82*2] |
| paddw xm1, [bufyq+32*4+82*2+16] |
| %endif |
| phaddw xm0, xm1 |
| movu xm1, [bufq+32*2] |
| pmulhrsw xm0, xm5 |
| paddw xm0, xm0 |
| pmulhrsw xm0, xm4 |
| paddw xm0, xm1 |
| pminsw xm0, xm6 |
| pmaxsw xm0, xm7 |
| vpblendd xm0, xm1, 0x08 |
| movu [bufq+32*2], xm0 |
| %else |
| movu [bufq+xq*2+ 0], m0 |
| movu [bufq+xq*2+32], m1 |
| add xd, 32 |
| cmp xd, 64 |
| jl .x_loop_ar0 |
| |
| ; last 12 pixels |
| movu m0, [bufyq+64*2] |
| movu m1, [bufq+64*2] |
| paddw m0, m0 |
| pmulhrsw m0, m4 |
| paddw m0, m1 |
| pminsw m0, m6 |
| pmaxsw m0, m7 |
| vpblendd m0, m1, 0xc0 |
| movu [bufq+64*2], m0 |
| %endif |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar0 |
| RET |
| |
| INIT_XMM avx2 |
| .ar1: |
| DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift |
| imul uvd, 28 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] |
| movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 |
| DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift |
| pmovsxbw xm4, xm4 |
| pshufd xm5, xm4, q1111 |
| pshufd xm4, xm4, q0000 |
| pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd |
| vpbroadcastw xm6, [base+hmul_bits+2+%3*2] |
| vpbroadcastd xm3, xm3 |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| sar maxd, 1 |
| mov mind, maxd |
| xor mind, -1 |
| .y_loop_ar1: |
| mov xq, -(76>>%2) |
| movsx val3d, word [bufq+xq*2-2] |
| .x_loop_ar1: |
| movu xm0, [bufq+xq*2-82*2-2] ; top/left |
| %if %2 |
| movu xm2, [bufyq+xq*4] |
| %else |
| movq xm2, [bufyq+xq*2] |
| %endif |
| %if %2 |
| %if %3 |
| phaddw xm2, [bufyq+xq*4+82*2] |
| punpckhqdq xm1, xm2, xm2 |
| paddw xm2, xm1 |
| %else |
| phaddw xm2, xm2 |
| %endif |
| pmulhrsw xm2, xm6 |
| %endif |
| psrldq xm1, xm0, 4 ; top/right |
| punpcklwd xm1, xm2 |
| psrldq xm2, xm0, 2 ; top |
| punpcklwd xm0, xm2 |
| pmaddwd xm1, xm5 |
| pmaddwd xm0, xm4 |
| paddd xm1, xm3 |
| paddd xm0, xm1 |
| .x_loop_ar1_inner: |
| movd val0d, xm0 |
| psrldq xm0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| sarx val3d, val3d, shiftd |
| movsx val0d, word [bufq+xq*2] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov word [bufq+xq*2], val3w |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xb, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| .x_loop_ar1_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar1 |
| RET |
| |
| INIT_YMM avx2 |
| .ar2: |
| %if WIN64 |
| ; xmm6 and xmm7 already saved |
| %assign xmm_regs_used 13 + %2 |
| %assign stack_size_padded 136 |
| SUB rsp, stack_size_padded |
| movaps [rsp+16*2], xmm8 |
| movaps [rsp+16*3], xmm9 |
| movaps [rsp+16*4], xmm10 |
| movaps [rsp+16*5], xmm11 |
| movaps [rsp+16*6], xmm12 |
| %if %2 |
| movaps [rsp+16*7], xmm13 |
| %endif |
| %endif |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 28 |
| vbroadcasti128 m10, [base+gen_shufA] |
| sar bdmaxd, 1 |
| vbroadcasti128 m11, [base+gen_shufB] |
| movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] |
| pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 |
| pinsrb xm7, [base+pb_1], 5 |
| pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 |
| movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] |
| pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 |
| pmovsxbw m7, xm7 |
| movd xm8, bdmaxd ; max_grain |
| pshufd m4, m7, q0000 |
| vpbroadcastw xm12, [base+round_vals-12+shiftq*2] |
| pshufd m5, m7, q1111 |
| pcmpeqd xm9, xm9 |
| pshufd m6, m7, q2222 |
| pxor xm9, xm8 ; min_grain |
| pshufd xm7, xm7, q3333 |
| DEFINE_ARGS buf, bufy, fg_data, h, x |
| %if %2 |
| vpbroadcastw xm13, [base+hmul_bits+2+%3*2] |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar2: |
| mov xq, -(76>>%2) |
| .x_loop_ar2: |
| vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] |
| vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] |
| pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] |
| pmaddwd m0, m4 |
| pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| pmaddwd m1, m5 |
| punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] |
| %if %2 |
| movu xm3, [bufyq+xq*4] |
| %if %3 |
| paddw xm3, [bufyq+xq*4+82*2] |
| %endif |
| phaddw xm3, xm3 |
| pmulhrsw xm3, xm13 |
| %else |
| movq xm3, [bufyq+xq*2] |
| %endif |
| punpcklwd xm3, xm12 ; luma, round interleaved |
| vpblendd m2, m3, 0x0f |
| pmaddwd m2, m6 |
| paddd m1, m0 |
| movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] |
| paddd m2, m1 |
| vextracti128 xm1, m2, 1 |
| paddd xm2, xm1 |
| pshufd xm1, xm0, q3321 |
| pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword |
| .x_loop_ar2_inner: |
| pmaddwd xm3, xm7, xm0 |
| paddd xm3, xm2 |
| psrldq xm2, 4 ; shift top to next pixel |
| psrad xm3, [fg_dataq+FGData.ar_coeff_shift] |
| ; we do not need to packssdw since we only care about one value |
| paddd xm3, xm1 |
| psrldq xm1, 4 |
| pminsd xm3, xm8 |
| pmaxsd xm3, xm9 |
| pextrw [bufq+xq*2], xm3, 0 |
| psrldq xm0, 2 |
| pslldq xm3, 2 |
| pblendw xm0, xm3, 00000010b |
| inc xq |
| jz .x_loop_ar2_end |
| test xb, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| .x_loop_ar2_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar2 |
| RET |
| |
| .ar3: |
| %if WIN64 |
| ; xmm6 and xmm7 already saved |
| %assign stack_offset 32 |
| %assign xmm_regs_used 14 + %2 |
| %assign stack_size_padded 152 |
| SUB rsp, stack_size_padded |
| movaps [rsp+16*2], xmm8 |
| movaps [rsp+16*3], xmm9 |
| movaps [rsp+16*4], xmm10 |
| movaps [rsp+16*5], xmm11 |
| movaps [rsp+16*6], xmm12 |
| movaps [rsp+16*7], xmm13 |
| %if %2 |
| movaps [rsp+16*8], xmm14 |
| %endif |
| %endif |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 28 |
| vpbroadcastw xm11, [base+round_vals-12+shiftq*2] |
| sar bdmaxd, 1 |
| movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] |
| pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma |
| movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] |
| pmovsxbw m7, xm7 |
| %if %2 |
| vpbroadcastw xm14, [base+hmul_bits+2+%3*2] |
| %endif |
| pshufd m4, m7, q0000 |
| pshufd m5, m7, q1111 |
| pshufd m6, m7, q2222 |
| pshufd m7, m7, q3333 |
| movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] |
| pinsrb xm0, [base+pb_1], 3 |
| pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 |
| pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 |
| pmovsxbw m0, xm0 |
| movd xm12, bdmaxd ; max_grain |
| pshufd m8, m0, q0000 |
| pshufd m9, m0, q1111 |
| pcmpeqd xm13, xm13 |
| punpckhqdq xm10, xm0, xm0 |
| pxor xm13, xm12 ; min_grain |
| pinsrw xm10, [base+round_vals-10+shiftq*2], 3 |
| DEFINE_ARGS buf, bufy, fg_data, h, unused, x |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar3: |
| mov xq, -(76>>%2) |
| .x_loop_ar3: |
| movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] |
| vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] |
| movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] |
| vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] |
| palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] |
| palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] |
| punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| pmaddwd m0, m4 |
| pmaddwd m2, m6 |
| pmaddwd m3, m5 |
| paddd m0, m2 |
| paddd m0, m3 |
| movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] |
| vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] |
| %if %2 |
| movu xm3, [bufyq+xq*4] |
| %if %3 |
| paddw xm3, [bufyq+xq*4+82*2] |
| %endif |
| phaddw xm3, xm3 |
| pmulhrsw xm3, xm14 |
| %else |
| movq xm3, [bufyq+xq*2] |
| %endif |
| punpcklwd m1, m3 |
| pmaddwd m1, m7 |
| paddd m0, m1 |
| psrldq m1, m2, 4 |
| psrldq m3, m2, 6 |
| vpblendd m3, m11, 0x0f ; rounding constant |
| punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] |
| pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] |
| psrldq m3, m2, 2 |
| punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| paddd m0, m1 |
| movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] |
| paddd m0, m2 |
| vextracti128 xm2, m0, 1 |
| paddd xm0, xm2 |
| .x_loop_ar3_inner: |
| pmaddwd xm2, xm1, xm10 |
| pshuflw xm3, xm2, q1032 |
| paddd xm2, xm0 ; add top |
| paddd xm2, xm3 ; left+cur |
| psrldq xm0, 4 |
| psrad xm2, [fg_dataq+FGData.ar_coeff_shift] |
| psrldq xm1, 2 |
| ; no need to packssdw since we only care about one value |
| pminsd xm2, xm12 |
| pmaxsd xm2, xm13 |
| pextrw [bufq+xq*2], xm2, 0 |
| pslldq xm2, 4 |
| pblendw xm1, xm2, 00000100b |
| inc xq |
| jz .x_loop_ar3_end |
| test xb, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| .x_loop_ar3_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar3 |
| RET |
| %endmacro |
| |
| cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ |
| grain_lut, unused, sby, see |
| %define base r11-grain_min |
| lea r11, [grain_min] |
| mov r6d, r9m ; bdmax |
| mov r9d, [fg_dataq+FGData.clip_to_restricted_range] |
| mov r7d, [fg_dataq+FGData.scaling_shift] |
| mov sbyd, sbym |
| vpbroadcastd m8, r9m |
| shr r6d, 11 ; is_12bpc |
| vpbroadcastd m9, [base+grain_min+r6*4] |
| shlx r10d, r9d, r6d |
| vpbroadcastd m10, [base+grain_max+r6*4] |
| lea r9d, [r6+r9*4] |
| vpbroadcastw m11, [base+mul_bits+r7*2-12] |
| vpbroadcastd m12, [base+fg_min+r10*4] |
| vpbroadcastd m13, [base+fg_max+r9*4] |
| test sbyd, sbyd |
| setnz r7b |
| vpbroadcastd m14, [base+pd_16] |
| test r7b, [fg_dataq+FGData.overlap_flag] |
| jnz .vertical_overlap |
| |
| imul seed, sbyd, (173 << 24) | 37 |
| add seed, (105 << 24) | 178 |
| rorx seed, seed, 24 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak |
| |
| lea src_bakq, [srcq+wq*2] |
| neg wq |
| sub dstq, srcq |
| |
| .loop_x: |
| rorx r6, seeq, 1 |
| or seed, 0xEFF4 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| rorx offyd, seed, 8 |
| rorx offxq, seeq, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyd, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| .loop_y: |
| ; scaling[src] |
| mova m0, [srcq+ 0] |
| mova m1, [srcq+32] |
| pand m4, m8, m0 |
| psrld m3, m0, 16 |
| mova m6, m9 |
| vpgatherdd m2, [scalingq+m4-0], m9 |
| pand m3, m8 |
| mova m9, m6 |
| vpgatherdd m4, [scalingq+m3-2], m6 |
| pand m5, m8, m1 |
| mova m6, m9 |
| vpgatherdd m3, [scalingq+m5-0], m9 |
| pblendw m4, m2, 0x55 |
| psrld m2, m1, 16 |
| mova m9, m6 |
| pand m2, m8 |
| vpgatherdd m5, [scalingq+m2-2], m6 |
| pblendw m5, m3, 0x55 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m4, [grain_lutq+offxyq*2] |
| pmulhrsw m5, [grain_lutq+offxyq*2+32] |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+32], m1 |
| |
| add srcq, strideq |
| add grain_lutq, 82*2 |
| dec hd |
| jg .loop_y |
| add wq, 32 |
| jge .end |
| lea srcq, [src_bakq+wq*2] |
| cmp byte [fg_dataq+FGData.overlap_flag], 0 |
| je .loop_x |
| movq xm7, [pw_27_17_17_27] |
| cmp dword r8m, 0 ; sby |
| jne .loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| .loop_x_h_overlap: |
| rorx r6, seeq, 1 |
| or seed, 0xEFF4 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, left_offxy |
| |
| lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx |
| rorx offyd, seed, 8 |
| rorx offxq, seeq, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyd, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, left_offxy |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| .loop_y_h_overlap: |
| ; scaling[src] |
| mova m0, [srcq+ 0] |
| mova m1, [srcq+32] |
| pand m4, m8, m0 |
| psrld m3, m0, 16 |
| mova m6, m9 |
| vpgatherdd m2, [scalingq+m4-0], m9 |
| pand m3, m8 |
| mova m9, m6 |
| vpgatherdd m4, [scalingq+m3-2], m6 |
| pand m5, m8, m1 |
| mova m6, m9 |
| vpgatherdd m3, [scalingq+m5-0], m9 |
| pblendw m4, m2, 0x55 |
| psrld m2, m1, 16 |
| mova m9, m6 |
| pand m2, m8 |
| vpgatherdd m5, [scalingq+m2-2], m6 |
| pblendw m5, m3, 0x55 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq*2] |
| movd xm6, [grain_lutq+left_offxyq*2] |
| punpcklwd xm6, xm3 |
| pmaddwd xm6, xm7 |
| paddd xm6, xm14 |
| psrad xm6, 5 |
| packssdw xm6, xm6 |
| pmaxsw xm6, xm9 |
| pminsw xm6, xm10 |
| vpblendd m3, m6, 0x01 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m4, m3 |
| pmulhrsw m5, [grain_lutq+offxyq*2+32] |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+32], m1 |
| |
| add srcq, strideq |
| add grain_lutq, 82*2 |
| dec hd |
| jg .loop_y_h_overlap |
| add wq, 32 |
| jge .end |
| lea srcq, [src_bakq+wq*2] |
| cmp dword r8m, 0 ; sby |
| jne .loop_x_hv_overlap |
| jmp .loop_x_h_overlap |
| |
| .vertical_overlap: |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ |
| sby, see, src_bak |
| |
| movzx sbyd, sbyb |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| imul r7d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add r7d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and r7d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, r7d |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| lea src_bakq, [srcq+wq*2] |
| neg wq |
| sub dstq, srcq |
| |
| .loop_x_v_overlap: |
| vpbroadcastd m15, [pw_27_17_17_27] |
| |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, unused, top_offxy |
| |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyd, [offyq+offxq*2+0x10001*747+32*82] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, unused, top_offxy |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| .loop_y_v_overlap: |
| ; scaling[src] |
| mova m0, [srcq+ 0] |
| mova m1, [srcq+32] |
| pand m4, m8, m0 |
| psrld m3, m0, 16 |
| mova m6, m9 |
| vpgatherdd m2, [scalingq+m4-0], m9 |
| pand m3, m8 |
| mova m9, m6 |
| vpgatherdd m4, [scalingq+m3-2], m6 |
| pand m5, m8, m1 |
| mova m6, m9 |
| vpgatherdd m3, [scalingq+m5-0], m9 |
| pblendw m2, m4, 0xaa |
| psrld m4, m1, 16 |
| mova m9, m6 |
| pand m4, m8 |
| vpgatherdd m5, [scalingq+m4-2], m6 |
| pblendw m3, m5, 0xaa |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m6, [grain_lutq+offxyq*2] |
| movu m5, [grain_lutq+top_offxyq*2] |
| punpcklwd m4, m5, m6 |
| punpckhwd m5, m6 |
| pmaddwd m4, m15 |
| pmaddwd m5, m15 |
| movu m7, [grain_lutq+offxyq*2+32] |
| movu m6, [grain_lutq+top_offxyq*2+32] |
| paddd m4, m14 |
| paddd m5, m14 |
| psrad m4, 5 |
| psrad m5, 5 |
| packssdw m4, m5 |
| punpcklwd m5, m6, m7 |
| punpckhwd m6, m7 |
| pmaddwd m5, m15 |
| pmaddwd m6, m15 |
| paddd m5, m14 |
| paddd m6, m14 |
| psrad m5, 5 |
| psrad m6, 5 |
| packssdw m5, m6 |
| pmaxsw m4, m9 |
| pmaxsw m5, m9 |
| pminsw m4, m10 |
| pminsw m5, m10 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmaddubsw m2, m11 |
| pmaddubsw m3, m11 |
| paddw m2, m2 |
| paddw m3, m3 |
| pmulhrsw m4, m2 |
| pmulhrsw m5, m3 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+32], m1 |
| |
| add srcq, strideq |
| add grain_lutq, 82*2 |
| dec hb |
| jz .end_y_v_overlap |
| vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| add hd, 0x80000000 |
| jnc .loop_y_v_overlap |
| jmp .loop_y |
| .end_y_v_overlap: |
| add wq, 32 |
| jge .end |
| lea srcq, [src_bakq+wq*2] |
| |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| |
| .loop_x_hv_overlap: |
| vpbroadcastd m15, [pw_27_17_17_27] |
| |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy |
| |
| lea topleft_offxyd, [top_offxyq+32] |
| lea left_offxyd, [offyq+32] |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyd, [offyq+offxq*2+0x10001*747+32*82] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| .loop_y_hv_overlap: |
| ; scaling[src] |
| mova m0, [srcq+ 0] |
| mova m1, [srcq+32] |
| pand m4, m8, m0 |
| psrld m3, m0, 16 |
| mova m6, m9 |
| vpgatherdd m2, [scalingq+m4-0], m9 |
| pand m3, m8 |
| mova m9, m6 |
| vpgatherdd m4, [scalingq+m3-2], m6 |
| pand m5, m8, m1 |
| mova m6, m9 |
| vpgatherdd m3, [scalingq+m5-0], m9 |
| pblendw m2, m4, 0xaa |
| psrld m4, m1, 16 |
| mova m9, m6 |
| pand m4, m8 |
| vpgatherdd m5, [scalingq+m4-2], m6 |
| pblendw m3, m5, 0xaa |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m7, [grain_lutq+offxyq*2] |
| movd xm6, [grain_lutq+left_offxyq*2] |
| movu m5, [grain_lutq+top_offxyq*2] |
| movd xm4, [grain_lutq+topleft_offxyq*2] |
| ; do h interpolation first (so top | top/left -> top, left | cur -> cur) |
| punpcklwd xm6, xm7 |
| punpcklwd xm4, xm5 |
| punpcklqdq xm6, xm4 |
| movddup xm4, [pw_27_17_17_27] |
| pmaddwd xm6, xm4 |
| paddd xm6, xm14 |
| psrad xm6, 5 |
| packssdw xm6, xm6 |
| pmaxsw xm6, xm9 |
| pminsw xm6, xm10 |
| pshuflw xm4, xm6, q1032 |
| vpblendd m6, m7, 0xfe |
| vpblendd m4, m5, 0xfe |
| ; followed by v interpolation (top | cur -> cur) |
| punpckhwd m5, m7 |
| pmaddwd m5, m15 |
| punpcklwd m4, m6 |
| pmaddwd m4, m15 |
| movu m7, [grain_lutq+offxyq*2+32] |
| movu m6, [grain_lutq+top_offxyq*2+32] |
| paddd m5, m14 |
| paddd m4, m14 |
| psrad m5, 5 |
| psrad m4, 5 |
| packssdw m4, m5 |
| punpcklwd m5, m6, m7 |
| punpckhwd m6, m7 |
| pmaddwd m5, m15 |
| pmaddwd m6, m15 |
| paddd m5, m14 |
| paddd m6, m14 |
| psrad m5, 5 |
| psrad m6, 5 |
| packssdw m5, m6 |
| pmaxsw m4, m9 |
| pmaxsw m5, m9 |
| pminsw m4, m10 |
| pminsw m5, m10 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmaddubsw m2, m11 |
| pmaddubsw m3, m11 |
| paddw m2, m2 |
| paddw m3, m3 |
| pmulhrsw m4, m2 |
| pmulhrsw m5, m3 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+32], m1 |
| |
| add srcq, strideq |
| add grain_lutq, 82*2 |
| dec hb |
| jz .end_y_hv_overlap |
| vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| add hd, 0x80000000 |
| jnc .loop_y_hv_overlap |
| movq xm7, [pw_27_17_17_27] |
| jmp .loop_y_h_overlap |
| .end_y_hv_overlap: |
| add wq, 32 |
| lea srcq, [src_bakq+wq*2] |
| jl .loop_x_hv_overlap |
| .end: |
| RET |
| |
| %macro FGUV_FN 3 ; name, ss_hor, ss_ver |
| cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ |
| grain_lut, h, sby, luma, lstride, uv_pl, is_id |
| %define base r12-grain_min |
| lea r12, [grain_min] |
| mov r9d, r13m ; bdmax |
| mov r7d, [fg_dataq+FGData.scaling_shift] |
| mov r11d, is_idm |
| mov sbyd, sbym |
| vpbroadcastw m11, [base+mul_bits+r7*2-12] |
| mov r6d, [fg_dataq+FGData.clip_to_restricted_range] |
| shr r9d, 11 ; is_12bpc |
| vpbroadcastd m8, [base+grain_min+r9*4] |
| shlx r10d, r6d, r9d |
| vpbroadcastd m9, [base+grain_max+r9*4] |
| vpbroadcastw m10, r13m |
| shlx r6d, r6d, r11d |
| vpbroadcastd m12, [base+fg_min+r10*4] |
| lea r6d, [r9+r6*2] |
| vpbroadcastd m13, [base+fg_max+r6*4] |
| test sbyd, sbyd |
| setnz r7b |
| cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 |
| jne .csfl |
| |
| %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused, sby, see, overlap |
| |
| %if %1 |
| mov r6d, r11m |
| vpbroadcastd m0, [base+pb_8_9_0_1] |
| vpbroadcastd m1, [base+uv_offset_mul+r9*4] |
| vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] |
| vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] |
| pshufb m14, m0 ; { uv_luma_mult, uv_mult } |
| pmaddwd m15, m1 |
| %else |
| %if %2 |
| vpbroadcastq m15, [base+pw_23_22] |
| %else |
| vpbroadcastq m15, [base+pw_27_17_17_27] |
| %endif |
| vpbroadcastd m14, [base+pd_16] |
| %endif |
| test r7b, [fg_dataq+FGData.overlap_flag] |
| jnz %%vertical_overlap |
| |
| imul seed, sbyd, (173 << 24) | 37 |
| add seed, (105 << 24) | 178 |
| rorx seed, seed, 24 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused2, unused3, see, unused4, unused5, unused6, luma, lstride |
| |
| mov lumaq, r9mp |
| mov lstrideq, r10mp |
| lea r10, [srcq+wq*2] |
| lea r11, [dstq+wq*2] |
| lea r12, [lumaq+wq*(2<<%2)] |
| mov r9mp, r10 |
| mov r11mp, r11 |
| mov r12mp, r12 |
| neg wq |
| |
| %%loop_x: |
| rorx r6, seeq, 1 |
| or seed, 0xEFF4 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, unused1, unused2, unused3, luma, lstride |
| |
| rorx offyd, seed, 8 |
| rorx offxq, seeq, 12 |
| and offyd, 0xf |
| imul offyd, 164>>%3 |
| lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, unused1, unused2, unused3, luma, lstride |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| %%loop_y: |
| ; luma_src |
| %if %2 |
| mova xm2, [lumaq+lstrideq*0+ 0] |
| vinserti128 m2, [lumaq+lstrideq*0+32], 1 |
| mova xm4, [lumaq+lstrideq*0+16] |
| vinserti128 m4, [lumaq+lstrideq*0+48], 1 |
| mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] |
| vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 |
| mova xm5, [lumaq+lstrideq*(1<<%3)+16] |
| vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 |
| phaddw m2, m4 |
| phaddw m3, m5 |
| pxor m4, m4 |
| pavgw m2, m4 |
| pavgw m3, m4 |
| %elif %1 |
| mova m2, [lumaq+ 0] |
| mova m3, [lumaq+32] |
| %endif |
| %if %1 |
| mova m0, [srcq] |
| %if %2 |
| mova m1, [srcq+strideq] |
| %else |
| mova m1, [srcq+32] |
| %endif |
| punpckhwd m4, m2, m0 |
| punpcklwd m2, m0 |
| punpckhwd m5, m3, m1 |
| punpcklwd m3, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m4, m2, m5, m3 |
| REPX {paddd x, m15}, m4, m2, m5, m3 |
| REPX {psrad x, 6 }, m4, m2, m5, m3 |
| packusdw m2, m4 |
| packusdw m3, m5 |
| pminuw m2, m10 |
| pminuw m3, m10 ; clip_pixel() |
| %elif %2 |
| pand m2, m10 |
| pand m3, m10 |
| %else |
| pand m2, m10, [lumaq+ 0] |
| pand m3, m10, [lumaq+32] |
| %endif |
| |
| ; scaling[luma_src] |
| vpbroadcastd m7, [pd_m65536] |
| pandn m4, m7, m2 |
| mova m6, m7 |
| vpgatherdd m5, [scalingq+m4-0], m7 |
| psrld m2, 16 |
| mova m7, m6 |
| vpgatherdd m4, [scalingq+m2-2], m6 |
| pblendw m4, m5, 0x55 |
| pandn m5, m7, m3 |
| mova m6, m7 |
| vpgatherdd m2, [scalingq+m5-0], m7 |
| psrld m3, 16 |
| vpgatherdd m5, [scalingq+m3-2], m6 |
| pblendw m5, m2, 0x55 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m4, [grain_lutq+offxyq*2] |
| %if %2 |
| pmulhrsw m5, [grain_lutq+offxyq*2+82*2] |
| %else |
| pmulhrsw m5, [grain_lutq+offxyq*2+32] |
| %endif |
| |
| ; dst = clip_pixel(src, noise) |
| %if %1 |
| paddw m0, m4 |
| paddw m1, m5 |
| %else |
| paddw m0, m4, [srcq] |
| %if %2 |
| paddw m1, m5, [srcq+strideq] |
| %else |
| paddw m1, m5, [srcq+32] |
| %endif |
| %endif |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq], m0 |
| %if %2 |
| mova [dstq+strideq], m1 |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*(2<<%3)] |
| %else |
| mova [dstq+32], m1 |
| add srcq, strideq |
| add dstq, strideq |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*(2<<%2) |
| %if %2 |
| sub hb, 2 |
| %else |
| dec hb |
| %endif |
| jg %%loop_y |
| add wq, 32>>%2 |
| jge .end |
| mov srcq, r9mp |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| cmp byte [fg_dataq+FGData.overlap_flag], 0 |
| je %%loop_x |
| cmp dword r8m, 0 ; sby |
| jne %%loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| %%loop_x_h_overlap: |
| rorx r6, seeq, 1 |
| or seed, 0xEFF4 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, unused1, unused2, luma, lstride |
| |
| lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx |
| rorx offyd, seed, 8 |
| rorx offxq, seeq, 12 |
| and offyd, 0xf |
| imul offyd, 164>>%3 |
| lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, unused1, unused2, luma, lstride |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| %%loop_y_h_overlap: |
| ; luma_src |
| %if %2 |
| mova xm2, [lumaq+lstrideq*0+ 0] |
| vinserti128 m2, [lumaq+lstrideq*0+32], 1 |
| mova xm4, [lumaq+lstrideq*0+16] |
| vinserti128 m4, [lumaq+lstrideq*0+48], 1 |
| mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] |
| vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 |
| mova xm5, [lumaq+lstrideq*(1<<%3)+16] |
| vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 |
| phaddw m2, m4 |
| phaddw m3, m5 |
| pxor m4, m4 |
| pavgw m2, m4 |
| pavgw m3, m4 |
| %elif %1 |
| mova m2, [lumaq] |
| mova m3, [lumaq+32] |
| %endif |
| %if %1 |
| mova m0, [srcq] |
| %if %2 |
| mova m1, [srcq+strideq] |
| %else |
| mova m1, [srcq+32] |
| %endif |
| punpckhwd m4, m2, m0 |
| punpcklwd m2, m0 |
| punpckhwd m5, m3, m1 |
| punpcklwd m3, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m4, m2, m5, m3 |
| REPX {paddd x, m15}, m4, m2, m5, m3 |
| REPX {psrad x, 6 }, m4, m2, m5, m3 |
| packusdw m2, m4 |
| packusdw m3, m5 |
| pminuw m2, m10 ; clip_pixel() |
| pminuw m3, m10 |
| %elif %2 |
| pand m2, m10 |
| pand m3, m10 |
| %else |
| pand m2, m10, [lumaq+ 0] |
| pand m3, m10, [lumaq+32] |
| %endif |
| |
| ; scaling[luma_src] |
| vpbroadcastd m7, [pd_m65536] |
| pandn m4, m7, m2 |
| mova m6, m7 |
| vpgatherdd m5, [scalingq+m4-0], m7 |
| psrld m2, 16 |
| mova m7, m6 |
| vpgatherdd m4, [scalingq+m2-2], m6 |
| pblendw m4, m5, 0x55 |
| pandn m5, m7, m3 |
| mova m6, m7 |
| vpgatherdd m2, [scalingq+m5-0], m7 |
| psrld m3, 16 |
| vpgatherdd m5, [scalingq+m3-2], m6 |
| pblendw m5, m2, 0x55 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m2, [grain_lutq+offxyq*2] |
| %if %2 |
| movu m3, [grain_lutq+offxyq*2+82*2] |
| %else |
| movu m3, [grain_lutq+offxyq*2+32] |
| %endif |
| movd xm6, [grain_lutq+left_offxyq*2] |
| %if %2 |
| pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} |
| punpckldq xm7, xm2, xm3 ; {cur0, cur1} |
| punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} |
| %else |
| punpcklwd xm6, xm2 |
| %endif |
| %if %1 |
| %if %2 |
| vpbroadcastq xm7, [pw_23_22] |
| %else |
| movq xm7, [pw_27_17_17_27] |
| %endif |
| pmaddwd xm6, xm7 |
| vpbroadcastd xm7, [pd_16] |
| paddd xm6, xm7 |
| %else |
| pmaddwd xm6, xm15 |
| paddd xm6, xm14 |
| %endif |
| psrad xm6, 5 |
| packssdw xm6, xm6 |
| pmaxsw xm6, xm8 |
| pminsw xm6, xm9 |
| vpblendd m2, m6, 0x01 |
| %if %2 |
| pshuflw xm6, xm6, q1032 |
| vpblendd m3, m6, 0x01 |
| %endif |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| %if %1 |
| paddw m0, m2 |
| paddw m1, m3 |
| %else |
| paddw m0, m2, [srcq] |
| %if %2 |
| paddw m1, m3, [srcq+strideq] |
| %else |
| paddw m1, m3, [srcq+32] |
| %endif |
| %endif |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq], m0 |
| %if %2 |
| mova [dstq+strideq], m1 |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*(2<<%3)] |
| %else |
| mova [dstq+32], m1 |
| add srcq, strideq |
| add dstq, strideq |
| add lumaq, r10mp |
| %endif |
| add grain_lutq, 82*(2<<%2) |
| %if %2 |
| sub hb, 2 |
| %else |
| dec hb |
| %endif |
| jg %%loop_y_h_overlap |
| add wq, 32>>%2 |
| jge .end |
| mov srcq, r9mp |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| cmp dword r8m, 0 ; sby |
| jne %%loop_x_hv_overlap |
| jmp %%loop_x_h_overlap |
| |
| %%vertical_overlap: |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ |
| sby, see, unused1, unused2, unused3, lstride |
| |
| movzx sbyd, sbyb |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| imul r7d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add r7d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and r7d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, r7d |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, unused1, top_offxy, unused2, luma, lstride |
| |
| mov lumaq, r9mp |
| mov lstrideq, r10mp |
| lea r10, [srcq+wq*2] |
| lea r11, [dstq+wq*2] |
| lea r12, [lumaq+wq*(2<<%2)] |
| mov r9mp, r10 |
| mov r11mp, r11 |
| mov r12mp, r12 |
| neg wq |
| |
| %%loop_x_v_overlap: |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164>>%3 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, unused1, top_offxy, unused2, luma, lstride |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| %if %2 == 0 |
| lea r10, [pw_27_17_17_27] |
| %endif |
| %%loop_y_v_overlap: |
| ; luma_src |
| %if %2 |
| mova xm2, [lumaq+lstrideq*0+ 0] |
| vinserti128 m2, [lumaq+lstrideq*0+32], 1 |
| mova xm4, [lumaq+lstrideq*0+16] |
| vinserti128 m4, [lumaq+lstrideq*0+48], 1 |
| mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] |
| vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 |
| mova xm5, [lumaq+lstrideq*(1<<%3)+16] |
| vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 |
| phaddw m2, m4 |
| phaddw m3, m5 |
| pxor m4, m4 |
| pavgw m2, m4 |
| pavgw m3, m4 |
| %elif %1 |
| mova m2, [lumaq] |
| mova m3, [lumaq+32] |
| %endif |
| %if %1 |
| mova m0, [srcq] |
| %if %2 |
| mova m1, [srcq+strideq] |
| %else |
| mova m1, [srcq+32] |
| %endif |
| punpckhwd m4, m2, m0 |
| punpcklwd m2, m0 |
| punpckhwd m5, m3, m1 |
| punpcklwd m3, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m4, m2, m5, m3 |
| REPX {paddd x, m15}, m4, m2, m5, m3 |
| REPX {psrad x, 6 }, m4, m2, m5, m3 |
| packusdw m2, m4 |
| packusdw m3, m5 |
| pminuw m2, m10 ; clip_pixel() |
| pminuw m3, m10 |
| %elif %2 |
| pand m2, m10 |
| pand m3, m10 |
| %else |
| pand m2, m10, [lumaq+ 0] |
| pand m3, m10, [lumaq+32] |
| %endif |
| |
| ; scaling[luma_src] |
| vpbroadcastd m7, [pd_m65536] |
| pandn m4, m7, m2 |
| mova m6, m7 |
| vpgatherdd m5, [scalingq+m4-0], m7 |
| psrld m2, 16 |
| mova m7, m6 |
| vpgatherdd m4, [scalingq+m2-2], m6 |
| pblendw m4, m5, 0x55 |
| pandn m5, m7, m3 |
| mova m6, m7 |
| vpgatherdd m2, [scalingq+m5-0], m7 |
| psrld m3, 16 |
| vpgatherdd m5, [scalingq+m3-2], m6 |
| pblendw m5, m2, 0x55 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m6, [grain_lutq+offxyq*2] |
| movu m3, [grain_lutq+top_offxyq*2] |
| punpcklwd m2, m3, m6 |
| punpckhwd m3, m6 ; { top, cur } |
| %if %3 |
| vpbroadcastd m0, [pw_23_22] |
| %elif %2 |
| vpbroadcastd m0, [pw_27_17_17_27] |
| %else |
| vpbroadcastd m0, [r10] |
| %endif |
| REPX {pmaddwd x, m0}, m2, m3 |
| %if %1 |
| vpbroadcastd m1, [pd_16] |
| REPX {paddd x, m1}, m2, m3 |
| %else |
| REPX {paddd x, m14}, m2, m3 |
| %endif |
| REPX {psrad x, 5}, m2, m3 |
| packssdw m2, m3 |
| %if %2 |
| movu m3, [grain_lutq+offxyq*2+82*2] |
| %else |
| movu m3, [grain_lutq+offxyq*2+32] |
| %endif |
| %if %3 |
| pmaxsw m2, m8 |
| pminsw m2, m9 |
| %else |
| %if %2 |
| movu m7, [grain_lutq+top_offxyq*2+82*2] |
| punpckhwd m6, m3, m7 ; { cur, top } |
| punpcklwd m3, m7 |
| %else |
| movu m7, [grain_lutq+top_offxyq*2+32] |
| punpckhwd m6, m7, m3 |
| punpcklwd m3, m7, m3 ; { top, cur } |
| %endif |
| pmaddwd m6, m0 |
| pmaddwd m3, m0 |
| %if %1 |
| paddd m6, m1 |
| paddd m3, m1 |
| %else |
| paddd m6, m14 |
| paddd m3, m14 |
| %endif |
| psrad m6, 5 |
| psrad m3, 5 |
| packssdw m3, m6 |
| pmaxsw m2, m8 |
| pmaxsw m3, m8 |
| pminsw m2, m9 |
| pminsw m3, m9 |
| %endif |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2, [srcq] |
| %if %2 |
| paddw m1, m3, [srcq+strideq] |
| %else |
| paddw m1, m3, [srcq+32] |
| %endif |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq], m0 |
| %if %2 |
| mova [dstq+strideq], m1 |
| sub hb, 2 |
| %else |
| mova [dstq+32], m1 |
| dec hb |
| %endif |
| jle %%end_y_v_overlap |
| %if %2 |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*(2<<%3)] |
| %else |
| add srcq, strideq |
| add dstq, strideq |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*(2<<%2) |
| %if %2 |
| jmp %%loop_y |
| %else |
| add hd, 0x80000000 |
| jc %%loop_y |
| add r10, 4 |
| jmp %%loop_y_v_overlap |
| %endif |
| %%end_y_v_overlap: |
| add wq, 32>>%2 |
| jge .end |
| mov srcq, r9mp |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| %%loop_x_hv_overlap: |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride |
| |
| %if %2 == 0 |
| lea r14, [pw_27_17_17_27] |
| %endif |
| lea topleft_offxyq, [top_offxyq+(32>>%2)] |
| lea left_offxyq, [offyq+(32>>%2)] |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164>>%3 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride |
| |
| mov grain_lutq, grain_lutmp |
| mov hd, hm |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| %%loop_y_hv_overlap: |
| ; luma_src |
| %if %2 |
| mova xm2, [lumaq+lstrideq*0+ 0] |
| vinserti128 m2, [lumaq+lstrideq*0+32], 1 |
| mova xm4, [lumaq+lstrideq*0+16] |
| vinserti128 m4, [lumaq+lstrideq*0+48], 1 |
| mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] |
| vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 |
| mova xm5, [lumaq+lstrideq*(1<<%3)+16] |
| vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 |
| phaddw m2, m4 |
| phaddw m3, m5 |
| pxor m4, m4 |
| pavgw m2, m4 |
| pavgw m3, m4 |
| %elif %1 |
| mova m2, [lumaq] |
| mova m3, [lumaq+32] |
| %endif |
| %if %1 |
| mova m0, [srcq] |
| %if %2 |
| mova m1, [srcq+strideq] |
| %else |
| mova m1, [srcq+32] |
| %endif |
| punpckhwd m4, m2, m0 |
| punpcklwd m2, m0 |
| punpckhwd m5, m3, m1 |
| punpcklwd m3, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m4, m2, m5, m3 |
| REPX {paddd x, m15}, m4, m2, m5, m3 |
| REPX {psrad x, 6 }, m4, m2, m5, m3 |
| packusdw m2, m4 |
| packusdw m3, m5 |
| pminuw m2, m10 ; clip_pixel() |
| pminuw m3, m10 |
| %elif %2 |
| pand m2, m10 |
| pand m3, m10 |
| %else |
| pand m2, m10, [lumaq+ 0] |
| pand m3, m10, [lumaq+32] |
| %endif |
| |
| ; scaling[luma_src] |
| vpbroadcastd m7, [pd_m65536] |
| pandn m4, m7, m2 |
| mova m6, m7 |
| vpgatherdd m5, [scalingq+m4-0], m7 |
| psrld m2, 16 |
| mova m7, m6 |
| vpgatherdd m4, [scalingq+m2-2], m6 |
| pblendw m4, m5, 0x55 |
| pandn m5, m7, m3 |
| mova m6, m7 |
| vpgatherdd m2, [scalingq+m5-0], m7 |
| psrld m3, 16 |
| vpgatherdd m5, [scalingq+m3-2], m6 |
| pblendw m5, m2, 0x55 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m0, [grain_lutq+offxyq*2] |
| movd xm2, [grain_lutq+left_offxyq*2] |
| movu m6, [grain_lutq+top_offxyq*2] |
| %if %2 |
| pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 |
| movu m3, [grain_lutq+offxyq*2+82*2] |
| punpckldq xm1, xm0, xm3 ; { cur0, cur1 } |
| %if %3 |
| vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } |
| vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } |
| %else |
| vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 |
| vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] |
| vpblendd m2, m7, 0x20 |
| movd xm7, [grain_lutq+top_offxyq*2+82*2] |
| punpckldq xm7, xm6 |
| vinserti128 m1, xm7, 1 |
| movu m7, [grain_lutq+top_offxyq*2+82*2] |
| %endif |
| punpcklwd m2, m1 ; { cur, left } |
| %if %1 |
| vpbroadcastq m1, [pw_23_22] |
| pmaddwd m2, m1 |
| vpbroadcastd m1, [pd_16] |
| paddd m2, m1 |
| psrad m2, 5 |
| packssdw m2, m2 |
| vpermq m2, m2, q3120 |
| %else |
| pmaddwd m2, m15 |
| paddd m2, m14 |
| psrad m2, 5 |
| vextracti128 xm1, m2, 1 |
| packssdw xm2, xm1 |
| %endif |
| %else |
| pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 |
| movu m3, [grain_lutq+offxyq*2+32] |
| movu m7, [grain_lutq+top_offxyq*2+32] |
| punpckldq xm1, xm0, xm6 |
| punpcklwd xm2, xm1 ; { cur, left } |
| %if %1 |
| movddup xm1, [pw_27_17_17_27] |
| pmaddwd xm2, xm1 |
| vpbroadcastd m1, [pd_16] |
| paddd xm2, xm1 |
| %else |
| pmaddwd xm2, xm15 |
| paddd xm2, xm14 |
| %endif |
| psrad xm2, 5 |
| packssdw xm2, xm2 |
| %endif |
| pmaxsw xm2, xm8 |
| pminsw xm2, xm9 |
| vpblendd m0, m2, 0x01 |
| %if %2 |
| pshufd xm2, xm2, q0321 |
| vpblendd m3, m2, 0x01 |
| %if %3 == 0 |
| pshufd xm2, xm2, q0321 |
| vpblendd m7, m2, 0x01 |
| %endif |
| %endif |
| pshuflw xm2, xm2, q1032 |
| vpblendd m2, m6, 0xfe |
| punpckhwd m6, m0 ; { top, cur } |
| punpcklwd m2, m0 |
| %if %3 |
| vpbroadcastd m0, [pw_23_22] |
| %elif %2 |
| vpbroadcastd m0, [pw_27_17_17_27] |
| %else |
| vpbroadcastd m0, [r14] |
| %endif |
| pmaddwd m6, m0 |
| pmaddwd m2, m0 |
| %if %1 |
| paddd m6, m1 |
| paddd m2, m1 |
| %else |
| paddd m6, m14 |
| paddd m2, m14 |
| %endif |
| psrad m6, 5 |
| psrad m2, 5 |
| packssdw m2, m6 |
| |
| %if %3 |
| pmaxsw m2, m8 |
| pminsw m2, m9 |
| %else |
| %if %2 |
| punpckhwd m6, m3, m7 |
| punpcklwd m3, m7 ; { cur, top } |
| %else |
| punpckhwd m6, m7, m3 |
| punpcklwd m3, m7, m3 ; { top, cur } |
| %endif |
| REPX {pmaddwd x, m0}, m6, m3 |
| %if %1 |
| REPX {paddd x, m1}, m6, m3 |
| %else |
| REPX {paddd x, m14}, m6, m3 |
| %endif |
| REPX {psrad x, 5}, m6, m3 |
| packssdw m3, m6 |
| pmaxsw m2, m8 |
| pmaxsw m3, m8 |
| pminsw m2, m9 |
| pminsw m3, m9 |
| %endif |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmaddubsw m4, m11 |
| pmaddubsw m5, m11 |
| paddw m4, m4 |
| paddw m5, m5 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2, [srcq] |
| %if %2 |
| paddw m1, m3, [srcq+strideq] |
| %else |
| paddw m1, m3, [srcq+32] |
| %endif |
| pmaxsw m0, m12 |
| pmaxsw m1, m12 |
| pminsw m0, m13 |
| pminsw m1, m13 |
| mova [dstq], m0 |
| %if %2 |
| mova [dstq+strideq], m1 |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*(2<<%3)] |
| %else |
| mova [dstq+32], m1 |
| add srcq, strideq |
| add dstq, strideq |
| add lumaq, r10mp |
| %endif |
| add grain_lutq, 82*(2<<%2) |
| %if %2 |
| sub hb, 2 |
| jg %%loop_y_h_overlap |
| %else |
| dec hb |
| jle %%end_y_hv_overlap |
| add hd, 0x80000000 |
| jc %%loop_y_h_overlap |
| add r14, 4 |
| jmp %%loop_y_hv_overlap |
| %endif |
| %%end_y_hv_overlap: |
| add wq, 32>>%2 |
| jge .end |
| mov srcq, r9mp |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| jmp %%loop_x_hv_overlap |
| %endmacro |
| |
| %%FGUV_32x32xN_LOOP 1, %2, %3 |
| .csfl: |
| %%FGUV_32x32xN_LOOP 0, %2, %3 |
| .end: |
| RET |
| %endmacro |
| |
| GEN_GRAIN_UV_FN 420, 1, 1 |
| FGUV_FN 420, 1, 1 |
| GEN_GRAIN_UV_FN 422, 1, 0 |
| FGUV_FN 422, 1, 0 |
| GEN_GRAIN_UV_FN 444, 0, 0 |
| FGUV_FN 444, 0, 0 |
| |
| %endif ; ARCH_X86_64 |