| ; Copyright © 2021, VideoLAN and dav1d authors |
| ; Copyright © 2021, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| %include "x86/filmgrain_common.asm" |
| |
| SECTION_RODATA 16 |
| pd_16: times 4 dd 16 |
| pw_1: times 8 dw 1 |
| pw_16384: times 8 dw 16384 |
| pw_8192: times 8 dw 8192 |
| pw_23_22: dw 23, 22 |
| times 3 dw 0, 32 |
| pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 |
| pw_27_17_17_27: dw 27, 17, 17, 27 |
| times 2 dw 0, 32 |
| rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 |
| pw_seed_xor: times 2 dw 0xb524 |
| times 2 dw 0x49d8 |
| pb_1: times 4 db 1 |
| hmul_bits: dw 32768, 16384, 8192, 4096 |
| round: dw 2048, 1024, 512 |
| mul_bits: dw 256, 128, 64, 32, 16 |
| round_vals: dw 32, 64, 128, 256, 512, 1024 |
| max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 |
| min: dw 0, 16*4, 16*16 |
| ; these two should be next to each other |
| pw_4: times 2 dw 4 |
| pw_16: times 2 dw 16 |
| |
| %macro JMP_TABLE 1-* |
| %xdefine %1_table %%table |
| %xdefine %%base %1_table |
| %xdefine %%prefix mangle(private_prefix %+ _%1) |
| %%table: |
| %rep %0 - 1 |
| dd %%prefix %+ .ar%2 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 |
| |
| SECTION .text |
| |
| %if ARCH_X86_32 |
| %undef base |
| %define PIC_ptr(a) base+a |
| %else |
| %define PIC_ptr(a) a |
| %endif |
| |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| %macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg |
| %assign %%idx 0 |
| %define %%tmp %2 |
| %if %0 == 8 |
| %define %%tmp %8 |
| %endif |
| %rep (%6/2) |
| %if %%idx == 0 |
| movd %5 %+ d, %2 |
| pshuflw %%tmp, %2, q3232 |
| %else |
| movd %5 %+ d, %%tmp |
| %if %6 == 8 |
| %if %%idx == 2 |
| punpckhqdq %%tmp, %%tmp |
| %elif %%idx == 4 |
| psrlq %%tmp, 32 |
| %endif |
| %endif |
| %endif |
| movzx %4 %+ d, %5 %+ w |
| shr %5 %+ d, 16 |
| |
| %if %%idx == 0 |
| movd %1, [%3+%4*%7] |
| %else |
| pinsrw %1, [%3+%4*%7], %%idx + 0 |
| %endif |
| pinsrw %1, [%3+%5*%7], %%idx + 1 |
| %assign %%idx %%idx+2 |
| %endrep |
| %endmacro |
| |
| %macro SPLATD 2 ; dst, src |
| %ifnidn %1, %2 |
| movd %1, %2 |
| %endif |
| pshufd %1, %1, q0000 |
| %endmacro |
| |
| %macro SPLATW 2 ; dst, src |
| %ifnidn %1, %2 |
| movd %1, %2 |
| %endif |
| pshuflw %1, %1, q0000 |
| punpcklqdq %1, %1 |
| %endmacro |
| |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax |
| lea r4, [pb_mask] |
| %define base r4-pb_mask |
| %else |
| cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax |
| LEA r4, $$ |
| %define base r4-$$ |
| %endif |
| movq m1, [base+rnd_next_upperbit_mask] |
| movq m4, [base+mul_bits] |
| movq m7, [base+hmul_bits] |
| mov r3d, [fg_dataq+FGData.grain_scale_shift] |
| lea r5d, [bdmaxq+1] |
| shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc |
| sub r3, r5 |
| SPLATW m6, [base+round+r3*2-2] |
| mova m5, [base+pb_mask] |
| SPLATW m0, [fg_dataq+FGData.seed] |
| mov r3, -73*82*2 |
| sub bufq, r3 |
| %if ARCH_X86_64 |
| lea r6, [gaussian_sequence] |
| %endif |
| .loop: |
| pand m2, m0, m1 |
| psrlw m3, m2, 10 |
| por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw m2, m4 ; bits 0x0f00 are set |
| pshufb m3, m5, m2 ; set 15th bit for next 4 seeds |
| psllq m2, m3, 30 |
| por m2, m3 |
| psllq m3, m2, 15 |
| por m2, m3 ; aggregate each bit into next seed's high bit |
| pmulhuw m3, m0, m7 |
| por m2, m3 ; 4 next output seeds |
| pshuflw m0, m2, q3333 |
| psrlw m2, 5 |
| %if ARCH_X86_64 |
| vpgatherdw m3, m2, r6, r5, r7, 4, 2 |
| %else |
| vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 |
| %endif |
| paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 |
| ; shifts by 0, which pmulhrsw does not support |
| pmulhrsw m3, m6 |
| movq [bufq+r3], m3 |
| add r3, 4*2 |
| jl .loop |
| |
| ; auto-regression code |
| movsxd r3, [fg_dataq+FGData.ar_coeff_lag] |
| movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] |
| lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] |
| jmp r3 |
| |
| .ar1: |
| %if WIN64 |
| DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 |
| lea bufq, [r0-2*(82*73-(82*3+79))] |
| PUSH r8 |
| %else |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 |
| %else ; x86-32 |
| DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 |
| PUSH r6 |
| %define shiftd r1d |
| %endif |
| sub bufq, 2*(82*73-(82*3+79)) |
| %endif |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] |
| movd m4, [fg_dataq+FGData.ar_coeffs_y] |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| %if WIN64 |
| DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 |
| %elif ARCH_X86_64 |
| DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 |
| %else ; x86-32 |
| %undef shiftd |
| DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 |
| %define hd dword r0m |
| %define maxd dword minm |
| %endif |
| %if cpuflag(sse4) |
| pmovsxbw m4, m4 |
| %else |
| pxor m3, m3 |
| pcmpgtb m3, m4 |
| punpcklbw m4, m3 |
| %endif |
| pinsrw m4, [base+pw_1], 3 |
| pshufd m5, m4, q1111 |
| pshufd m4, m4, q0000 |
| SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd |
| mov hd, 70 |
| sar maxd, 1 |
| mov mind, maxd |
| xor mind, -1 |
| .y_loop_ar1: |
| mov xq, -76 |
| movsx val3d, word [bufq+xq*2-2] |
| .x_loop_ar1: |
| movu m0, [bufq+xq*2-82*2-2] ; top/left |
| psrldq m2, m0, 2 ; top |
| psrldq m1, m0, 4 ; top/right |
| punpcklwd m0, m2 |
| punpcklwd m1, m3 |
| pmaddwd m0, m4 |
| pmaddwd m1, m5 |
| paddd m0, m1 |
| .x_loop_ar1_inner: |
| movd val0d, m0 |
| psrldq m0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| sar val3d, shiftb |
| movsx val0d, word [bufq+xq*2] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov word [bufq+xq*2], val3w |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xq, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| |
| .x_loop_ar1_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar1 |
| %if WIN64 |
| POP r8 |
| %elif ARCH_X86_32 |
| POP r6 |
| %undef maxd |
| %undef hd |
| %endif |
| .ar0: |
| RET |
| |
| .ar2: |
| %if ARCH_X86_32 |
| %assign stack_offset_old stack_offset |
| ALLOC_STACK -16*8 |
| %endif |
| DEFINE_ARGS buf, fg_data, bdmax, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movd m0, [base+round_vals-12+shiftq*2] |
| pshuflw m0, m0, q0000 |
| movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 |
| pxor m2, m2 |
| punpcklwd m0, m2 |
| pcmpgtb m2, m6 |
| punpckhbw m3, m6, m2 |
| punpcklbw m6, m2 |
| pshufd m2, m6, q3333 |
| pshufd m1, m6, q2222 |
| pshufd m7, m6, q1111 |
| pshufd m6, m6, q0000 |
| pshufd m4, m3, q1111 |
| pshufd m3, m3, q0000 |
| %if ARCH_X86_64 |
| SWAP 0, 12 |
| SWAP 1, 8 |
| SWAP 2, 9 |
| SWAP 3, 10 |
| SWAP 4, 11 |
| %else |
| %define m12 [rsp+0*16] |
| %define m8 [rsp+1*16] |
| %define m9 [rsp+2*16] |
| %define m10 [rsp+3*16] |
| %define m11 [rsp+4*16] |
| mova m12, m0 |
| mova m8, m1 |
| mova m9, m2 |
| mova m10, m3 |
| mova m11, m4 |
| mov bdmaxd, bdmaxm |
| %endif |
| sar bdmaxd, 1 |
| SPLATW m0, bdmaxd ; max_grain |
| pcmpeqw m1, m1 |
| %if !cpuflag(sse4) |
| pcmpeqw m2, m2 |
| psrldq m2, 14 |
| pslldq m2, 2 |
| pxor m2, m1 |
| %endif |
| pxor m1, m0 ; min_grain |
| %if ARCH_X86_64 |
| SWAP 0, 13 |
| SWAP 1, 14 |
| SWAP 2, 15 |
| %else |
| %define m13 [rsp+5*16] |
| %define m14 [rsp+6*16] |
| mova m13, m0 |
| mova m14, m1 |
| %if !cpuflag(sse4) |
| %define m15 [rsp+7*16] |
| mova m15, m2 |
| %endif |
| %endif |
| sub bufq, 2*(82*73-(82*3+79)) |
| DEFINE_ARGS buf, fg_data, h, x |
| mov hd, 70 |
| .y_loop_ar2: |
| mov xq, -76 |
| |
| .x_loop_ar2: |
| movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] |
| movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] |
| psrldq m2, m0, 2 |
| psrldq m3, m0, 4 |
| psrldq m4, m0, 6 |
| psrldq m5, m0, 8 |
| punpcklwd m0, m2 |
| punpcklwd m3, m4 |
| punpcklwd m5, m1 |
| psrldq m2, m1, 2 |
| psrldq m4, m1, 4 |
| punpcklwd m2, m4 |
| psrldq m4, m1, 6 |
| psrldq m1, 8 |
| punpcklwd m4, m1 |
| pmaddwd m0, m6 |
| pmaddwd m3, m7 |
| pmaddwd m5, m8 |
| pmaddwd m2, m9 |
| pmaddwd m4, m10 |
| paddd m0, m3 |
| paddd m5, m2 |
| paddd m0, m4 |
| paddd m0, m5 ; accumulated top 2 rows |
| paddd m0, m12 |
| |
| movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] |
| pshufd m4, m1, q3321 |
| pxor m2, m2 |
| pcmpgtw m2, m4 |
| punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] |
| .x_loop_ar2_inner: |
| pmaddwd m2, m1, m11 |
| paddd m2, m0 |
| psrldq m0, 4 ; shift top to next pixel |
| psrad m2, [fg_dataq+FGData.ar_coeff_shift] |
| paddd m2, m4 |
| packssdw m2, m2 |
| pminsw m2, m13 |
| pmaxsw m2, m14 |
| psrldq m4, 4 |
| pslldq m2, 2 |
| psrldq m1, 2 |
| %if cpuflag(sse4) |
| pblendw m1, m2, 00000010b |
| %else |
| pand m1, m15 |
| pandn m3, m15, m2 |
| por m1, m3 |
| %endif |
| ; overwrite previous pixel, this should be ok |
| movd [bufq+xq*2-2], m1 |
| inc xq |
| jz .x_loop_ar2_end |
| test xq, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| |
| .x_loop_ar2_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar2 |
| %if ARCH_X86_32 |
| %undef m8 |
| %undef m9 |
| %undef m10 |
| %undef m11 |
| %undef m12 |
| %undef m13 |
| %undef m14 |
| %undef m15 |
| %endif |
| RET |
| |
| .ar3: |
| DEFINE_ARGS buf, fg_data, bdmax, shift |
| %if WIN64 |
| mov r6, rsp |
| and rsp, ~15 |
| sub rsp, 64 |
| %define tmp rsp |
| %elif ARCH_X86_64 |
| %define tmp rsp+stack_offset-72 |
| %else |
| %assign stack_offset stack_offset_old |
| ALLOC_STACK -16*12 |
| %define tmp rsp |
| mov bdmaxd, bdmaxm |
| %endif |
| sar bdmaxd, 1 |
| SPLATW m7, bdmaxd ; max_grain |
| pcmpeqw m6, m6 |
| %if !cpuflag(sse4) |
| pcmpeqw m4, m4 |
| psrldq m4, 14 |
| pslldq m4, 4 |
| pxor m4, m6 |
| %endif |
| pxor m6, m7 ; min_grain |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| |
| %if ARCH_X86_64 |
| SWAP 6, 14 |
| SWAP 7, 15 |
| %else |
| %define m14 [rsp+10*16] |
| %define m15 [esp+11*16] |
| mova m14, m6 |
| mova m15, m7 |
| %endif |
| |
| ; build cf0-1 until 18-19 in m5-12 and r0/1 |
| pxor m1, m1 |
| movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 |
| pcmpgtb m1, m0 |
| punpckhbw m2, m0, m1 |
| punpcklbw m0, m1 |
| |
| %if cpuflag(sse4) |
| pshufd m4, m2, q3333 |
| %else |
| pshufd m5, m2, q3333 |
| mova [tmp+48], m5 |
| %endif |
| pshufd m3, m2, q2222 |
| pshufd m1, m2, q0000 |
| pshufd m2, m2, q1111 |
| pshufd m7, m0, q2222 |
| pshufd m6, m0, q1111 |
| pshufd m5, m0, q0000 |
| pshufd m0, m0, q3333 |
| |
| %if ARCH_X86_64 |
| SWAP 0, 8 |
| SWAP 1, 9 |
| SWAP 2, 10 |
| SWAP 3, 11 |
| SWAP 4, 12 |
| %else |
| %define m8 [rsp+4*16] |
| %define m9 [esp+5*16] |
| %define m10 [rsp+6*16] |
| %define m11 [esp+7*16] |
| %define m12 [rsp+8*16] |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| mova m12, m4 |
| %endif |
| |
| ; build cf20,round in r2 |
| ; build cf21-23,round*2 in m13 |
| pxor m1, m1 |
| movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 |
| pcmpgtb m1, m0 |
| punpcklbw m0, m1 |
| pshufd m1, m0, q0000 |
| pshufd m2, m0, q1111 |
| mova [tmp+ 0], m1 |
| mova [tmp+16], m2 |
| psrldq m3, m0, 10 |
| pinsrw m3, [base+round_vals+shiftq*2-10], 3 |
| |
| %if ARCH_X86_64 |
| SWAP 3, 13 |
| %else |
| %define m13 [esp+9*16] |
| mova m13, m3 |
| %endif |
| |
| pinsrw m0, [base+round_vals+shiftq*2-12], 5 |
| pshufd m3, m0, q2222 |
| mova [tmp+32], m3 |
| |
| DEFINE_ARGS buf, fg_data, h, x |
| sub bufq, 2*(82*73-(82*3+79)) |
| mov hd, 70 |
| .y_loop_ar3: |
| mov xq, -76 |
| |
| .x_loop_ar3: |
| movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] |
| movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] |
| palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] |
| palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] |
| punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| |
| pmaddwd m0, m5 |
| pmaddwd m2, m6 |
| pmaddwd m3, m7 |
| paddd m0, m2 |
| paddd m0, m3 |
| ; m0 = top line first 6 multiplied by cf, m1 = top line last entry |
| |
| movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] |
| movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] |
| punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] |
| palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] |
| palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] |
| punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] |
| punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] |
| shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| |
| pmaddwd m1, m8 |
| pmaddwd m4, m9 |
| pmaddwd m3, m10 |
| pmaddwd m2, m11 |
| paddd m1, m4 |
| paddd m3, m2 |
| paddd m0, m1 |
| paddd m0, m3 |
| ; m0 = top 2 lines multiplied by cf |
| |
| movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] |
| movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] |
| palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] |
| palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] |
| punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| punpcklwd m2, [base+pw_1] |
| |
| %if cpuflag(sse4) |
| pmaddwd m1, m12 |
| %else |
| pmaddwd m1, [tmp+48] |
| %endif |
| pmaddwd m3, [tmp+ 0] |
| pmaddwd m4, [tmp+16] |
| pmaddwd m2, [tmp+32] |
| paddd m1, m3 |
| paddd m4, m2 |
| paddd m0, m1 |
| paddd m0, m4 |
| ; m0 = top 3 lines multiplied by cf plus rounding for downshift |
| |
| movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] |
| .x_loop_ar3_inner: |
| pmaddwd m2, m1, m13 |
| pshufd m3, m2, q1111 |
| paddd m2, m3 ; left+cur |
| paddd m2, m0 ; add top |
| psrldq m0, 4 |
| psrad m2, [fg_dataq+FGData.ar_coeff_shift] |
| packssdw m2, m2 |
| pminsw m2, m15 |
| pmaxsw m2, m14 |
| pslldq m2, 4 |
| psrldq m1, 2 |
| %if cpuflag(sse4) |
| pblendw m1, m2, 00000100b |
| %else |
| pand m1, m12 |
| pandn m3, m12, m2 |
| por m1, m3 |
| %endif |
| ; overwrite a couple of pixels, should be ok |
| movq [bufq+xq*2-4], m1 |
| inc xq |
| jz .x_loop_ar3_end |
| test xq, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| |
| .x_loop_ar3_end: |
| add bufq, 82*2 |
| dec hd |
| jg .y_loop_ar3 |
| %if WIN64 |
| mov rsp, r6 |
| %elif ARCH_X86_32 |
| %undef m8 |
| %undef m9 |
| %undef m10 |
| %undef m11 |
| %undef m12 |
| %undef m13 |
| %undef m14 |
| %undef m15 |
| %endif |
| RET |
| |
| %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg |
| %define base r8-pb_mask |
| lea r8, [pb_mask] |
| movifnidn bdmaxd, bdmaxm |
| lea r6d, [bdmaxq+1] |
| %else |
| cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h |
| %define base r2-$$ |
| LEA r2, $$ |
| mov fg_dataq, r2m |
| mov r6d, r4m |
| inc r6d |
| %endif |
| movq m1, [base+rnd_next_upperbit_mask] |
| movq m4, [base+mul_bits] |
| movq m7, [base+hmul_bits] |
| mov r5d, [fg_dataq+FGData.grain_scale_shift] |
| shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc |
| sub r5, r6 |
| SPLATW m6, [base+round+r5*2-2] |
| mova m5, [base+pb_mask] |
| SPLATW m0, [fg_dataq+FGData.seed] |
| %if ARCH_X86_64 |
| SPLATW m2, [base+pw_seed_xor+uvq*4] |
| %else |
| mov r5d, r3m |
| SPLATW m2, [base+pw_seed_xor+r5*4] |
| %endif |
| pxor m0, m2 |
| %if ARCH_X86_64 |
| lea r6, [gaussian_sequence] |
| %endif |
| %if %2 |
| mov hd, 73-35*%3 |
| add bufq, 44*2 |
| .loop_y: |
| mov xq, -44 |
| %else |
| mov xq, -82*73 |
| add bufq, 82*73*2 |
| %endif |
| .loop_x: |
| pand m2, m0, m1 |
| psrlw m3, m2, 10 |
| por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw m2, m4 ; bits 0x0f00 are set |
| pshufb m3, m5, m2 ; set 15th bit for next 4 seeds |
| psllq m2, m3, 30 |
| por m2, m3 |
| psllq m3, m2, 15 |
| por m2, m3 ; aggregate each bit into next seed's high bit |
| pmulhuw m3, m0, m7 |
| por m2, m3 ; 4 next output seeds |
| pshuflw m0, m2, q3333 |
| psrlw m2, 5 |
| %if ARCH_X86_64 |
| vpgatherdw m3, m2, r6, r9, r10, 4, 2 |
| %else |
| vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 |
| %endif |
| paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 |
| ; shifts by 0, which pmulhrsw does not support |
| pmulhrsw m3, m6 |
| movq [bufq+xq*2], m3 |
| add xq, 4 |
| jl .loop_x |
| %if %2 |
| add bufq, 82*2 |
| dec hd |
| jg .loop_y |
| %endif |
| |
| ; auto-regression code |
| movsxd r5, [fg_dataq+FGData.ar_coeff_lag] |
| movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] |
| lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] |
| jmp r5 |
| |
| .ar0: |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| %else |
| DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift |
| %assign stack_offset_old stack_offset |
| ALLOC_STACK -16*2 |
| mov bufyq, r1m |
| mov uvd, r3m |
| %endif |
| imul uvd, 28 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| SPLATW m3, [base+hmul_bits+shiftq*2-10] |
| %if ARCH_X86_64 |
| sar bdmaxd, 1 |
| SPLATW m1, bdmaxd ; max_gain |
| %else |
| SPLATW m1, r4m |
| psraw m1, 1 |
| %endif |
| pcmpeqw m7, m7 |
| pxor m7, m1 ; min_grain |
| %if ARCH_X86_64 |
| SWAP 1, 14 |
| DEFINE_ARGS buf, bufy, h, x |
| %else |
| %define m14 [rsp+0*16] |
| mova m14, m1 |
| DEFINE_ARGS buf, bufy, pic_reg, h, x |
| %endif |
| pxor m5, m5 |
| pcmpgtb m5, m4 |
| punpcklbw m4, m5 |
| %if %2 |
| SPLATW m6, [base+hmul_bits+2+%3*2] |
| %endif |
| SPLATW m4, m4 |
| pxor m5, m5 |
| %if %2 |
| %if !cpuflag(sse4) |
| pcmpeqw m2, m2 |
| pslldq m2, 12 |
| %if ARCH_X86_64 |
| SWAP 2, 12 |
| %else |
| %define m12 [rsp+1*16] |
| mova m12, m2 |
| %endif |
| %endif |
| %endif |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) |
| %else |
| sub bufq, 2*(82*70-3) |
| %endif |
| add bufyq, 2*(3+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar0: |
| ; first 32 pixels |
| xor xd, xd |
| .x_loop_ar0: |
| movu m0, [bufyq+xq*(2<<%2)] |
| %if %2 |
| %if %3 |
| movu m2, [bufyq+xq*4+82*2] |
| paddw m0, m2 |
| %endif |
| movu m1, [bufyq+xq*4 +16] |
| %if %3 |
| movu m2, [bufyq+xq*4+82*2+16] |
| paddw m1, m2 |
| %endif |
| phaddw m0, m1 |
| pmulhrsw m0, m6 |
| %endif |
| punpckhwd m1, m0, m5 |
| punpcklwd m0, m5 |
| REPX {pmaddwd x, m4}, m0, m1 |
| REPX {psrad x, 5}, m0, m1 |
| packssdw m0, m1 |
| pmulhrsw m0, m3 |
| movu m1, [bufq+xq*2] |
| paddw m0, m1 |
| pminsw m0, m14 |
| pmaxsw m0, m7 |
| cmp xd, 72-40*%2 |
| je .end |
| movu [bufq+xq*2], m0 |
| add xd, 8 |
| jmp .x_loop_ar0 |
| |
| ; last 6/4 pixels |
| .end: |
| %if %2 |
| %if cpuflag(sse4) |
| pblendw m0, m1, 11000000b |
| %else |
| pand m1, m12 |
| pandn m2, m12, m0 |
| por m0, m1, m2 |
| %endif |
| movu [bufq+xq*2], m0 |
| %else |
| movq [bufq+xq*2], m0 |
| %endif |
| |
| add bufq, 82*2 |
| add bufyq, 82*(2<<%3) |
| dec hd |
| jg .y_loop_ar0 |
| %if ARCH_X86_32 |
| %undef m12 |
| %undef m14 |
| %endif |
| RET |
| |
| .ar1: |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x |
| %else |
| %assign stack_offset stack_offset_old |
| %xdefine rstk rsp |
| %assign stack_size_padded 0 |
| DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 |
| mov bufyq, r1m |
| mov uvd, r3m |
| %endif |
| imul uvd, 28 |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] |
| movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| %if WIN64 |
| DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 |
| %if %2 |
| lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] |
| %else |
| lea bufq, [r0-2*(82*69+3)] |
| %endif |
| %else |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 |
| %else |
| DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 |
| %define hd dword r1m |
| %define mind dword r3m |
| %define maxd dword r4m |
| %endif |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| %endif |
| %if ARCH_X86_64 |
| mov shiftd, [r2+FGData.ar_coeff_shift] |
| %else |
| mov shiftd, [r3+FGData.ar_coeff_shift] |
| %endif |
| pxor m5, m5 |
| pcmpgtb m5, m4 |
| punpcklbw m4, m5 ; cf0-4 in words |
| pshuflw m4, m4, q2100 |
| psrldq m4, 2 ; cf0-3,4 in words |
| pshufd m5, m4, q1111 |
| pshufd m4, m4, q0000 |
| movd m3, [base+round_vals+shiftq*2-12] ; rnd |
| pxor m6, m6 |
| punpcklwd m3, m6 |
| %if %2 |
| SPLATW m6, [base+hmul_bits+2+%3*2] |
| %endif |
| SPLATD m3, m3 |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| sar maxd, 1 |
| %if ARCH_X86_64 |
| mov mind, maxd |
| xor mind, -1 |
| %else |
| DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 |
| mov r2, maxd |
| xor r2, -1 |
| mov mind, r2 |
| %endif |
| .y_loop_ar1: |
| mov xq, -(76>>%2) |
| movsx val3d, word [bufq+xq*2-2] |
| .x_loop_ar1: |
| movu m0, [bufq+xq*2-82*2-2] ; top/left |
| %if %2 |
| movu m7, [bufyq+xq*4] |
| %if %3 |
| movu m1, [bufyq+xq*4+82*2] |
| phaddw m7, m1 |
| %else |
| phaddw m7, m7 |
| %endif |
| %else |
| movq m7, [bufyq+xq*2] |
| %endif |
| psrldq m2, m0, 2 ; top |
| psrldq m1, m0, 4 ; top/right |
| punpcklwd m0, m2 |
| %if %2 |
| %if %3 |
| pshufd m2, m7, q3232 |
| paddw m7, m2 |
| %endif |
| pmulhrsw m7, m6 |
| %endif |
| punpcklwd m1, m7 |
| pmaddwd m0, m4 |
| pmaddwd m1, m5 |
| paddd m0, m1 |
| paddd m0, m3 |
| .x_loop_ar1_inner: |
| movd val0d, m0 |
| psrldq m0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| sar val3d, shiftb |
| movsx val0d, word [bufq+xq*2] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov word [bufq+xq*2], val3w |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xq, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| |
| .x_loop_ar1_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar1 |
| %if ARCH_X86_32 |
| %undef maxd |
| %undef mind |
| %undef hd |
| %endif |
| RET |
| |
| .ar2: |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| %else |
| DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift |
| ALLOC_STACK -16*8 |
| mov bufyq, r1m |
| mov uvd, r3m |
| %endif |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 28 |
| %if ARCH_X86_64 |
| sar bdmaxd, 1 |
| SPLATW m5, bdmaxd ; max_grain |
| %else |
| SPLATW m5, r4m |
| psraw m5, 1 |
| %endif |
| pcmpeqw m6, m6 |
| %if !cpuflag(sse4) |
| pcmpeqw m7, m7 |
| psrldq m7, 14 |
| pslldq m7, 2 |
| pxor m7, m6 |
| %endif |
| pxor m6, m5 ; min_grain |
| %if %2 && cpuflag(sse4) |
| SPLATW m7, [base+hmul_bits+2+%3*2] |
| %endif |
| |
| %if ARCH_X86_64 |
| SWAP 5, 13 |
| SWAP 6, 14 |
| SWAP 7, 15 |
| %else |
| %define m13 [rsp+5*16] |
| %define m14 [rsp+6*16] |
| %define m15 [rsp+7*16] |
| mova m13, m5 |
| mova m14, m6 |
| mova m15, m7 |
| %endif |
| |
| ; coef values |
| movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] |
| pxor m1, m1 |
| pcmpgtb m1, m0 |
| punpckhbw m2, m0, m1 |
| punpcklbw m0, m1 |
| pinsrw m2, [base+round_vals-12+shiftq*2], 5 |
| |
| pshufd m6, m0, q0000 |
| pshufd m7, m0, q1111 |
| pshufd m1, m0, q3333 |
| pshufd m0, m0, q2222 |
| pshufd m3, m2, q1111 |
| pshufd m4, m2, q2222 |
| pshufd m2, m2, q0000 |
| |
| %if ARCH_X86_64 |
| SWAP 0, 8 |
| SWAP 1, 9 |
| SWAP 2, 10 |
| SWAP 3, 11 |
| SWAP 4, 12 |
| %else |
| %define m8 [rsp+0*16] |
| %define m9 [rsp+1*16] |
| %define m10 [rsp+2*16] |
| %define m11 [rsp+3*16] |
| %define m12 [rsp+4*16] |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| mova m12, m4 |
| %endif |
| |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, fg_data, h, x |
| %else |
| DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x |
| %endif |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar2: |
| mov xq, -(76>>%2) |
| |
| .x_loop_ar2: |
| movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] |
| movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] |
| psrldq m4, m0, 2 ; y=-2,x=[-1,+5] |
| psrldq m1, m0, 4 ; y=-2,x=[-0,+5] |
| psrldq m3, m0, 6 ; y=-2,x=[+1,+5] |
| psrldq m2, m0, 8 ; y=-2,x=[+2,+5] |
| punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] |
| punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] |
| pmaddwd m0, m6 |
| pmaddwd m1, m7 |
| pmaddwd m2, m8 |
| paddd m0, m1 |
| paddd m0, m2 |
| psrldq m3, m5, 2 ; y=-1,x=[-1,+5] |
| psrldq m1, m5, 4 ; y=-1,x=[-0,+5] |
| psrldq m4, m5, 6 ; y=-1,x=[+1,+5] |
| psrldq m2, m5, 8 ; y=-1,x=[+2,+5] |
| punpcklwd m3, m1 |
| punpcklwd m4, m2 |
| pmaddwd m3, m9 |
| pmaddwd m4, m10 |
| paddd m3, m4 |
| paddd m0, m3 |
| |
| ; luma component & rounding |
| %if %2 |
| movu m1, [bufyq+xq*4] |
| %if %3 |
| movu m2, [bufyq+xq*4+82*2] |
| phaddw m1, m2 |
| pshufd m2, m1, q3232 |
| paddw m1, m2 |
| %else |
| phaddw m1, m1 |
| %endif |
| %if cpuflag(sse4) |
| pmulhrsw m1, m15 |
| %elif %3 |
| pmulhrsw m1, [base+pw_8192] |
| %else |
| pmulhrsw m1, [base+pw_16384] |
| %endif |
| %else |
| movq m1, [bufyq+xq*2] |
| %endif |
| punpcklwd m1, [base+pw_1] |
| pmaddwd m1, m12 |
| paddd m0, m1 |
| |
| movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] |
| pshufd m2, m1, q3321 |
| pxor m3, m3 |
| pcmpgtw m3, m2 |
| punpcklwd m2, m3 ; y=0,x=[0,3] in dword |
| .x_loop_ar2_inner: |
| pmaddwd m3, m1, m11 |
| paddd m3, m0 |
| psrldq m0, 4 ; shift top to next pixel |
| psrad m3, [fg_dataq+FGData.ar_coeff_shift] |
| ; we do not need to packssdw since we only care about one value |
| paddd m3, m2 |
| packssdw m3, m3 |
| pminsw m3, m13 |
| pmaxsw m3, m14 |
| psrldq m1, 2 |
| pslldq m3, 2 |
| psrldq m2, 4 |
| %if cpuflag(sse4) |
| pblendw m1, m3, 00000010b |
| %else |
| pand m1, m15 |
| pandn m4, m15, m3 |
| por m1, m4 |
| %endif |
| ; overwrite previous pixel, should be ok |
| movd [bufq+xq*2-2], m1 |
| inc xq |
| jz .x_loop_ar2_end |
| test xq, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| |
| .x_loop_ar2_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar2 |
| %if ARCH_X86_32 |
| %undef m13 |
| %undef m14 |
| %undef m15 |
| %endif |
| RET |
| |
| .ar3: |
| %if ARCH_X86_64 |
| DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift |
| %if WIN64 |
| mov r6, rsp |
| and rsp, ~15 |
| sub rsp, 96 |
| %define tmp rsp |
| %else |
| %define tmp rsp+stack_offset-120 |
| %endif |
| %else |
| DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift |
| %assign stack_offset stack_offset_old |
| ALLOC_STACK -16*14 |
| mov bufyq, r1m |
| mov uvd, r3m |
| %define tmp rsp |
| %endif |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 28 |
| SPLATW m4, [base+round_vals-12+shiftq*2] |
| pxor m5, m5 |
| pcmpgtw m5, m4 |
| punpcklwd m4, m5 |
| %if ARCH_X86_64 |
| sar bdmaxd, 1 |
| SPLATW m6, bdmaxd ; max_grain |
| %else |
| SPLATW m6, r4m |
| psraw m6, 1 |
| %endif |
| pcmpeqw m7, m7 |
| %if !cpuflag(sse4) |
| pcmpeqw m3, m3 |
| psrldq m3, 14 |
| pslldq m3, 4 |
| pxor m3, m7 |
| %endif |
| pxor m7, m6 ; min_grain |
| %if %2 && cpuflag(sse4) |
| SPLATW m3, [base+hmul_bits+2+%3*2] |
| %endif |
| |
| %if ARCH_X86_64 |
| SWAP 3, 11 |
| SWAP 4, 12 |
| SWAP 6, 14 |
| SWAP 7, 15 |
| %else |
| %define m11 [rsp+ 9*16] |
| %define m12 [rsp+10*16] |
| %define m14 [rsp+12*16] |
| %define m15 [rsp+13*16] |
| mova m11, m3 |
| mova m12, m4 |
| mova m14, m6 |
| mova m15, m7 |
| %endif |
| |
| ; cf from y=-3,x=-3 until y=-3,x=-2 |
| movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] |
| pxor m1, m1 |
| pcmpgtb m1, m0 |
| punpckhbw m2, m0, m1 |
| punpcklbw m0, m1 |
| pshufd m1, m0, q0000 |
| pshufd m3, m0, q1111 |
| pshufd m4, m0, q2222 |
| pshufd m0, m0, q3333 |
| pshufd m5, m2, q0000 |
| pshufd m6, m2, q1111 |
| mova [tmp+16*0], m1 |
| mova [tmp+16*1], m3 |
| mova [tmp+16*2], m4 |
| mova [tmp+16*3], m0 |
| mova [tmp+16*4], m5 |
| mova [tmp+16*5], m6 |
| pshufd m6, m2, q2222 |
| pshufd m7, m2, q3333 |
| |
| ; cf from y=-1,x=-1 to y=0,x=-1 + luma component |
| movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] |
| pxor m1, m1 |
| pcmpgtb m1, m0 |
| punpckhbw m2, m0, m1 ; luma |
| punpcklbw m0, m1 |
| pshufd m3, m0, q3232 |
| psrldq m5, m0, 10 |
| ; y=0,x=[-3 to -1] + "1.0" for current pixel |
| pinsrw m5, [base+round_vals-10+shiftq*2], 3 |
| ; y=-1,x=[-1 to +2] |
| pshufd m1, m0, q0000 |
| pshufd m0, m0, q1111 |
| ; y=-1,x=+3 + luma |
| punpcklwd m3, m2 |
| pshufd m3, m3, q0000 |
| |
| %if ARCH_X86_64 |
| SWAP 1, 8 |
| SWAP 0, 9 |
| SWAP 3, 10 |
| SWAP 5, 13 |
| DEFINE_ARGS buf, bufy, fg_data, h, x |
| %else |
| %define m8 [rsp+ 6*16] |
| %define m9 [rsp+ 7*16] |
| %define m10 [rsp+ 8*16] |
| %define m13 [rsp+11*16] |
| mova m8, m1 |
| mova m9, m0 |
| mova m10, m3 |
| mova m13, m5 |
| DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x |
| %endif |
| %if %2 |
| sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) |
| %else |
| sub bufq, 2*(82*69+3) |
| %endif |
| add bufyq, 2*(79+82*3) |
| mov hd, 70-35*%3 |
| .y_loop_ar3: |
| mov xq, -(76>>%2) |
| |
| .x_loop_ar3: |
| ; first line |
| movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] |
| movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] |
| palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] |
| palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] |
| punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| |
| pmaddwd m0, [tmp+0*16] |
| pmaddwd m2, [tmp+1*16] |
| pmaddwd m3, [tmp+2*16] |
| paddd m0, m2 |
| paddd m0, m3 ; first 6 x of top y |
| |
| ; second line [m0/1 are busy] |
| movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] |
| movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] |
| punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] |
| palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] |
| palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] |
| punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] |
| punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] |
| shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] |
| pmaddwd m1, [tmp+3*16] |
| pmaddwd m4, [tmp+4*16] |
| pmaddwd m3, [tmp+5*16] |
| pmaddwd m5, m6 |
| paddd m1, m4 |
| paddd m3, m5 |
| paddd m0, m1 |
| paddd m0, m3 ; top 2 lines |
| |
| ; third line [m0 is busy] & luma + round |
| movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] |
| movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] |
| %if %2 |
| movu m5, [bufyq+xq*4] |
| %if %3 |
| movu m4, [bufyq+xq*4+82*2] |
| phaddw m5, m4 |
| %else |
| phaddw m5, m5 |
| %endif |
| %else |
| movq m5, [bufyq+xq*2] |
| %endif |
| palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] |
| palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] |
| %if %3 |
| pshufd m4, m5, q3232 |
| paddw m5, m4 |
| %endif |
| %if %2 |
| %if cpuflag(sse4) |
| pmulhrsw m5, m11 |
| %elif %3 |
| pmulhrsw m5, [base+pw_8192] |
| %else |
| pmulhrsw m5, [base+pw_16384] |
| %endif |
| %endif |
| punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] |
| punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] |
| shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] |
| punpcklwd m2, m5 |
| pmaddwd m1, m7 |
| pmaddwd m3, m8 |
| pmaddwd m4, m9 |
| pmaddwd m2, m10 |
| paddd m1, m3 |
| paddd m4, m2 |
| paddd m0, m12 ; += round |
| paddd m1, m4 |
| paddd m0, m1 |
| |
| movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] |
| .x_loop_ar3_inner: |
| pmaddwd m2, m1, m13 |
| pshufd m3, m2, q1111 |
| paddd m2, m3 ; left+cur |
| paddd m2, m0 ; add top |
| psrldq m0, 4 |
| psrad m2, [fg_dataq+FGData.ar_coeff_shift] |
| packssdw m2, m2 |
| pminsw m2, m14 |
| pmaxsw m2, m15 |
| pslldq m2, 4 |
| psrldq m1, 2 |
| %if cpuflag(sse4) |
| pblendw m1, m2, 00000100b |
| %else |
| pand m1, m11 |
| pandn m3, m11, m2 |
| por m1, m3 |
| %endif |
| ; overwrite previous pixels, should be ok |
| movq [bufq+xq*2-4], m1 |
| inc xq |
| jz .x_loop_ar3_end |
| test xq, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| |
| .x_loop_ar3_end: |
| add bufq, 82*2 |
| add bufyq, 82*2<<%3 |
| dec hd |
| jg .y_loop_ar3 |
| %if WIN64 |
| mov rsp, r6 |
| %elif ARCH_X86_32 |
| %undef m8 |
| %undef m9 |
| %undef m10 |
| %undef m11 |
| %undef m12 |
| %undef m13 |
| %undef m14 |
| %undef m15 |
| %endif |
| RET |
| %endmacro |
| |
| generate_grain_uv_fn 420, 1, 1 |
| generate_grain_uv_fn 422, 1, 0 |
| generate_grain_uv_fn 444, 0, 0 |
| |
| %macro SCRATCH 3 |
| %if ARCH_X86_32 |
| mova [rsp+%3*mmsize], m%1 |
| %define m%2 [rsp+%3*mmsize] |
| %else |
| SWAP %1, %2 |
| %endif |
| %endmacro |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < mmsize |
| cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ |
| dst, src, scaling, unused1, fg_data, picptr, unused2 |
| ; copy stack arguments to new position post-alignment, so that we |
| ; don't have to keep the old stack location in a separate register |
| mov r0, r0m |
| mov r1, r2m |
| mov r2, r4m |
| mov r3, r6m |
| mov r4, r7m |
| mov r5, r8m |
| |
| %define r0m [rsp+8*mmsize+ 3*gprsize] |
| %define r2m [rsp+8*mmsize+ 5*gprsize] |
| %define r4m [rsp+8*mmsize+ 7*gprsize] |
| %define r6m [rsp+8*mmsize+ 9*gprsize] |
| %define r7m [rsp+8*mmsize+10*gprsize] |
| %define r8m [rsp+8*mmsize+11*gprsize] |
| |
| mov r0m, r0 |
| mov r2m, r1 |
| mov r4m, r2 |
| mov r6m, r3 |
| mov r7m, r4 |
| mov r8m, r5 |
| %else |
| cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ |
| dst, src, scaling, unused1, fg_data, picptr, unused2 |
| %endif |
| mov srcq, srcm |
| mov scalingq, r5m |
| mov fg_dataq, r3m |
| %if STACK_ALIGNMENT < mmsize |
| mov r6, r9m |
| |
| %define r9m [rsp+8*mmsize+ 4*gprsize] |
| %define r3m [rsp+8*mmsize+ 6*gprsize] |
| %define r5m [rsp+8*mmsize+ 8*gprsize] |
| |
| mov r9m, r6 |
| %endif |
| LEA r5, $$ |
| %define base r5-$$ |
| mov r5m, picptrq |
| %else |
| cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut |
| lea r8, [pb_mask] |
| %define base r8-pb_mask |
| %endif |
| mov r6d, [fg_dataq+FGData.scaling_shift] |
| SPLATW m3, [base+mul_bits+r6*2-14] |
| mov r6d, [fg_dataq+FGData.clip_to_restricted_range] |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 0, 3 |
| %else |
| DECLARE_REG_TMP 9, 10 |
| %endif |
| mov t0d, r9m ; bdmax |
| sar t0d, 11 ; is_12bpc |
| inc t0d |
| mov t1d, r6d |
| imul t1d, t0d |
| dec t0d |
| SPLATW m5, [base+min+t1*2] |
| lea t0d, [t0d*3] |
| lea t0d, [r6d*2+t0d] |
| SPLATW m4, [base+max+t0*2] |
| SPLATW m2, r9m |
| |
| pcmpeqw m1, m1 |
| psraw m7, m2, 1 ; max_grain |
| pxor m1, m7 ; min_grain |
| SPLATD m6, [base+pd_16] |
| |
| SCRATCH 1, 9, 0 |
| SCRATCH 2, 10, 1 |
| SCRATCH 3, 11, 2 |
| SCRATCH 4, 12, 3 |
| SCRATCH 5, 13, 4 |
| SCRATCH 6, 14, 5 |
| SCRATCH 7, 15, 6 |
| |
| mova m6, [base+pw_27_17_17_27] ; for horizontal filter |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 |
| DECLARE_REG_TMP 0 |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ |
| sby, see |
| DECLARE_REG_TMP 7 |
| %endif |
| |
| mov sbyd, r8m |
| movzx t0d, byte [fg_dataq+FGData.overlap_flag] |
| test t0d, t0d |
| jz .no_vertical_overlap |
| test sbyd, sbyd |
| jnz .vertical_overlap |
| .no_vertical_overlap: |
| mov dword r8m, t0d |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused |
| imul seed, (173 << 24) | 37 |
| %else |
| imul seed, sbyd, (173 << 24) | 37 |
| %endif |
| add seed, (105 << 24) | 178 |
| rol seed, 8 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak |
| |
| mov r3m, seed |
| mov wq, r4m |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused1, unused2, see, src_bak |
| %endif |
| |
| lea src_bakq, [srcq+wq*2] |
| mov r9mp, src_bakq |
| neg wq |
| sub dstmp, srcq |
| %if ARCH_X86_32 |
| mov r4m, wq |
| %endif |
| |
| .loop_x: |
| %if ARCH_X86_32 |
| mov seed, r3m |
| %endif |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak |
| |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyq, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak |
| %endif |
| |
| .loop_x_odd: |
| movzx hd, word r7m |
| mov grain_lutq, grain_lutmp |
| .loop_y: |
| ; src |
| pand m0, m10, [srcq+ 0] |
| pand m1, m10, [srcq+16] ; m0-1: src as word |
| |
| ; scaling[src] |
| %if ARCH_X86_32 |
| vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 |
| vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 |
| %else |
| vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 |
| vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 |
| %endif |
| REPX {psrlw x, 8}, m2, m3 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m4, [grain_lutq+offxyq*2] |
| movu m5, [grain_lutq+offxyq*2+16] |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| REPX {pmullw x, m11}, m2, m3 |
| pmulhrsw m4, m2 |
| pmulhrsw m5, m3 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+16], m1 |
| |
| add srcq, r2mp ; src += stride |
| add grain_lutq, 82*2 |
| dec hd |
| jg .loop_y |
| |
| %if ARCH_X86_32 |
| add r4mp, 16 |
| %else |
| add wq, 16 |
| %endif |
| jge .end |
| %if ARCH_X86_32 |
| mov srcq, r9mp |
| add srcq, r4mp |
| add srcq, r4mp |
| %else |
| mov src_bakq, r9mp |
| lea srcq, [src_bakq+wq*2] |
| %endif |
| btc dword r8m, 2 |
| jc .next_blk |
| add offxyd, 16 |
| test dword r8m, 2 |
| jz .loop_x_odd |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r12d, 16 ; top_offxy += 16 |
| %endif |
| jmp .loop_x_odd_v_overlap |
| |
| .next_blk: |
| test dword r8m, 1 |
| jz .loop_x |
| |
| ; r8m = sbym |
| test dword r8m, 2 |
| jnz .loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| .loop_x_h_overlap: |
| %if ARCH_X86_32 |
| add offxyd, 16 |
| mov [rsp+8*mmsize+0*gprsize], offxyd |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak |
| mov seed, r3m |
| %endif |
| |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, left_offxy |
| |
| lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx |
| |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyq, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, left_offxy |
| %endif |
| |
| mov hd, dword r7m |
| mov grain_lutq, grain_lutmp |
| .loop_y_h_overlap: |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m5, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r5, [rsp+8*mmsize+0*gprsize] |
| movd m4, [grain_lutq+r5*2] |
| %else |
| movd m4, [grain_lutq+left_offxyq*2] |
| %endif |
| punpcklwd m4, m5 |
| pmaddwd m4, m6 |
| paddd m4, m14 |
| psrad m4, 5 |
| packssdw m4, m4 |
| pminsw m4, m15 |
| pmaxsw m4, m9 |
| shufps m4, m5, q3210 |
| |
| ; src |
| pand m0, m10, [srcq+ 0] |
| pand m1, m10, [srcq+16] ; m0-1: src as word |
| |
| ; scaling[src] |
| %if ARCH_X86_32 |
| vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 |
| vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 |
| %else |
| vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 |
| vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 |
| %endif |
| REPX {psrlw x, 8}, m2, m3 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| movu m5, [grain_lutq+offxyq*2+16] |
| REPX {pmullw x, m11}, m2, m3 |
| pmulhrsw m4, m2 |
| pmulhrsw m5, m3 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+16], m1 |
| |
| add srcq, r2mp |
| add grain_lutq, 82*2 |
| dec hd |
| jg .loop_y_h_overlap |
| |
| %if ARCH_X86_32 |
| add r4mp, 16 |
| %else |
| add wq, 16 |
| %endif |
| jge .end |
| %if ARCH_X86_32 |
| mov srcq, r9mp |
| add srcq, r4mp |
| add srcq, r4mp |
| %else |
| mov src_bakq, r9mp |
| lea srcq, [src_bakq+wq*2] |
| %endif |
| or dword r8m, 4 |
| add offxyd, 16 |
| |
| ; r8m = sbym |
| test dword r8m, 2 |
| jz .loop_x_odd |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r12d, 16 ; top_offxy += 16 |
| %endif |
| jmp .loop_x_odd_v_overlap |
| |
| .end: |
| RET |
| |
| .vertical_overlap: |
| or t0d, 2 |
| mov r8m, t0d |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ |
| sby, see |
| %endif |
| |
| movzx sbyd, sbyb |
| %if ARCH_X86_32 |
| imul r4, [fg_dataq+FGData.seed], 0x00010001 |
| DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused |
| %else |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| %endif |
| imul t0d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add t0d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and t0d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, t0d |
| %if ARCH_X86_32 |
| xor sbyd, seed |
| |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak |
| |
| mov r3m, seed |
| mov wq, r4m |
| %else |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused1, unused2, see, src_bak |
| %endif |
| |
| lea src_bakq, [srcq+wq*2] |
| mov r9mp, src_bakq |
| neg wq |
| sub dstmp, srcq |
| %if ARCH_X86_32 |
| mov r4m, wq |
| %endif |
| |
| .loop_x_v_overlap: |
| %if ARCH_X86_32 |
| mov r5, r5m |
| SPLATD m7, [base+pw_27_17_17_27] |
| mov seed, r3m |
| %else |
| SPLATD m7, [pw_27_17_17_27] |
| %endif |
| |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp t0b ; parity of top_seed |
| shr seed, 16 |
| shl t0d, 16 |
| test seeb, seeh |
| setp t0b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor t0d, r6d |
| mov seed, t0d |
| ror seed, 1 ; updated (cur_seed << 16) | top_seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, unused, top_offxy |
| |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| ror offxd, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*2+0x10001*747+32*82] |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, unused, top_offxy |
| %endif |
| |
| movzx top_offxyd, offxyw |
| %if ARCH_X86_32 |
| mov [rsp+8*mmsize+1*gprsize], top_offxyd |
| |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %endif |
| shr offxyd, 16 |
| |
| .loop_x_odd_v_overlap: |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m7, [PIC_ptr(pw_27_17_17_27)] |
| mov hd, dword r7m |
| mov grain_lutq, grain_lutmp |
| .loop_y_v_overlap: |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r5, [rsp+8*mmsize+1*gprsize] |
| movu m2, [grain_lutq+r5*2] |
| %else |
| movu m2, [grain_lutq+top_offxyq*2] |
| %endif |
| punpckhwd m4, m2, m3 |
| punpcklwd m2, m3 |
| REPX {pmaddwd x, m7}, m4, m2 |
| REPX {paddd x, m14}, m4, m2 |
| REPX {psrad x, 5}, m4, m2 |
| packssdw m2, m4 |
| pminsw m2, m15 |
| pmaxsw m2, m9 |
| movu m4, [grain_lutq+offxyq*2+16] |
| %if ARCH_X86_32 |
| movu m3, [grain_lutq+r5*2+16] |
| %else |
| movu m3, [grain_lutq+top_offxyq*2+16] |
| %endif |
| punpckhwd m5, m3, m4 |
| punpcklwd m3, m4 |
| REPX {pmaddwd x, m7}, m5, m3 |
| REPX {paddd x, m14}, m5, m3 |
| REPX {psrad x, 5}, m5, m3 |
| packssdw m3, m5 |
| pminsw m3, m15 |
| pmaxsw m3, m9 |
| |
| ; src |
| pand m0, m10, [srcq+ 0] ; m0-1: src as word |
| pand m1, m10, [srcq+16] ; m0-1: src as word |
| |
| ; scaling[src] |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| %if ARCH_X86_32 |
| vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 |
| %else |
| vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 |
| %endif |
| psrlw m4, 8 |
| pmullw m4, m11 |
| pmulhrsw m4, m2 |
| %if ARCH_X86_32 |
| vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 |
| %else |
| vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 |
| %endif |
| psrlw m5, 8 |
| pmullw m5, m11 |
| pmulhrsw m5, m3 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m5 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+16], m1 |
| |
| add srcq, r2mp |
| add grain_lutq, 82*2 |
| dec hw |
| jz .end_y_v_overlap |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] |
| xor hd, 0x10000 |
| test hd, 0x10000 |
| jnz .loop_y_v_overlap |
| jmp .loop_y |
| |
| .end_y_v_overlap: |
| %if ARCH_X86_32 |
| add r4mp, 16 |
| %else |
| add wq, 16 |
| %endif |
| jge .end_hv |
| %if ARCH_X86_32 |
| mov srcq, r9mp |
| add srcq, r4mp |
| add srcq, r4mp |
| %else |
| mov src_bakq, r9mp |
| lea srcq, [src_bakq+wq*2] |
| %endif |
| btc dword r8m, 2 |
| jc .next_blk_v |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add top_offxyd, 16 |
| %endif |
| add offxyd, 16 |
| jmp .loop_x_odd_v_overlap |
| |
| .next_blk_v: |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| |
| .loop_x_hv_overlap: |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak |
| |
| mov r0, [rsp+8*mmsize+1*gprsize] |
| add r3, 16 |
| add r0, 16 |
| mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy |
| mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy |
| |
| mov seed, r3m |
| xor r0, r0 |
| %else |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| %endif |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp t0b ; parity of top_seed |
| shr seed, 16 |
| shl t0d, 16 |
| test seeb, seeh |
| setp t0b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor t0d, r6d |
| mov seed, t0d |
| ror seed, 1 ; updated (cur_seed << 16) | top_seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy |
| |
| lea topleft_offxyq, [top_offxyq+16] |
| lea left_offxyq, [offyq+16] |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| ror offxd, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*2+0x10001*747+32*82] |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy |
| %endif |
| |
| movzx top_offxyd, offxyw |
| %if ARCH_X86_32 |
| mov [rsp+8*mmsize+1*gprsize], top_offxyd |
| |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %endif |
| shr offxyd, 16 |
| |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m7, [PIC_ptr(pw_27_17_17_27)] |
| |
| movzx hd, word r7m |
| mov grain_lutq, grain_lutmp |
| .loop_y_hv_overlap: |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m2, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy |
| mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy |
| movu m4, [grain_lutq+r0*2] |
| movd m5, [grain_lutq+r5*2] |
| mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy |
| movd m3, [grain_lutq+r5*2] |
| %else |
| movu m4, [grain_lutq+top_offxyq*2] |
| movd m5, [grain_lutq+left_offxyq*2] |
| movd m3, [grain_lutq+topleft_offxyq*2] |
| %endif |
| ; do h interpolation first (so top | top/left -> top, left | cur -> cur) |
| punpcklwd m5, m2 |
| punpcklwd m3, m4 |
| REPX {pmaddwd x, m6}, m5, m3 |
| REPX {paddd x, m14}, m5, m3 |
| REPX {psrad x, 5}, m5, m3 |
| packssdw m5, m3 |
| pminsw m5, m15 |
| pmaxsw m5, m9 |
| shufps m3, m5, m2, q3210 |
| shufps m5, m4, q3232 |
| ; followed by v interpolation (top | cur -> cur) |
| movu m0, [grain_lutq+offxyq*2+16] |
| %if ARCH_X86_32 |
| movu m1, [grain_lutq+r0*2+16] |
| %else |
| movu m1, [grain_lutq+top_offxyq*2+16] |
| %endif |
| punpcklwd m2, m5, m3 |
| punpckhwd m5, m3 |
| punpcklwd m3, m1, m0 |
| punpckhwd m1, m0 |
| REPX {pmaddwd x, m7}, m2, m5, m3, m1 |
| REPX {paddd x, m14}, m2, m5, m3, m1 |
| REPX {psrad x, 5}, m2, m5, m3, m1 |
| packssdw m2, m5 |
| packssdw m3, m1 |
| REPX {pminsw x, m15}, m2, m3 |
| REPX {pmaxsw x, m9}, m2, m3 |
| |
| ; src |
| pand m0, m10, [srcq+ 0] |
| pand m1, m10, [srcq+16] ; m0-1: src as word |
| |
| ; scaling[src] |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| %if ARCH_X86_32 |
| vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 |
| %else |
| vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 |
| %endif |
| psrlw m4, 8 |
| pmullw m4, m11 |
| pmulhrsw m2, m4 |
| %if ARCH_X86_32 |
| vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 |
| %else |
| vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 |
| %endif |
| psrlw m5, 8 |
| pmullw m5, m11 |
| pmulhrsw m3, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+srcq+ 0], m0 |
| mova [dstq+srcq+16], m1 |
| |
| add srcq, r2mp |
| add grain_lutq, 82*2 |
| dec hw |
| jz .end_y_hv_overlap |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] |
| xor hd, 0x10000 |
| test hd, 0x10000 |
| jnz .loop_y_hv_overlap |
| jmp .loop_y_h_overlap |
| |
| .end_y_hv_overlap: |
| or dword r8m, 4 |
| %if ARCH_X86_32 |
| add r4mp, 16 |
| %else |
| add wq, 16 |
| %endif |
| jge .end_hv |
| %if ARCH_X86_32 |
| mov r5, r5m |
| add offxyd, 16 |
| add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 |
| mov srcq, r9mp |
| add srcq, r4mp |
| add srcq, r4mp |
| %else |
| add offxyd, 16 |
| add top_offxyd, 16 |
| mov src_bakq, r9mp |
| lea srcq, [src_bakq+wq*2] |
| %endif |
| jmp .loop_x_odd_v_overlap |
| |
| .end_hv: |
| RET |
| %if ARCH_X86_32 |
| DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 |
| %endif |
| |
| %macro FGUV_FN 3 ; name, ss_hor, ss_ver |
| INIT_XMM ssse3 |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < mmsize |
| cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ |
| tmp, src, scaling, h, fg_data, picptr, unused |
| mov r0, r0m |
| mov r1, r1m |
| mov r2, r2m |
| mov r4, r3m |
| mov r3, r4m |
| mov r5, r5m |
| %define r0m [rsp+8*mmsize+ 3*gprsize] |
| %define r1m [rsp+8*mmsize+ 4*gprsize] |
| %define r2m [rsp+8*mmsize+ 5*gprsize] |
| %define r3m [rsp+8*mmsize+ 6*gprsize] |
| %define r4m [rsp+8*mmsize+ 7*gprsize] |
| %define r5m [rsp+8*mmsize+ 8*gprsize] |
| mov r0m, r0 |
| mov r2m, r2 |
| mov r4m, r3 |
| mov r5m, r5 |
| |
| mov r0, r6m |
| mov r2, r7m |
| mov r3, r8m |
| mov r5, r9m |
| %define r6m [rsp+8*mmsize+ 9*gprsize] |
| %define r7m [rsp+8*mmsize+10*gprsize] |
| %define r8m [rsp+8*mmsize+11*gprsize] |
| %define r9m [rsp+8*mmsize+12*gprsize] |
| mov r6m, r0 |
| mov r7m, r2 |
| mov r8m, r3 |
| mov r9m, r5 |
| |
| mov r2, r10m |
| mov r3, r11m |
| mov r5, r12m |
| mov r0, r13m |
| %define r10m [rsp+8*mmsize+13*gprsize] |
| %define r11m [rsp+8*mmsize+14*gprsize] |
| %define r12m [rsp+8*mmsize+15*gprsize] |
| mov r10m, r2 |
| mov r11m, r3 |
| mov r12m, r5 |
| |
| SPLATW m2, r13m |
| %else |
| cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ |
| tmp, src, scaling, h, fg_data, picptr, unused |
| mov srcq, srcm |
| mov fg_dataq, r3m |
| %endif |
| LEA r5, $$ |
| %define base r5-$$ |
| |
| DECLARE_REG_TMP 0, 2, 3 |
| %else |
| cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ |
| grain_lut, h, sby, luma, lstride, uv_pl, is_id |
| %define base r8-pb_mask |
| lea r8, [pb_mask] |
| |
| DECLARE_REG_TMP 9, 10, 11 |
| %endif |
| mov r6d, [fg_dataq+FGData.scaling_shift] |
| SPLATW m3, [base+mul_bits+r6*2-14] |
| mov r6d, [fg_dataq+FGData.clip_to_restricted_range] |
| %if STACK_ALIGNMENT >= mmsize |
| mov t0d, r13m ; bdmax |
| %endif |
| sar t0d, 11 ; is_12bpc |
| inc t0d |
| mov t1d, r6d |
| imul t1d, t0d |
| dec t0d |
| SPLATW m5, [base+min+t1*2] |
| lea t1d, [t0d*3] |
| mov t2d, r12m |
| inc t2d |
| imul r6d, t2d |
| add t1d, r6d |
| SPLATW m4, [base+max+t1*2] |
| %if STACK_ALIGNMENT >= mmsize |
| SPLATW m2, r13m |
| %endif |
| |
| SCRATCH 2, 10, 2 |
| SCRATCH 3, 11, 3 |
| SCRATCH 4, 12, 4 |
| SCRATCH 5, 13, 5 |
| |
| %define mzero m7 |
| |
| %if %3 |
| SPLATD m2, [base+pw_23_22] |
| %endif |
| |
| %if ARCH_X86_32 |
| mov scalingq, r5m |
| mov r5m, r5 |
| %else |
| mov r13mp, strideq |
| %endif |
| |
| pcmpeqw m0, m0 |
| psraw m1, m10, 1 |
| pxor m0, m1 |
| |
| SCRATCH 0, 8, 0 |
| SCRATCH 1, 9, 1 |
| |
| cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 |
| jne .csfl |
| |
| %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap |
| |
| DECLARE_REG_TMP 0 |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap |
| |
| DECLARE_REG_TMP 9 |
| %endif |
| |
| %if %1 |
| mov r6d, r11m |
| SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] |
| SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] |
| punpcklwd m6, m1, m0 |
| SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] |
| SPLATD m7, [base+pw_4+t0*4] |
| pmullw m5, m7 |
| %else |
| SPLATD m6, [base+pd_16] |
| %if %2 |
| mova m5, [base+pw_23_22] |
| %else |
| mova m5, [base+pw_27_17_17_27] |
| %endif |
| %endif |
| |
| SCRATCH 6, 14, 6 |
| SCRATCH 5, 15, 7 |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 0 |
| %else |
| DECLARE_REG_TMP 7 |
| %endif |
| |
| mov sbyd, r8m |
| mov t0d, [fg_dataq+FGData.overlap_flag] |
| test t0d, t0d |
| jz %%no_vertical_overlap |
| test sbyd, sbyd |
| jnz %%vertical_overlap |
| |
| %%no_vertical_overlap: |
| mov r8m, t0d |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap |
| imul seed, (173 << 24) | 37 |
| %else |
| imul seed, sbyd, (173 << 24) | 37 |
| %endif |
| add seed, (105 << 24) | 178 |
| rol seed, 8 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, luma |
| |
| mov dstq, r0mp |
| mov lumaq, r9mp |
| mov wq, r4m |
| lea r3, [srcq+wq*2] |
| mov r1mp, r3 |
| lea r3, [dstq+wq*2] |
| mov r11mp, r3 |
| lea r3, [lumaq+wq*(2<<%2)] |
| mov r12mp, r3 |
| %if %3 |
| shl r10mp, 1 |
| %endif |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused2, unused3, see, unused4, unused5, unused6, luma, lstride |
| |
| mov lstrideq, r10mp |
| %if %3 |
| add lstrideq, lstrideq |
| %endif |
| mov lumaq, r9mp |
| lea r10, [srcq+wq*2] |
| lea r11, [dstq+wq*2] |
| lea r12, [lumaq+wq*(2<<%2)] |
| mov r10mp, r10 |
| mov r11mp, r11 |
| mov r12mp, r12 |
| %endif |
| neg wq |
| %if ARCH_X86_32 |
| mov r4mp, wq |
| %endif |
| |
| %%loop_x: |
| %if ARCH_X86_32 |
| mov seed, r3m |
| %endif |
| |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, unused1, unused2, unused3, luma, lstride |
| |
| mov offxd, seed |
| mov offyd, seed |
| %endif |
| ror offyd, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164>>%3 |
| lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, unused1, unused2, unused3, luma, lstride |
| %endif |
| |
| %if %2 == 0 |
| %%loop_x_odd: |
| %endif |
| mov hd, r7m |
| mov grain_lutq, grain_lutmp |
| %%loop_y: |
| ; src |
| mova m0, [srcq] |
| mova m1, [srcq+16] ; m0-1: src as word |
| |
| ; luma_src |
| pxor mzero, mzero |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut |
| |
| mov lumaq, r9m |
| %endif |
| mova m4, [lumaq+ 0] |
| mova m6, [lumaq+(16<<%2)] |
| %if %2 |
| phaddw m4, [lumaq+16] |
| phaddw m6, [lumaq+48] |
| %endif |
| %if ARCH_X86_32 |
| add lumaq, r10mp |
| mov r9m, lumaq |
| %endif |
| %if %2 |
| pavgw m4, mzero |
| pavgw m6, mzero |
| %endif |
| |
| %if %1 |
| punpckhwd m3, m4, m0 |
| punpcklwd m4, m0 |
| punpckhwd m5, m6, m1 |
| punpcklwd m6, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m3, m4, m5, m6 |
| REPX {psrad x, 6}, m3, m4, m5, m6 |
| packssdw m4, m3 |
| packssdw m6, m5 |
| REPX {paddw x, m15}, m4, m6 |
| REPX {pmaxsw x, mzero}, m4, m6 |
| REPX {pminsw x, m10}, m4, m6 ; clip_pixel() |
| %else |
| REPX {pand x, m10}, m4, m6 |
| %endif |
| |
| ; scaling[luma_src] |
| %if ARCH_X86_32 |
| vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 |
| vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 |
| %else |
| vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 |
| vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 |
| %endif |
| REPX {psrlw x, 8}, m3, m5 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m4, [grain_lutq+offxyq*2] |
| movu m6, [grain_lutq+offxyq*2+16] |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| REPX {pmullw x, m11}, m3, m5 |
| pmulhrsw m4, m3 |
| pmulhrsw m6, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m4 |
| paddw m1, m6 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+ 0], m0 |
| mova [dstq+16], m1 |
| |
| %if ARCH_X86_32 |
| add srcq, r2mp |
| add dstq, r2mp |
| mov dstmp, dstq |
| %else |
| add srcq, r13mp |
| add dstq, r13mp |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*2 |
| dec hd |
| jg %%loop_y |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma |
| |
| mov wq, r4mp |
| %endif |
| add wq, 16 |
| jge %%end |
| %if ARCH_X86_32 |
| mov srcq, r1mp |
| %else |
| mov srcq, r10mp |
| %endif |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| %if ARCH_X86_32 |
| mov r0m, dstq |
| mov r9m, lumaq |
| mov r4m, wq |
| %endif |
| %if %2 == 0 |
| btc dword r8m, 2 |
| jc %%next_blk |
| add offxyd, 16 |
| test dword r8m, 2 |
| jz %%loop_x_odd |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r11d, 16 |
| %endif |
| jmp %%loop_x_odd_v_overlap |
| %%next_blk: |
| %endif |
| test dword r8m, 1 |
| je %%loop_x |
| |
| ; r8m = sbym |
| test dword r8m, 2 |
| jnz %%loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| %%loop_x_h_overlap: |
| %if ARCH_X86_32 |
| add offxyd, 16 |
| mov [rsp+8*mmsize+0*gprsize], offxyd |
| |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut |
| |
| mov seed, r3m |
| %endif |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, unused1, unused2, luma, lstride |
| |
| lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx |
| mov offxd, seed |
| mov offyd, seed |
| %endif |
| ror offyd, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164>>%3 |
| lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, unused1, unused2, luma, lstride |
| %endif |
| |
| mov hd, r7m |
| mov grain_lutq, grain_lutmp |
| %%loop_y_h_overlap: |
| mova m0, [srcq] |
| mova m1, [srcq+16] |
| |
| ; luma_src |
| pxor mzero, mzero |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut |
| mov lumaq, r9m |
| %endif |
| mova m4, [lumaq+ 0] |
| mova m6, [lumaq+(16<<%2)] |
| %if %2 |
| phaddw m4, [lumaq+16] |
| phaddw m6, [lumaq+48] |
| %endif |
| %if ARCH_X86_32 |
| add lumaq, r10mp |
| mov r9m, lumaq |
| %endif |
| %if %2 |
| pavgw m4, mzero |
| pavgw m6, mzero |
| %endif |
| |
| %if %1 |
| punpckhwd m3, m4, m0 |
| punpcklwd m4, m0 |
| punpckhwd m5, m6, m1 |
| punpcklwd m6, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m3, m4, m5, m6 |
| REPX {psrad x, 6}, m3, m4, m5, m6 |
| packssdw m4, m3 |
| packssdw m6, m5 |
| REPX {paddw x, m15}, m4, m6 |
| REPX {pmaxsw x, mzero}, m4, m6 |
| REPX {pminsw x, m10}, m4, m6 ; clip_pixel() |
| %else |
| REPX {pand x, m10}, m4, m6 |
| %endif |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m7, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r5, [rsp+8*mmsize+0*gprsize] |
| movd m5, [grain_lutq+r5*2] |
| %else |
| movd m5, [grain_lutq+left_offxyq*2+ 0] |
| %endif |
| punpcklwd m5, m7 ; {left0, cur0} |
| %if %1 |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| %if %2 |
| pmaddwd m5, [PIC_ptr(pw_23_22)] |
| %else |
| pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] |
| %endif |
| paddd m5, [PIC_ptr(pd_16)] |
| %else |
| pmaddwd m5, m15 |
| paddd m5, m14 |
| %endif |
| psrad m5, 5 |
| packssdw m5, m5 |
| pmaxsw m5, m8 |
| pminsw m5, m9 |
| shufps m5, m7, q3210 |
| movu m3, [grain_lutq+offxyq*2+16] |
| |
| ; scaling[luma_src] |
| %if ARCH_X86_32 |
| vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 |
| vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 |
| %else |
| vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 |
| vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 |
| %endif |
| REPX {psrlw x, 8}, m7, m4 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| REPX {pmullw x, m11}, m7, m4 |
| pmulhrsw m5, m7 |
| pmulhrsw m3, m4 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m5 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+ 0], m0 |
| mova [dstq+16], m1 |
| |
| %if ARCH_X86_32 |
| add srcq, r2mp |
| add dstq, r2mp |
| mov dstmp, dstq |
| %else |
| add srcq, r13mp |
| add dstq, r13mp |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*2 |
| dec hd |
| jg %%loop_y_h_overlap |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut |
| mov wq, r4mp |
| %endif |
| add wq, 16 |
| jge %%end |
| %if ARCH_X86_32 |
| mov srcq, r1mp |
| %else |
| mov srcq, r10mp |
| %endif |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| %if ARCH_X86_32 |
| mov r0mp, dstq |
| mov r9mp, lumaq |
| mov r4m, wq |
| %endif |
| |
| %if %2 |
| ; r8m = sbym |
| test dword r8m, 2 |
| jne %%loop_x_hv_overlap |
| jmp %%loop_x_h_overlap |
| %else |
| or dword r8m, 4 |
| add offxyd, 16 |
| |
| ; r8m = sbym |
| test dword r8m, 2 |
| jz %%loop_x_odd |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r11d, 16 ; top_offxy += 16 |
| %endif |
| jmp %%loop_x_odd_v_overlap |
| %endif |
| |
| %%end: |
| RET |
| |
| %%vertical_overlap: |
| or t0d, 2 |
| mov r8m, t0d |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ |
| sby, see, unused1, unused2, unused3, lstride |
| %endif |
| |
| movzx sbyd, sbyb |
| %if ARCH_X86_32 |
| imul r4, [fg_dataq+FGData.seed], 0x00010001 |
| |
| DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused |
| %else |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| %endif |
| imul t0d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add t0d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and t0d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, t0d |
| %if ARCH_X86_32 |
| xor sbyd, seed |
| |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, luma |
| |
| mov r3m, seed |
| mov dstq, r0mp |
| mov lumaq, r9mp |
| mov wq, r4m |
| lea r3, [srcq+wq*2] |
| mov r1mp, r3 |
| lea r3, [dstq+wq*2] |
| mov r11mp, r3 |
| lea r3, [lumaq+wq*(2<<%2)] |
| mov r12mp, r3 |
| %if %3 |
| shl r10mp, 1 |
| %endif |
| %else |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| unused1, unused2, see, unused3, unused4, unused5, luma, lstride |
| |
| mov lstrideq, r10mp |
| %if %3 |
| add lstrideq, lstrideq |
| %endif |
| mov lumaq, r9mp |
| lea r10, [srcq+wq*2] |
| lea r11, [dstq+wq*2] |
| lea r12, [lumaq+wq*(2<<%2)] |
| mov r10mp, r10 |
| mov r11mp, r11 |
| mov r12mp, r12 |
| %endif |
| neg wq |
| %if ARCH_X86_32 |
| mov r4m, wq |
| %endif |
| |
| %%loop_x_v_overlap: |
| %if ARCH_X86_32 |
| mov seed, r3m |
| xor t0d, t0d |
| %else |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| %endif |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp t0b ; parity of top_seed |
| shr seed, 16 |
| shl t0d, 16 |
| test seeb, seeh |
| setp t0b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor t0d, r6d |
| mov seed, t0d |
| ror seed, 1 ; updated (cur_seed << 16) | top_seed |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, unused1, top_offxy, unused2, luma, lstride |
| |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| ror offxd, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164>>%3 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, unused1, top_offxy, unused2, luma, lstride |
| %endif |
| movzx top_offxyd, offxyw |
| %if ARCH_X86_32 |
| mov [rsp+8*mmsize+1*gprsize], top_offxyd |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %endif |
| shr offxyd, 16 |
| |
| %if %2 == 0 |
| %%loop_x_odd_v_overlap: |
| %endif |
| %if %3 == 0 |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m2, [PIC_ptr(pw_27_17_17_27)] |
| %endif |
| |
| mov hd, r7m |
| mov grain_lutq, grain_lutmp |
| %%loop_y_v_overlap: |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy |
| movu m5, [grain_lutq+r0*2] |
| %else |
| movu m5, [grain_lutq+top_offxyq*2] |
| %endif |
| punpckhwd m7, m5, m3 |
| punpcklwd m5, m3 ; {top/cur interleaved} |
| REPX {pmaddwd x, m2}, m7, m5 |
| %if %1 |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 |
| %else |
| REPX {paddd x, m14}, m7, m5 |
| %endif |
| REPX {psrad x, 5}, m7, m5 |
| packssdw m3, m5, m7 |
| pmaxsw m3, m8 |
| pminsw m3, m9 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m4, [grain_lutq+offxyq*2+16] |
| %if ARCH_X86_32 |
| movu m5, [grain_lutq+r0*2+16] |
| %else |
| movu m5, [grain_lutq+top_offxyq*2+16] |
| %endif |
| punpckhwd m7, m5, m4 |
| punpcklwd m5, m4 ; {top/cur interleaved} |
| REPX {pmaddwd x, m2}, m7, m5 |
| %if %1 |
| REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 |
| %else |
| REPX {paddd x, m14}, m7, m5 |
| %endif |
| REPX {psrad x, 5}, m7, m5 |
| packssdw m4, m5, m7 |
| pmaxsw m4, m8 |
| pminsw m4, m9 |
| |
| ; src |
| mova m0, [srcq] |
| mova m1, [srcq+16] |
| |
| ; luma_src |
| pxor mzero, mzero |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut |
| |
| mov lumaq, r9mp |
| %endif |
| mova m5, [lumaq+ 0] |
| mova m6, [lumaq+(16<<%2)] |
| %if %2 |
| phaddw m5, [lumaq+16] |
| phaddw m6, [lumaq+48] |
| %endif |
| %if ARCH_X86_32 |
| add lumaq, r10mp |
| mov r9mp, lumaq |
| %endif |
| %if %2 |
| pavgw m5, mzero |
| pavgw m6, mzero |
| %endif |
| |
| %if %1 |
| punpckhwd m7, m5, m0 |
| punpcklwd m5, m0 |
| REPX {pmaddwd x, m14}, m7, m5 |
| REPX {psrad x, 6}, m7, m5 |
| packssdw m5, m7 |
| punpckhwd m7, m6, m1 |
| punpcklwd m6, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m7, m6 |
| REPX {psrad x, 6}, m7, m6 |
| packssdw m6, m7 |
| pxor mzero, mzero |
| REPX {paddw x, m15}, m5, m6 |
| REPX {pmaxsw x, mzero}, m5, m6 |
| REPX {pminsw x, m10}, m5, m6 ; clip_pixel() |
| %else |
| REPX {pand x, m10}, m5, m6 |
| %endif |
| |
| ; scaling[luma_src] |
| %if ARCH_X86_32 |
| vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 |
| vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 |
| %else |
| vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 |
| vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 |
| %endif |
| REPX {psrlw x, 8}, m7, m5 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| REPX {pmullw x, m11}, m7, m5 |
| pmulhrsw m3, m7 |
| pmulhrsw m4, m5 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m3 |
| paddw m1, m4 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+ 0], m0 |
| mova [dstq+16], m1 |
| |
| dec hw |
| jle %%end_y_v_overlap |
| %if ARCH_X86_32 |
| add srcq, r2mp |
| add dstq, r2mp |
| mov dstmp, dstq |
| %else |
| add srcq, r13mp |
| add dstq, r13mp |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*2 |
| %if %3 |
| jmp %%loop_y |
| %else |
| btc hd, 16 |
| jc %%loop_y |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] |
| jmp %%loop_y_v_overlap |
| %endif |
| |
| %%end_y_v_overlap: |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut |
| |
| mov wq, r4m |
| %endif |
| add wq, 16 |
| jge %%end_hv |
| %if ARCH_X86_32 |
| mov srcq, r1mp |
| %else |
| mov srcq, r10mp |
| %endif |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| %if ARCH_X86_32 |
| mov r0mp, dstq |
| mov r9mp, lumaq |
| mov r4m, wq |
| %endif |
| |
| %if %2 |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| %else |
| btc dword r8m, 2 |
| jc %%loop_x_hv_overlap |
| add offxyd, 16 |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r11d, 16 |
| %endif |
| jmp %%loop_x_odd_v_overlap |
| %endif |
| |
| %%loop_x_hv_overlap: |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut |
| |
| mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy |
| add offxyd, 16 |
| add t0d, 16 |
| mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd |
| mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd |
| |
| DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut |
| |
| mov seed, r3m |
| xor t0d, t0d |
| %else |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| %endif |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp t0b ; parity of top_seed |
| shr seed, 16 |
| shl t0d, 16 |
| test seeb, seeh |
| setp t0b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor t0d, r6d |
| mov seed, t0d |
| ror seed, 1 ; updated (cur_seed << 16) | top_seed |
| %if ARCH_X86_32 |
| mov r3m, seed |
| |
| DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx |
| |
| mov offxd, offyd |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride |
| |
| lea topleft_offxyq, [top_offxyq+16] |
| lea left_offxyq, [offyq+16] |
| mov offyd, seed |
| mov offxd, seed |
| %endif |
| ror offyd, 8 |
| ror offxd, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164>>%3 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] |
| |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy |
| %else |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride |
| %endif |
| movzx top_offxyd, offxyw |
| %if ARCH_X86_32 |
| mov [rsp+8*mmsize+1*gprsize], top_offxyd |
| |
| DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut |
| %endif |
| shr offxyd, 16 |
| |
| %if %3 == 0 |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m2, [PIC_ptr(pw_27_17_17_27)] |
| %endif |
| |
| mov hd, r7m |
| mov grain_lutq, grain_lutmp |
| %%loop_y_hv_overlap: |
| ; grain = grain_lut[offy+y][offx+x] |
| %if ARCH_X86_32 |
| mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy |
| mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy |
| movd m5, [grain_lutq+r5*2] |
| %else |
| movd m5, [grain_lutq+left_offxyq*2] |
| %endif |
| movu m7, [grain_lutq+offxyq*2] |
| %if ARCH_X86_32 |
| mov r5, [rsp+8*mmsize+2*gprsize] |
| movu m4, [grain_lutq+r0*2] |
| %if %2 |
| pinsrw m5, [grain_lutq+r5*2], 2 |
| %else |
| movd m3, [grain_lutq+r5*2] |
| %endif |
| %else |
| movu m4, [grain_lutq+top_offxyq*2] |
| %if %2 |
| pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } |
| %else |
| movd m3, [grain_lutq+topleft_offxyq*2] |
| %endif |
| %endif |
| %if %2 == 0 |
| punpckldq m5, m3 |
| %endif |
| punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } |
| punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } |
| %if %1 |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| %if %2 |
| movddup m0, [PIC_ptr(pw_23_22)] |
| %else |
| movddup m0, [PIC_ptr(pw_27_17_17_27)] |
| %endif |
| %else |
| pshufd m0, m15, q1010 |
| %endif |
| pmaddwd m5, m0 |
| %if %1 |
| paddd m5, [PIC_ptr(pd_16)] |
| %else |
| paddd m5, m14 |
| %endif |
| psrad m5, 5 |
| packssdw m5, m5 |
| pmaxsw m5, m8 |
| pminsw m5, m9 |
| shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 |
| shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter |
| shufps m5, m4, q3231 ; top0-7 post-h_filter |
| |
| punpckhwd m7, m5, m3 |
| punpcklwd m5, m3 ; {top/cur interleaved} |
| REPX {pmaddwd x, m2}, m7, m5 |
| %if %1 |
| REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 |
| %else |
| REPX {paddd x, m14}, m5, m7 |
| %endif |
| REPX {psrad x, 5}, m5, m7 |
| packssdw m3, m5, m7 |
| pmaxsw m3, m8 |
| pminsw m3, m9 |
| |
| ; right half |
| movu m4, [grain_lutq+offxyq*2+16] |
| %if ARCH_X86_32 |
| movu m0, [grain_lutq+r0*2+16] |
| %else |
| movu m0, [grain_lutq+top_offxyq*2+16] |
| %endif |
| punpckhwd m1, m0, m4 |
| punpcklwd m0, m4 ; {top/cur interleaved} |
| REPX {pmaddwd x, m2}, m1, m0 |
| %if %1 |
| REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 |
| %else |
| REPX {paddd x, m14}, m1, m0 |
| %endif |
| REPX {psrad x, 5}, m1, m0 |
| packssdw m4, m0, m1 |
| pmaxsw m4, m8 |
| pminsw m4, m9 |
| |
| ; src |
| mova m0, [srcq] |
| mova m1, [srcq+16] |
| |
| ; luma_src |
| pxor mzero, mzero |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut |
| |
| mov lumaq, r9mp |
| %endif |
| mova m6, [lumaq+ 0] |
| mova m5, [lumaq+(16<<%2)] |
| %if %2 |
| phaddw m6, [lumaq+16] |
| phaddw m5, [lumaq+48] |
| %endif |
| %if ARCH_X86_32 |
| add lumaq, r10mp |
| mov r9mp, lumaq |
| %endif |
| %if %2 |
| pavgw m6, mzero |
| pavgw m5, mzero |
| %endif |
| |
| %if %1 |
| punpckhwd m7, m6, m0 |
| punpcklwd m6, m0 |
| REPX {pmaddwd x, m14}, m7, m6 |
| REPX {psrad x, 6}, m7, m6 |
| packssdw m6, m7 |
| punpckhwd m7, m5, m1 |
| punpcklwd m5, m1 ; { luma, chroma } |
| REPX {pmaddwd x, m14}, m7, m5 |
| REPX {psrad x, 6}, m7, m5 |
| packssdw m5, m7 |
| pxor mzero, mzero |
| REPX {paddw x, m15}, m6, m5 |
| REPX {pmaxsw x, mzero}, m6, m5 |
| REPX {pminsw x, m10}, m6, m5 ; clip_pixel() |
| %else |
| REPX {pand x, m10}, m6, m5 |
| %endif |
| |
| ; scaling[luma_src] |
| %if ARCH_X86_32 |
| vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 |
| vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 |
| %else |
| %if %3 == 0 |
| ; register shortage :) |
| push r12 |
| %endif |
| vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 |
| vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 |
| %if %3 == 0 |
| pop r12 |
| %endif |
| %endif |
| REPX {psrlw x, 8}, m7, m6 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| REPX {pmullw x, m11}, m7, m6 |
| pmulhrsw m3, m7 |
| pmulhrsw m4, m6 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m3 |
| paddw m1, m4 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| movifnidn dstq, dstmp |
| mova [dstq+ 0], m0 |
| mova [dstq+16], m1 |
| |
| %if ARCH_X86_32 |
| add srcq, r2mp |
| add dstq, r2mp |
| mov dstmp, dstq |
| %else |
| add srcq, r13mp |
| add dstq, r13mp |
| add lumaq, lstrideq |
| %endif |
| add grain_lutq, 82*2 |
| dec hw |
| %if %3 |
| jg %%loop_y_h_overlap |
| %else |
| jle %%end_y_hv_overlap |
| btc hd, 16 |
| jc %%loop_y_h_overlap |
| %if ARCH_X86_32 |
| mov r5, r5m |
| %endif |
| SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] |
| jmp %%loop_y_hv_overlap |
| %%end_y_hv_overlap: |
| %endif |
| %if ARCH_X86_32 |
| DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut |
| |
| mov wq, r4m |
| %endif |
| add wq, 16 |
| jge %%end_hv |
| %if ARCH_X86_32 |
| mov srcq, r1mp |
| %else |
| mov srcq, r10mp |
| %endif |
| mov dstq, r11mp |
| mov lumaq, r12mp |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| lea lumaq, [lumaq+wq*(2<<%2)] |
| %if ARCH_X86_32 |
| mov dstmp, dstq |
| mov r9mp, lumaq |
| mov r4m, wq |
| %endif |
| %if %2 |
| jmp %%loop_x_hv_overlap |
| %else |
| or dword r8m, 4 |
| add offxyd, 16 |
| %if ARCH_X86_32 |
| add dword [rsp+8*mmsize+1*gprsize], 16 |
| %else |
| add r11d, 16 ; top_offxy += 16 |
| %endif |
| jmp %%loop_x_odd_v_overlap |
| %endif |
| |
| %%end_hv: |
| RET |
| %endmacro |
| |
| %%FGUV_32x32xN_LOOP 1, %2, %3 |
| .csfl: |
| %%FGUV_32x32xN_LOOP 0, %2, %3 |
| |
| %if STACK_ALIGNMENT < mmsize |
| DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 |
| %endif |
| %endmacro |
| |
| FGUV_FN 420, 1, 1 |
| FGUV_FN 422, 1, 0 |
| FGUV_FN 444, 0, 0 |