| ; Copyright © 2019, VideoLAN and dav1d authors |
| ; Copyright © 2019, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 32 |
| pw_1024: times 16 dw 1024 |
| pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 |
| rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 |
| byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 |
| pw_seed_xor: times 2 dw 0xb524 |
| times 2 dw 0x49d8 |
| pd_m65536: dd ~0xffff |
| pb_23_22: times 2 db 23, 22 |
| pb_1: times 4 db 1 |
| hmul_bits: dw 32768, 16384, 8192, 4096 |
| round: dw 2048, 1024, 512 |
| mul_bits: dw 256, 128, 64, 32, 16 |
| round_vals: dw 32, 64, 128, 256, 512 |
| max: dw 255, 240, 235 |
| min: dw 0, 16 |
| pb_27_17_17_27: db 27, 17, 17, 27 |
| |
| %macro JMP_TABLE 1-* |
| %xdefine %1_table %%table |
| %xdefine %%base %1_table |
| %xdefine %%prefix mangle(private_prefix %+ _%1) |
| %%table: |
| %rep %0 - 1 |
| dd %%prefix %+ .ar%2 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 |
| JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 |
| |
| struc FGData |
| .seed: resd 1 |
| .num_y_points: resd 1 |
| .y_points: resb 14 * 2 |
| .chroma_scaling_from_luma: resd 1 |
| .num_uv_points: resd 2 |
| .uv_points: resb 2 * 10 * 2 |
| .scaling_shift: resd 1 |
| .ar_coeff_lag: resd 1 |
| .ar_coeffs_y: resb 24 |
| .ar_coeffs_uv: resb 2 * 26 ; includes padding |
| .ar_coeff_shift: resd 1 |
| .grain_scale_shift: resd 1 |
| .uv_mult: resd 2 |
| .uv_luma_mult: resd 2 |
| .uv_offset: resd 2 |
| .overlap_flag: resd 1 |
| .clip_to_restricted_range: resd 1 |
| endstruc |
| |
| cextern gaussian_sequence |
| |
| SECTION .text |
| |
| INIT_XMM avx2 |
| cglobal generate_grain_y, 2, 9, 16, buf, fg_data |
| lea r4, [pb_mask] |
| %define base r4-pb_mask |
| movq xm1, [base+rnd_next_upperbit_mask] |
| movq xm4, [base+mul_bits] |
| movq xm7, [base+hmul_bits] |
| mov r2d, [fg_dataq+FGData.grain_scale_shift] |
| vpbroadcastw xm8, [base+round+r2*2] |
| mova xm5, [base+pb_mask] |
| vpbroadcastw xm0, [fg_dataq+FGData.seed] |
| vpbroadcastd xm9, [base+pd_m65536] |
| mov r2, -73*82 |
| sub bufq, r2 |
| lea r3, [gaussian_sequence] |
| .loop: |
| pand xm2, xm0, xm1 |
| psrlw xm3, xm2, 10 |
| por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw xm2, xm4 ; bits 0x0f00 are set |
| pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds |
| psllq xm6, xm2, 30 |
| por xm2, xm6 |
| psllq xm6, xm2, 15 |
| por xm2, xm6 ; aggregate each bit into next seed's high bit |
| pmulhuw xm3, xm0, xm7 |
| por xm2, xm3 ; 4 next output seeds |
| pshuflw xm0, xm2, q3333 |
| psrlw xm2, 5 |
| pmovzxwd xm3, xm2 |
| mova xm6, xm9 |
| vpgatherdd xm2, [r3+xm3*2], xm6 |
| pandn xm2, xm9, xm2 |
| packusdw xm2, xm2 |
| pmulhrsw xm2, xm8 |
| packsswb xm2, xm2 |
| movd [bufq+r2], xm2 |
| add r2, 4 |
| jl .loop |
| |
| ; auto-regression code |
| movsxd r2, [fg_dataq+FGData.ar_coeff_lag] |
| movsxd r2, [base+generate_grain_y_avx2_table+r2*4] |
| lea r2, [r2+base+generate_grain_y_avx2_table] |
| jmp r2 |
| |
| .ar1: |
| DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] |
| movd xm4, [fg_dataq+FGData.ar_coeffs_y] |
| DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 |
| pinsrb xm4, [pb_1], 3 |
| pmovsxbw xm4, xm4 |
| pshufd xm5, xm4, q1111 |
| pshufd xm4, xm4, q0000 |
| vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd |
| sub bufq, 82*73-(82*3+79) |
| mov hd, 70 |
| mov mind, -128 |
| mov maxd, 127 |
| .y_loop_ar1: |
| mov xq, -76 |
| movsx val3d, byte [bufq+xq-1] |
| .x_loop_ar1: |
| pmovsxbw xm0, [bufq+xq-82-1] ; top/left |
| pmovsxbw xm2, [bufq+xq-82+0] ; top |
| pmovsxbw xm1, [bufq+xq-82+1] ; top/right |
| punpcklwd xm0, xm2 |
| punpcklwd xm1, xm3 |
| pmaddwd xm0, xm4 |
| pmaddwd xm1, xm5 |
| paddd xm0, xm1 |
| .x_loop_ar1_inner: |
| movd val0d, xm0 |
| psrldq xm0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| %if WIN64 |
| sarx val3d, val3d, shiftd |
| %else |
| sar val3d, shiftb |
| %endif |
| movsx val0d, byte [bufq+xq] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov byte [bufq+xq], val3b |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xq, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| |
| .x_loop_ar1_end: |
| add bufq, 82 |
| dec hd |
| jg .y_loop_ar1 |
| .ar0: |
| RET |
| |
| .ar2: |
| DEFINE_ARGS buf, fg_data, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movd xm14, [base+hmul_bits-10+shiftq*2] |
| movq xm15, [base+byte_blend+1] |
| pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 |
| movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 |
| pmovsxbw xm9, xm9 |
| DEFINE_ARGS buf, h, x |
| pshufd xm12, xm9, q0000 |
| pshufd xm13, xm9, q1111 |
| pshufd xm11, xm8, q3333 |
| pshufd xm10, xm8, q2222 |
| pshufd xm9, xm8, q1111 |
| pshufd xm8, xm8, q0000 |
| sub bufq, 82*73-(82*3+79) |
| mov hd, 70 |
| .y_loop_ar2: |
| mov xq, -76 |
| |
| .x_loop_ar2: |
| pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] |
| pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] |
| psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] |
| psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] |
| psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] |
| punpcklwd xm2, xm0, xm2 |
| punpcklwd xm3, xm4 |
| pmaddwd xm2, xm8 |
| pmaddwd xm3, xm11 |
| paddd xm2, xm3 |
| |
| psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] |
| psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] |
| psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] |
| punpcklwd xm4, xm5 |
| punpcklwd xm6, xm1 |
| psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] |
| psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] |
| punpcklwd xm7, xm1 |
| pmaddwd xm4, xm9 |
| pmaddwd xm6, xm10 |
| pmaddwd xm7, xm12 |
| paddd xm4, xm6 |
| paddd xm2, xm7 |
| paddd xm2, xm4 |
| |
| movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] |
| .x_loop_ar2_inner: |
| pmovsxbw xm1, xm0 |
| pmaddwd xm3, xm1, xm13 |
| paddd xm3, xm2 |
| psrldq xm1, 4 ; y=0,x=0 |
| psrldq xm2, 4 ; shift top to next pixel |
| psrad xm3, 5 |
| packssdw xm3, xm3 |
| pmulhrsw xm3, xm14 |
| paddw xm3, xm1 |
| packsswb xm3, xm3 |
| pextrb [bufq+xq], xm3, 0 |
| pslldq xm3, 2 |
| pand xm3, xm15 |
| pandn xm0, xm15, xm0 |
| por xm0, xm3 |
| psrldq xm0, 1 |
| inc xq |
| jz .x_loop_ar2_end |
| test xq, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| |
| .x_loop_ar2_end: |
| add bufq, 82 |
| dec hd |
| jg .y_loop_ar2 |
| RET |
| |
| .ar3: |
| DEFINE_ARGS buf, fg_data, shift |
| %if WIN64 |
| SUB rsp, 16*12 |
| %assign stack_size_padded (stack_size_padded+16*12) |
| %assign stack_size (stack_size+16*12) |
| %else |
| ALLOC_STACK 16*12 |
| %endif |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movd xm14, [base+hmul_bits-10+shiftq*2] |
| movq xm15, [base+byte_blend] |
| pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 |
| pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 |
| pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 |
| pshufd xm9, xm0, q1111 |
| pshufd xm10, xm0, q2222 |
| pshufd xm11, xm0, q3333 |
| pshufd xm0, xm0, q0000 |
| pshufd xm6, xm1, q1111 |
| pshufd xm7, xm1, q2222 |
| pshufd xm8, xm1, q3333 |
| pshufd xm1, xm1, q0000 |
| pshufd xm3, xm2, q1111 |
| pshufd xm4, xm2, q2222 |
| psrldq xm5, xm2, 10 |
| pshufd xm2, xm2, q0000 |
| pinsrw xm5, [base+round_vals+shiftq*2-10], 3 |
| mova [rsp+ 0*16], xm0 |
| mova [rsp+ 1*16], xm9 |
| mova [rsp+ 2*16], xm10 |
| mova [rsp+ 3*16], xm11 |
| mova [rsp+ 4*16], xm1 |
| mova [rsp+ 5*16], xm6 |
| mova [rsp+ 6*16], xm7 |
| mova [rsp+ 7*16], xm8 |
| mova [rsp+ 8*16], xm2 |
| mova [rsp+ 9*16], xm3 |
| mova [rsp+10*16], xm4 |
| mova [rsp+11*16], xm5 |
| pxor xm13, xm13 |
| DEFINE_ARGS buf, h, x |
| sub bufq, 82*73-(82*3+79) |
| mov hd, 70 |
| .y_loop_ar3: |
| mov xq, -76 |
| |
| .x_loop_ar3: |
| movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] |
| movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] |
| movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] |
| pxor xm3, xm3 |
| pcmpgtb xm6, xm3, xm2 |
| pcmpgtb xm5, xm3, xm1 |
| pcmpgtb xm4, xm3, xm0 |
| punpckhbw xm3, xm0, xm4 |
| punpcklbw xm0, xm4 |
| punpckhbw xm4, xm1, xm5 |
| punpcklbw xm1, xm5 |
| punpckhbw xm5, xm2, xm6 |
| punpcklbw xm2, xm6 |
| |
| psrldq xm6, xm0, 2 |
| psrldq xm7, xm0, 4 |
| psrldq xm8, xm0, 6 |
| psrldq xm9, xm0, 8 |
| palignr xm10, xm3, xm0, 10 |
| palignr xm11, xm3, xm0, 12 |
| |
| punpcklwd xm0, xm6 |
| punpcklwd xm7, xm8 |
| punpcklwd xm9, xm10 |
| punpcklwd xm11, xm1 |
| pmaddwd xm0, [rsp+ 0*16] |
| pmaddwd xm7, [rsp+ 1*16] |
| pmaddwd xm9, [rsp+ 2*16] |
| pmaddwd xm11, [rsp+ 3*16] |
| paddd xm0, xm7 |
| paddd xm9, xm11 |
| paddd xm0, xm9 |
| |
| psrldq xm6, xm1, 2 |
| psrldq xm7, xm1, 4 |
| psrldq xm8, xm1, 6 |
| psrldq xm9, xm1, 8 |
| palignr xm10, xm4, xm1, 10 |
| palignr xm11, xm4, xm1, 12 |
| psrldq xm12, xm2, 2 |
| |
| punpcklwd xm6, xm7 |
| punpcklwd xm8, xm9 |
| punpcklwd xm10, xm11 |
| punpcklwd xm12, xm2, xm12 |
| pmaddwd xm6, [rsp+ 4*16] |
| pmaddwd xm8, [rsp+ 5*16] |
| pmaddwd xm10, [rsp+ 6*16] |
| pmaddwd xm12, [rsp+ 7*16] |
| paddd xm6, xm8 |
| paddd xm10, xm12 |
| paddd xm6, xm10 |
| paddd xm0, xm6 |
| |
| psrldq xm6, xm2, 4 |
| psrldq xm7, xm2, 6 |
| psrldq xm8, xm2, 8 |
| palignr xm9, xm5, xm2, 10 |
| palignr xm5, xm5, xm2, 12 |
| |
| punpcklwd xm6, xm7 |
| punpcklwd xm8, xm9 |
| punpcklwd xm5, xm13 |
| pmaddwd xm6, [rsp+ 8*16] |
| pmaddwd xm8, [rsp+ 9*16] |
| pmaddwd xm5, [rsp+10*16] |
| paddd xm0, xm6 |
| paddd xm8, xm5 |
| paddd xm0, xm8 |
| |
| movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] |
| .x_loop_ar3_inner: |
| pmovsxbw xm2, xm1 |
| pmaddwd xm2, [rsp+16*11] |
| pshufd xm3, xm2, q1111 |
| paddd xm2, xm3 ; left+cur |
| paddd xm2, xm0 ; add top |
| psrldq xm0, 4 |
| psrad xm2, 5 |
| packssdw xm2, xm2 |
| pmulhrsw xm2, xm14 |
| packsswb xm2, xm2 |
| pextrb [bufq+xq], xm2, 0 |
| pslldq xm2, 3 |
| pand xm2, xm15 |
| pandn xm1, xm15, xm1 |
| por xm1, xm2 |
| psrldq xm1, 1 |
| inc xq |
| jz .x_loop_ar3_end |
| test xq, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| |
| .x_loop_ar3_end: |
| add bufq, 82 |
| dec hd |
| jg .y_loop_ar3 |
| RET |
| |
| INIT_XMM avx2 |
| cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv |
| lea r4, [pb_mask] |
| %define base r4-pb_mask |
| movq xm1, [base+rnd_next_upperbit_mask] |
| movq xm4, [base+mul_bits] |
| movq xm7, [base+hmul_bits] |
| mov r5d, [fg_dataq+FGData.grain_scale_shift] |
| vpbroadcastw xm8, [base+round+r5*2] |
| mova xm5, [base+pb_mask] |
| vpbroadcastw xm0, [fg_dataq+FGData.seed] |
| vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] |
| pxor xm0, xm9 |
| vpbroadcastd xm9, [base+pd_m65536] |
| lea r6, [gaussian_sequence] |
| mov r7d, 38 |
| add bufq, 44 |
| .loop_y: |
| mov r5, -44 |
| .loop_x: |
| pand xm2, xm0, xm1 |
| psrlw xm3, xm2, 10 |
| por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set |
| pmullw xm2, xm4 ; bits 0x0f00 are set |
| pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds |
| psllq xm6, xm2, 30 |
| por xm2, xm6 |
| psllq xm6, xm2, 15 |
| por xm2, xm6 ; aggregate each bit into next seed's high bit |
| pmulhuw xm3, xm0, xm7 |
| por xm2, xm3 ; 4 next output seeds |
| pshuflw xm0, xm2, q3333 |
| psrlw xm2, 5 |
| pmovzxwd xm3, xm2 |
| mova xm6, xm9 |
| vpgatherdd xm2, [r6+xm3*2], xm6 |
| pandn xm2, xm9, xm2 |
| packusdw xm2, xm2 |
| pmulhrsw xm2, xm8 |
| packsswb xm2, xm2 |
| movd [bufq+r5], xm2 |
| add r5, 4 |
| jl .loop_x |
| add bufq, 82 |
| dec r7d |
| jg .loop_y |
| |
| ; auto-regression code |
| movsxd r5, [fg_dataq+FGData.ar_coeff_lag] |
| movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4] |
| lea r5, [r5+base+generate_grain_uv_420_avx2_table] |
| jmp r5 |
| |
| .ar0: |
| INIT_YMM avx2 |
| DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift |
| imul uvd, 25 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| movd xm3, [base+hmul_bits+shiftq*2] |
| DEFINE_ARGS buf, bufy, h |
| pmovsxbw xm4, xm4 |
| vpbroadcastd m7, [pb_1] |
| vpbroadcastw m6, [hmul_bits+4] |
| vpbroadcastw m4, xm4 |
| vpbroadcastw m3, xm3 |
| sub bufq, 82*38+82-(82*3+41) |
| add bufyq, 3+82*3 |
| mov hd, 35 |
| .y_loop_ar0: |
| ; first 32 pixels |
| movu xm8, [bufyq] |
| movu xm9, [bufyq+82] |
| movu xm10, [bufyq+16] |
| movu xm11, [bufyq+82+16] |
| vinserti128 m8, [bufyq+32], 1 |
| vinserti128 m9, [bufyq+82+32], 1 |
| vinserti128 m10, [bufyq+48], 1 |
| vinserti128 m11, [bufyq+82+48], 1 |
| pmaddubsw m8, m7, m8 |
| pmaddubsw m9, m7, m9 |
| pmaddubsw m10, m7, m10 |
| pmaddubsw m11, m7, m11 |
| paddw m8, m9 |
| paddw m10, m11 |
| pmulhrsw m8, m6 |
| pmulhrsw m10, m6 |
| pmullw m8, m4 |
| pmullw m10, m4 |
| pmulhrsw m8, m3 |
| pmulhrsw m10, m3 |
| packsswb m8, m10 |
| movu m0, [bufq] |
| punpckhbw m1, m0, m8 |
| punpcklbw m0, m8 |
| pmaddubsw m1, m7, m1 |
| pmaddubsw m0, m7, m0 |
| packsswb m0, m1 |
| movu [bufq], m0 |
| |
| ; last 6 pixels |
| movu xm8, [bufyq+32*2] |
| movu xm9, [bufyq+32*2+82] |
| pmaddubsw xm8, xm7, xm8 |
| pmaddubsw xm9, xm7, xm9 |
| paddw xm8, xm9 |
| pmulhrsw xm8, xm6 |
| pmullw xm8, xm4 |
| pmulhrsw xm8, xm3 |
| packsswb xm8, xm8 |
| movq xm0, [bufq+32] |
| punpcklbw xm8, xm0 |
| pmaddubsw xm8, xm7, xm8 |
| packsswb xm8, xm8 |
| vpblendw xm0, xm8, xm0, 1000b |
| movq [bufq+32], xm0 |
| |
| add bufq, 82 |
| add bufyq, 82*2 |
| dec hd |
| jg .y_loop_ar0 |
| RET |
| |
| .ar1: |
| INIT_XMM avx2 |
| DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift |
| imul uvd, 25 |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] |
| movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] |
| pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 |
| DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift |
| pmovsxbw xm4, xm4 |
| pshufd xm5, xm4, q1111 |
| pshufd xm4, xm4, q0000 |
| pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd |
| vpbroadcastd xm7, [pb_1] |
| vpbroadcastw xm6, [hmul_bits+4] |
| vpbroadcastd xm3, xm3 |
| sub bufq, 82*38+44-(82*3+41) |
| add bufyq, 79+82*3 |
| mov hd, 35 |
| mov mind, -128 |
| mov maxd, 127 |
| .y_loop_ar1: |
| mov xq, -38 |
| movsx val3d, byte [bufq+xq-1] |
| .x_loop_ar1: |
| pmovsxbw xm0, [bufq+xq-82-1] ; top/left |
| movq xm8, [bufyq+xq*2] |
| movq xm9, [bufyq+xq*2+82] |
| psrldq xm2, xm0, 2 ; top |
| psrldq xm1, xm0, 4 ; top/right |
| pmaddubsw xm8, xm7, xm8 |
| pmaddubsw xm9, xm7, xm9 |
| paddw xm8, xm9 |
| pmulhrsw xm8, xm6 |
| punpcklwd xm0, xm2 |
| punpcklwd xm1, xm8 |
| pmaddwd xm0, xm4 |
| pmaddwd xm1, xm5 |
| paddd xm0, xm1 |
| paddd xm0, xm3 |
| .x_loop_ar1_inner: |
| movd val0d, xm0 |
| psrldq xm0, 4 |
| imul val3d, cf3d |
| add val3d, val0d |
| sarx val3d, val3d, shiftd |
| movsx val0d, byte [bufq+xq] |
| add val3d, val0d |
| cmp val3d, maxd |
| cmovg val3d, maxd |
| cmp val3d, mind |
| cmovl val3d, mind |
| mov byte [bufq+xq], val3b |
| ; keep val3d in-place as left for next x iteration |
| inc xq |
| jz .x_loop_ar1_end |
| test xq, 3 |
| jnz .x_loop_ar1_inner |
| jmp .x_loop_ar1 |
| |
| .x_loop_ar1_end: |
| add bufq, 82 |
| add bufyq, 82*2 |
| dec hd |
| jg .y_loop_ar1 |
| RET |
| |
| .ar2: |
| DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 25 |
| movd xm15, [base+hmul_bits-10+shiftq*2] |
| pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 |
| pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 |
| vpbroadcastw xm7, [base+hmul_bits+4] |
| vpbroadcastd xm6, [base+pb_1] |
| DEFINE_ARGS buf, bufy, h, x |
| pshufd xm12, xm9, q0000 |
| pshufd xm13, xm9, q1111 |
| pshufd xm14, xm9, q2222 |
| pxor xm10, xm10 |
| vpblendw xm14, xm10, 10101010b |
| pshufd xm11, xm8, q3333 |
| pshufd xm10, xm8, q2222 |
| pshufd xm9, xm8, q1111 |
| pshufd xm8, xm8, q0000 |
| sub bufq, 82*38+44-(82*3+41) |
| add bufyq, 79+82*3 |
| mov hd, 35 |
| .y_loop_ar2: |
| mov xq, -38 |
| |
| .x_loop_ar2: |
| pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] |
| pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] |
| psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] |
| psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] |
| psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] |
| punpcklwd xm2, xm0, xm2 |
| punpcklwd xm3, xm4 |
| pmaddwd xm2, xm8 |
| pmaddwd xm3, xm11 |
| paddd xm2, xm3 |
| |
| psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] |
| psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] |
| psrldq xm0, 8 ; y=-2,x=[+2,+5] |
| punpcklwd xm4, xm5 |
| punpcklwd xm0, xm1 |
| psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] |
| psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] |
| punpcklwd xm3, xm1 |
| pmaddwd xm4, xm9 |
| pmaddwd xm0, xm10 |
| pmaddwd xm3, xm12 |
| paddd xm4, xm0 |
| paddd xm2, xm3 |
| paddd xm2, xm4 |
| |
| movq xm0, [bufyq+xq*2] |
| movq xm3, [bufyq+xq*2+82] |
| pmaddubsw xm0, xm6, xm0 |
| pmaddubsw xm3, xm6, xm3 |
| paddw xm0, xm3 |
| pmulhrsw xm0, xm7 |
| punpcklwd xm0, xm0 |
| pmaddwd xm0, xm14 |
| paddd xm2, xm0 |
| |
| movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] |
| .x_loop_ar2_inner: |
| pmovsxbw xm0, xm0 |
| pmaddwd xm3, xm0, xm13 |
| paddd xm3, xm2 |
| psrldq xm2, 4 ; shift top to next pixel |
| psrad xm3, 5 |
| packssdw xm3, xm3 |
| pmulhrsw xm3, xm15 |
| pslldq xm3, 2 |
| psrldq xm0, 2 |
| paddw xm3, xm0 |
| vpblendw xm0, xm3, 00000010b |
| packsswb xm0, xm0 |
| pextrb [bufq+xq], xm0, 1 |
| inc xq |
| jz .x_loop_ar2_end |
| test xq, 3 |
| jnz .x_loop_ar2_inner |
| jmp .x_loop_ar2 |
| |
| .x_loop_ar2_end: |
| add bufq, 82 |
| add bufyq, 82*2 |
| dec hd |
| jg .y_loop_ar2 |
| RET |
| |
| .ar3: |
| DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift |
| SUB rsp, 16*12 |
| %assign stack_size_padded (stack_size_padded+16*12) |
| %assign stack_size (stack_size+16*12) |
| mov shiftd, [fg_dataq+FGData.ar_coeff_shift] |
| imul uvd, 25 |
| movd xm14, [base+hmul_bits-10+shiftq*2] |
| pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 |
| pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 |
| pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 |
| pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] |
| pshufd xm9, xm0, q1111 |
| pshufd xm10, xm0, q2222 |
| pshufd xm11, xm0, q3333 |
| pshufd xm0, xm0, q0000 |
| pshufd xm6, xm1, q1111 |
| pshufd xm7, xm1, q2222 |
| pshufd xm8, xm1, q3333 |
| pshufd xm1, xm1, q0000 |
| pshufd xm3, xm2, q1111 |
| pshufd xm4, xm2, q2222 |
| vpbroadcastw xm5, xm5 |
| vpblendw xm4, xm5, 10101010b ; interleave luma cf |
| psrldq xm5, xm2, 10 |
| pshufd xm2, xm2, q0000 |
| pinsrw xm5, [base+round_vals+shiftq*2-10], 3 |
| mova [rsp+ 0*16], xm0 |
| mova [rsp+ 1*16], xm9 |
| mova [rsp+ 2*16], xm10 |
| mova [rsp+ 3*16], xm11 |
| mova [rsp+ 4*16], xm1 |
| mova [rsp+ 5*16], xm6 |
| mova [rsp+ 6*16], xm7 |
| mova [rsp+ 7*16], xm8 |
| mova [rsp+ 8*16], xm2 |
| mova [rsp+ 9*16], xm3 |
| mova [rsp+10*16], xm4 |
| mova [rsp+11*16], xm5 |
| vpbroadcastd xm13, [base+pb_1] |
| vpbroadcastw xm15, [base+hmul_bits+4] |
| DEFINE_ARGS buf, bufy, h, x |
| sub bufq, 82*38+44-(82*3+41) |
| add bufyq, 79+82*3 |
| mov hd, 35 |
| .y_loop_ar3: |
| mov xq, -38 |
| |
| .x_loop_ar3: |
| movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] |
| movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] |
| movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] |
| pxor xm3, xm3 |
| pcmpgtb xm6, xm3, xm2 |
| pcmpgtb xm5, xm3, xm1 |
| pcmpgtb xm4, xm3, xm0 |
| punpckhbw xm3, xm0, xm4 |
| punpcklbw xm0, xm4 |
| punpckhbw xm4, xm1, xm5 |
| punpcklbw xm1, xm5 |
| punpckhbw xm5, xm2, xm6 |
| punpcklbw xm2, xm6 |
| |
| psrldq xm6, xm0, 2 |
| psrldq xm7, xm0, 4 |
| psrldq xm8, xm0, 6 |
| psrldq xm9, xm0, 8 |
| palignr xm10, xm3, xm0, 10 |
| palignr xm11, xm3, xm0, 12 |
| |
| punpcklwd xm0, xm6 |
| punpcklwd xm7, xm8 |
| punpcklwd xm9, xm10 |
| punpcklwd xm11, xm1 |
| pmaddwd xm0, [rsp+ 0*16] |
| pmaddwd xm7, [rsp+ 1*16] |
| pmaddwd xm9, [rsp+ 2*16] |
| pmaddwd xm11, [rsp+ 3*16] |
| paddd xm0, xm7 |
| paddd xm9, xm11 |
| paddd xm0, xm9 |
| |
| psrldq xm6, xm1, 2 |
| psrldq xm7, xm1, 4 |
| psrldq xm8, xm1, 6 |
| psrldq xm9, xm1, 8 |
| palignr xm10, xm4, xm1, 10 |
| palignr xm11, xm4, xm1, 12 |
| psrldq xm12, xm2, 2 |
| |
| punpcklwd xm6, xm7 |
| punpcklwd xm8, xm9 |
| punpcklwd xm10, xm11 |
| punpcklwd xm12, xm2, xm12 |
| pmaddwd xm6, [rsp+ 4*16] |
| pmaddwd xm8, [rsp+ 5*16] |
| pmaddwd xm10, [rsp+ 6*16] |
| pmaddwd xm12, [rsp+ 7*16] |
| paddd xm6, xm8 |
| paddd xm10, xm12 |
| paddd xm6, xm10 |
| paddd xm0, xm6 |
| |
| psrldq xm6, xm2, 4 |
| psrldq xm7, xm2, 6 |
| psrldq xm8, xm2, 8 |
| palignr xm9, xm5, xm2, 10 |
| palignr xm5, xm5, xm2, 12 |
| |
| movq xm1, [bufyq+xq*2] |
| movq xm2, [bufyq+xq*2+82] |
| pmaddubsw xm1, xm13, xm1 |
| pmaddubsw xm2, xm13, xm2 |
| paddw xm1, xm2 |
| pmulhrsw xm1, xm15 |
| |
| punpcklwd xm6, xm7 |
| punpcklwd xm8, xm9 |
| punpcklwd xm5, xm1 |
| pmaddwd xm6, [rsp+ 8*16] |
| pmaddwd xm8, [rsp+ 9*16] |
| pmaddwd xm5, [rsp+10*16] |
| paddd xm0, xm6 |
| paddd xm8, xm5 |
| paddd xm0, xm8 |
| |
| movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] |
| .x_loop_ar3_inner: |
| pmovsxbw xm1, xm1 |
| pmaddwd xm2, xm1, [rsp+16*11] |
| pshufd xm3, xm2, q1111 |
| paddd xm2, xm3 ; left+cur |
| paddd xm2, xm0 ; add top |
| psrldq xm0, 4 |
| psrad xm2, 5 |
| packssdw xm2, xm2 |
| pmulhrsw xm2, xm14 |
| pslldq xm2, 6 |
| vpblendw xm1, xm2, 1000b |
| packsswb xm1, xm1 |
| pextrb [bufq+xq], xm1, 3 |
| psrldq xm1, 1 |
| inc xq |
| jz .x_loop_ar3_end |
| test xq, 3 |
| jnz .x_loop_ar3_inner |
| jmp .x_loop_ar3 |
| |
| .x_loop_ar3_end: |
| add bufq, 82 |
| add bufyq, 82*2 |
| dec hd |
| jg .y_loop_ar3 |
| RET |
| |
| INIT_YMM avx2 |
| cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut |
| pcmpeqw m10, m10 |
| psrld m10, 24 |
| mov r7d, [fg_dataq+FGData.scaling_shift] |
| lea r8, [pb_mask] |
| %define base r8-pb_mask |
| vpbroadcastw m11, [base+mul_bits+r7*2-14] |
| mov r7d, [fg_dataq+FGData.clip_to_restricted_range] |
| vpbroadcastw m12, [base+max+r7*4] |
| vpbroadcastw m13, [base+min+r7*2] |
| |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap |
| |
| mov overlapd, [fg_dataq+FGData.overlap_flag] |
| movifnidn sbyd, sbym |
| test sbyd, sbyd |
| setnz r7b |
| test r7b, overlapb |
| jnz .vertical_overlap |
| |
| imul seed, sbyd, (173 << 24) | 37 |
| add seed, (105 << 24) | 178 |
| rol seed, 8 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| unused1, unused2, see, overlap |
| |
| lea src_bakq, [srcq+wq] |
| neg wq |
| sub dstq, srcq |
| |
| .loop_x: |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| offx, offy, see, overlap |
| |
| mov offxd, seed |
| rorx offyd, seed, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyq, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| h, offxy, see, overlap |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| .loop_y: |
| ; src |
| mova m0, [srcq] |
| pxor m2, m2 |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| punpckhwd m5, m0, m2 |
| punpcklwd m4, m0, m2 |
| punpckhwd m7, m1, m2 |
| punpcklwd m6, m1, m2 ; m4-7: src as dword |
| |
| ; scaling[src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq] |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq+srcq], m0 |
| |
| add srcq, strideq |
| add grain_lutq, 82 |
| dec hd |
| jg .loop_y |
| |
| add wq, 32 |
| jge .end |
| lea srcq, [src_bakq+wq] |
| test overlapd, overlapd |
| jz .loop_x |
| |
| ; r8m = sbym |
| movd xm15, [pb_27_17_17_27] |
| cmp dword r8m, 0 |
| jne .loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| movd xm14, [pw_1024] |
| .loop_x_h_overlap: |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy |
| |
| lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx |
| mov offxd, seed |
| rorx offyd, seed, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 164 |
| lea offyq, [offyq+offxq*2+747] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| .loop_y_h_overlap: |
| ; src |
| mova m0, [srcq] |
| pxor m2, m2 |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| punpckhwd m5, m0, m2 |
| punpcklwd m4, m0, m2 |
| punpckhwd m7, m1, m2 |
| punpcklwd m6, m1, m2 ; m4-7: src as dword |
| |
| ; scaling[src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq] |
| movd xm4, [grain_lutq+left_offxyq] |
| punpcklbw xm4, xm3 |
| pmaddubsw xm4, xm15, xm4 |
| pmulhrsw xm4, xm14 |
| packsswb xm4, xm4 |
| vpblendw xm4, xm3, 11111110b |
| vpblendd m3, m4, 00001111b |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq+srcq], m0 |
| |
| add srcq, strideq |
| add grain_lutq, 82 |
| dec hd |
| jg .loop_y_h_overlap |
| |
| add wq, 32 |
| jge .end |
| lea srcq, [src_bakq+wq] |
| |
| ; r8m = sbym |
| cmp dword r8m, 0 |
| jne .loop_x_hv_overlap |
| jmp .loop_x_h_overlap |
| |
| .end: |
| RET |
| |
| .vertical_overlap: |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap |
| |
| movzx sbyd, sbyb |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| imul r7d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add r7d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and r7d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, r7d |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| unused1, unused2, see, overlap |
| |
| lea src_bakq, [srcq+wq] |
| neg wq |
| sub dstq, srcq |
| |
| vpbroadcastd m14, [pw_1024] |
| .loop_x_v_overlap: |
| vpbroadcastw m15, [pb_27_17_17_27] |
| |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| offx, offy, see, overlap, top_offxy |
| |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*2+0x10001*747+32*82] |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| h, offxy, see, overlap, top_offxy |
| |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| .loop_y_v_overlap: |
| ; src |
| mova m0, [srcq] |
| pxor m2, m2 |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| punpckhwd m5, m0, m2 |
| punpcklwd m4, m0, m2 |
| punpckhwd m7, m1, m2 |
| punpcklwd m6, m1, m2 ; m4-7: src as dword |
| |
| ; scaling[src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq] |
| movu m4, [grain_lutq+top_offxyq] |
| punpckhbw m6, m4, m3 |
| punpcklbw m4, m3 |
| pmaddubsw m6, m15, m6 |
| pmaddubsw m4, m15, m4 |
| pmulhrsw m6, m14 |
| pmulhrsw m4, m14 |
| packsswb m3, m4, m6 |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq+srcq], m0 |
| |
| vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line |
| add srcq, strideq |
| add grain_lutq, 82 |
| dec hw |
| jz .end_y_v_overlap |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| xor hd, 0x10000 |
| test hd, 0x10000 |
| jnz .loop_y_v_overlap |
| jmp .loop_y |
| |
| .end_y_v_overlap: |
| add wq, 32 |
| jge .end_hv |
| lea srcq, [src_bakq+wq] |
| |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| |
| movd xm15, [pb_27_17_17_27] |
| .loop_x_hv_overlap: |
| vpbroadcastw m8, [pb_27_17_17_27] |
| |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, top_offxy, topleft_offxy |
| |
| lea topleft_offxyq, [top_offxyq+32] |
| lea left_offxyq, [offyq+32] |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 164 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq*2+0x10001*747+32*82] |
| |
| DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, top_offxy, topleft_offxy |
| |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| .loop_y_hv_overlap: |
| ; src |
| mova m0, [srcq] |
| pxor m2, m2 |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| punpckhwd m5, m0, m2 |
| punpcklwd m4, m0, m2 |
| punpckhwd m7, m1, m2 |
| punpcklwd m6, m1, m2 ; m4-7: src as dword |
| |
| ; scaling[src] |
| pcmpeqw m3, m3 |
| ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel |
| vpgatherdd m9, [scalingq+m4], m3 |
| pcmpeqw m3, m3 |
| vpgatherdd m4, [scalingq+m5], m3 |
| pcmpeqw m3, m3 |
| vpgatherdd m5, [scalingq+m6], m3 |
| pcmpeqw m3, m3 |
| vpgatherdd m6, [scalingq+m7], m3 |
| pand m9, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m9, m4 |
| packusdw m5, m6 |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu m3, [grain_lutq+offxyq] |
| movu m6, [grain_lutq+top_offxyq] |
| movd xm4, [grain_lutq+left_offxyq] |
| movd xm7, [grain_lutq+topleft_offxyq] |
| ; do h interpolation first (so top | top/left -> top, left | cur -> cur) |
| punpcklbw xm4, xm3 |
| punpcklbw xm7, xm6 |
| pmaddubsw xm4, xm15, xm4 |
| pmaddubsw xm7, xm15, xm7 |
| pmulhrsw xm4, xm14 |
| pmulhrsw xm7, xm14 |
| packsswb xm4, xm4 |
| packsswb xm7, xm7 |
| vpblendw xm4, xm3, 11111110b |
| vpblendw xm7, xm6, 11111110b |
| vpblendd m3, m4, 00001111b |
| vpblendd m6, m7, 00001111b |
| ; followed by v interpolation (top | cur -> cur) |
| punpckhbw m7, m6, m3 |
| punpcklbw m6, m3 |
| pmaddubsw m7, m8, m7 |
| pmaddubsw m6, m8, m6 |
| pmulhrsw m7, m14 |
| pmulhrsw m6, m14 |
| packsswb m3, m6, m7 |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmullw m2, m9 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq+srcq], m0 |
| |
| vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line |
| add srcq, strideq |
| add grain_lutq, 82 |
| dec hw |
| jz .end_y_hv_overlap |
| ; 2 lines get vertical overlap, then fall back to non-overlap code for |
| ; remaining (up to) 30 lines |
| xor hd, 0x10000 |
| test hd, 0x10000 |
| jnz .loop_y_hv_overlap |
| jmp .loop_y_h_overlap |
| |
| .end_y_hv_overlap: |
| add wq, 32 |
| lea srcq, [src_bakq+wq] |
| jl .loop_x_hv_overlap |
| |
| .end_hv: |
| RET |
| |
| cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ |
| grain_lut, h, sby, luma, lstride, uv_pl, is_id |
| pcmpeqw m10, m10 |
| psrld m10, 24 |
| mov r7d, [fg_dataq+FGData.scaling_shift] |
| lea r8, [pb_mask] |
| %define base r8-pb_mask |
| vpbroadcastw m11, [base+mul_bits+r7*2-14] |
| mov r7d, [fg_dataq+FGData.clip_to_restricted_range] |
| mov r9d, dword is_idm |
| vpbroadcastw m13, [base+min+r7*2] |
| shlx r7d, r7d, r9d |
| vpbroadcastw m12, [base+max+r7*2] |
| |
| cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 |
| jne .csfl |
| |
| %macro FGUV_32x32xN_LOOP 1 ; not-csfl |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap |
| |
| %if %1 |
| mov r7d, dword r11m |
| vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] |
| vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] |
| punpcklbw m14, m1, m0 |
| vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] |
| %else |
| vpbroadcastd m14, [pw_1024] |
| vpbroadcastd m15, [pb_23_22] |
| %endif |
| |
| mov overlapd, [fg_dataq+FGData.overlap_flag] |
| movifnidn sbyd, sbym |
| test sbyd, sbyd |
| setnz r7b |
| test r7b, overlapb |
| jnz %%vertical_overlap |
| |
| imul seed, sbyd, (173 << 24) | 37 |
| add seed, (105 << 24) | 178 |
| rol seed, 8 |
| movzx seed, seew |
| xor seed, [fg_dataq+FGData.seed] |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| unused2, unused3, see, overlap, unused4, unused5, lstride |
| |
| mov lumaq, r9mp |
| lea r12, [srcq+wq] |
| lea r13, [dstq+wq] |
| lea r14, [lumaq+wq*2] |
| mov r11mp, r12 |
| mov r12mp, r13 |
| mov lstrideq, r10mp |
| neg wq |
| |
| %%loop_x: |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| offx, offy, see, overlap, unused1, unused2, lstride |
| |
| mov offxd, seed |
| rorx offyd, seed, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 82 |
| lea offyq, [offyq+offxq+498] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| h, offxy, see, overlap, unused1, unused2, lstride |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| %%loop_y: |
| ; src |
| mova xm4, [lumaq+lstrideq*0+ 0] |
| mova xm6, [lumaq+lstrideq*0+16] |
| mova xm0, [srcq] |
| vpbroadcastd m7, [pb_1] |
| vinserti128 m4, [lumaq+lstrideq*2 +0], 1 |
| vinserti128 m6, [lumaq+lstrideq*2+16], 1 |
| vinserti128 m0, [srcq+strideq], 1 |
| pxor m2, m2 |
| pmaddubsw m4, m7 |
| pmaddubsw m6, m7 |
| pavgw m4, m2 |
| pavgw m6, m2 |
| |
| %if %1 |
| packuswb m4, m6 ; luma |
| punpckhbw m6, m4, m0 |
| punpcklbw m4, m0 ; { luma, chroma } |
| pmaddubsw m6, m14 |
| pmaddubsw m4, m14 |
| psraw m6, 6 |
| psraw m4, 6 |
| paddw m6, m15 |
| paddw m4, m15 |
| packuswb m4, m6 ; pack+unpack = clip |
| punpckhbw m6, m4, m2 |
| punpcklbw m4, m2 |
| %endif |
| |
| punpckhwd m5, m4, m2 |
| punpcklwd m4, m2 |
| punpckhwd m7, m6, m2 |
| punpcklwd m6, m2 ; m4-7: luma_src as dword |
| |
| ; scaling[luma_src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; unpack chroma_source |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| movu xm3, [grain_lutq+offxyq+ 0] |
| vinserti128 m3, [grain_lutq+offxyq+82], 1 |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq], xm0 |
| vextracti128 [dstq+strideq], m0, 1 |
| |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*4] |
| add grain_lutq, 82*2 |
| sub hb, 2 |
| jg %%loop_y |
| |
| add wq, 16 |
| jge %%end |
| mov srcq, r11mp |
| mov dstq, r12mp |
| lea lumaq, [r14+wq*2] |
| add srcq, wq |
| add dstq, wq |
| test overlapd, overlapd |
| jz %%loop_x |
| |
| ; r8m = sbym |
| cmp dword r8m, 0 |
| jne %%loop_x_hv_overlap |
| |
| ; horizontal overlap (without vertical overlap) |
| %%loop_x_h_overlap: |
| mov r6d, seed |
| or seed, 0xEFF4 |
| shr r6d, 1 |
| test seeb, seeh |
| lea seed, [r6+0x8000] |
| cmovp seed, r6d ; updated seed |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, unused1, unused2, lstride |
| |
| lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx |
| mov offxd, seed |
| rorx offyd, seed, 8 |
| shr offxd, 12 |
| and offyd, 0xf |
| imul offyd, 82 |
| lea offyq, [offyq+offxq+498] ; offy*stride+offx |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, unused1, unused2, lstride |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| %%loop_y_h_overlap: |
| ; src |
| mova xm4, [lumaq+lstrideq*0+ 0] |
| mova xm6, [lumaq+lstrideq*0+16] |
| mova xm0, [srcq] |
| vpbroadcastd m7, [pb_1] |
| vinserti128 m4, [lumaq+lstrideq*2 +0], 1 |
| vinserti128 m6, [lumaq+lstrideq*2+16], 1 |
| vinserti128 m0, [srcq+strideq], 1 |
| pxor m2, m2 |
| pmaddubsw m4, m7 |
| pmaddubsw m6, m7 |
| pavgw m4, m2 |
| pavgw m6, m2 |
| |
| %if %1 |
| packuswb m4, m6 ; luma |
| punpckhbw m6, m4, m0 |
| punpcklbw m4, m0 ; { luma, chroma } |
| pmaddubsw m6, m14 |
| pmaddubsw m4, m14 |
| psraw m6, 6 |
| psraw m4, 6 |
| paddw m6, m15 |
| paddw m4, m15 |
| packuswb m4, m6 ; pack+unpack = clip |
| punpckhbw m6, m4, m2 |
| punpcklbw m4, m2 |
| %endif |
| |
| punpckhwd m5, m4, m2 |
| punpcklwd m4, m2 |
| punpckhwd m7, m6, m2 |
| punpcklwd m6, m2 ; m4-7: luma_src as dword |
| |
| ; scaling[luma_src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; unpack chroma_source |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| %if %1 |
| vpbroadcastd m6, [pb_23_22] ; FIXME |
| %endif |
| movu xm3, [grain_lutq+offxyq+ 0] |
| movd xm4, [grain_lutq+left_offxyq+ 0] |
| vinserti128 m3, [grain_lutq+offxyq+82], 1 |
| vinserti128 m4, [grain_lutq+left_offxyq+82], 1 |
| punpcklbw m4, m3 |
| %if %1 |
| pmaddubsw m4, m6, m4 |
| pmulhrsw m4, [pw_1024] |
| %else |
| pmaddubsw m4, m15, m4 |
| pmulhrsw m4, m14 |
| %endif |
| packsswb m4, m4 |
| pcmpeqw m6, m6 ; FIXME |
| psrldq m6, 15 ; FIXME |
| vpblendvb m3, m3, m4, m6 |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq], xm0 |
| vextracti128 [dstq+strideq], m0, 1 |
| |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*4] |
| add grain_lutq, 82*2 |
| sub hb, 2 |
| jg %%loop_y_h_overlap |
| |
| add wq, 16 |
| jge %%end |
| mov srcq, r11mp |
| mov dstq, r12mp |
| lea lumaq, [r14+wq*2] |
| add srcq, wq |
| add dstq, wq |
| |
| ; r8m = sbym |
| cmp dword r8m, 0 |
| jne %%loop_x_hv_overlap |
| jmp %%loop_x_h_overlap |
| |
| %%end: |
| RET |
| |
| %%vertical_overlap: |
| DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ |
| sby, see, overlap, unused1, unused2, lstride |
| |
| movzx sbyd, sbyb |
| imul seed, [fg_dataq+FGData.seed], 0x00010001 |
| imul r7d, sbyd, 173 * 0x00010001 |
| imul sbyd, 37 * 0x01000100 |
| add r7d, (105 << 16) | 188 |
| add sbyd, (178 << 24) | (141 << 8) |
| and r7d, 0x00ff00ff |
| and sbyd, 0xff00ff00 |
| xor seed, r7d |
| xor seed, sbyd ; (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| unused1, unused2, see, overlap, unused3, unused4, lstride |
| |
| mov lumaq, r9mp |
| lea r12, [srcq+wq] |
| lea r13, [dstq+wq] |
| lea r14, [lumaq+wq*2] |
| mov r11mp, r12 |
| mov r12mp, r13 |
| mov lstrideq, r10mp |
| neg wq |
| |
| %%loop_x_v_overlap: |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| offx, offy, see, overlap, top_offxy, unused, lstride |
| |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 82 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq+0x10001*498+16*82] |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| h, offxy, see, overlap, top_offxy, unused, lstride |
| |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| %%loop_y_v_overlap: |
| ; src |
| mova xm4, [lumaq+lstrideq*0+ 0] |
| mova xm6, [lumaq+lstrideq*0+16] |
| mova xm0, [srcq] |
| vpbroadcastd m7, [pb_1] |
| vinserti128 m4, [lumaq+lstrideq*2 +0], 1 |
| vinserti128 m6, [lumaq+lstrideq*2+16], 1 |
| vinserti128 m0, [srcq+strideq], 1 |
| pxor m2, m2 |
| pmaddubsw m4, m7 |
| pmaddubsw m6, m7 |
| pavgw m4, m2 |
| pavgw m6, m2 |
| |
| %if %1 |
| packuswb m4, m6 ; luma |
| punpckhbw m6, m4, m0 |
| punpcklbw m4, m0 ; { luma, chroma } |
| pmaddubsw m6, m14 |
| pmaddubsw m4, m14 |
| psraw m6, 6 |
| psraw m4, 6 |
| paddw m6, m15 |
| paddw m4, m15 |
| packuswb m4, m6 ; pack+unpack = clip |
| punpckhbw m6, m4, m2 |
| punpcklbw m4, m2 |
| %endif |
| |
| punpckhwd m5, m4, m2 |
| punpcklwd m4, m2 |
| punpckhwd m7, m6, m2 |
| punpcklwd m6, m2 ; m4-7: luma_src as dword |
| |
| ; scaling[luma_src] |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m8, [scalingq+m4], m3 |
| vpgatherdd m4, [scalingq+m5], m9 |
| pcmpeqw m3, m3 |
| pcmpeqw m9, m9 |
| vpgatherdd m5, [scalingq+m6], m3 |
| vpgatherdd m6, [scalingq+m7], m9 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; unpack chroma_source |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| %if %1 |
| vpbroadcastd m6, [pb_23_22] |
| %endif |
| movq xm3, [grain_lutq+offxyq] |
| movq xm4, [grain_lutq+top_offxyq] |
| vinserti128 m3, [grain_lutq+offxyq+8], 1 |
| vinserti128 m4, [grain_lutq+top_offxyq+8], 1 |
| punpcklbw m4, m3 |
| %if %1 |
| pmaddubsw m4, m6, m4 |
| pmulhrsw m4, [pw_1024] |
| %else |
| pmaddubsw m4, m15, m4 |
| pmulhrsw m4, m14 |
| %endif |
| packsswb m4, m4 |
| vpermq m4, m4, q3120 |
| ; only interpolate first line, insert second line unmodified |
| vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[luma_src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq], xm0 |
| vextracti128 [dstq+strideq], m0, 1 |
| |
| sub hb, 2 |
| jl %%end_y_v_overlap |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*4] |
| add grain_lutq, 82*2 |
| jmp %%loop_y |
| |
| %%end_y_v_overlap: |
| add wq, 16 |
| jge %%end_hv |
| mov srcq, r11mp |
| mov dstq, r12mp |
| lea lumaq, [r14+wq*2] |
| add srcq, wq |
| add dstq, wq |
| |
| ; since fg_dataq.overlap is guaranteed to be set, we never jump |
| ; back to .loop_x_v_overlap, and instead always fall-through to |
| ; h+v overlap |
| |
| %%loop_x_hv_overlap: |
| ; we assume from the block above that bits 8-15 of r7d are zero'ed |
| mov r6d, seed |
| or seed, 0xeff4eff4 |
| test seeb, seeh |
| setp r7b ; parity of top_seed |
| shr seed, 16 |
| shl r7d, 16 |
| test seeb, seeh |
| setp r7b ; parity of cur_seed |
| or r6d, 0x00010001 |
| xor r7d, r6d |
| rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride |
| |
| lea topleft_offxyq, [top_offxyq+16] |
| lea left_offxyq, [offyq+16] |
| rorx offyd, seed, 8 |
| rorx offxd, seed, 12 |
| and offyd, 0xf000f |
| and offxd, 0xf000f |
| imul offyd, 82 |
| ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy |
| lea offyq, [offyq+offxq+0x10001*498+16*82] |
| |
| DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ |
| h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride |
| |
| movzx top_offxyd, offxyw |
| shr offxyd, 16 |
| |
| mov hd, hm |
| mov grain_lutq, grain_lutmp |
| %%loop_y_hv_overlap: |
| ; src |
| mova xm4, [lumaq+lstrideq*0+ 0] |
| mova xm6, [lumaq+lstrideq*0+16] |
| mova xm0, [srcq] |
| vpbroadcastd m7, [pb_1] |
| vinserti128 m4, [lumaq+lstrideq*2 +0], 1 |
| vinserti128 m6, [lumaq+lstrideq*2+16], 1 |
| vinserti128 m0, [srcq+strideq], 1 |
| pxor m2, m2 |
| pmaddubsw m4, m7 |
| pmaddubsw m6, m7 |
| pavgw m4, m2 |
| pavgw m6, m2 |
| |
| %if %1 |
| packuswb m4, m6 ; luma |
| punpckhbw m6, m4, m0 |
| punpcklbw m4, m0 ; { luma, chroma } |
| pmaddubsw m6, m14 |
| pmaddubsw m4, m14 |
| psraw m6, 6 |
| psraw m4, 6 |
| paddw m6, m15 |
| paddw m4, m15 |
| packuswb m4, m6 ; pack+unpack = clip |
| punpckhbw m6, m4, m2 |
| punpcklbw m4, m2 |
| %endif |
| |
| punpckhwd m5, m4, m2 |
| punpcklwd m4, m2 |
| punpckhwd m7, m6, m2 |
| punpcklwd m6, m2 ; m4-7: src as dword |
| |
| ; scaling[src] |
| pcmpeqw m9, m9 |
| pcmpeqw m3, m3 |
| vpgatherdd m8, [scalingq+m4], m9 |
| vpgatherdd m4, [scalingq+m5], m3 |
| pcmpeqw m9, m9 |
| pcmpeqw m3, m3 |
| vpgatherdd m5, [scalingq+m6], m9 |
| vpgatherdd m6, [scalingq+m7], m3 |
| pand m8, m10 |
| pand m4, m10 |
| pand m5, m10 |
| pand m6, m10 |
| packusdw m8, m4 |
| packusdw m5, m6 |
| |
| ; unpack chroma source |
| punpckhbw m1, m0, m2 |
| punpcklbw m0, m2 ; m0-1: src as word |
| |
| ; grain = grain_lut[offy+y][offx+x] |
| %if %1 |
| vpbroadcastd m9, [pb_23_22] |
| %endif |
| movu xm3, [grain_lutq+offxyq] |
| movq xm6, [grain_lutq+top_offxyq] |
| vinserti128 m3, [grain_lutq+offxyq+82], 1 |
| vinserti128 m6, [grain_lutq+top_offxyq+8], 1 |
| movd xm4, [grain_lutq+left_offxyq] |
| movd xm7, [grain_lutq+topleft_offxyq] |
| vinserti128 m4, [grain_lutq+left_offxyq+82], 1 |
| ; do h interpolation first (so top | top/left -> top, left | cur -> cur) |
| punpcklbw m4, m3 |
| punpcklbw xm7, xm6 |
| %if %1 |
| pmaddubsw m4, m9, m4 |
| pmaddubsw xm7, xm9, xm7 |
| pmulhrsw m4, [pw_1024] |
| pmulhrsw xm7, [pw_1024] |
| %else |
| pmaddubsw m4, m15, m4 |
| pmaddubsw xm7, xm15, xm7 |
| pmulhrsw m4, m14 |
| pmulhrsw xm7, xm14 |
| %endif |
| packsswb m4, m4 |
| packsswb xm7, xm7 |
| pcmpeqw m9, m9 ; this is kind of ugly |
| psrldq m9, 15 |
| vpblendvb m3, m3, m4, m9 |
| shufpd m9, m9, m9, 1110b |
| vpblendvb m6, m6, m7, m9 |
| vpermq m9, m3, q3120 |
| ; followed by v interpolation (top | cur -> cur) |
| punpcklbw m6, m9 |
| %if %1 |
| vpbroadcastd m9, [pb_23_22] |
| pmaddubsw m6, m9, m6 |
| pmulhrsw m6, [pw_1024] |
| %else |
| pmaddubsw m6, m15, m6 |
| pmulhrsw m6, m14 |
| %endif |
| packsswb m6, m6 |
| vpermq m6, m6, q3120 |
| vpblendd m3, m3, m6, 00001111b |
| pcmpgtb m7, m2, m3 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| |
| ; noise = round2(scaling[src] * grain, scaling_shift) |
| pmullw m2, m8 |
| pmullw m3, m5 |
| pmulhrsw m2, m11 |
| pmulhrsw m3, m11 |
| |
| ; dst = clip_pixel(src, noise) |
| paddw m0, m2 |
| paddw m1, m3 |
| pmaxsw m0, m13 |
| pmaxsw m1, m13 |
| pminsw m0, m12 |
| pminsw m1, m12 |
| packuswb m0, m1 |
| mova [dstq], xm0 |
| vextracti128 [dstq+strideq], m0, 1 |
| |
| lea srcq, [srcq+strideq*2] |
| lea dstq, [dstq+strideq*2] |
| lea lumaq, [lumaq+lstrideq*4] |
| add grain_lutq, 82*2 |
| sub hb, 2 |
| jg %%loop_y_h_overlap |
| |
| %%end_y_hv_overlap: |
| add wq, 16 |
| jge %%end_hv |
| mov srcq, r11mp |
| mov dstq, r12mp |
| lea lumaq, [r14+wq*2] |
| add srcq, wq |
| add dstq, wq |
| jmp %%loop_x_hv_overlap |
| |
| %%end_hv: |
| RET |
| %endmacro |
| |
| FGUV_32x32xN_LOOP 1 |
| .csfl: |
| FGUV_32x32xN_LOOP 0 |
| |
| %endif ; ARCH_X86_64 |