| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 32 |
| |
| wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
| wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 |
| wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 |
| sgr_r_ext: times 16 db 1 |
| times 16 db 9 |
| |
| ; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of |
| ; cache but eliminates some shifts in the inner sgr loop which is overall a win |
| const sgr_x_by_x_avx2 |
| dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 |
| dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 |
| dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 |
| dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 |
| dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 |
| dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 |
| dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 |
| |
| times 4 db -1 ; needed for 16-bit sgr |
| pb_m5: times 4 db -5 |
| pb_3: times 4 db 3 |
| pw_5_6: dw 5, 6 |
| |
| sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 |
| sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 |
| db 9, -1, 10, -1, 11, -1, 12, -1 |
| |
| pw_256: times 2 dw 256 |
| pw_2056: times 2 dw 2056 |
| pw_m16380: times 2 dw -16380 |
| pd_25: dd 25 |
| pd_34816: dd 34816 |
| pd_m4096: dd -4096 |
| pd_0xf00801c7: dd 0xf00801c7 |
| pd_0xf00800a4: dd 0xf00800a4 |
| |
| SECTION .text |
| |
| DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers |
| |
| INIT_YMM avx2 |
| cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ |
| w, h, edge, flt |
| mov fltq, r6mp |
| movifnidn hd, hm |
| mov edged, r7m |
| mov wd, wm |
| vbroadcasti128 m6, [wiener_shufA] |
| vpbroadcastb m11, [fltq+ 0] ; x0 x0 |
| vbroadcasti128 m7, [wiener_shufB] |
| vpbroadcastd m12, [fltq+ 2] |
| vbroadcasti128 m8, [wiener_shufC] |
| packsswb m12, m12 ; x1 x2 |
| vpbroadcastw m13, [fltq+ 6] ; x3 |
| vbroadcasti128 m9, [sgr_shuf+6] |
| add lpfq, wq |
| vpbroadcastd m10, [pw_m16380] |
| vpbroadcastd m14, [fltq+16] ; y0 y1 |
| add dstq, wq |
| vpbroadcastd m15, [fltq+20] ; y2 y3 |
| lea t1, [rsp+wq*2+16] |
| psllw m14, 5 |
| neg wq |
| psllw m15, 5 |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t6, t1 |
| mov t5, t1 |
| add t1, 384*2 |
| call .h_top |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| mov t4, t1 |
| add t1, 384*2 |
| add r10, strideq |
| mov [rsp], r10 ; below |
| call .h |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v2 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v3 |
| .main: |
| lea t0, [t1+384*2] |
| .main_loop: |
| call .hv |
| dec hd |
| jnz .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .v3 |
| mov lpfq, [rsp] |
| call .hv_bottom |
| add lpfq, strideq |
| call .hv_bottom |
| .v1: |
| call .v |
| RET |
| .no_top: |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov [rsp], r10 |
| call .h |
| mov t6, t1 |
| mov t5, t1 |
| mov t4, t1 |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v2 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v3 |
| lea t0, [t1+384*2] |
| call .hv |
| dec hd |
| jz .v3 |
| add t0, 384*8 |
| call .hv |
| dec hd |
| jnz .main |
| .v3: |
| call .v |
| .v2: |
| call .v |
| jmp .v1 |
| .extend_right: |
| movd xm2, r10d |
| vpbroadcastd m0, [pb_3] |
| vpbroadcastd m1, [pb_m5] |
| vpbroadcastb m2, xm2 |
| movu m3, [pb_0to31] |
| psubb m0, m2 |
| psubb m1, m2 |
| pminub m0, m3 |
| pminub m1, m3 |
| pshufb m4, m0 |
| pshufb m5, m1 |
| ret |
| .h: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movd xm4, [leftq] |
| vpblendd m4, [lpfq+r10-4], 0xfe |
| add leftq, 4 |
| jmp .h_main |
| .h_extend_left: |
| vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located |
| mova m4, [lpfq+r10] ; before the start of the buffer |
| palignr m4, m5, 12 |
| pshufb m4, [wiener_l_shuf] |
| jmp .h_main |
| .h_top: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu m4, [lpfq+r10-4] |
| .h_main: |
| movu m5, [lpfq+r10+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp r10d, -34 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| pshufb m0, m4, m6 |
| pmaddubsw m0, m11 |
| pshufb m1, m5, m6 |
| pmaddubsw m1, m11 |
| pshufb m2, m4, m7 |
| pmaddubsw m2, m12 |
| pshufb m3, m5, m7 |
| pmaddubsw m3, m12 |
| paddw m0, m2 |
| pshufb m2, m4, m8 |
| pmaddubsw m2, m12 |
| paddw m1, m3 |
| pshufb m3, m5, m8 |
| pmaddubsw m3, m12 |
| pshufb m4, m9 |
| paddw m0, m2 |
| pmullw m2, m4, m13 |
| pshufb m5, m9 |
| paddw m1, m3 |
| pmullw m3, m5, m13 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m4, m10 |
| paddw m5, m10 |
| paddw m0, m2 |
| vpbroadcastd m2, [pw_2056] |
| paddw m1, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m2 |
| paddw m1, m2 |
| mova [t1+r10*2+ 0], m0 |
| mova [t1+r10*2+32], m1 |
| add r10, 32 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv: |
| add lpfq, strideq |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movd xm4, [leftq] |
| vpblendd m4, [lpfq+r10-4], 0xfe |
| add leftq, 4 |
| jmp .hv_main |
| .hv_extend_left: |
| movu m4, [lpfq+r10-4] |
| pshufb m4, [wiener_l_shuf] |
| jmp .hv_main |
| .hv_bottom: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu m4, [lpfq+r10-4] |
| .hv_main: |
| movu m5, [lpfq+r10+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp r10d, -34 |
| jl .hv_have_right |
| call .extend_right |
| .hv_have_right: |
| pshufb m0, m4, m6 |
| pmaddubsw m0, m11 |
| pshufb m1, m5, m6 |
| pmaddubsw m1, m11 |
| pshufb m2, m4, m7 |
| pmaddubsw m2, m12 |
| pshufb m3, m5, m7 |
| pmaddubsw m3, m12 |
| paddw m0, m2 |
| pshufb m2, m4, m8 |
| pmaddubsw m2, m12 |
| paddw m1, m3 |
| pshufb m3, m5, m8 |
| pmaddubsw m3, m12 |
| pshufb m4, m9 |
| paddw m0, m2 |
| pmullw m2, m4, m13 |
| pshufb m5, m9 |
| paddw m1, m3 |
| pmullw m3, m5, m13 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m4, m10 |
| paddw m5, m10 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova m2, [t4+r10*2] |
| paddw m2, [t2+r10*2] |
| mova m3, [t3+r10*2] |
| paddsw m0, m4 |
| vpbroadcastd m4, [pw_2056] |
| paddsw m1, m5 |
| mova m5, [t5+r10*2] |
| paddw m5, [t1+r10*2] |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m4 |
| paddw m1, m4 |
| paddw m4, m0, [t6+r10*2] |
| mova [t0+r10*2], m0 |
| punpcklwd m0, m2, m3 |
| pmaddwd m0, m15 |
| punpckhwd m2, m3 |
| pmaddwd m2, m15 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m14 |
| punpckhwd m4, m5 |
| pmaddwd m4, m14 |
| paddd m0, m3 |
| paddd m4, m2 |
| mova m2, [t4+r10*2+32] |
| paddw m2, [t2+r10*2+32] |
| mova m3, [t3+r10*2+32] |
| mova m5, [t5+r10*2+32] |
| paddw m5, [t1+r10*2+32] |
| packuswb m0, m4 |
| paddw m4, m1, [t6+r10*2+32] |
| mova [t0+r10*2+32], m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m15 |
| punpckhwd m2, m3 |
| pmaddwd m2, m15 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m14 |
| punpckhwd m4, m5 |
| pmaddwd m4, m14 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+r10], m0 |
| add r10, 32 |
| jl .hv_loop |
| mov t6, t5 |
| mov t5, t4 |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t6 |
| add dstq, strideq |
| ret |
| .v: |
| mov r10, wq |
| .v_loop: |
| mova m2, [t4+r10*2+ 0] |
| paddw m2, [t2+r10*2+ 0] |
| mova m4, [t3+r10*2+ 0] |
| mova m6, [t1+r10*2+ 0] |
| paddw m8, m6, [t6+r10*2+ 0] |
| paddw m6, [t5+r10*2+ 0] |
| mova m3, [t4+r10*2+32] |
| paddw m3, [t2+r10*2+32] |
| mova m5, [t3+r10*2+32] |
| mova m7, [t1+r10*2+32] |
| paddw m9, m7, [t6+r10*2+32] |
| paddw m7, [t5+r10*2+32] |
| punpcklwd m0, m2, m4 |
| pmaddwd m0, m15 |
| punpckhwd m2, m4 |
| pmaddwd m2, m15 |
| punpcklwd m4, m8, m6 |
| pmaddwd m4, m14 |
| punpckhwd m6, m8, m6 |
| pmaddwd m6, m14 |
| punpcklwd m1, m3, m5 |
| pmaddwd m1, m15 |
| punpckhwd m3, m5 |
| pmaddwd m3, m15 |
| punpcklwd m5, m9, m7 |
| pmaddwd m5, m14 |
| punpckhwd m7, m9, m7 |
| pmaddwd m7, m14 |
| paddd m0, m4 |
| paddd m2, m6 |
| paddd m1, m5 |
| paddd m3, m7 |
| packuswb m0, m2 |
| packuswb m1, m3 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+r10], m0 |
| add r10, 32 |
| jl .v_loop |
| mov t6, t5 |
| mov t5, t4 |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| add dstq, strideq |
| ret |
| |
| cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ |
| w, h, edge, flt |
| mov fltq, r6mp |
| movifnidn hd, hm |
| mov edged, r7m |
| mov wd, wm |
| vbroadcasti128 m6, [wiener_shufB] |
| vpbroadcastd m12, [fltq+ 2] |
| vbroadcasti128 m7, [wiener_shufC] |
| packsswb m12, m12 ; x1 x2 |
| vpbroadcastw m13, [fltq+ 6] ; x3 |
| vbroadcasti128 m8, [sgr_shuf+6] |
| add lpfq, wq |
| vpbroadcastd m9, [pw_m16380] |
| vpbroadcastd m10, [pw_2056] |
| mova m11, [wiener_l_shuf] |
| vpbroadcastd m14, [fltq+16] ; __ y1 |
| add dstq, wq |
| vpbroadcastd m15, [fltq+20] ; y2 y3 |
| lea t1, [rsp+wq*2+16] |
| psllw m14, 5 |
| neg wq |
| psllw m15, 5 |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t4, t1 |
| add t1, 384*2 |
| call .h_top |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| mov t3, t1 |
| add t1, 384*2 |
| add r10, strideq |
| mov [rsp], r10 ; below |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v2 |
| .main: |
| mov t0, t4 |
| .main_loop: |
| call .hv |
| dec hd |
| jnz .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .v2 |
| mov lpfq, [rsp] |
| call .hv_bottom |
| add lpfq, strideq |
| call .hv_bottom |
| .end: |
| RET |
| .no_top: |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov [rsp], r10 |
| call .h |
| mov t4, t1 |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v2 |
| lea t0, [t1+384*2] |
| call .hv |
| dec hd |
| jz .v2 |
| add t0, 384*6 |
| call .hv |
| dec hd |
| jnz .main |
| .v2: |
| call .v |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| add dstq, strideq |
| .v1: |
| call .v |
| jmp .end |
| .h: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movd xm4, [leftq] |
| vpblendd m4, [lpfq+r10-4], 0xfe |
| add leftq, 4 |
| jmp .h_main |
| .h_extend_left: |
| vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located |
| mova m4, [lpfq+r10] ; before the start of the buffer |
| palignr m4, m5, 12 |
| pshufb m4, m11 |
| jmp .h_main |
| .h_top: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu m4, [lpfq+r10-4] |
| .h_main: |
| movu m5, [lpfq+r10+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp r10d, -33 |
| jl .h_have_right |
| call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right |
| .h_have_right: |
| pshufb m0, m4, m6 |
| pmaddubsw m0, m12 |
| pshufb m1, m5, m6 |
| pmaddubsw m1, m12 |
| pshufb m2, m4, m7 |
| pmaddubsw m2, m12 |
| pshufb m3, m5, m7 |
| pmaddubsw m3, m12 |
| pshufb m4, m8 |
| paddw m0, m2 |
| pmullw m2, m4, m13 |
| pshufb m5, m8 |
| paddw m1, m3 |
| pmullw m3, m5, m13 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m4, m9 |
| paddw m5, m9 |
| paddw m0, m2 |
| paddw m1, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m10 |
| paddw m1, m10 |
| mova [t1+r10*2+ 0], m0 |
| mova [t1+r10*2+32], m1 |
| add r10, 32 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv: |
| add lpfq, strideq |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movd xm4, [leftq] |
| vpblendd m4, [lpfq+r10-4], 0xfe |
| add leftq, 4 |
| jmp .hv_main |
| .hv_extend_left: |
| movu m4, [lpfq+r10-4] |
| pshufb m4, m11 |
| jmp .hv_main |
| .hv_bottom: |
| mov r10, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu m4, [lpfq+r10-4] |
| .hv_main: |
| movu m5, [lpfq+r10+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp r10d, -33 |
| jl .hv_have_right |
| call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right |
| .hv_have_right: |
| pshufb m0, m4, m6 |
| pmaddubsw m0, m12 |
| pshufb m1, m5, m6 |
| pmaddubsw m1, m12 |
| pshufb m2, m4, m7 |
| pmaddubsw m2, m12 |
| pshufb m3, m5, m7 |
| pmaddubsw m3, m12 |
| pshufb m4, m8 |
| paddw m0, m2 |
| pmullw m2, m4, m13 |
| pshufb m5, m8 |
| paddw m1, m3 |
| pmullw m3, m5, m13 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m4, m9 |
| paddw m5, m9 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova m2, [t3+r10*2] |
| paddw m2, [t1+r10*2] |
| mova m3, [t2+r10*2] |
| paddsw m0, m4 |
| paddsw m1, m5 |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m10 |
| paddw m1, m10 |
| paddw m4, m0, [t4+r10*2] |
| mova [t0+r10*2], m0 |
| punpcklwd m0, m2, m3 |
| pmaddwd m0, m15 |
| punpckhwd m2, m3 |
| pmaddwd m2, m15 |
| punpcklwd m3, m4, m4 |
| pmaddwd m3, m14 |
| punpckhwd m4, m4 |
| pmaddwd m4, m14 |
| paddd m0, m3 |
| paddd m4, m2 |
| mova m2, [t3+r10*2+32] |
| paddw m2, [t1+r10*2+32] |
| mova m3, [t2+r10*2+32] |
| packuswb m0, m4 |
| paddw m4, m1, [t4+r10*2+32] |
| mova [t0+r10*2+32], m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m15 |
| punpckhwd m2, m3 |
| pmaddwd m2, m15 |
| punpcklwd m3, m4, m4 |
| pmaddwd m3, m14 |
| punpckhwd m4, m4 |
| pmaddwd m4, m14 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+r10], m0 |
| add r10, 32 |
| jl .hv_loop |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t4 |
| add dstq, strideq |
| ret |
| .v: |
| mov r10, wq |
| psrld m13, m14, 16 ; y1 __ |
| .v_loop: |
| mova m6, [t1+r10*2+ 0] |
| paddw m2, m6, [t3+r10*2+ 0] |
| mova m4, [t2+r10*2+ 0] |
| mova m7, [t1+r10*2+32] |
| paddw m3, m7, [t3+r10*2+32] |
| mova m5, [t2+r10*2+32] |
| paddw m6, [t4+r10*2+ 0] |
| paddw m7, [t4+r10*2+32] |
| punpcklwd m0, m2, m4 |
| pmaddwd m0, m15 |
| punpckhwd m2, m4 |
| pmaddwd m2, m15 |
| punpcklwd m1, m3, m5 |
| pmaddwd m1, m15 |
| punpckhwd m3, m5 |
| pmaddwd m3, m15 |
| punpcklwd m5, m7, m6 |
| pmaddwd m4, m5, m14 |
| punpckhwd m7, m6 |
| pmaddwd m6, m7, m14 |
| pmaddwd m5, m13 |
| pmaddwd m7, m13 |
| paddd m0, m4 |
| paddd m2, m6 |
| paddd m1, m5 |
| paddd m3, m7 |
| packuswb m0, m2 |
| packuswb m1, m3 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+r10], m0 |
| add r10, 32 |
| jl .v_loop |
| ret |
| |
| cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %define base r12-sgr_x_by_x_avx2-256*4 |
| lea r12, [sgr_x_by_x_avx2+256*4] |
| mov paramsq, r6mp |
| mov wd, wm |
| movifnidn hd, hm |
| mov edged, r7m |
| vbroadcasti128 m8, [base+sgr_shuf+0] |
| vbroadcasti128 m9, [base+sgr_shuf+8] |
| add lpfq, wq |
| vbroadcasti128 m10, [base+sgr_shuf+2] |
| add dstq, wq |
| vbroadcasti128 m11, [base+sgr_shuf+6] |
| lea t3, [rsp+wq*4+16+400*12] |
| vpbroadcastd m12, [paramsq+0] ; s0 |
| pxor m6, m6 |
| vpbroadcastw m7, [paramsq+8] ; w0 |
| lea t1, [rsp+wq*2+20] |
| vpbroadcastd m13, [base+pd_0xf00800a4] |
| neg wq |
| vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) |
| psllw m7, 4 |
| vpbroadcastd m15, [base+pd_m4096] |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t2, t1 |
| call .top_fixup |
| add t1, 400*6 |
| call .h_top |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add r10, strideq |
| mov [rsp], r10 ; below |
| mov t0, t2 |
| dec hd |
| jz .height1 |
| or edged, 16 |
| call .h |
| .main: |
| add lpfq, strideq |
| call .hv |
| call .prep_n |
| sub hd, 2 |
| jl .extend_bottom |
| .main_loop: |
| add lpfq, strideq |
| test hd, hd |
| jz .odd_height |
| call .h |
| add lpfq, strideq |
| call .hv |
| call .n0 |
| call .n1 |
| sub hd, 2 |
| jge .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, [rsp] |
| call .h_top |
| add lpfq, strideq |
| call .hv_bottom |
| .end: |
| call .n0 |
| call .n1 |
| .end2: |
| RET |
| .height1: |
| call .hv |
| call .prep_n |
| jmp .odd_height_end |
| .odd_height: |
| call .hv |
| call .n0 |
| call .n1 |
| .odd_height_end: |
| call .v |
| call .n0 |
| jmp .end2 |
| .extend_bottom: |
| call .v |
| jmp .end |
| .no_top: |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov [rsp], r10 |
| call .h |
| lea t2, [t1+400*6] |
| call .top_fixup |
| dec hd |
| jz .no_top_height1 |
| or edged, 16 |
| mov t0, t1 |
| mov t1, t2 |
| jmp .main |
| .no_top_height1: |
| call .v |
| call .prep_n |
| jmp .odd_height_end |
| .extend_right: |
| movd xm2, r10d |
| mova m0, [sgr_r_ext] |
| vpbroadcastb m2, xm2 |
| psubb m0, m2 |
| pminub m0, [pb_0to31] |
| pshufb m5, m0 |
| ret |
| .h: ; horizontal boxsum |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .h_main |
| .h_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .h_main |
| .h_top: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu xm5, [lpfq+r10-2] |
| .h_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp r10d, -18 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| pshufb m3, m5, m8 |
| pmullw m4, m3, m3 |
| pshufb m2, m5, m9 |
| paddw m0, m3, m2 |
| shufps m3, m2, q2121 |
| paddw m0, m3 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m1 |
| punpckhwd m2, m3 |
| pmaddwd m2, m2 |
| punpcklwd m3, m4, m6 |
| paddd m1, m3 |
| punpckhwd m4, m6 |
| paddd m2, m4 |
| pshufb m4, m5, m10 |
| paddw m0, m4 |
| pshufb m5, m11 |
| paddw m0, m5 ; sum |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m3 |
| punpckhwd m4, m5 |
| pmaddwd m4, m4 |
| test edgeb, 16 ; y > 0 |
| jz .h_loop_end |
| paddw m0, [t1+r10*2+400*0] |
| paddd m1, [t1+r10*2+400*2] |
| paddd m2, [t1+r10*2+400*4] |
| .h_loop_end: |
| paddd m1, m3 ; sumsq |
| paddd m2, m4 |
| mova [t1+r10*2+400*0], m0 |
| mova [t1+r10*2+400*2], m1 |
| mova [t1+r10*2+400*4], m2 |
| add r10, 16 |
| jl .h_loop |
| ret |
| .top_fixup: |
| lea r10, [wq-2] |
| .top_fixup_loop: ; the sums of the first row needs to be doubled |
| mova m0, [t1+r10*2+400*0] |
| mova m1, [t1+r10*2+400*2] |
| mova m2, [t1+r10*2+400*4] |
| paddw m0, m0 |
| paddd m1, m1 |
| paddd m2, m2 |
| mova [t2+r10*2+400*0], m0 |
| mova [t2+r10*2+400*2], m1 |
| mova [t2+r10*2+400*4], m2 |
| add r10, 16 |
| jl .top_fixup_loop |
| ret |
| ALIGN function_align |
| .hv: ; horizontal boxsum + vertical boxsum + ab |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .hv_main |
| .hv_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .hv_main |
| .hv_bottom: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu xm5, [lpfq+r10-2] |
| .hv_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp r10d, -18 |
| jl .hv_have_right |
| call .extend_right |
| .hv_have_right: |
| pshufb m1, m5, m8 |
| pmullw m4, m1, m1 |
| pshufb m3, m5, m9 |
| paddw m0, m1, m3 |
| shufps m1, m3, q2121 |
| paddw m0, m1 |
| punpcklwd m2, m3, m1 |
| pmaddwd m2, m2 |
| punpckhwd m3, m1 |
| pmaddwd m3, m3 |
| punpcklwd m1, m4, m6 |
| paddd m2, m1 |
| punpckhwd m4, m6 |
| paddd m3, m4 |
| pshufb m1, m5, m10 |
| paddw m0, m1 |
| pshufb m5, m11 |
| paddw m0, m5 ; h sum |
| punpcklwd m4, m5, m1 |
| pmaddwd m4, m4 |
| punpckhwd m5, m1 |
| pmaddwd m5, m5 |
| paddw m1, m0, [t1+r10*2+400*0] |
| paddd m2, m4 ; h sumsq |
| paddd m3, m5 |
| paddd m4, m2, [t1+r10*2+400*2] |
| paddd m5, m3, [t1+r10*2+400*4] |
| test hd, hd |
| jz .hv_last_row |
| .hv_main2: |
| paddw m1, [t2+r10*2+400*0] ; hv sum |
| paddd m4, [t2+r10*2+400*2] ; hv sumsq |
| paddd m5, [t2+r10*2+400*4] |
| mova [t0+r10*2+400*0], m0 |
| mova [t0+r10*2+400*2], m2 |
| mova [t0+r10*2+400*4], m3 |
| vpbroadcastd m2, [pd_25] |
| punpcklwd m0, m1, m6 ; b |
| punpckhwd m1, m6 |
| pmulld m4, m2 ; a * 25 |
| pmulld m5, m2 |
| pmaddwd m2, m0, m0 ; b * b |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| pmulld m4, m12 ; p * s |
| pmulld m5, m12 |
| pmaddwd m0, m13 ; b * 164 |
| pmaddwd m1, m13 |
| paddusw m4, m13 |
| paddusw m5, m13 |
| psrad m3, m4, 20 ; min(z, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m4 ; x |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r12+m4*4], m5 |
| pmulld m0, m2 |
| pmulld m1, m3 |
| paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m14 |
| pand m0, m15 |
| pand m1, m15 |
| por m0, m2 ; a | (b << 12) |
| por m1, m3 |
| mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires |
| vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. |
| mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but |
| vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. |
| add r10, 16 |
| jl .hv_loop |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t2 |
| ret |
| .hv_last_row: ; esoteric edge case for odd heights |
| mova [t1+r10*2+400*0], m1 |
| paddw m1, m0 |
| mova [t1+r10*2+400*2], m4 |
| paddd m4, m2 |
| mova [t1+r10*2+400*4], m5 |
| paddd m5, m3 |
| jmp .hv_main2 |
| .v: ; vertical boxsum + ab |
| lea r10, [wq-2] |
| .v_loop: |
| mova m0, [t1+r10*2+400*0] |
| mova m2, [t1+r10*2+400*2] |
| mova m3, [t1+r10*2+400*4] |
| paddw m1, m0, [t2+r10*2+400*0] |
| paddd m4, m2, [t2+r10*2+400*2] |
| paddd m5, m3, [t2+r10*2+400*4] |
| paddw m0, m0 |
| paddd m2, m2 |
| paddd m3, m3 |
| paddw m1, m0 ; hv sum |
| paddd m4, m2 ; hv sumsq |
| paddd m5, m3 |
| vpbroadcastd m2, [pd_25] |
| punpcklwd m0, m1, m6 ; b |
| punpckhwd m1, m6 |
| pmulld m4, m2 ; a * 25 |
| pmulld m5, m2 |
| pmaddwd m2, m0, m0 ; b * b |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| pmulld m4, m12 ; p * s |
| pmulld m5, m12 |
| pmaddwd m0, m13 ; b * 164 |
| pmaddwd m1, m13 |
| paddusw m4, m13 |
| paddusw m5, m13 |
| psrad m3, m4, 20 ; min(z, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m4 ; x |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r12+m4*4], m5 |
| pmulld m0, m2 |
| pmulld m1, m3 |
| paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m14 |
| pand m0, m15 |
| pand m1, m15 |
| por m0, m2 ; a | (b << 12) |
| por m1, m3 |
| mova [t3+r10*4+ 8], xm0 |
| vextracti128 [t3+r10*4+40], m0, 1 |
| mova [t3+r10*4+24], xm1 |
| vextracti128 [t3+r10*4+56], m1, 1 |
| add r10, 16 |
| jl .v_loop |
| ret |
| .prep_n: ; initial neighbor setup |
| mov r10, wq |
| .prep_n_loop: |
| movu m0, [t3+r10*4+ 4] |
| movu m1, [t3+r10*4+36] |
| paddd m2, m0, [t3+r10*4+ 0] |
| paddd m3, m1, [t3+r10*4+32] |
| paddd m2, [t3+r10*4+ 8] |
| paddd m3, [t3+r10*4+40] |
| paddd m0, m2 |
| pslld m2, 2 |
| paddd m1, m3 |
| pslld m3, 2 |
| paddd m2, m0 ; ab 565 |
| paddd m3, m1 |
| pandn m0, m15, m2 ; a |
| psrld m2, 12 ; b |
| pandn m1, m15, m3 |
| psrld m3, 12 |
| mova [t3+r10*4+400*4+ 0], m0 |
| mova [t3+r10*4+400*8+ 0], m2 |
| mova [t3+r10*4+400*4+32], m1 |
| mova [t3+r10*4+400*8+32], m3 |
| add r10, 16 |
| jl .prep_n_loop |
| ret |
| ALIGN function_align |
| .n0: ; neighbor + output (even rows) |
| mov r10, wq |
| .n0_loop: |
| movu m0, [t3+r10*4+ 4] |
| movu m1, [t3+r10*4+36] |
| paddd m2, m0, [t3+r10*4+ 0] |
| paddd m3, m1, [t3+r10*4+32] |
| paddd m2, [t3+r10*4+ 8] |
| paddd m3, [t3+r10*4+40] |
| paddd m0, m2 |
| pslld m2, 2 |
| paddd m1, m3 |
| pslld m3, 2 |
| paddd m2, m0 |
| paddd m3, m1 |
| pandn m0, m15, m2 |
| psrld m2, 12 |
| pandn m1, m15, m3 |
| psrld m3, 12 |
| paddd m4, m0, [t3+r10*4+400*4+ 0] ; a |
| paddd m5, m1, [t3+r10*4+400*4+32] |
| mova [t3+r10*4+400*4+ 0], m0 |
| mova [t3+r10*4+400*4+32], m1 |
| paddd m0, m2, [t3+r10*4+400*8+ 0] ; b |
| paddd m1, m3, [t3+r10*4+400*8+32] |
| mova [t3+r10*4+400*8+ 0], m2 |
| mova [t3+r10*4+400*8+32], m3 |
| pmovzxbd m2, [dstq+r10+0] |
| pmovzxbd m3, [dstq+r10+8] |
| pmaddwd m4, m2 ; a * src |
| pmaddwd m5, m3 |
| packssdw m2, m3 |
| psubd m0, m4 ; b - a * src + (1 << 8) |
| psubd m1, m5 |
| psrad m0, 9 |
| psrad m1, 9 |
| packssdw m0, m1 |
| pmulhrsw m0, m7 |
| paddw m0, m2 |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| pshufd xm0, xm0, q3120 |
| mova [dstq+r10], xm0 |
| add r10, 16 |
| jl .n0_loop |
| add dstq, strideq |
| ret |
| ALIGN function_align |
| .n1: ; neighbor + output (odd rows) |
| mov r10, wq |
| .n1_loop: |
| pmovzxbd m2, [dstq+r10+0] |
| pmovzxbd m3, [dstq+r10+8] |
| pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src |
| pmaddwd m5, m3, [t3+r10*4+400*4+32] |
| mova m0, [t3+r10*4+400*8+ 0] ; b |
| mova m1, [t3+r10*4+400*8+32] |
| packssdw m2, m3 |
| psubd m0, m4 ; b - a * src + (1 << 7) |
| psubd m1, m5 |
| psrad m0, 8 |
| psrad m1, 8 |
| packssdw m0, m1 |
| pmulhrsw m0, m7 |
| paddw m0, m2 |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| pshufd xm0, xm0, q3120 |
| mova [dstq+r10], xm0 |
| add r10, 16 |
| jl .n1_loop |
| add dstq, strideq |
| ret |
| |
| cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %define base r14-sgr_x_by_x_avx2-256*4 |
| mov paramsq, r6mp |
| mov wd, wm |
| movifnidn hd, hm |
| mov edged, r7m |
| lea r14, [sgr_x_by_x_avx2+256*4] |
| vbroadcasti128 m8, [base+sgr_shuf+2] |
| add lpfq, wq |
| vbroadcasti128 m9, [base+sgr_shuf+4] |
| add dstq, wq |
| vbroadcasti128 m10, [base+sgr_shuf+6] |
| lea t3, [rsp+wq*4+16+400*12] |
| vpbroadcastd m11, [paramsq+ 4] ; s1 |
| pxor m6, m6 |
| vpbroadcastw m7, [paramsq+10] ; w1 |
| lea t1, [rsp+wq*2+20] |
| vpbroadcastd m12, [base+pd_0xf00801c7] |
| neg wq |
| vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) |
| psllw m7, 4 |
| vpbroadcastd m14, [base+pd_m4096] |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t2, t1 |
| add t1, 400*6 |
| call .h_top |
| lea t4, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add t4, strideq |
| mov [rsp], t4 ; below |
| mov t0, t2 |
| call .hv |
| .main: |
| mov t5, t3 |
| add t3, 400*4 |
| dec hd |
| jz .height1 |
| add lpfq, strideq |
| call .hv |
| call .prep_n |
| dec hd |
| jz .extend_bottom |
| .main_loop: |
| add lpfq, strideq |
| call .hv |
| call .n |
| dec hd |
| jnz .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, [rsp] |
| call .hv_bottom |
| call .n |
| add lpfq, strideq |
| call .hv_bottom |
| .end: |
| call .n |
| RET |
| .height1: |
| call .v |
| call .prep_n |
| mov t2, t1 |
| call .v |
| jmp .end |
| .extend_bottom: |
| call .v |
| call .n |
| mov t2, t1 |
| call .v |
| jmp .end |
| .no_top: |
| lea t4, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea t4, [t4+strideq*2] |
| mov [rsp], t4 |
| call .h |
| lea t0, [t1+400*6] |
| mov t2, t1 |
| call .v |
| jmp .main |
| .h: ; horizontal boxsum |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .h_main |
| .h_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .h_main |
| .h_top: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu xm5, [lpfq+r10-2] |
| .h_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp r10d, -17 |
| jl .h_have_right |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right |
| .h_have_right: |
| pshufb m0, m5, m8 |
| pmullw m2, m0, m0 |
| pshufb m4, m5, m9 |
| paddw m0, m4 |
| pshufb m5, m10 |
| paddw m0, m5 ; sum |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m3 |
| punpckhwd m4, m5 |
| pmaddwd m4, m4 |
| punpcklwd m1, m2, m6 |
| punpckhwd m2, m6 |
| mova [t1+r10*2+400*0], m0 |
| paddd m1, m3 ; sumsq |
| paddd m2, m4 |
| mova [t1+r10*2+400*2], m1 |
| mova [t1+r10*2+400*4], m2 |
| add r10, 16 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv: ; horizontal boxsum + vertical boxsum + ab |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .hv_main |
| .hv_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .hv_main |
| .hv_bottom: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu xm5, [lpfq+r10-2] |
| .hv_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp r10d, -17 |
| jl .hv_have_right |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right |
| .hv_have_right: |
| pshufb m0, m5, m8 |
| pmullw m3, m0, m0 |
| pshufb m1, m5, m9 |
| paddw m0, m1 |
| pshufb m5, m10 |
| paddw m0, m5 ; h sum |
| punpcklwd m4, m5, m1 |
| pmaddwd m4, m4 |
| punpckhwd m5, m1 |
| pmaddwd m5, m5 |
| paddw m1, m0, [t2+r10*2+400*0] |
| paddw m1, [t1+r10*2+400*0] ; hv sum |
| punpcklwd m2, m3, m6 |
| punpckhwd m3, m6 |
| paddd m4, m2 ; h sumsq |
| paddd m5, m3 |
| paddd m2, m4, [t2+r10*2+400*2] |
| paddd m3, m5, [t2+r10*2+400*4] |
| paddd m2, [t1+r10*2+400*2] ; hv sumsq |
| paddd m3, [t1+r10*2+400*4] |
| mova [t0+r10*2+400*0], m0 |
| punpcklwd m0, m1, m6 ; b |
| punpckhwd m1, m6 |
| mova [t0+r10*2+400*2], m4 |
| pslld m4, m2, 3 |
| mova [t0+r10*2+400*4], m5 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a * 9 |
| pmaddwd m2, m0, m0 ; b * b |
| paddd m5, m3 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| pmulld m4, m11 ; p * s |
| pmulld m5, m11 |
| pmaddwd m0, m12 ; b * 455 |
| pmaddwd m1, m12 |
| paddusw m4, m12 |
| paddusw m5, m12 |
| psrad m3, m4, 20 ; min(z, 255) - 256 |
| vpgatherdd m2, [r14+m3*4], m4 |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r14+m4*4], m5 |
| pmulld m0, m2 |
| pmulld m1, m3 |
| paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m13 |
| pand m0, m14 |
| pand m1, m14 |
| por m0, m2 ; a | (b << 12) |
| por m1, m3 |
| mova [t3+r10*4+ 8], xm0 |
| vextracti128 [t3+r10*4+40], m0, 1 |
| mova [t3+r10*4+24], xm1 |
| vextracti128 [t3+r10*4+56], m1, 1 |
| add r10, 16 |
| jl .hv_loop |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t2 |
| ret |
| .v: ; vertical boxsum + ab |
| lea r10, [wq-2] |
| .v_loop: |
| mova m1, [t1+r10*2+400*0] |
| paddw m1, m1 |
| paddw m1, [t2+r10*2+400*0] ; hv sum |
| mova m2, [t1+r10*2+400*2] |
| mova m3, [t1+r10*2+400*4] |
| paddd m2, m2 |
| paddd m3, m3 |
| paddd m2, [t2+r10*2+400*2] ; hv sumsq |
| paddd m3, [t2+r10*2+400*4] |
| punpcklwd m0, m1, m6 ; b |
| punpckhwd m1, m6 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a * 9 |
| pmaddwd m2, m0, m0 ; b * b |
| paddd m5, m3 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| pmulld m4, m11 ; p * s |
| pmulld m5, m11 |
| pmaddwd m0, m12 ; b * 455 |
| pmaddwd m1, m12 |
| paddusw m4, m12 |
| paddusw m5, m12 |
| psrad m3, m4, 20 ; min(z, 255) - 256 |
| vpgatherdd m2, [r14+m3*4], m4 |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r14+m4*4], m5 |
| pmulld m0, m2 |
| pmulld m1, m3 |
| paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m13 |
| pand m0, m14 |
| pand m1, m14 |
| por m0, m2 ; a | (b << 12) |
| por m1, m3 |
| mova [t3+r10*4+ 8], xm0 |
| vextracti128 [t3+r10*4+40], m0, 1 |
| mova [t3+r10*4+24], xm1 |
| vextracti128 [t3+r10*4+56], m1, 1 |
| add r10, 16 |
| jl .v_loop |
| ret |
| .prep_n: ; initial neighbor setup |
| mov r10, wq |
| mov t4, t3 |
| add t3, 400*4 |
| .prep_n_loop: |
| mova m2, [t5+r10*4+0] |
| mova m3, [t4+r10*4+0] |
| paddd m2, [t5+r10*4+8] |
| paddd m3, [t4+r10*4+8] |
| paddd m0, m2, [t5+r10*4+4] |
| paddd m1, m3, [t4+r10*4+4] |
| pslld m0, 2 |
| paddd m1, m1 ; ab[ 0] 222 |
| psubd m0, m2 ; ab[-1] 343 |
| mova [t3+r10*4+400*4], m1 |
| paddd m1, m1 |
| mova [t5+r10*4], m0 |
| psubd m1, m3 ; ab[ 0] 343 |
| mova [t4+r10*4], m1 |
| add r10, 8 |
| jl .prep_n_loop |
| ret |
| ; a+b are packed together in a single dword, but we can't do the |
| ; full neighbor calculations before splitting them since we don't |
| ; have sufficient precision. The solution is to do the calculations |
| ; in two equal halves and split a and b before doing the final sum. |
| ALIGN function_align |
| .n: ; neighbor + output |
| mov r10, wq |
| .n_loop: |
| mova m4, [t3+r10*4+ 0] |
| paddd m4, [t3+r10*4+ 8] |
| paddd m5, m4, [t3+r10*4+ 4] |
| paddd m5, m5 ; ab[+1] 222 |
| mova m2, [t3+r10*4+400*4+ 0] |
| paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 |
| mova m3, [t3+r10*4+400*4+32] |
| paddd m1, m3, [t5+r10*4+32] |
| mova [t3+r10*4+400*4+ 0], m5 |
| paddd m5, m5 |
| psubd m5, m4 ; ab[+1] 343 |
| mova [t5+r10*4+ 0], m5 |
| paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 |
| mova m4, [t3+r10*4+32] |
| paddd m4, [t3+r10*4+40] |
| paddd m5, m4, [t3+r10*4+36] |
| paddd m5, m5 |
| mova [t3+r10*4+400*4+32], m5 |
| paddd m5, m5 |
| psubd m5, m4 |
| mova [t5+r10*4+32], m5 |
| pandn m4, m14, m0 |
| psrld m0, 12 |
| paddd m3, m5 |
| pandn m5, m14, m2 |
| psrld m2, 12 |
| paddd m4, m5 ; a |
| pandn m5, m14, m1 |
| psrld m1, 12 |
| paddd m0, m2 ; b + (1 << 8) |
| pandn m2, m14, m3 |
| psrld m3, 12 |
| paddd m5, m2 |
| pmovzxbd m2, [dstq+r10+0] |
| paddd m1, m3 |
| pmovzxbd m3, [dstq+r10+8] |
| pmaddwd m4, m2 ; a * src |
| pmaddwd m5, m3 |
| packssdw m2, m3 |
| psubd m0, m4 ; b - a * src + (1 << 8) |
| psubd m1, m5 |
| psrad m0, 9 |
| psrad m1, 9 |
| packssdw m0, m1 |
| pmulhrsw m0, m7 |
| paddw m0, m2 |
| vextracti128 xm1, m0, 1 |
| packuswb xm0, xm1 |
| pshufd xm0, xm0, q3120 |
| mova [dstq+r10], xm0 |
| add r10, 16 |
| jl .n_loop |
| mov r10, t5 |
| mov t5, t4 |
| mov t4, r10 |
| add dstq, strideq |
| ret |
| |
| cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %define base r12-sgr_x_by_x_avx2-256*4 |
| lea r12, [sgr_x_by_x_avx2+256*4] |
| mov paramsq, r6mp |
| mov wd, wm |
| movifnidn hd, hm |
| mov edged, r7m |
| vbroadcasti128 m9, [base+sgr_shuf+0] |
| vbroadcasti128 m10, [base+sgr_shuf+8] |
| add lpfq, wq |
| vbroadcasti128 m11, [base+sgr_shuf+2] |
| vbroadcasti128 m12, [base+sgr_shuf+6] |
| add dstq, wq |
| vpbroadcastd m15, [paramsq+8] ; w0 w1 |
| lea t3, [rsp+wq*4+400*24+8] |
| vpbroadcastd m13, [paramsq+0] ; s0 |
| pxor m7, m7 |
| vpbroadcastd m14, [paramsq+4] ; s1 |
| lea t1, [rsp+wq*2+12] |
| neg wq |
| psllw m15, 2 ; to reuse existing pd_m4096 register for rounding |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t2, t1 |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup |
| add t1, 400*12 |
| call .h_top |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add r10, strideq |
| mov [rsp], r10 ; below |
| call .hv0 |
| .main: |
| dec hd |
| jz .height1 |
| add lpfq, strideq |
| call .hv1 |
| call .prep_n |
| sub hd, 2 |
| jl .extend_bottom |
| .main_loop: |
| add lpfq, strideq |
| call .hv0 |
| test hd, hd |
| jz .odd_height |
| add lpfq, strideq |
| call .hv1 |
| call .n0 |
| call .n1 |
| sub hd, 2 |
| jge .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, [rsp] |
| call .hv0_bottom |
| add lpfq, strideq |
| call .hv1_bottom |
| .end: |
| call .n0 |
| call .n1 |
| .end2: |
| RET |
| .height1: |
| call .v1 |
| call .prep_n |
| jmp .odd_height_end |
| .odd_height: |
| call .v1 |
| call .n0 |
| call .n1 |
| .odd_height_end: |
| call .v0 |
| call .v1 |
| call .n0 |
| jmp .end2 |
| .extend_bottom: |
| call .v0 |
| call .v1 |
| jmp .end |
| .no_top: |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov [rsp], r10 |
| call .h |
| lea t2, [t1+400*12] |
| lea r10, [wq-2] |
| .top_fixup_loop: |
| mova m0, [t1+r10*2+400* 0] |
| mova m1, [t1+r10*2+400* 2] |
| mova m2, [t1+r10*2+400* 4] |
| paddw m0, m0 |
| mova m3, [t1+r10*2+400* 6] |
| paddd m1, m1 |
| mova m4, [t1+r10*2+400* 8] |
| paddd m2, m2 |
| mova m5, [t1+r10*2+400*10] |
| mova [t2+r10*2+400* 0], m0 |
| mova [t2+r10*2+400* 2], m1 |
| mova [t2+r10*2+400* 4], m2 |
| mova [t2+r10*2+400* 6], m3 |
| mova [t2+r10*2+400* 8], m4 |
| mova [t2+r10*2+400*10], m5 |
| add r10, 16 |
| jl .top_fixup_loop |
| call .v0 |
| jmp .main |
| .h: ; horizontal boxsums |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .h_main |
| .h_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .h_main |
| .h_top: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu xm5, [lpfq+r10-2] |
| .h_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp r10d, -18 |
| jl .h_have_right |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right |
| .h_have_right: |
| pshufb m6, m5, m9 |
| pshufb m4, m5, m10 |
| paddw m8, m6, m4 |
| shufps m0, m6, m4, q2121 |
| pmullw m3, m0, m0 |
| pshufb m2, m5, m11 |
| paddw m0, m2 |
| pshufb m5, m12 |
| paddw m0, m5 ; sum3 |
| punpcklwd m1, m2, m5 |
| pmaddwd m1, m1 |
| punpckhwd m2, m5 |
| pmaddwd m2, m2 |
| punpcklwd m5, m6, m4 |
| pmaddwd m5, m5 |
| punpckhwd m6, m4 |
| pmaddwd m6, m6 |
| punpcklwd m4, m3, m7 |
| paddd m1, m4 ; sumsq3 |
| punpckhwd m3, m7 |
| paddd m2, m3 |
| mova [t1+r10*2+400* 6], m0 |
| mova [t1+r10*2+400* 8], m1 |
| mova [t1+r10*2+400*10], m2 |
| paddw m8, m0 ; sum5 |
| paddd m5, m1 ; sumsq5 |
| paddd m6, m2 |
| mova [t1+r10*2+400* 0], m8 |
| mova [t1+r10*2+400* 2], m5 |
| mova [t1+r10*2+400* 4], m6 |
| add r10, 16 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .hv0_main |
| .hv0_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .hv0_main |
| .hv0_bottom: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| .hv0_loop: |
| movu xm5, [lpfq+r10-2] |
| .hv0_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv0_have_right |
| cmp r10d, -18 |
| jl .hv0_have_right |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right |
| .hv0_have_right: |
| pshufb m6, m5, m9 |
| pshufb m4, m5, m10 |
| paddw m8, m6, m4 |
| shufps m1, m6, m4, q2121 |
| pmullw m0, m1, m1 |
| pshufb m3, m5, m11 |
| paddw m1, m3 |
| pshufb m5, m12 |
| paddw m1, m5 ; sum3 |
| punpcklwd m2, m3, m5 |
| pmaddwd m2, m2 |
| punpckhwd m3, m5 |
| pmaddwd m3, m3 |
| punpcklwd m5, m6, m4 |
| pmaddwd m5, m5 |
| punpckhwd m6, m4 |
| pmaddwd m6, m6 |
| punpcklwd m4, m0, m7 |
| paddd m2, m4 ; sumsq3 |
| punpckhwd m0, m7 |
| paddd m3, m0 |
| paddw m8, m1 ; sum5 |
| paddd m5, m2 ; sumsq5 |
| paddd m6, m3 |
| mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row |
| mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd |
| mova [t3+r10*4+400*0+40], m6 |
| paddw m8, [t1+r10*2+400* 0] |
| paddd m5, [t1+r10*2+400* 2] |
| paddd m6, [t1+r10*2+400* 4] |
| mova [t1+r10*2+400* 0], m8 |
| mova [t1+r10*2+400* 2], m5 |
| mova [t1+r10*2+400* 4], m6 |
| paddw m0, m1, [t1+r10*2+400* 6] |
| paddd m4, m2, [t1+r10*2+400* 8] |
| paddd m5, m3, [t1+r10*2+400*10] |
| mova [t1+r10*2+400* 6], m1 |
| mova [t1+r10*2+400* 8], m2 |
| mova [t1+r10*2+400*10], m3 |
| paddw m1, m0, [t2+r10*2+400* 6] |
| paddd m2, m4, [t2+r10*2+400* 8] |
| paddd m3, m5, [t2+r10*2+400*10] |
| mova [t2+r10*2+400* 6], m0 |
| mova [t2+r10*2+400* 8], m4 |
| mova [t2+r10*2+400*10], m5 |
| punpcklwd m0, m1, m7 ; b3 |
| punpckhwd m1, m7 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a3 * 9 |
| pmaddwd m2, m0, m0 ; b3 * b |
| paddd m5, m3 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p3 |
| vpbroadcastd m2, [base+pd_0xf00801c7] |
| psubd m5, m3 |
| pmulld m4, m14 ; p3 * s1 |
| pmulld m5, m14 |
| pmaddwd m0, m2 ; b3 * 455 |
| pmaddwd m1, m2 |
| paddusw m4, m2 |
| paddusw m5, m2 |
| psrad m3, m4, 20 ; min(z3, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m4 |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r12+m4*4], m5 |
| vpbroadcastd m4, [base+pd_34816] |
| pmulld m0, m2 |
| vpbroadcastd m5, [base+pd_m4096] |
| pmulld m1, m3 |
| paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m4 |
| pand m0, m5 |
| pand m1, m5 |
| por m0, m2 ; a3 | (b3 << 12) |
| por m1, m3 |
| mova [t3+r10*4+400*4+ 8], xm0 |
| vextracti128 [t3+r10*4+400*4+40], m0, 1 |
| mova [t3+r10*4+400*4+24], xm1 |
| vextracti128 [t3+r10*4+400*4+56], m1, 1 |
| add r10, 16 |
| jl .hv0_loop |
| ret |
| ALIGN function_align |
| .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| vpbroadcastd xm0, [leftq] |
| mova xm5, [lpfq+wq] |
| palignr xm5, xm0, 12 |
| add leftq, 4 |
| jmp .hv1_main |
| .hv1_extend_left: |
| mova xm5, [lpfq+wq] |
| pshufb xm5, [base+sgr_l_shuf] |
| jmp .hv1_main |
| .hv1_bottom: |
| lea r10, [wq-2] |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| .hv1_loop: |
| movu xm5, [lpfq+r10-2] |
| .hv1_main: |
| vinserti128 m5, [lpfq+r10+6], 1 |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv1_have_right |
| cmp r10d, -18 |
| jl .hv1_have_right |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right |
| .hv1_have_right: |
| pshufb m6, m5, m9 |
| pshufb m3, m5, m10 |
| paddw m8, m6, m3 |
| shufps m2, m6, m3, q2121 |
| pmullw m1, m2, m2 |
| pshufb m0, m5, m11 |
| paddw m2, m0 |
| pshufb m5, m12 |
| paddw m2, m5 ; sum3 |
| punpcklwd m4, m5, m0 |
| pmaddwd m4, m4 |
| punpckhwd m5, m0 |
| pmaddwd m5, m5 |
| punpcklwd m0, m6, m3 |
| pmaddwd m0, m0 |
| punpckhwd m6, m3 |
| pmaddwd m6, m6 |
| punpcklwd m3, m1, m7 |
| paddd m4, m3 ; sumsq3 |
| punpckhwd m1, m7 |
| paddd m5, m1 |
| paddw m1, m2, [t2+r10*2+400* 6] |
| mova [t2+r10*2+400* 6], m2 |
| paddw m8, m2 ; sum5 |
| paddd m2, m4, [t2+r10*2+400* 8] |
| paddd m3, m5, [t2+r10*2+400*10] |
| mova [t2+r10*2+400* 8], m4 |
| mova [t2+r10*2+400*10], m5 |
| paddd m4, m0 ; sumsq5 |
| paddd m5, m6 |
| punpcklwd m0, m1, m7 ; b3 |
| punpckhwd m1, m7 |
| pslld m6, m2, 3 |
| pslld m7, m3, 3 |
| paddd m6, m2 ; a3 * 9 |
| pmaddwd m2, m0, m0 ; b3 * b3 |
| paddd m7, m3 |
| pmaddwd m3, m1, m1 |
| psubd m6, m2 ; p3 |
| vpbroadcastd m2, [base+pd_0xf00801c7] |
| psubd m7, m3 |
| pmulld m6, m14 ; p3 * s1 |
| pmulld m7, m14 |
| pmaddwd m0, m2 ; b3 * 455 |
| pmaddwd m1, m2 |
| paddusw m6, m2 |
| paddusw m7, m2 |
| psrad m3, m6, 20 ; min(z3, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m6 |
| psrad m6, m7, 20 |
| vpgatherdd m3, [r12+m6*4], m7 |
| vpbroadcastd m6, [base+pd_34816] ; x3 |
| pmulld m0, m2 |
| vpbroadcastd m7, [base+pd_m4096] |
| pmulld m1, m3 |
| paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m6 |
| pand m0, m7 |
| pand m7, m1 |
| por m0, m2 ; a3 | (b3 << 12) |
| por m7, m3 |
| paddw m1, m8, [t2+r10*2+400*0] |
| paddd m2, m4, [t2+r10*2+400*2] |
| paddd m3, m5, [t2+r10*2+400*4] |
| paddw m1, [t1+r10*2+400*0] |
| paddd m2, [t1+r10*2+400*2] |
| paddd m3, [t1+r10*2+400*4] |
| mova [t2+r10*2+400*0], m8 |
| mova [t2+r10*2+400*2], m4 |
| mova [t2+r10*2+400*4], m5 |
| mova [t3+r10*4+400*8+ 8], xm0 |
| vextracti128 [t3+r10*4+400*8+40], m0, 1 |
| mova [t3+r10*4+400*8+24], xm7 |
| vextracti128 [t3+r10*4+400*8+56], m7, 1 |
| vpbroadcastd m4, [base+pd_25] |
| pxor m7, m7 |
| punpcklwd m0, m1, m7 ; b5 |
| punpckhwd m1, m7 |
| pmulld m2, m4 ; a5 * 25 |
| pmulld m3, m4 |
| pmaddwd m4, m0, m0 ; b5 * b5 |
| pmaddwd m5, m1, m1 |
| psubd m2, m4 ; p5 |
| vpbroadcastd m4, [base+pd_0xf00800a4] |
| psubd m3, m5 |
| pmulld m2, m13 ; p5 * s0 |
| pmulld m3, m13 |
| pmaddwd m0, m4 ; b5 * 164 |
| pmaddwd m1, m4 |
| paddusw m2, m4 |
| paddusw m3, m4 |
| psrad m5, m2, 20 ; min(z5, 255) - 256 |
| vpgatherdd m4, [r12+m5*4], m2 ; x5 |
| psrad m2, m3, 20 |
| vpgatherdd m5, [r12+m2*4], m3 |
| pmulld m0, m4 |
| pmulld m1, m5 |
| paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m6 |
| vpbroadcastd m6, [base+pd_m4096] |
| pand m0, m6 |
| pand m1, m6 |
| por m0, m4 ; a5 | (b5 << 12) |
| por m1, m5 |
| mova [t3+r10*4+400*0+ 8], xm0 |
| vextracti128 [t3+r10*4+400*0+40], m0, 1 |
| mova [t3+r10*4+400*0+24], xm1 |
| vextracti128 [t3+r10*4+400*0+56], m1, 1 |
| add r10, 16 |
| jl .hv1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .v0: ; vertical boxsums + ab3 (even rows) |
| lea r10, [wq-2] |
| vpbroadcastd m6, [base+pd_34816] |
| vpbroadcastd m8, [base+pd_m4096] |
| .v0_loop: |
| mova m0, [t1+r10*2+400* 6] |
| mova m4, [t1+r10*2+400* 8] |
| mova m5, [t1+r10*2+400*10] |
| paddw m0, m0 |
| paddd m4, m4 |
| paddd m5, m5 |
| paddw m1, m0, [t2+r10*2+400* 6] |
| paddd m2, m4, [t2+r10*2+400* 8] |
| paddd m3, m5, [t2+r10*2+400*10] |
| mova [t2+r10*2+400* 6], m0 |
| mova [t2+r10*2+400* 8], m4 |
| mova [t2+r10*2+400*10], m5 |
| punpcklwd m0, m1, m7 ; b3 |
| punpckhwd m1, m7 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a3 * 9 |
| pmaddwd m2, m0, m0 ; b3 * b3 |
| paddd m5, m3 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p3 |
| vpbroadcastd m2, [base+pd_0xf00801c7] |
| psubd m5, m3 |
| pmulld m4, m14 ; p3 * s1 |
| pmulld m5, m14 |
| pmaddwd m0, m2 ; b3 * 455 |
| pmaddwd m1, m2 |
| paddusw m4, m2 |
| paddusw m5, m2 |
| psrad m3, m4, 20 ; min(z3, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m4 ; x3 |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r12+m4*4], m5 |
| pmulld m0, m2 |
| pmulld m1, m3 |
| paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m6 |
| pand m0, m8 |
| pand m1, m8 |
| por m0, m2 ; a3 | (b3 << 12) |
| por m1, m3 |
| mova m2, [t1+r10*2+400*0] |
| mova m3, [t1+r10*2+400*2] |
| mova m4, [t1+r10*2+400*4] |
| mova [t3+r10*4+400*8+ 8], m2 |
| mova [t3+r10*4+400*0+ 8], m3 |
| mova [t3+r10*4+400*0+40], m4 |
| paddw m2, m2 ; cc5 |
| paddd m3, m3 |
| paddd m4, m4 |
| mova [t1+r10*2+400*0], m2 |
| mova [t1+r10*2+400*2], m3 |
| mova [t1+r10*2+400*4], m4 |
| mova [t3+r10*4+400*4+ 8], xm0 |
| vextracti128 [t3+r10*4+400*4+40], m0, 1 |
| mova [t3+r10*4+400*4+24], xm1 |
| vextracti128 [t3+r10*4+400*4+56], m1, 1 |
| add r10, 16 |
| jl .v0_loop |
| ret |
| .v1: ; vertical boxsums + ab (odd rows) |
| lea r10, [wq-2] |
| .v1_loop: |
| mova m4, [t1+r10*2+400* 6] |
| mova m5, [t1+r10*2+400* 8] |
| mova m6, [t1+r10*2+400*10] |
| paddw m1, m4, [t2+r10*2+400* 6] |
| paddd m2, m5, [t2+r10*2+400* 8] |
| paddd m3, m6, [t2+r10*2+400*10] |
| mova [t2+r10*2+400* 6], m4 |
| mova [t2+r10*2+400* 8], m5 |
| mova [t2+r10*2+400*10], m6 |
| punpcklwd m0, m1, m7 ; b3 |
| punpckhwd m1, m7 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a3 * 9 |
| pmaddwd m2, m0, m0 ; b3 * b3 |
| paddd m5, m3 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p3 |
| vpbroadcastd m2, [base+pd_0xf00801c7] |
| psubd m5, m3 |
| pmulld m4, m14 ; p3 * s1 |
| pmulld m5, m14 |
| pmaddwd m0, m2 ; b3 * 455 |
| pmaddwd m1, m2 |
| paddusw m4, m2 |
| paddusw m5, m2 |
| psrad m3, m4, 20 ; min(z3, 255) - 256 |
| vpgatherdd m2, [r12+m3*4], m4 ; x3 |
| psrad m4, m5, 20 |
| vpgatherdd m3, [r12+m4*4], m5 |
| vpbroadcastd m4, [base+pd_34816] |
| pmulld m0, m2 |
| vpbroadcastd m8, [base+pd_m4096] |
| pmulld m1, m3 |
| paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m4 |
| pand m0, m8 |
| pand m8, m1 |
| por m0, m2 ; a3 | (b3 << 12) |
| por m8, m3 |
| mova m4, [t3+r10*4+400*8+ 8] |
| mova m5, [t3+r10*4+400*0+ 8] |
| mova m6, [t3+r10*4+400*0+40] |
| paddw m1, m4, [t2+r10*2+400*0] |
| paddd m2, m5, [t2+r10*2+400*2] |
| paddd m3, m6, [t2+r10*2+400*4] |
| paddw m1, [t1+r10*2+400*0] |
| paddd m2, [t1+r10*2+400*2] |
| paddd m3, [t1+r10*2+400*4] |
| mova [t2+r10*2+400*0], m4 |
| mova [t2+r10*2+400*2], m5 |
| mova [t2+r10*2+400*4], m6 |
| vpbroadcastd m4, [base+pd_25] |
| mova [t3+r10*4+400*8+ 8], xm0 |
| vextracti128 [t3+r10*4+400*8+40], m0, 1 |
| mova [t3+r10*4+400*8+24], xm8 |
| vextracti128 [t3+r10*4+400*8+56], m8, 1 |
| punpcklwd m0, m1, m7 ; b5 |
| punpckhwd m1, m7 |
| pmulld m2, m4 ; a5 * 25 |
| pmulld m3, m4 |
| pmaddwd m4, m0, m0 ; b5 * b5 |
| pmaddwd m5, m1, m1 |
| psubd m2, m4 ; p5 |
| vpbroadcastd m4, [base+pd_0xf00800a4] |
| psubd m3, m5 |
| pmulld m2, m13 ; p5 * s0 |
| pmulld m3, m13 |
| pmaddwd m0, m4 ; b5 * 164 |
| pmaddwd m1, m4 |
| paddusw m2, m4 |
| paddusw m3, m4 |
| psrad m5, m2, 20 ; min(z5, 255) - 256 |
| vpgatherdd m4, [r12+m5*4], m2 ; x5 |
| psrad m2, m3, 20 |
| vpgatherdd m5, [r12+m2*4], m3 |
| pmulld m0, m4 |
| vpbroadcastd m6, [base+pd_34816] |
| pmulld m1, m5 |
| paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m6 |
| vpbroadcastd m6, [base+pd_m4096] |
| pand m0, m6 |
| pand m1, m6 |
| por m0, m4 ; a5 | (b5 << 12) |
| por m1, m5 |
| mova [t3+r10*4+400*0+ 8], xm0 |
| vextracti128 [t3+r10*4+400*0+40], m0, 1 |
| mova [t3+r10*4+400*0+24], xm1 |
| vextracti128 [t3+r10*4+400*0+56], m1, 1 |
| add r10, 16 |
| jl .v1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .prep_n: ; initial neighbor setup |
| mov r10, wq |
| .prep_n_loop: |
| movu m0, [t3+r10*4+400*0+4] |
| paddd m1, m0, [t3+r10*4+400*0+0] |
| mova m4, [t3+r10*4+400*4+0] |
| paddd m1, [t3+r10*4+400*0+8] |
| mova m5, [t3+r10*4+400*8+0] |
| paddd m4, [t3+r10*4+400*4+8] |
| paddd m5, [t3+r10*4+400*8+8] |
| paddd m2, m4, [t3+r10*4+400*4+4] |
| paddd m3, m5, [t3+r10*4+400*8+4] |
| paddd m0, m1 |
| pslld m1, 2 |
| pslld m2, 2 |
| paddd m1, m0 ; ab5 565 |
| paddd m3, m3 ; ab3[ 0] 222 |
| psubd m2, m4 ; ab3[-1] 343 |
| mova [t3+r10*4+400*20], m3 |
| pandn m0, m6, m1 ; a5 565 |
| mova [t3+r10*4+400*24], m2 |
| psrld m1, 12 ; b5 565 |
| mova [t3+r10*4+400*12], m0 |
| paddd m3, m3 |
| mova [t3+r10*4+400*16], m1 |
| psubd m3, m5 ; ab3[ 0] 343 |
| mova [t3+r10*4+400*28], m3 |
| add r10, 8 |
| jl .prep_n_loop |
| ret |
| ALIGN function_align |
| .n0: ; neighbor + output (even rows) |
| mov r10, wq |
| .n0_loop: |
| movu m0, [t3+r10*4+4] |
| paddd m4, m0, [t3+r10*4+0] |
| paddd m4, [t3+r10*4+8] |
| paddd m0, m4 |
| pslld m4, 2 |
| paddd m4, m0 |
| pandn m0, m6, m4 |
| psrld m4, 12 |
| paddd m2, m0, [t3+r10*4+400*12] ; a5 |
| mova [t3+r10*4+400*12], m0 |
| paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) |
| mova [t3+r10*4+400*16], m4 |
| mova m3, [t3+r10*4+400*4+0] |
| paddd m3, [t3+r10*4+400*4+8] |
| paddd m5, m3, [t3+r10*4+400*4+4] |
| paddd m5, m5 ; ab3[ 1] 222 |
| mova m4, [t3+r10*4+400*20] |
| paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 |
| mova [t3+r10*4+400*20], m5 |
| paddd m5, m5 |
| psubd m5, m3 ; ab3[ 1] 343 |
| mova [t3+r10*4+400*24], m5 |
| paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 |
| pandn m3, m6, m1 |
| psrld m1, 12 |
| pandn m5, m6, m4 |
| psrld m4, 12 |
| paddd m3, m5 ; a3 |
| paddd m1, m4 ; b3 + (1 << 8) |
| pmovzxbd m4, [dstq+r10] |
| pmaddwd m2, m4 ; a5 * src |
| pmaddwd m3, m4 ; a3 * src |
| psubd m0, m2 ; b5 - a5 * src + (1 << 8) |
| psubd m1, m3 ; b3 - a3 * src + (1 << 8) |
| psrld m0, 9 |
| pslld m1, 7 |
| pblendw m0, m1, 0xaa |
| pmaddwd m0, m15 |
| psubd m0, m6 |
| psrad m0, 13 |
| paddd m0, m4 |
| vextracti128 xm1, m0, 1 |
| packssdw xm0, xm1 |
| packuswb xm0, xm0 |
| movq [dstq+r10], xm0 |
| add r10, 8 |
| jl .n0_loop |
| add dstq, strideq |
| ret |
| ALIGN function_align |
| .n1: ; neighbor + output (odd rows) |
| mov r10, wq |
| .n1_loop: |
| mova m3, [t3+r10*4+400*8+0] |
| paddd m3, [t3+r10*4+400*8+8] |
| paddd m5, m3, [t3+r10*4+400*8+4] |
| paddd m5, m5 ; ab3[ 1] 222 |
| mova m4, [t3+r10*4+400*20] |
| paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 |
| mova [t3+r10*4+400*20], m5 |
| paddd m5, m5 |
| psubd m5, m3 ; ab3[ 1] 343 |
| mova [t3+r10*4+400*28], m5 |
| paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 |
| pandn m3, m6, m1 |
| psrld m1, 12 |
| pandn m5, m6, m4 |
| psrld m4, 12 |
| paddd m3, m5 ; -a3 |
| paddd m1, m4 ; b3 + (1 << 8) |
| pmovzxbd m4, [dstq+r10] |
| pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src |
| mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) |
| pmaddwd m3, m4 ; -a3 * src |
| psubd m0, m2 ; a5 * src + b5 + (1 << 7) |
| psubd m1, m3 ; a3 * src + b3 + (1 << 8) |
| psrld m0, 8 |
| pslld m1, 7 |
| pblendw m0, m1, 0xaa |
| pmaddwd m0, m15 |
| psubd m0, m6 |
| psrad m0, 13 |
| paddd m0, m4 |
| vextracti128 xm1, m0, 1 |
| packssdw xm0, xm1 |
| packuswb xm0, xm0 |
| movq [dstq+r10], xm0 |
| add r10, 8 |
| jl .n1_loop |
| add dstq, strideq |
| ret |
| |
| %endif ; ARCH_X86_64 |