blob: 01eb6fa348bd4f868360a47b0688ee8a4cf33b41 [file] [log] [blame]
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; Copyright © 2018, VideoLabs
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_right_ext_mask: times 24 db 0xff
times 8 db 0
pb_1: times 16 db 1
pb_3: times 16 db 3
pw_256: times 8 dw 256
pw_2056: times 8 dw 2056
pw_m16380: times 8 dw -16380
pd_4096: times 4 dd 4096
pd_34816: times 4 dd 34816
pd_0xffff: times 4 dd 0xffff
pd_0xf00800a4: times 4 dd 0xf00800a4
pd_0xf00801c7: times 4 dd 0xf00801c7
cextern sgr_x_by_x
SECTION .text
%macro movif64 2 ; dst, src
%if ARCH_X86_64
mov %1, %2
%endif
%endmacro
%macro movif32 2 ; dst, src
%if ARCH_X86_32
mov %1, %2
%endif
%endmacro
%if ARCH_X86_32
%define PIC_base_offset $$
%macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
%assign pic_reg_stk_off 4
%xdefine PIC_reg %1
%if %2 == 1
mov [esp], %1
%endif
LEA PIC_reg, PIC_base_offset
%if %3 == 1
XCHG_PIC_REG
%endif
%endmacro
%macro XCHG_PIC_REG 0
mov [esp+pic_reg_stk_off], PIC_reg
%assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
mov PIC_reg, [esp+pic_reg_stk_off]
%endmacro
%define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
%else
%macro XCHG_PIC_REG 0
%endmacro
%define PIC_sym(sym) (sym)
%endif
%macro WIENER 0
%if ARCH_X86_64
DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
w, h, edge, flt, x
%define tmpstrideq strideq
%define base 0
mov fltq, r6mp
mov wd, wm
movifnidn hd, hm
mov edged, r7m
movq m14, [fltq]
add lpfq, wq
movq m7, [fltq+16]
add dstq, wq
lea t1, [rsp+wq*2+16]
mova m15, [pw_2056]
neg wq
%if cpuflag(ssse3)
pshufb m14, [wiener_init]
mova m8, [wiener_shufA]
pshufd m12, m14, q2222 ; x0 x0
mova m9, [wiener_shufB]
pshufd m13, m14, q3333 ; x1 x2
mova m10, [wiener_shufC]
punpcklqdq m14, m14 ; x3
mova m11, [wiener_shufD]
%else
mova m10, [pw_m16380]
punpcklwd m14, m14
pshufd m11, m14, q0000 ; x0
pshufd m12, m14, q1111 ; x1
pshufd m13, m14, q2222 ; x2
pshufd m14, m14, q3333 ; x3
%endif
%else
DECLARE_REG_TMP 4, 0, _, 5
%if cpuflag(ssse3)
%define m10 [base+wiener_shufC]
%define m11 [base+wiener_shufD]
%define stk_off 96
%else
%define m10 [base+pw_m16380]
%define m11 [stk+96]
%define stk_off 112
%endif
cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
%define base r6-pb_right_ext_mask-21
%define stk esp
%define dstq leftq
%define edgeb byte edged
%define edged [stk+ 8]
%define dstmp [stk+12]
%define hd dword [stk+16]
%define wq [stk+20]
%define strideq [stk+24]
%define leftmp [stk+28]
%define t2 [stk+32]
%define t4 [stk+36]
%define t5 [stk+40]
%define t6 [stk+44]
%define m8 [base+wiener_shufA]
%define m9 [base+wiener_shufB]
%define m12 [stk+48]
%define m13 [stk+64]
%define m14 [stk+80]
%define m15 [base+pw_2056]
mov r1, r6m ; flt
mov r0, r0m ; dst
mov r4, r4m ; w
mov lpfq, lpfm
mov r2, r7m ; edge
mov r5, r5m ; h
movq m3, [r1+ 0]
movq m7, [r1+16]
add r0, r4
mov r1, r1m ; stride
add lpfq, r4
mov edged, r2
mov r2, r2m ; left
mov dstmp, r0
lea t1, [rsp+r4*2+stk_off]
mov hd, r5
neg r4
LEA r6, pb_right_ext_mask+21
mov wq, r4
mov strideq, r1
mov leftmp, r2
mov r4, r1
%if cpuflag(ssse3)
pshufb m3, [base+wiener_init]
pshufd m1, m3, q2222
pshufd m2, m3, q3333
punpcklqdq m3, m3
%else
punpcklwd m3, m3
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova m11, m0
%endif
mova m12, m1
mova m13, m2
mova m14, m3
%endif
psllw m7, 5
pshufd m6, m7, q0000 ; y0 y1
pshufd m7, m7, q1111 ; y2 y3
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t6, t1
mov t5, t1
add t1, 384*2
call .h_top
lea t3, [lpfq+tmpstrideq*4]
mov lpfq, dstmp
add t3, tmpstrideq
mov [rsp], t3 ; below
mov t4, t1
add t1, 384*2
call .h
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
.main:
lea t0, [t1+384*2]
.main_loop:
call .hv
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .v3
mov lpfq, [rsp]
call .hv_bottom
add lpfq, strideq
call .hv_bottom
.v1:
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
RET
.no_top:
lea t3, [lpfq+tmpstrideq*4]
mov lpfq, dstmp
lea t3, [t3+tmpstrideq*2]
mov [rsp], t3
call .h
mov t6, t1
mov t5, t1
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
lea t0, [t1+384*2]
call .hv
dec hd
jz .v3
add t0, 384*8
call .hv
dec hd
jnz .main
.v3:
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
.v2:
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
movd m2, [lpfq-4]
%if ARCH_X86_64
push r0
lea r0, [pb_right_ext_mask+21]
movu m0, [r0+xq+0]
movu m1, [r0+xq+8]
pop r0
%else
movu m0, [r6+xq+0]
movu m1, [r6+xq+8]
%endif
%if cpuflag(ssse3)
pshufb m2, [base+pb_3]
%else
punpcklbw m2, m2
pshuflw m2, m2, q3333
punpcklqdq m2, m2
%endif
pand m4, m0
pand m5, m1
pandn m0, m2
pandn m1, m2
por m4, m0
por m5, m1
ret
.h:
%define stk esp+4 ; offset due to call
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movifnidn leftq, leftmp
mova m4, [lpfq+xq]
movd m5, [leftq]
add leftq, 4
pslldq m4, 4
por m4, m5
movifnidn leftmp, leftq
jmp .h_main
.h_extend_left:
%if cpuflag(ssse3)
mova m4, [lpfq+xq]
pshufb m4, [base+wiener_l_shuf]
%else
mova m5, [lpfq+xq]
pshufd m4, m5, q2103
punpcklbw m5, m5
punpcklwd m5, m5
movss m4, m5
%endif
jmp .h_main
.h_top:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m4, [lpfq+xq-4]
.h_main:
movu m5, [lpfq+xq+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp xd, -18
jl .h_have_right
call .extend_right
.h_have_right:
%macro %%h7 0
%if cpuflag(ssse3)
pshufb m0, m4, m8
pmaddubsw m0, m12
pshufb m1, m5, m8
pmaddubsw m1, m12
pshufb m2, m4, m9
pmaddubsw m2, m13
pshufb m3, m5, m9
pmaddubsw m3, m13
paddw m0, m2
pshufb m2, m4, m10
pmaddubsw m2, m13
paddw m1, m3
pshufb m3, m5, m10
pmaddubsw m3, m13
pshufb m4, m11
paddw m0, m2
pmullw m2, m14, m4
pshufb m5, m11
paddw m1, m3
pmullw m3, m14, m5
psllw m4, 7
psllw m5, 7
paddw m0, m2
mova m2, [base+pw_m16380]
paddw m1, m3
paddw m4, m2
paddw m5, m2
paddsw m0, m4
paddsw m1, m5
%else
psrldq m0, m4, 1
pslldq m1, m4, 1
pxor m3, m3
punpcklbw m0, m3
punpckhbw m1, m3
paddw m0, m1
pmullw m0, m11
psrldq m1, m4, 2
pslldq m2, m4, 2
punpcklbw m1, m3
punpckhbw m2, m3
paddw m1, m2
pmullw m1, m12
paddw m0, m1
pshufd m2, m4, q0321
punpcklbw m2, m3
pmullw m1, m14, m2
paddw m0, m1
psrldq m1, m4, 3
pslldq m4, 3
punpcklbw m1, m3
punpckhbw m4, m3
paddw m1, m4
pmullw m1, m13
paddw m0, m1
psllw m2, 7
paddw m2, m10
paddsw m0, m2
psrldq m1, m5, 1
pslldq m2, m5, 1
punpcklbw m1, m3
punpckhbw m2, m3
paddw m1, m2
pmullw m1, m11
psrldq m2, m5, 2
pslldq m4, m5, 2
punpcklbw m2, m3
punpckhbw m4, m3
paddw m2, m4
pmullw m2, m12
paddw m1, m2
pshufd m4, m5, q0321
punpcklbw m4, m3
pmullw m2, m14, m4
paddw m1, m2
psrldq m2, m5, 3
pslldq m5, 3
punpcklbw m2, m3
punpckhbw m5, m3
paddw m2, m5
pmullw m2, m13
paddw m1, m2
psllw m4, 7
paddw m4, m10
paddsw m1, m4
%endif
%endmacro
%%h7
psraw m0, 3
psraw m1, 3
paddw m0, m15
paddw m1, m15
mova [t1+xq*2+ 0], m0
mova [t1+xq*2+16], m1
add xq, 16
jl .h_loop
ret
ALIGN function_align
.hv:
add lpfq, strideq
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movifnidn leftq, leftmp
mova m4, [lpfq+xq]
movd m5, [leftq]
add leftq, 4
pslldq m4, 4
por m4, m5
movifnidn leftmp, leftq
jmp .hv_main
.hv_extend_left:
%if cpuflag(ssse3)
mova m4, [lpfq+xq]
pshufb m4, [base+wiener_l_shuf]
%else
mova m5, [lpfq+xq]
pshufd m4, m5, q2103
punpcklbw m5, m5
punpcklwd m5, m5
movss m4, m5
%endif
jmp .hv_main
.hv_bottom:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m4, [lpfq+xq-4]
.hv_main:
movu m5, [lpfq+xq+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp xd, -18
jl .hv_have_right
call .extend_right
.hv_have_right:
%%h7
%if ARCH_X86_64
mova m2, [t4+xq*2]
paddw m2, [t2+xq*2]
%else
mov r2, t4
mova m2, [r2+xq*2]
mov r2, t2
paddw m2, [r2+xq*2]
mov r2, t5
%endif
mova m3, [t3+xq*2]
%if ARCH_X86_64
mova m5, [t5+xq*2]
%else
mova m5, [r2+xq*2]
mov r2, t6
%endif
paddw m5, [t1+xq*2]
psraw m0, 3
psraw m1, 3
paddw m0, m15
paddw m1, m15
%if ARCH_X86_64
paddw m4, m0, [t6+xq*2]
%else
paddw m4, m0, [r2+xq*2]
mov r2, t4
%endif
mova [t0+xq*2], m0
punpcklwd m0, m2, m3
pmaddwd m0, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4, m5
pmaddwd m3, m6
punpckhwd m4, m5
pmaddwd m4, m6
paddd m0, m3
mova m3, [t3+xq*2+16]
paddd m4, m2
%if ARCH_X86_64
mova m2, [t4+xq*2+16]
paddw m2, [t2+xq*2+16]
mova m5, [t5+xq*2+16]
%else
mova m2, [r2+xq*2+16]
mov r2, t2
paddw m2, [r2+xq*2+16]
mov r2, t5
mova m5, [r2+xq*2+16]
mov r2, t6
%endif
paddw m5, [t1+xq*2+16]
packuswb m0, m4
%if ARCH_X86_64
paddw m4, m1, [t6+xq*2+16]
%else
paddw m4, m1, [r2+xq*2+16]
mov dstq, dstmp
%endif
mova [t0+xq*2+16], m1
punpcklwd m1, m2, m3
pmaddwd m1, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4, m5
pmaddwd m3, m6
punpckhwd m4, m5
pmaddwd m4, m6
paddd m1, m3
paddd m2, m4
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+xq], m0
add xq, 16
jl .hv_loop
add dstq, strideq
%if ARCH_X86_64
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t6
%else
mov dstmp, dstq
mov r1, t5
mov r2, t4
mov t6, r1
mov t5, r2
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, r1
%endif
ret
%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
.v:
mov xq, wq
.v_loop:
%if ARCH_X86_64
mova m1, [t4+xq*2]
paddw m1, [t2+xq*2]
%else
mov r2, t4
mova m1, [r2+xq*2]
mov r2, t2
paddw m1, [r2+xq*2]
mov r2, t6
%endif
mova m2, [t3+xq*2]
mova m4, [t1+xq*2]
%if ARCH_X86_64
paddw m3, m4, [t6+xq*2]
paddw m4, [t5+xq*2]
%else
paddw m3, m4, [r2+xq*2]
mov r2, t5
paddw m4, [r2+xq*2]
mov r2, t4
%endif
punpcklwd m0, m1, m2
pmaddwd m0, m7
punpckhwd m1, m2
pmaddwd m1, m7
punpcklwd m2, m3, m4
pmaddwd m2, m6
punpckhwd m3, m4
pmaddwd m3, m6
paddd m0, m2
paddd m1, m3
%if ARCH_X86_64
mova m2, [t4+xq*2+16]
paddw m2, [t2+xq*2+16]
%else
mova m2, [r2+xq*2+16]
mov r2, t2
paddw m2, [r2+xq*2+16]
mov r2, t6
%endif
mova m3, [t3+xq*2+16]
mova m5, [t1+xq*2+16]
%if ARCH_X86_64
paddw m4, m5, [t6+xq*2+16]
paddw m5, [t5+xq*2+16]
%else
paddw m4, m5, [r2+xq*2+16]
mov r2, t5
paddw m5, [r2+xq*2+16]
movifnidn dstq, dstmp
%endif
packuswb m0, m1
punpcklwd m1, m2, m3
pmaddwd m1, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4, m5
pmaddwd m3, m6
punpckhwd m4, m5
pmaddwd m4, m6
paddd m1, m3
paddd m2, m4
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+xq], m0
add xq, 16
jl .v_loop
add dstq, strideq
%if ARCH_X86_64
mov t6, t5
mov t5, t4
%else
mov dstmp, dstq
mov r1, t5
mov r2, t4
mov t6, r1
mov t5, r2
%endif
mov t4, t3
mov t3, t2
mov t2, t1
ret
%endif
%if ARCH_X86_64
cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
w, h, edge, flt, x
mov fltq, r6mp
mov wd, wm
movifnidn hd, hm
mov edged, r7m
movq m14, [fltq]
add lpfq, wq
movq m7, [fltq+16]
add dstq, wq
mova m8, [pw_m16380]
lea t1, [rsp+wq*2+16]
mova m15, [pw_2056]
neg wq
%if cpuflag(ssse3)
pshufb m14, [wiener_init]
mova m9, [wiener_shufB]
pshufd m13, m14, q3333 ; x1 x2
mova m10, [wiener_shufC]
punpcklqdq m14, m14 ; x3
mova m11, [wiener_shufD]
mova m12, [wiener_l_shuf]
%else
punpcklwd m14, m14
pshufd m11, m14, q1111 ; x1
pshufd m13, m14, q2222 ; x2
pshufd m14, m14, q3333 ; x3
%endif
%else
%if cpuflag(ssse3)
%define stk_off 80
%else
%define m11 [stk+80]
%define stk_off 96
%endif
cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
%define stk esp
%define leftmp [stk+28]
%define m8 [base+pw_m16380]
%define m12 [base+wiener_l_shuf]
%define m14 [stk+48]
mov r1, r6m ; flt
mov r0, r0m ; dst
mov r4, r4m ; w
mov lpfq, lpfm
mov r2, r7m ; edge
mov r5, r5m ; h
movq m2, [r1+ 0]
movq m7, [r1+16]
add r0, r4
mov r1, r1m ; stride
add lpfq, r4
mov edged, r2
mov r2, r2m ; left
mov dstmp, r0
lea t1, [rsp+r4*2+stk_off]
mov hd, r5
neg r4
LEA r6, pb_right_ext_mask+21
mov wq, r4
mov strideq, r1
mov leftmp, r2
mov r4, r1
%if cpuflag(ssse3)
pshufb m2, [base+wiener_init]
pshufd m1, m2, q3333
punpcklqdq m2, m2
%else
punpcklwd m2, m2
pshufd m0, m2, q1111
pshufd m1, m2, q2222
pshufd m2, m2, q3333
mova m11, m0
%endif
mova m13, m1
mova m14, m2
%endif
psllw m7, 5
pshufd m6, m7, q0000 ; __ y1
pshufd m7, m7, q1111 ; y2 y3
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t4, t1
add t1, 384*2
call .h_top
lea xq, [lpfq+tmpstrideq*4]
mov lpfq, dstmp
mov t3, t1
add t1, 384*2
add xq, tmpstrideq
mov [rsp], xq ; below
call .h
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v2
.main:
mov t0, t4
.main_loop:
call .hv
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .v2
mov lpfq, [rsp]
call .hv_bottom
add lpfq, strideq
call .hv_bottom
.end:
RET
.no_top:
lea t3, [lpfq+tmpstrideq*4]
mov lpfq, dstmp
lea t3, [t3+tmpstrideq*2]
mov [rsp], t3
call .h
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v2
lea t0, [t1+384*2]
call .hv
dec hd
jz .v2
add t0, 384*6
call .hv
dec hd
jnz .main
.v2:
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
add dstq, strideq
mov t4, t3
mov t3, t2
mov t2, t1
movifnidn dstmp, dstq
.v1:
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
jmp .end
.h:
%define stk esp+4
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movifnidn leftq, leftmp
mova m4, [lpfq+xq]
movd m5, [leftq]
add leftq, 4
pslldq m4, 4
por m4, m5
movifnidn leftmp, leftq
jmp .h_main
.h_extend_left:
%if cpuflag(ssse3)
mova m4, [lpfq+xq]
pshufb m4, m12
%else
mova m5, [lpfq+xq]
pshufd m4, m5, q2103
punpcklbw m5, m5
punpcklwd m5, m5
movss m4, m5
%endif
jmp .h_main
.h_top:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m4, [lpfq+xq-4]
.h_main:
movu m5, [lpfq+xq+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp xd, -17
jl .h_have_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.h_have_right:
%macro %%h5 0
%if cpuflag(ssse3)
pshufb m0, m4, m9
pmaddubsw m0, m13
pshufb m1, m5, m9
pmaddubsw m1, m13
pshufb m2, m4, m10
pmaddubsw m2, m13
pshufb m3, m5, m10
pmaddubsw m3, m13
pshufb m4, m11
paddw m0, m2
pmullw m2, m14, m4
pshufb m5, m11
paddw m1, m3
pmullw m3, m14, m5
psllw m4, 7
psllw m5, 7
paddw m4, m8
paddw m5, m8
paddw m0, m2
paddw m1, m3
paddsw m0, m4
paddsw m1, m5
%else
psrldq m0, m4, 2
pslldq m1, m4, 2
pxor m3, m3
punpcklbw m0, m3
punpckhbw m1, m3
paddw m0, m1
pmullw m0, m11
pshufd m2, m4, q0321
punpcklbw m2, m3
pmullw m1, m14, m2
paddw m0, m1
psrldq m1, m4, 3
pslldq m4, 3
punpcklbw m1, m3
punpckhbw m4, m3
paddw m1, m4
pmullw m1, m13
paddw m0, m1
psllw m2, 7
paddw m2, m8
paddsw m0, m2
psrldq m1, m5, 2
pslldq m4, m5, 2
punpcklbw m1, m3
punpckhbw m4, m3
paddw m1, m4
pmullw m1, m11
pshufd m4, m5, q0321
punpcklbw m4, m3
pmullw m2, m14, m4
paddw m1, m2
psrldq m2, m5, 3
pslldq m5, 3
punpcklbw m2, m3
punpckhbw m5, m3
paddw m2, m5
pmullw m2, m13
paddw m1, m2
psllw m4, 7
paddw m4, m8
paddsw m1, m4
%endif
%endmacro
%%h5
psraw m0, 3
psraw m1, 3
paddw m0, m15
paddw m1, m15
mova [t1+xq*2+ 0], m0
mova [t1+xq*2+16], m1
add xq, 16
jl .h_loop
ret
ALIGN function_align
.hv:
add lpfq, strideq
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movifnidn leftq, leftmp
mova m4, [lpfq+xq]
movd m5, [leftq]
add leftq, 4
pslldq m4, 4
por m4, m5
movifnidn leftmp, leftq
jmp .hv_main
.hv_extend_left:
%if cpuflag(ssse3)
mova m4, [lpfq+xq]
pshufb m4, m12
%else
mova m5, [lpfq+xq]
pshufd m4, m5, q2103
punpcklbw m5, m5
punpcklwd m5, m5
movss m4, m5
%endif
jmp .hv_main
.hv_bottom:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m4, [lpfq+xq-4]
.hv_main:
movu m5, [lpfq+xq+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp xd, -17
jl .hv_have_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.hv_have_right:
%%h5
mova m2, [t3+xq*2]
paddw m2, [t1+xq*2]
psraw m0, 3
psraw m1, 3
paddw m0, m15
paddw m1, m15
%if ARCH_X86_64
mova m3, [t2+xq*2]
paddw m4, m0, [t4+xq*2]
%else
mov r2, t2
mova m3, [r2+xq*2]
mov r2, t4
paddw m4, m0, [r2+xq*2]
%endif
mova [t0+xq*2], m0
punpcklwd m0, m2, m3
pmaddwd m0, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4, m4
pmaddwd m3, m6
punpckhwd m4, m4
pmaddwd m4, m6
paddd m0, m3
paddd m4, m2
mova m2, [t3+xq*2+16]
paddw m2, [t1+xq*2+16]
packuswb m0, m4
%if ARCH_X86_64
mova m3, [t2+xq*2+16]
paddw m4, m1, [t4+xq*2+16]
%else
paddw m4, m1, [r2+xq*2+16]
mov r2, t2
mova m3, [r2+xq*2+16]
mov dstq, dstmp
%endif
mova [t0+xq*2+16], m1
punpcklwd m1, m2, m3
pmaddwd m1, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4, m4
pmaddwd m3, m6
punpckhwd m4, m4
pmaddwd m4, m6
paddd m1, m3
paddd m2, m4
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+xq], m0
add xq, 16
jl .hv_loop
add dstq, strideq
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t4
movifnidn dstmp, dstq
ret
%if cpuflag(ssse3)
.v:
mov xq, wq
.v_loop:
mova m3, [t1+xq*2]
paddw m1, m3, [t3+xq*2]
%if ARCH_X86_64
mova m2, [t2+xq*2]
paddw m3, [t4+xq*2]
%else
mov r2, t2
mova m2, [r2+xq*2]
mov r2, t4
paddw m3, [r2+xq*2]
%endif
punpcklwd m0, m1, m2
pmaddwd m0, m7
punpckhwd m1, m2
pmaddwd m1, m7
punpcklwd m2, m3
pmaddwd m2, m6
punpckhwd m3, m3
pmaddwd m3, m6
paddd m0, m2
paddd m1, m3
mova m4, [t1+xq*2+16]
paddw m2, m4, [t3+xq*2+16]
%if ARCH_X86_64
mova m3, [t2+xq*2+16]
paddw m4, [t4+xq*2+16]
%else
paddw m4, [r2+xq*2+16]
mov r2, t2
mova m3, [r2+xq*2+16]
mov dstq, dstmp
%endif
packuswb m0, m1
punpcklwd m1, m2, m3
pmaddwd m1, m7
punpckhwd m2, m3
pmaddwd m2, m7
punpcklwd m3, m4
pmaddwd m3, m6
punpckhwd m4, m4
pmaddwd m4, m6
paddd m1, m3
paddd m2, m4
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+xq], m0
add xq, 16
jl .v_loop
ret
%endif
%endmacro
INIT_XMM sse2
WIENER
INIT_XMM ssse3
WIENER
;;;;;;;;;;;;;;;;;;;;;;;;;;
;; self-guided ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro GATHERDD 3 ; dst, src, tmp
movd %3d, %2
%if ARCH_X86_64
movd %1, [r13+%3]
pextrw %3d, %2, 2
pinsrw %1, [r13+%3+2], 3
pextrw %3d, %2, 4
pinsrw %1, [r13+%3+2], 5
pextrw %3d, %2, 6
pinsrw %1, [r13+%3+2], 7
%else
movd %1, [base+sgr_x_by_x-0xf03+%3]
pextrw %3, %2, 2
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
pextrw %3, %2, 4
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
pextrw %3, %2, 6
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
%endif
%endmacro
%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
%if ARCH_X86_64
%define tmp r14
%else
%define tmp %4
%endif
GATHERDD %1, %2, tmp
GATHERDD %2, %3, tmp
movif32 %4, %5
psrld %1, 24
psrld %2, 24
packssdw %1, %2
%endmacro
%macro MULLD 3 ; dst, src, tmp
pmulhuw %3, %1, %2
pmullw %1, %2
pslld %3, 16
paddd %1, %3
%endmacro
%if ARCH_X86_32
DECLARE_REG_TMP 0, 1, 2, 3, 5
%if STACK_ALIGNMENT < 16
%assign extra_stack 5*16
%else
%assign extra_stack 3*16
%endif
cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
dst, stride, left, lpf, w
%if STACK_ALIGNMENT < 16
%define dstm dword [esp+calloff+16*0+4*6]
%define stridemp dword [esp+calloff+16*0+4*7]
%define leftm dword [esp+calloff+16*3+4*0]
%define lpfm dword [esp+calloff+16*3+4*1]
%define w0m dword [esp+calloff+16*3+4*2]
%define hd dword [esp+calloff+16*3+4*3]
%define edgeb byte [esp+calloff+16*3+4*4]
%define edged dword [esp+calloff+16*3+4*4]
%define leftmp leftm
%else
%define w0m wm
%define hd dword r5m
%define edgeb byte r7m
%define edged dword r7m
%endif
%define hvsrcm dword [esp+calloff+4*0]
%define w1m dword [esp+calloff+4*1]
%define t0m dword [esp+calloff+4*2]
%define t2m dword [esp+calloff+4*3]
%define t3m dword [esp+calloff+4*4]
%define t4m dword [esp+calloff+4*5]
%define m8 [base+pb_1]
%define m9 [esp+calloff+16*2]
%define m10 [base+pd_0xf00800a4]
%define m11 [base+sgr_lshuf5]
%define m12 [base+pd_34816]
%define m13 [base+pb_0to15]
%define r10 r4
%define base r6-$$
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
mov leftq, [rstk+stack_offset+12]
mov lpfq, [rstk+stack_offset+16]
mov wd, [rstk+stack_offset+20]
mov dstm, dstq
mov stridemp, strideq
mov leftm, leftq
mov r1, [rstk+stack_offset+24]
mov r2, [rstk+stack_offset+32]
mov lpfm, lpfq
mov hd, r1
mov edged, r2
%endif
%else
DECLARE_REG_TMP 8, 7, 9, 11, 12
cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
mov wd, wm
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
mov edged, r7m
movu m9, [paramsq]
add lpfq, wq
mova m8, [pb_1]
lea t1, [rsp+wq*2+20]
mova m10, [pd_0xf00800a4]
add dstq, wq
lea t3, [rsp+wq*4+400*12+16]
mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
lea t4, [rsp+wq*2+400*20+16]
pshufhw m7, m9, q0000
pshufb m9, [pw_256] ; s0
punpckhqdq m7, m7 ; w0
neg wq
mova m13, [pb_0to15]
pxor m6, m6
mova m11, [sgr_lshuf5]
psllw m7, 4
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
movu m1, [r1]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+20]
add dstq, wq
lea t3, [rsp+extra_stack+wq*4+400*12+16]
mov dstm, dstq
lea t4, [rsp+extra_stack+wq*2+400*20+16]
mov t3m, t3
pshufhw m7, m1, q0000
mov t4m, t4
pshufb m1, [base+pw_256] ; s0
punpckhqdq m7, m7 ; w0
psllw m7, 4
neg wq
mova m9, m1
pxor m6, m6
mov w1m, wd
sub wd, 2
mov lpfq, lpfm
mov w0m, wd
%define strideq r5
%endif
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, stridemp
movif32 t2m, t1
mov t2, t1
call .top_fixup
add t1, 400*6
call .h_top
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov lpfm, r10 ; below
movif32 t0m, t2
mov t0, t2
dec hd
jz .height1
or edged, 16
call .h
.main:
add lpfq, stridemp
movif32 t4, t4m
call .hv
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
movif32 lpfq, hvsrcm
add lpfq, stridemp
%if ARCH_X86_64
test hb, hb
%else
mov r4, hd
test r4, r4
%endif
jz .odd_height
call .h
add lpfq, stridemp
call .hv
movif32 dstq, dstm
call .n0
call .n1
sub hd, 2
movif32 t0, t0m
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, lpfm
call .h_top
add lpfq, stridemp
call .hv_bottom
.end:
movif32 dstq, dstm
call .n0
call .n1
.end2:
RET
.height1:
movif32 t4, t4m
call .hv
call .prep_n
jmp .odd_height_end
.odd_height:
call .hv
movif32 dstq, dstm
call .n0
call .n1
.odd_height_end:
call .v
movif32 dstq, dstm
call .n0
jmp .end2
.extend_bottom:
call .v
jmp .end
.no_top:
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov lpfm, r10
call .h
lea t2, [t1+400*6]
movif32 t2m, t2
call .top_fixup
dec hd
jz .no_top_height1
or edged, 16
mov t0, t1
mov t1, t2
movif32 t0m, t0
jmp .main
.no_top_height1:
movif32 t3, t3m
movif32 t4, t4m
call .v
call .prep_n
jmp .odd_height_end
.extend_right:
%assign stack_offset stack_offset+8
%assign calloff 8
movd m1, wd
movd m3, [lpfq-1]
pshufb m1, m6
pshufb m3, m6
psubb m2, m8, m1
pcmpgtb m2, m13
pand m5, m2
pandn m2, m3
por m5, m2
ret
%assign stack_offset stack_offset-4
%assign calloff 4
.h: ; horizontal boxsum
%if ARCH_X86_64
lea wq, [r4-2]
%else
%define leftq r4
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 13
jmp .h_main
.h_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, m11
jmp .h_main
.h_top:
%if ARCH_X86_64
lea wq, [r4-2]
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 wq, w0m
.h_loop:
movu m5, [lpfq+wq-1]
.h_main:
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp wd, -10
jl .h_have_right
call .extend_right
.h_have_right:
punpcklbw m4, m5, m6
punpckhbw m5, m6
palignr m2, m5, m4, 2
paddw m0, m4, m2
palignr m3, m5, m4, 6
paddw m0, m3
punpcklwd m1, m2, m3
pmaddwd m1, m1
punpckhwd m2, m3
pmaddwd m2, m2
palignr m5, m4, 8
paddw m0, m5
punpcklwd m3, m4, m5
pmaddwd m3, m3
paddd m1, m3
punpckhwd m3, m4, m5
pmaddwd m3, m3
shufps m4, m5, q2121
paddw m0, m4 ; sum
punpcklwd m5, m4, m6
pmaddwd m5, m5
punpckhwd m4, m6
pmaddwd m4, m4
paddd m2, m3
test edgeb, 16 ; y > 0
jz .h_loop_end
paddw m0, [t1+wq*2+400*0]
paddd m1, [t1+wq*2+400*2]
paddd m2, [t1+wq*2+400*4]
.h_loop_end:
paddd m1, m5 ; sumsq
paddd m2, m4
mova [t1+wq*2+400*0], m0
mova [t1+wq*2+400*2], m1
mova [t1+wq*2+400*4], m2
add wq, 8
jl .h_loop
ret
.top_fixup:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.top_fixup_loop: ; the sums of the first row needs to be doubled
mova m0, [t1+wq*2+400*0]
mova m1, [t1+wq*2+400*2]
mova m2, [t1+wq*2+400*4]
paddw m0, m0
paddd m1, m1
paddd m2, m2
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m1
mova [t2+wq*2+400*4], m2
add wq, 8
jl .top_fixup_loop
ret
ALIGN function_align
.hv: ; horizontal boxsum + vertical boxsum + ab
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 13
jmp .hv_main
.hv_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, m11
jmp .hv_main
.hv_bottom:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movif32 wq, w0m
%if ARCH_X86_32
jmp .hv_loop_start
%endif
.hv_loop:
movif32 lpfq, hvsrcm
.hv_loop_start:
movu m5, [lpfq+wq-1]
.hv_main:
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp wd, -10
jl .hv_have_right
call .extend_right
.hv_have_right:
movif32 t3, hd
punpcklbw m4, m5, m6
punpckhbw m5, m6
palignr m3, m5, m4, 2
paddw m0, m4, m3
palignr m1, m5, m4, 6
paddw m0, m1
punpcklwd m2, m3, m1
pmaddwd m2, m2
punpckhwd m3, m1
pmaddwd m3, m3
palignr m5, m4, 8
paddw m0, m5
punpcklwd m1, m4, m5
pmaddwd m1, m1
paddd m2, m1
punpckhwd m1, m4, m5
pmaddwd m1, m1
shufps m4, m5, q2121
paddw m0, m4 ; h sum
punpcklwd m5, m4, m6
pmaddwd m5, m5
punpckhwd m4, m6
pmaddwd m4, m4
paddd m3, m1
paddd m2, m5 ; h sumsq
paddd m3, m4
paddw m1, m0, [t1+wq*2+400*0]
paddd m4, m2, [t1+wq*2+400*2]
paddd m5, m3, [t1+wq*2+400*4]
%if ARCH_X86_64
test hd, hd
%else
test t3, t3
%endif
jz .hv_last_row
.hv_main2:
paddw m1, [t2+wq*2+400*0] ; hv sum
paddd m4, [t2+wq*2+400*2] ; hv sumsq
paddd m5, [t2+wq*2+400*4]
mova [t0+wq*2+400*0], m0
pslld m0, m4, 4
mova [t0+wq*2+400*2], m2
mova [t0+wq*2+400*4], m3
pslld m2, m4, 3
paddd m4, m0
pslld m0, m5, 4
paddd m4, m2 ; a * 25
pslld m2, m5, 3
paddd m5, m0
paddd m5, m2
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaddwd m2, m0, m0 ; b * b
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m2 ; p * s
MULLD m5, m9, m2
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m12
mova [t4+wq*2+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m1
add wq, 8
jl .hv_loop
mov t2, t1
mov t1, t0
mov t0, t2
movif32 t2m, t2
movif32 t0m, t0
ret
.hv_last_row: ; esoteric edge case for odd heights
mova [t1+wq*2+400*0], m1
paddw m1, m0
mova [t1+wq*2+400*2], m4
paddd m4, m2
mova [t1+wq*2+400*4], m5
paddd m5, m3
jmp .hv_main2
.v: ; vertical boxsum + ab
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.v_loop:
mova m0, [t1+wq*2+400*0]
mova m2, [t1+wq*2+400*2]
mova m3, [t1+wq*2+400*4]
paddw m1, m0, [t2+wq*2+400*0]
paddd m4, m2, [t2+wq*2+400*2]
paddd m5, m3, [t2+wq*2+400*4]
paddw m0, m0
paddd m2, m2
paddd m3, m3
paddw m1, m0 ; hv sum
paddd m4, m2 ; hv sumsq
pslld m0, m4, 4
paddd m5, m3
pslld m2, m4, 3
paddd m4, m0
pslld m0, m5, 4
paddd m4, m2 ; a * 25
pslld m2, m5, 3
paddd m5, m0
paddd m5, m2
punpcklwd m0, m1, m6
punpckhwd m1, m6
pmaddwd m2, m0, m0 ; b * b
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m2 ; p * s
MULLD m5, m9, m2
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m12
mova [t4+wq*2+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m1
add wq, 8
jl .v_loop
ret
.prep_n: ; initial neighbor setup
movif64 wq, r4
movif32 wd, w1m
.prep_n_loop:
movu m0, [t4+wq*2+ 2]
movu m3, [t4+wq*2+ 4]
movu m1, [t3+wq*4+ 4]
movu m4, [t3+wq*4+ 8]
movu m2, [t3+wq*4+20]
movu m5, [t3+wq*4+24]
paddw m3, m0
paddd m4, m1
paddd m5, m2
paddw m3, [t4+wq*2+ 0]
paddd m4, [t3+wq*4+ 0]
paddd m5, [t3+wq*4+16]
paddw m0, m3
psllw m3, 2
paddd m1, m4
pslld m4, 2
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a 565
paddd m1, m4 ; b 565
paddd m2, m5
mova [t4+wq*2+400*2+ 0], m0
mova [t3+wq*4+400*4+ 0], m1
mova [t3+wq*4+400*4+16], m2
add wq, 8
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
movif64 wq, r4
movif32 wd, w1m
.n0_loop:
movu m0, [t4+wq*2+ 2]
movu m3, [t4+wq*2+ 4]
movu m1, [t3+wq*4+ 4]
movu m4, [t3+wq*4+ 8]
movu m2, [t3+wq*4+20]
movu m5, [t3+wq*4+24]
paddw m3, m0
paddd m4, m1
paddd m5, m2
paddw m3, [t4+wq*2+ 0]
paddd m4, [t3+wq*4+ 0]
paddd m5, [t3+wq*4+16]
paddw m0, m3
psllw m3, 2
paddd m1, m4
pslld m4, 2
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a 565
paddd m1, m4 ; b 565
paddd m2, m5
paddw m3, m0, [t4+wq*2+400*2+ 0]
paddd m4, m1, [t3+wq*4+400*4+ 0]
paddd m5, m2, [t3+wq*4+400*4+16]
mova [t4+wq*2+400*2+ 0], m0
mova [t3+wq*4+400*4+ 0], m1
mova [t3+wq*4+400*4+16], m2
movq m0, [dstq+wq]
punpcklbw m0, m6
punpcklwd m1, m0, m6 ; src
punpcklwd m2, m3, m6 ; a
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
psubd m4, m2 ; b - a * src + (1 << 8)
psubd m5, m3
psrad m4, 9
psrad m5, 9
packssdw m4, m5
pmulhrsw m4, m7
paddw m0, m4
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n0_loop
add dstq, stridemp
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
movif64 wq, r4
movif32 wd, w1m
.n1_loop:
movq m0, [dstq+wq]
mova m3, [t4+wq*2+400*2+ 0]
mova m4, [t3+wq*4+400*4+ 0]
mova m5, [t3+wq*4+400*4+16]
punpcklbw m0, m6
punpcklwd m1, m0, m6 ; src
punpcklwd m2, m3, m6 ; a
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
psubd m4, m2 ; b - a * src + (1 << 7)
psubd m5, m3
psrad m4, 8
psrad m5, 8
packssdw m4, m5
pmulhrsw m4, m7
paddw m0, m4
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n1_loop
add dstq, stridemp
movif32 dstm, dstq
ret
%if ARCH_X86_32
%if STACK_ALIGNMENT < 16
%assign extra_stack 4*16
%else
%assign extra_stack 2*16
%endif
cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
dst, stride, left, lpf, w
%if STACK_ALIGNMENT < 16
%define dstm dword [esp+calloff+16*2+4*0]
%define stridemp dword [esp+calloff+16*2+4*1]
%define leftm dword [esp+calloff+16*2+4*2]
%define lpfm dword [esp+calloff+16*2+4*3]
%define w0m dword [esp+calloff+16*2+4*4]
%define hd dword [esp+calloff+16*2+4*5]
%define edgeb byte [esp+calloff+16*2+4*6]
%define edged dword [esp+calloff+16*2+4*6]
%define leftmp leftm
%else
%define w0m wm
%define hd dword r5m
%define edgeb byte r7m
%define edged dword r7m
%endif
%define hvsrcm dword [esp+calloff+4*0]
%define w1m dword [esp+calloff+4*1]
%define t3m dword [esp+calloff+4*2]
%define t4m dword [esp+calloff+4*3]
%define m8 [base+pb_0to15]
%define m9 [esp+calloff+16*1]
%define m10 [base+pd_0xf00801c7]
%define m11 [base+pd_34816]
%define m12 m6
%define m13 [base+sgr_lshuf3]
%define base r6-$$
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
mov leftq, [rstk+stack_offset+12]
mov lpfq, [rstk+stack_offset+16]
mov wd, [rstk+stack_offset+20]
mov dstm, dstq
mov stridemp, strideq
mov leftm, leftq
mov r1, [rstk+stack_offset+24]
mov r2, [rstk+stack_offset+32]
mov lpfm, lpfq
mov hd, r1
mov edged, r2
%endif
%else
cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
mov wd, wm
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
mov hd, hm
mov edged, r7m
movq m9, [paramsq+4]
add lpfq, wq
lea t1, [rsp+wq*2+12]
mova m8, [pb_0to15]
add dstq, wq
lea t3, [rsp+wq*4+400*12+8]
mova m10, [pd_0xf00801c7]
lea t4, [rsp+wq*2+400*32+8]
mova m11, [pd_34816]
pshuflw m7, m9, q3333
pshufb m9, [pw_256] ; s1
punpcklqdq m7, m7 ; w1
neg wq
pxor m6, m6
mova m13, [sgr_lshuf3]
psllw m7, 4
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
movq m1, [r1+4]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+20]
add dstq, wq
lea t3, [rsp+extra_stack+wq*4+400*12+16]
mov dstm, dstq
lea t4, [rsp+extra_stack+wq*2+400*32+16]
mov t3m, t3
pshuflw m7, m1, q3333
mov t4m, t4
pshufb m1, [base+pw_256] ; s1
punpcklqdq m7, m7 ; w1
psllw m7, 4
neg wq
mova m9, m1
pxor m6, m6
mov w1m, wd
sub wd, 2
mov lpfq, lpfm
mov w0m, wd
%define strideq r5
%endif
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, stridemp
mov t2, t1
add t1, 400*6
call .h_top
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov lpfm, r10 ; below
movif32 t4, t4m
call .hv0
.main:
dec hd
jz .height1
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv0
%if ARCH_X86_64
test hb, hb
%else
mov r4, hd
test r4, r4
%endif
jz .odd_height
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, lpfm
call .hv0_bottom
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .v1
call .prep_n
jmp .odd_height_end
.odd_height:
call .v1
call .n0
call .n1
.odd_height_end:
call .v0
call .v1
call .n0
jmp .end2
.extend_bottom:
call .v0
call .v1
jmp .end
.no_top:
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov lpfm, r10
call .h
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wq, w0m
mov hvsrcm, lpfq
%endif
lea t2, [t1+400*6]
.top_fixup_loop:
mova m0, [t1+wq*2+400*0]
mova m1, [t1+wq*2+400*2]
mova m2, [t1+wq*2+400*4]
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m1
mova [t2+wq*2+400*4], m2
add wq, 8
jl .top_fixup_loop
movif32 t3, t3m
movif32 t4, t4m
call .v0
jmp .main
.extend_right:
%assign stack_offset stack_offset+8
%assign calloff 8
movd m0, [lpfq-1]
movd m1, wd
mova m3, m8
pshufb m0, m6
pshufb m1, m6
mova m2, m6
psubb m2, m1
pcmpgtb m2, m3
pand m5, m2
pandn m2, m0
por m5, m2
ret
%assign stack_offset stack_offset-4
%assign calloff 4
.h: ; horizontal boxsum
%if ARCH_X86_64
lea wq, [r4-2]
%else
%define leftq r4
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 14
jmp .h_main
.h_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, m13
jmp .h_main
.h_top:
%if ARCH_X86_64
lea wq, [r4-2]
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 wq, w0m
.h_loop:
movu m5, [lpfq+wq]
.h_main:
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp wd, -9
jl .h_have_right
call .extend_right
.h_have_right:
punpcklbw m4, m5, m6
punpckhbw m5, m6
palignr m0, m5, m4, 2
paddw m1, m4, m0
punpcklwd m2, m4, m0
pmaddwd m2, m2
punpckhwd m3, m4, m0
pmaddwd m3, m3
palignr m5, m4, 4
paddw m1, m5 ; sum
punpcklwd m4, m5, m6
pmaddwd m4, m4
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m4 ; sumsq
paddd m3, m5
mova [t1+wq*2+400*0], m1
mova [t1+wq*2+400*2], m2
mova [t1+wq*2+400*4], m3
add wq, 8
jl .h_loop
ret
ALIGN function_align
.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 14
jmp .hv0_main
.hv0_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, m13
jmp .hv0_main
.hv0_bottom:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
movif32 wq, w0m
%if ARCH_X86_32
jmp .hv0_loop_start
%endif
.hv0_loop:
movif32 lpfq, hvsrcm
.hv0_loop_start:
movu m5, [lpfq+wq]
.hv0_main:
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv0_have_right
cmp wd, -9
jl .hv0_have_right
call .extend_right
.hv0_have_right:
punpcklbw m4, m5, m6
punpckhbw m5, m6
palignr m0, m5, m4, 2
paddw m1, m4, m0
punpcklwd m2, m4, m0
pmaddwd m2, m2
punpckhwd m3, m4, m0
pmaddwd m3, m3
palignr m5, m4, 4
paddw m1, m5 ; sum
punpcklwd m4, m5, m6
pmaddwd m4, m4
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m4 ; sumsq
paddd m3, m5
paddw m0, m1, [t1+wq*2+400*0]
paddd m4, m2, [t1+wq*2+400*2]
paddd m5, m3, [t1+wq*2+400*4]
mova [t1+wq*2+400*0], m1
mova [t1+wq*2+400*2], m2
mova [t1+wq*2+400*4], m3
paddw m1, m0, [t2+wq*2+400*0]
paddd m2, m4, [t2+wq*2+400*2]
paddd m3, m5, [t2+wq*2+400*4]
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m4
mova [t2+wq*2+400*4], m5
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m1
add wq, 8
jl .hv0_loop
ret
ALIGN function_align
.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 14
jmp .hv1_main
.hv1_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, m13
jmp .hv1_main
.hv1_bottom:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
movif32 wq, w0m
%if ARCH_X86_32
jmp .hv1_loop_start
%endif
.hv1_loop:
movif32 lpfq, hvsrcm
.hv1_loop_start:
movu m5, [lpfq+wq]
.hv1_main:
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv1_have_right
cmp wd, -9
jl .hv1_have_right
call .extend_right
.hv1_have_right:
punpcklbw m4, m5, m6
punpckhbw m5, m6
palignr m1, m5, m4, 2
paddw m0, m4, m1
punpcklwd m2, m4, m1
pmaddwd m2, m2
punpckhwd m3, m4, m1
pmaddwd m3, m3
palignr m5, m4, 4
paddw m0, m5 ; h sum
punpcklwd m1, m5, m6
pmaddwd m1, m1
punpckhwd m5, m6
pmaddwd m5, m5
paddd m2, m1 ; h sumsq
paddd m3, m5
paddw m1, m0, [t2+wq*2+400*0]
paddd m4, m2, [t2+wq*2+400*2]
paddd m5, m3, [t2+wq*2+400*4]
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m2
mova [t2+wq*2+400*4], m3
pslld m2, m4, 3
pslld m3, m5, 3
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+400*2 +4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
mova [t3+wq*4+400*4+24], m1
add wq, 8
jl .hv1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.v0: ; vertical boxsums + ab (even rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.v0_loop:
mova m0, [t1+wq*2+400*0]
mova m4, [t1+wq*2+400*2]
mova m5, [t1+wq*2+400*4]
paddw m0, m0
paddd m4, m4
paddd m5, m5
paddw m1, m0, [t2+wq*2+400*0]
paddd m2, m4, [t2+wq*2+400*2]
paddd m3, m5, [t2+wq*2+400*4]
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m4
mova [t2+wq*2+400*4], m5
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m1
add wq, 8
jl .v0_loop
ret
.v1: ; vertical boxsums + ab (odd rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.v1_loop:
mova m0, [t1+wq*2+400*0]
mova m4, [t1+wq*2+400*2]
mova m5, [t1+wq*2+400*4]
paddw m1, m0, [t2+wq*2+400*0]
paddd m2, m4, [t2+wq*2+400*2]
paddd m3, m5, [t2+wq*2+400*4]
mova [t2+wq*2+400*0], m0
mova [t2+wq*2+400*2], m4
mova [t2+wq*2+400*4], m5
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
mova [t3+wq*4+400*4+24], m1
add wq, 8
jl .v1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.prep_n: ; initial neighbor setup
movif64 wq, r4
movif32 wd, w1m
.prep_n_loop:
movu m0, [t4+wq*2+400*0+ 4]
movu m1, [t3+wq*4+400*0+ 8]
movu m2, [t3+wq*4+400*0+24]
movu m3, [t4+wq*2+400*0+ 2]
movu m4, [t3+wq*4+400*0+ 4]
movu m5, [t3+wq*4+400*0+20]
paddw m0, [t4+wq*2+400*0+ 0]
paddd m1, [t3+wq*4+400*0+ 0]
paddd m2, [t3+wq*4+400*0+16]
paddw m3, m0
paddd m4, m1
paddd m5, m2
psllw m3, 2 ; a[-1] 444
pslld m4, 2 ; b[-1] 444
pslld m5, 2
psubw m3, m0 ; a[-1] 343
psubd m4, m1 ; b[-1] 343
psubd m5, m2
mova [t4+wq*2+400*4], m3
mova [t3+wq*4+400*8+ 0], m4
mova [t3+wq*4+400*8+16], m5
movu m0, [t4+wq*2+400*2+ 4]
movu m1, [t3+wq*4+400*4+ 8]
movu m2, [t3+wq*4+400*4+24]
movu m3, [t4+wq*2+400*2+ 2]
movu m4, [t3+wq*4+400*4+ 4]
movu m5, [t3+wq*4+400*4+20]
paddw m0, [t4+wq*2+400*2+ 0]
paddd m1, [t3+wq*4+400*4+ 0]
paddd m2, [t3+wq*4+400*4+16]
paddw m3, m0
paddd m4, m1
paddd m5, m2
psllw m3, 2 ; a[ 0] 444
pslld m4, 2 ; b[ 0] 444
pslld m5, 2
mova [t4+wq*2+400* 6], m3
mova [t3+wq*4+400*12+ 0], m4
mova [t3+wq*4+400*12+16], m5
psubw m3, m0 ; a[ 0] 343
psubd m4, m1 ; b[ 0] 343
psubd m5, m2
mova [t4+wq*2+400* 8], m3
mova [t3+wq*4+400*16+ 0], m4
mova [t3+wq*4+400*16+16], m5
add wq, 8
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
movif64 wq, r4
movif32 wd, w1m
.n0_loop:
movu m3, [t4+wq*2+400*0+4]
movu m1, [t4+wq*2+400*0+2]
paddw m3, [t4+wq*2+400*0+0]
paddw m1, m3
psllw m1, 2 ; a[ 1] 444
psubw m2, m1, m3 ; a[ 1] 343
paddw m3, m2, [t4+wq*2+400*4]
paddw m3, [t4+wq*2+400*6]
mova [t4+wq*2+400*4], m2
mova [t4+wq*2+400*6], m1
movu m4, [t3+wq*4+400*0+8]
movu m1, [t3+wq*4+400*0+4]
paddd m4, [t3+wq*4+400*0+0]
paddd m1, m4
pslld m1, 2 ; b[ 1] 444
psubd m2, m1, m4 ; b[ 1] 343
paddd m4, m2, [t3+wq*4+400* 8+ 0]
paddd m4, [t3+wq*4+400*12+ 0]
mova [t3+wq*4+400* 8+ 0], m2
mova [t3+wq*4+400*12+ 0], m1
movu m5, [t3+wq*4+400*0+24]
movu m1, [t3+wq*4+400*0+20]
paddd m5, [t3+wq*4+400*0+16]
paddd m1, m5
pslld m1, 2
psubd m2, m1, m5
paddd m5, m2, [t3+wq*4+400* 8+16]
paddd m5, [t3+wq*4+400*12+16]
mova [t3+wq*4+400* 8+16], m2
mova [t3+wq*4+400*12+16], m1
movq m0, [dstq+wq]
punpcklbw m0, m6
punpcklwd m1, m0, m6
punpcklwd m2, m3, m6
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
psubd m4, m2 ; b - a * src + (1 << 8)
psubd m5, m3
psrad m4, 9
psrad m5, 9
packssdw m4, m5
pmulhrsw m4, m7
paddw m0, m4
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n0_loop
add dstq, stridemp
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
movif64 wq, r4
movif32 wd, w1m
.n1_loop:
movu m3, [t4+wq*2+400*2+4]
movu m1, [t4+wq*2+400*2+2]
paddw m3, [t4+wq*2+400*2+0]
paddw m1, m3
psllw m1, 2 ; a[ 1] 444
psubw m2, m1, m3 ; a[ 1] 343
paddw m3, m2, [t4+wq*2+400*6]
paddw m3, [t4+wq*2+400*8]
mova [t4+wq*2+400*6], m1
mova [t4+wq*2+400*8], m2
movu m4, [t3+wq*4+400*4+8]
movu m1, [t3+wq*4+400*4+4]
paddd m4, [t3+wq*4+400*4+0]
paddd m1, m4
pslld m1, 2 ; b[ 1] 444
psubd m2, m1, m4 ; b[ 1] 343
paddd m4, m2, [t3+wq*4+400*12+ 0]
paddd m4, [t3+wq*4+400*16+ 0]
mova [t3+wq*4+400*12+ 0], m1
mova [t3+wq*4+400*16+ 0], m2
movu m5, [t3+wq*4+400*4+24]
movu m1, [t3+wq*4+400*4+20]
paddd m5, [t3+wq*4+400*4+16]
paddd m1, m5
pslld m1, 2
psubd m2, m1, m5
paddd m5, m2, [t3+wq*4+400*12+16]
paddd m5, [t3+wq*4+400*16+16]
mova [t3+wq*4+400*12+16], m1
mova [t3+wq*4+400*16+16], m2
movq m0, [dstq+wq]
punpcklbw m0, m6
punpcklwd m1, m0, m6
punpcklwd m2, m3, m6
pmaddwd m2, m1 ; a * src
punpckhwd m1, m0, m6
punpckhwd m3, m6
pmaddwd m3, m1
psubd m4, m2 ; b - a * src + (1 << 8)
psubd m5, m3
psrad m4, 9
psrad m5, 9
packssdw m4, m5
pmulhrsw m4, m7
paddw m0, m4
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n1_loop
add dstq, stridemp
movif32 dstm, dstq
ret
%if ARCH_X86_32
%if STACK_ALIGNMENT < 16
%assign extra_stack 10*16
%else
%assign extra_stack 8*16
%endif
cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
dst, stride, left, lpf, w
%if STACK_ALIGNMENT < 16
%define dstm dword [esp+calloff+16*8+4*0]
%define stridemp dword [esp+calloff+16*8+4*1]
%define leftm dword [esp+calloff+16*8+4*2]
%define lpfm dword [esp+calloff+16*8+4*3]
%define w0m dword [esp+calloff+16*8+4*4]
%define hd dword [esp+calloff+16*8+4*5]
%define edgeb byte [esp+calloff+16*8+4*6]
%define edged dword [esp+calloff+16*8+4*6]
%define leftmp leftm
%else
%define w0m wm
%define hd dword r5m
%define edgeb byte r7m
%define edged dword r7m
%endif
%define hvsrcm dword [esp+calloff+4*0]
%define w1m dword [esp+calloff+4*1]
%define t3m dword [esp+calloff+4*2]
%define t4m dword [esp+calloff+4*3]
%xdefine m8 m6
%define m9 [base+pd_0xffff]
%define m10 [base+pd_34816]
%define m11 [base+pd_0xf00801c7]
%define m12 [base+pd_0xf00800a4]
%define m13 [esp+calloff+16*4]
%define m14 [esp+calloff+16*5]
%define m15 [esp+calloff+16*6]
%define m6 [esp+calloff+16*7]
%define base r6-$$
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
mov leftq, [rstk+stack_offset+12]
mov lpfq, [rstk+stack_offset+16]
mov wd, [rstk+stack_offset+20]
mov dstm, dstq
mov stridemp, strideq
mov leftm, leftq
mov r1, [rstk+stack_offset+24]
mov r2, [rstk+stack_offset+32]
mov lpfm, lpfq
mov hd, r1
mov edged, r2
%endif
%else
cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
mov wd, wm
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
mov edged, r7m
mova m15, [paramsq]
add lpfq, wq
mova m9, [pd_0xffff]
lea t1, [rsp+wq*2+44]
mova m10, [pd_34816]
add dstq, wq
lea t3, [rsp+wq*4+400*24+40]
mova m11, [pd_0xf00801c7]
lea t4, [rsp+wq*2+400*52+40]
mova m12, [base+pd_0xf00800a4]
neg wq
pshuflw m13, m15, q0000
pshuflw m14, m15, q2222
pshufhw m15, m15, q1010
punpcklqdq m13, m13 ; s0
punpcklqdq m14, m14 ; s1
punpckhqdq m15, m15 ; w0 w1
pxor m6, m6
psllw m15, 2
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
mova m2, [r1]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+52]
add dstq, wq
lea t3, [rsp+extra_stack+wq*4+400*24+48]
mov dstm, dstq
lea t4, [rsp+extra_stack+wq*2+400*52+48]
mov t3m, t3
mov t4m, t4
neg wq
pshuflw m0, m2, q0000
pshuflw m1, m2, q2222
pshufhw m2, m2, q1010
punpcklqdq m0, m0 ; s0
punpcklqdq m1, m1 ; s1
punpckhqdq m2, m2 ; w0 w1
mov w1m, wd
pxor m3, m3
psllw m2, 2
mova m13, m0
mova m14, m1
sub wd, 2
mova m15, m2
mova m6, m3
mov lpfq, lpfm
mov w0m, wd
%define strideq r5
%endif
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, stridemp
mov t2, t1
%if ARCH_X86_64
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
%else
mov wq, w0m
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
%endif
add t1, 400*12
call .h_top
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov lpfm, r10 ; below
movif32 t4, t4m
call .hv0
.main:
dec hd
jz .height1
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv0
%if ARCH_X86_64
test hd, hd
%else
mov r4, hd
test r4, r4
%endif
jz .odd_height
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, lpfm
call .hv0_bottom
movif32 lpfq, hvsrcm
add lpfq, stridemp
call .hv1_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .v1
call .prep_n
jmp .odd_height_end
.odd_height:
call .v1
call .n0
call .n1
.odd_height_end:
call .v0
call .v1
call .n0
jmp .end2
.extend_bottom:
call .v0
call .v1
jmp .end
.no_top:
movif32 strideq, stridemp
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov lpfm, r10
call .h
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wq, w0m
mov hvsrcm, lpfq
%endif
lea t2, [t1+400*12]
.top_fixup_loop:
mova m0, [t1+wq*2+400* 0]
mova m1, [t1+wq*2+400* 2]
mova m2, [t1+wq*2+400* 4]
paddw m0, m0
mova m3, [t1+wq*2+400* 6]
paddd m1, m1
mova m4, [t1+wq*2+400* 8]
paddd m2, m2
mova m5, [t1+wq*2+400*10]
mova [t2+wq*2+400* 0], m0
mova [t2+wq*2+400* 2], m1
mova [t2+wq*2+400* 4], m2
mova [t2+wq*2+400* 6], m3
mova [t2+wq*2+400* 8], m4
mova [t2+wq*2+400*10], m5
add wq, 8
jl .top_fixup_loop
movif32 t3, t3m
movif32 t4, t4m
call .v0
jmp .main
.extend_right:
%assign stack_offset stack_offset+8
%assign calloff 8
%if ARCH_X86_64
SWAP m8, m6
%endif
movd m1, wd
movd m3, [lpfq-1]
pshufb m1, m8
pshufb m3, m8
psubb m2, [base+pb_1], m1
pcmpgtb m2, [base+pb_0to15]
pand m5, m2
pandn m2, m3
por m5, m2
%if ARCH_X86_64
SWAP m6, m8
%endif
ret
%assign stack_offset stack_offset-4
%assign calloff 4
.h: ; horizontal boxsum
%if ARCH_X86_64
lea wq, [r4-2]
%else
%define leftq r4
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 13
jmp .h_main
.h_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, [base+sgr_lshuf5]
jmp .h_main
.h_top:
%if ARCH_X86_64
lea wq, [r4-2]
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movif32 wq, w0m
.h_loop:
movu m5, [lpfq+wq-1]
.h_main:
test edgeb, 2 ; LR_HAVE_RIGHT
%if ARCH_X86_32
pxor m8, m8
%else
SWAP m8, m6
%endif
jnz .h_have_right
cmp wd, -10
jl .h_have_right
call .extend_right
.h_have_right:
punpcklbw m4, m5, m8
punpckhbw m5, m8
palignr m3, m5, m4, 2
palignr m0, m5, m4, 4
paddw m1, m3, m0
punpcklwd m2, m3, m0
pmaddwd m2, m2
punpckhwd m3, m0
pmaddwd m3, m3
palignr m0, m5, m4, 6
paddw m1, m0 ; sum3
punpcklwd m7, m0, m8
pmaddwd m7, m7
punpckhwd m0, m8
pmaddwd m0, m0
%if ARCH_X86_64
SWAP m6, m8
%endif
paddd m2, m7 ; sumsq3
palignr m5, m4, 8
punpcklwd m7, m5, m4
paddw m8, m4, m5
pmaddwd m7, m7
punpckhwd m5, m4
pmaddwd m5, m5
paddd m3, m0
mova [t1+wq*2+400* 6], m1
mova [t1+wq*2+400* 8], m2
mova [t1+wq*2+400*10], m3
paddw m8, m1 ; sum5
paddd m7, m2 ; sumsq5
paddd m5, m3
mova [t1+wq*2+400* 0], m8
mova [t1+wq*2+400* 2], m7
mova [t1+wq*2+400* 4], m5
add wq, 8
jl .h_loop
ret
ALIGN function_align
.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 13
jmp .hv0_main
.hv0_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, [base+sgr_lshuf5]
jmp .hv0_main
.hv0_bottom:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
movif32 wq, w0m
%if ARCH_X86_32
jmp .hv0_loop_start
%endif
.hv0_loop:
movif32 lpfq, hvsrcm
.hv0_loop_start:
movu m5, [lpfq+wq-1]
.hv0_main:
test edgeb, 2 ; LR_HAVE_RIGHT
%if ARCH_X86_32
pxor m8, m8
%else
SWAP m8, m6
%endif
jnz .hv0_have_right
cmp wd, -10
jl .hv0_have_right
call .extend_right
.hv0_have_right:
punpcklbw m4, m5, m8
punpckhbw m5, m8
palignr m3, m5, m4, 2
palignr m0, m5, m4, 4
movif32 t3, t3m
paddw m1, m3, m0
punpcklwd m2, m3, m0
pmaddwd m2, m2
punpckhwd m3, m0
pmaddwd m3, m3
palignr m0, m5, m4, 6
paddw m1, m0 ; h sum3
punpcklwd m7, m0, m8
pmaddwd m7, m7
punpckhwd m0, m8
%if ARCH_X86_64
SWAP m6, m8
%endif
pmaddwd m0, m0
paddd m2, m7 ; h sumsq3
palignr m5, m4, 8
punpcklwd m7, m5, m4
paddw m8, m4, m5
pmaddwd m7, m7
punpckhwd m5, m4
pmaddwd m5, m5
paddd m3, m0
paddw m8, m1 ; h sum5
paddd m7, m2 ; h sumsq5
paddd m5, m3
mova [t3+wq*4+400*8+ 8], m8
mova [t3+wq*4+400*0+ 8], m7
mova [t3+wq*4+400*0+24], m5
paddw m8, [t1+wq*2+400* 0]
paddd m7, [t1+wq*2+400* 2]
paddd m5, [t1+wq*2+400* 4]
mova [t1+wq*2+400* 0], m8
mova [t1+wq*2+400* 2], m7
mova [t1+wq*2+400* 4], m5
paddw m0, m1, [t1+wq*2+400* 6]
paddd m4, m2, [t1+wq*2+400* 8]
paddd m5, m3, [t1+wq*2+400*10]
mova [t1+wq*2+400* 6], m1
mova [t1+wq*2+400* 8], m2
mova [t1+wq*2+400*10], m3
paddw m1, m0, [t2+wq*2+400* 6]
paddd m2, m4, [t2+wq*2+400* 8]
paddd m3, m5, [t2+wq*2+400*10]
mova [t2+wq*2+400* 6], m0
mova [t2+wq*2+400* 8], m4
mova [t2+wq*2+400*10], m5
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a3 * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
punpckhwd m1, m7
pmaddwd m3, m1, m1
%if ARCH_X86_64
SWAP m7, m6
%endif
psubd m4, m2 ; p3
psubd m5, m3
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
mova [t3+wq*4+400*4+24], m1
add wq, 8
jl .hv0_loop
ret
ALIGN function_align
.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
movif32 leftq, leftm
movddup m4, [leftq-4]
movif32 wq, w0m
mova m5, [lpfq+wq+2]
add leftmp, 4
palignr m5, m4, 13
jmp .hv1_main
.hv1_extend_left:
movif32 wq, w0m
mova m5, [lpfq+wq+2]
pshufb m5, [base+sgr_lshuf5]
jmp .hv1_main
.hv1_bottom:
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov hvsrcm, lpfq
%endif
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
movif32 wq, w0m
%if ARCH_X86_32
jmp .hv1_loop_start
%endif
.hv1_loop:
movif32 lpfq, hvsrcm
.hv1_loop_start:
movu m5, [lpfq+wq-1]
.hv1_main:
test edgeb, 2 ; LR_HAVE_RIGHT
%if ARCH_X86_32
pxor m8, m8
%else
SWAP m8, m6
%endif
jnz .hv1_have_right
cmp wd, -10
jl .hv1_have_right
call .extend_right
.hv1_have_right:
punpcklbw m4, m5, m8
punpckhbw m5, m8
palignr m7, m5, m4, 2
palignr m3, m5, m4, 4
paddw m2, m7, m3
punpcklwd m0, m7, m3
pmaddwd m0, m0
punpckhwd m7, m3
pmaddwd m7, m7
palignr m3, m5, m4, 6
paddw m2, m3 ; h sum3
punpcklwd m1, m3, m8
pmaddwd m1, m1
punpckhwd m3, m8
%if ARCH_X86_64
SWAP m6, m8
%endif
pmaddwd m3, m3
paddd m0, m1 ; h sumsq3
palignr m5, m4, 8
punpckhwd m1, m4, m5
paddw m8, m4, m5
pmaddwd m1, m1
punpcklwd m4, m5
pmaddwd m4, m4
paddd m7, m3
paddw m5, m2, [t2+wq*2+400* 6]
mova [t2+wq*2+400* 6], m2
paddw m8, m2 ; h sum5
paddd m2, m0, [t2+wq*2+400* 8]
paddd m3, m7, [t2+wq*2+400*10]
mova [t2+wq*2+400* 8], m0
mova [t2+wq*2+400*10], m7
paddd m4, m0 ; h sumsq5
paddd m1, m7
pslld m0, m2, 3
pslld m7, m3, 3
paddd m2, m0 ; a3 * 9
paddd m3, m7
%if ARCH_X86_32
mova [esp+20], m8
pxor m8, m8
%else
SWAP m8, m6
%endif
punpcklwd m0, m5, m8 ; b3
pmaddwd m7, m0, m0
punpckhwd m5, m8
pmaddwd m8, m5, m5
psubd m2, m7 ; p3
psubd m3, m8
MULLD m2, m14, m8 ; p3 * s1
MULLD m3, m14, m8
pmaddwd m0, m11 ; b3 * 455
pmaddwd m5, m11
paddusw m2, m11
paddusw m3, m11
psrld m2, 20 ; min(z3, 255)
movif32 t3, t3m
psrld m3, 20
GATHER_X_BY_X m8, m2, m3, r0, dstm
punpcklwd m2, m8, m8
punpckhwd m3, m8, m8
MULLD m0, m2, m7
MULLD m5, m3, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m5, m10
psrld m0, 12
psrld m5, 12
mova [t4+wq*2+400*4+ 4], m8
mova [t3+wq*4+400*8+ 8], m0
mova [t3+wq*4+400*8+24], m5
%if ARCH_X86_32
mova m8, [esp+20]
%else
SWAP m6, m8
pxor m6, m6
%endif
paddw m5, m8, [t2+wq*2+400*0]
paddd m2, m4, [t2+wq*2+400*2]
paddd m3, m1, [t2+wq*2+400*4]
paddw m5, [t1+wq*2+400*0]
paddd m2, [t1+wq*2+400*2]
paddd m3, [t1+wq*2+400*4]
mova [t2+wq*2+400*0], m8
pslld m0, m2, 4
mova [t2+wq*2+400*2], m4
pslld m8, m3, 4
mova [t2+wq*2+400*4], m1
pslld m4, m2, 3
paddd m2, m0
pslld m7, m3, 3
paddd m3, m8
paddd m2, m4 ; a5 * 25
paddd m3, m7
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
punpcklwd m0, m5, m7 ; b5
pmaddwd m4, m0, m0
punpckhwd m5, m7
pmaddwd m1, m5, m5
%if ARCH_X86_64
SWAP m7, m6
%endif
psubd m2, m4 ; p5
psubd m3, m1
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m5, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m1, m2, m3, r0, dstm
punpcklwd m2, m1, m1
punpckhwd m3, m1, m1
MULLD m0, m2, m7
MULLD m5, m3, m7
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m5, m10
mova [t4+wq*2+4], m1
psrld m0, 12
psrld m5, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m5
add wq, 8
jl .hv1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.v0: ; vertical boxsums + ab3 (even rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.v0_loop:
mova m0, [t1+wq*2+400* 6]
mova m4, [t1+wq*2+400* 8]
mova m5, [t1+wq*2+400*10]
paddw m0, m0
paddd m4, m4
paddd m5, m5
paddw m1, m0, [t2+wq*2+400* 6]
paddd m2, m4, [t2+wq*2+400* 8]
paddd m3, m5, [t2+wq*2+400*10]
mova [t2+wq*2+400* 6], m0
mova [t2+wq*2+400* 8], m4
mova [t2+wq*2+400*10], m5
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a3 * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
punpckhwd m1, m7
pmaddwd m3, m1, m1
psubd m4, m2 ; p3
psubd m5, m3
%if ARCH_X86_64
SWAP m7, m6
%endif
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+400*2+4], m3
psrld m0, 12
psrld m1, 12
mova m3, [t1+wq*2+400*0]
mova m4, [t1+wq*2+400*2]
mova m5, [t1+wq*2+400*4]
mova [t3+wq*4+400*8+ 8], m3
mova [t3+wq*4+400*0+ 8], m4
mova [t3+wq*4+400*0+24], m5
paddw m3, m3 ; cc5
paddd m4, m4
paddd m5, m5
mova [t1+wq*2+400*0], m3
mova [t1+wq*2+400*2], m4
mova [t1+wq*2+400*4], m5
mova [t3+wq*4+400*4+ 8], m0
mova [t3+wq*4+400*4+24], m1
add wq, 8
jl .v0_loop
ret
.v1: ; vertical boxsums + ab (odd rows)
%if ARCH_X86_64
lea wq, [r4-2]
%else
mov wd, w0m
%endif
.v1_loop:
mova m4, [t1+wq*2+400* 6]
mova m5, [t1+wq*2+400* 8]
mova m7, [t1+wq*2+400*10]
paddw m1, m4, [t2+wq*2+400* 6]
paddd m2, m5, [t2+wq*2+400* 8]
paddd m3, m7, [t2+wq*2+400*10]
mova [t2+wq*2+400* 6], m4
mova [t2+wq*2+400* 8], m5
mova [t2+wq*2+400*10], m7
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
punpckhwd m1, m7
pmaddwd m3, m1, m1
psubd m4, m2 ; p3
psubd m5, m3
%if ARCH_X86_64
SWAP m7, m6
%endif
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+400*4+4], m3
psrld m0, 12
psrld m8, m1, 12
mova m4, [t3+wq*4+400*8+ 8]
mova m5, [t3+wq*4+400*0+ 8]
mova m7, [t3+wq*4+400*0+24]
paddw m1, m4, [t2+wq*2+400*0]
paddd m2, m5, [t2+wq*2+400*2]
paddd m3, m7, [t2+wq*2+400*4]
paddw m1, [t1+wq*2+400*0]
paddd m2, [t1+wq*2+400*2]
paddd m3, [t1+wq*2+400*4]
mova [t2+wq*2+400*0], m4
mova [t2+wq*2+400*2], m5
mova [t2+wq*2+400*4], m7
pslld m4, m2, 4
mova [t3+wq*4+400*8+ 8], m0
pslld m5, m3, 4
mova [t3+wq*4+400*8+24], m8
pslld m7, m2, 3
paddd m2, m4
pslld m8, m3, 3
paddd m3, m5
paddd m2, m7 ; a5 * 25
paddd m3, m8
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
punpcklwd m0, m1, m7 ; b5
pmaddwd m4, m0, m0
punpckhwd m1, m7
pmaddwd m5, m1, m1
psubd m2, m4 ; p5
psubd m3, m5
%if ARCH_X86_64
SWAP m7, m6
%endif
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m1, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m4, m2, m3, r0, dstm
punpcklwd m2, m4, m4
punpckhwd m3, m4, m4
MULLD m0, m2, m7
MULLD m1, m3, m7
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+4], m4
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0
mova [t3+wq*4+24], m1
add wq, 8
jl .v1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.prep_n: ; initial neighbor setup
movif64 wq, r4
movif32 wd, w1m
.prep_n_loop:
movu m0, [t4+wq*2+400*0+ 2]
movu m1, [t3+wq*4+400*0+ 4]
movu m2, [t3+wq*4+400*0+20]
movu m7, [t4+wq*2+400*0+ 4]
movu m8, [t3+wq*4+400*0+ 8]
paddw m3, m0, [t4+wq*2+400*0+ 0]
paddd m4, m1, [t3+wq*4+400*0+ 0]
paddd m5, m2, [t3+wq*4+400*0+16]
paddw m3, m7
paddd m4, m8
movu m7, [t3+wq*4+400*0+24]
paddw m0, m3
paddd m1, m4
psllw m3, 2
pslld m4, 2
paddd m5, m7
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a5 565
paddd m1, m4 ; b5 565
paddd m2, m5
mova [t4+wq*2+400* 6+ 0], m0
mova [t3+wq*4+400*12+ 0], m1
mova [t3+wq*4+400*12+16], m2
movu m0, [t4+wq*2+400*2+ 4]
movu m1, [t3+wq*4+400*4+ 8]
movu m2, [t3+wq*4+400*4+24]
movu m3, [t4+wq*2+400*2+ 2]
movu m4, [t3+wq*4+400*4+ 4]
movu m5, [t3+wq*4+400*4+20]
paddw m0, [t4+wq*2+400*2+ 0]
paddd m1, [t3+wq*4+400*4+ 0]
paddd m2, [t3+wq*4+400*4+16]
paddw m3, m0
paddd m4, m1
paddd m5, m2
psllw m3, 2 ; a3[-1] 444
pslld m4, 2 ; b3[-1] 444
pslld m5, 2
psubw m3, m0 ; a3[-1] 343
psubd m4, m1 ; b3[-1] 343
psubd m5, m2
mova [t4+wq*2+400* 8+ 0], m3
mova [t3+wq*4+400*16+ 0], m4
mova [t3+wq*4+400*16+16], m5
movu m0, [t4+wq*2+400*4+ 4]
movu m1, [t3+wq*4+400*8+ 8]
movu m2, [t3+wq*4+400*8+24]
movu m3, [t4+wq*2+400*4+ 2]
movu m4, [t3+wq*4+400*8+ 4]
movu m5, [t3+wq*4+400*8+20]
paddw m0, [t4+wq*2+400*4+ 0]
paddd m1, [t3+wq*4+400*8+ 0]
paddd m2, [t3+wq*4+400*8+16]
paddw m3, m0
paddd m4, m1
paddd m5, m2
psllw m3, 2 ; a3[ 0] 444
pslld m4, 2 ; b3[ 0] 444
pslld m5, 2
mova [t4+wq*2+400*10+ 0], m3
mova [t3+wq*4+400*20+ 0], m4
mova [t3+wq*4+400*20+16], m5
psubw m3, m0 ; a3[ 0] 343
psubd m4, m1 ; b3[ 0] 343
psubd m5, m2
mova [t4+wq*2+400*12+ 0], m3
mova [t3+wq*4+400*24+ 0], m4
mova [t3+wq*4+400*24+16], m5
add wq, 8
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
movif64 wq, r4
movif32 wd, w1m
.n0_loop:
movu m0, [t4+wq*2+ 4]
movu m2, [t4+wq*2+ 2]
paddw m0, [t4+wq*2+ 0]
paddw m0, m2
paddw m2, m0
psllw m0, 2
paddw m0, m2 ; a5
movu m4, [t3+wq*4+ 8]
movu m5, [t3+wq*4+24]
movu m1, [t3+wq*4+ 4]
movu m3, [t3+wq*4+20]
paddd m4, [t3+wq*4+ 0]
paddd m5, [t3+wq*4+16]
paddd m4, m1
paddd m5, m3
paddd m1, m4
paddd m3, m5
pslld m4, 2
pslld m5, 2
paddd m4, m1 ; b5
paddd m5, m3
movu m2, [t4+wq*2+400* 6]
paddw m2, m0
mova [t4+wq*2+400* 6], m0
paddd m0, m4, [t3+wq*4+400*12+ 0]
paddd m1, m5, [t3+wq*4+400*12+16]
mova [t3+wq*4+400*12+ 0], m4
mova [t3+wq*4+400*12+16], m5
mova [rsp+16+ARCH_X86_32*4], m1
movu m3, [t4+wq*2+400*2+4]
movu m5, [t4+wq*2+400*2+2]
paddw m3, [t4+wq*2+400*2+0]
paddw m5, m3
psllw m5, 2 ; a3[ 1] 444
psubw m4, m5, m3 ; a3[ 1] 343
movu m3, [t4+wq*2+400* 8]
paddw m3, [t4+wq*2+400*10]
paddw m3, m4
mova [t4+wq*2+400* 8], m4
mova [t4+wq*2+400*10], m5
movu m1, [t3+wq*4+400*4+ 8]
movu m5, [t3+wq*4+400*4+ 4]
movu m7, [t3+wq*4+400*4+24]
movu m8, [t3+wq*4+400*4+20]
paddd m1, [t3+wq*4+400*4+ 0]
paddd m7, [t3+wq*4+400*4+16]
paddd m5, m1
paddd m8, m7
pslld m5, 2 ; b3[ 1] 444
pslld m8, 2
psubd m4, m5, m1 ; b3[ 1] 343
%if ARCH_X86_32
mova [esp+52], m8
psubd m8, m7
%else
psubd m6, m8, m7
SWAP m8, m6
%endif
paddd m1, m4, [t3+wq*4+400*16+ 0]
paddd m7, m8, [t3+wq*4+400*16+16]
paddd m1, [t3+wq*4+400*20+ 0]
paddd m7, [t3+wq*4+400*20+16]
mova [t3+wq*4+400*16+ 0], m4
mova [t3+wq*4+400*16+16], m8
mova [t3+wq*4+400*20+ 0], m5
%if ARCH_X86_32
mova m8, [esp+52]
%else
SWAP m8, m6
pxor m6, m6
%endif
mova [t3+wq*4+400*20+16], m8
mova [rsp+32+ARCH_X86_32*4], m7
movq m4, [dstq+wq]
punpcklbw m4, m6
punpcklwd m5, m4, m6
punpcklwd m7, m2, m6
pmaddwd m7, m5 ; a5 * src
punpcklwd m8, m3, m6
pmaddwd m8, m5 ; a3 * src
punpckhwd m5, m4, m6
punpckhwd m2, m6
pmaddwd m2, m5
punpckhwd m3, m6
pmaddwd m3, m5
psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13)
psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13)
psrld m0, 9
pslld m1, 7
pand m0, m9
pandn m8, m9, m1
por m0, m8
mova m1, [rsp+16+ARCH_X86_32*4]
psubd m1, m2
mova m2, [rsp+32+ARCH_X86_32*4]
psubd m2, m3
mova m3, [base+pd_4096]
psrld m1, 9
pslld m2, 7
pand m1, m9
pandn m5, m9, m2
por m1, m5
pmaddwd m0, m15
pmaddwd m1, m15
paddd m0, m3
paddd m1, m3
psrad m0, 13
psrad m1, 13
packssdw m0, m1
paddw m0, m4
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n0_loop
add dstq, stridemp
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
movif64 wq, r4
movif32 wd, w1m
.n1_loop:
movu m3, [t4+wq*2+400*4+4]
movu m5, [t4+wq*2+400*4+2]
paddw m3, [t4+wq*2+400*4+0]
paddw m5, m3
psllw m5, 2 ; a3[ 1] 444
psubw m4, m5, m3 ; a3[ 1] 343
paddw m3, m4, [t4+wq*2+400*12]
paddw m3, [t4+wq*2+400*10]
mova [t4+wq*2+400*10], m5
mova [t4+wq*2+400*12], m4
movu m1, [t3+wq*4+400*8+ 8]
movu m5, [t3+wq*4+400*8+ 4]
movu m7, [t3+wq*4+400*8+24]
movu m8, [t3+wq*4+400*8+20]
paddd m1, [t3+wq*4+400*8+ 0]
paddd m7, [t3+wq*4+400*8+16]
paddd m5, m1
paddd m8, m7
pslld m5, 2 ; b3[ 1] 444
pslld m8, 2
psubd m4, m5, m1 ; b3[ 1] 343
psubd m0, m8, m7
paddd m1, m4, [t3+wq*4+400*24+ 0]
paddd m7, m0, [t3+wq*4+400*24+16]
paddd m1, [t3+wq*4+400*20+ 0]
paddd m7, [t3+wq*4+400*20+16]
mova [t3+wq*4+400*20+ 0], m5
mova [t3+wq*4+400*20+16], m8
mova [t3+wq*4+400*24+ 0], m4
mova [t3+wq*4+400*24+16], m0
movq m5, [dstq+wq]
mova m2, [t4+wq*2+400* 6]
punpcklbw m5, m6
punpcklwd m4, m5, m6
punpcklwd m8, m2, m6
pmaddwd m8, m4 ; a5 * src
punpcklwd m0, m3, m6
pmaddwd m0, m4 ; a3 * src
punpckhwd m4, m5, m6
punpckhwd m2, m6
pmaddwd m2, m4
punpckhwd m3, m6
pmaddwd m3, m4
psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13)
mova m0, [t3+wq*4+400*12+ 0]
psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13)
mova m4, [t3+wq*4+400*12+16]
psubd m4, m2
psubd m7, m3
pslld m1, 7
psrld m0, 8
psrld m4, 8
pslld m7, 7
pandn m3, m9, m1
pand m0, m9
por m0, m3
pand m4, m9
pandn m2, m9, m7
por m2, m4
mova m1, [base+pd_4096]
pmaddwd m0, m15
pmaddwd m2, m15
paddd m0, m1
paddd m2, m1
psrad m0, 13
psrad m2, 13
packssdw m0, m2
paddw m0, m5
packuswb m0, m0
movq [dstq+wq], m0
add wq, 8
jl .n1_loop
add dstq, stridemp
movif32 dstm, dstq
ret