blob: 1e571774caf47b8e47e30db377c07051ff205bd1 [file] [log] [blame]
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12
wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13
wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47
db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8
db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128
sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
r_ext_mask: times 68 db -1
times 4 db 0
wiener_x_shuf: db 0, 2, -1, 0
wiener_x_add: db 0, 1,127, 0
pw_61448: times 2 dw 61448
pw_164_455: dw 164, 455
pd_m16380: dd -16380
pd_m4096: dd -4096
pd_m25 dd -25
pd_m9: dd -9
pd_34816: dd 34816
pd_8421376: dd 8421376
cextern sgr_x_by_x
SECTION .text
DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
INIT_ZMM avx512icl
cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
w, h, edge, flt
mov fltq, r6mp
mov wd, wm
movifnidn hd, hm
mov edged, r7m
vbroadcasti32x4 m6, [wiener_shufA]
vbroadcasti32x4 m7, [wiener_shufB]
mov r10d, 0xfffe
vbroadcasti32x4 m8, [wiener_shufC]
vbroadcasti32x4 m9, [wiener_shufD]
kmovw k1, r10d
vpbroadcastd m0, [wiener_x_shuf]
vpbroadcastd m1, [wiener_x_add]
mov r10, 0xaaaaaaaaaaaaaaaa
vpbroadcastd m11, [fltq+ 0]
vpbroadcastd m12, [fltq+ 4]
kmovq k2, r10
vpbroadcastd m10, [pd_m16380]
packsswb m11, m11 ; x0 x1 x0 x1
vpbroadcastd m14, [fltq+16]
pshufb m12, m0
vpbroadcastd m15, [fltq+20]
paddb m12, m1 ; x2 x3+1 x2 127
vpbroadcastd m13, [pd_8421376]
psllw m14, 5 ; y0 y1
psllw m15, 5 ; y2 y3
cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32
jle .w32 ; pixels, so we need a special case for small widths
lea t1, [rsp+wq*2+16]
add lpfq, wq
add dstq, wq
neg wq
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t6, t1
mov t5, t1
add t1, 384*2
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
mov t4, t1
add t1, 384*2
add r10, strideq
mov [rsp], r10 ; below
call .h
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
.main:
lea t0, [t1+384*2]
.main_loop:
call .hv
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .v3
mov lpfq, [rsp]
call .hv_bottom
add lpfq, strideq
call .hv_bottom
.v1:
call .v
RET
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
mov t6, t1
mov t5, t1
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .v1
add lpfq, strideq
add t1, 384*2
call .h
mov t2, t1
dec hd
jz .v2
add lpfq, strideq
add t1, 384*2
call .h
dec hd
jz .v3
lea t0, [t1+384*2]
call .hv
dec hd
jz .v3
add t0, 384*8
call .hv
dec hd
jnz .main
.v3:
call .v
.v2:
call .v
jmp .v1
.h:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movd xm16, [leftq]
vmovdqu32 m16{k1}, [lpfq+r10-4]
add leftq, 4
jmp .h_main
.h_extend_left:
vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception
vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
jmp .h_main
.h_top:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu m16, [lpfq+r10-4]
.h_main:
movu m17, [lpfq+r10+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -66
jl .h_have_right
push r0
lea r0, [r_ext_mask+65]
vpbroadcastb m0, [lpfq-1]
vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
vpternlogd m17, m0, [r0+r10+8], 0xe4
pop r0
.h_have_right:
pshufb m4, m16, m6
mova m0, m10
vpdpbusd m0, m4, m11
pshufb m4, m16, m7
mova m2, m10
vpdpbusd m2, m4, m11
pshufb m4, m17, m6
mova m1, m10
vpdpbusd m1, m4, m11
pshufb m4, m17, m7
mova m3, m10
vpdpbusd m3, m4, m11
pshufb m4, m16, m8
vpdpbusd m0, m4, m12
pshufb m16, m9
vpdpbusd m2, m16, m12
pshufb m4, m17, m8
vpdpbusd m1, m4, m12
pshufb m17, m9
vpdpbusd m3, m17, m12
packssdw m0, m2
packssdw m1, m3
psraw m0, 3
psraw m1, 3
mova [t1+r10*2+ 0], m0
mova [t1+r10*2+64], m1
add r10, 64
jl .h_loop
ret
ALIGN function_align
.hv:
add lpfq, strideq
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movd xm16, [leftq]
vmovdqu32 m16{k1}, [lpfq+r10-4]
add leftq, 4
jmp .hv_main
.hv_extend_left:
vpbroadcastb xm16, [lpfq+r10]
vmovdqu32 m16{k1}, [lpfq+r10-4]
jmp .hv_main
.hv_bottom:
mov r10, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu m16, [lpfq+r10-4]
.hv_main:
movu m17, [lpfq+r10+4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -66
jl .hv_have_right
push r0
lea r0, [r_ext_mask+65]
vpbroadcastb m0, [lpfq-1]
vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
vpternlogd m17, m0, [r0+r10+8], 0xe4
pop r0
.hv_have_right:
pshufb m4, m16, m6
mova m0, m10
vpdpbusd m0, m4, m11
pshufb m4, m16, m7
mova m2, m10
vpdpbusd m2, m4, m11
pshufb m4, m17, m6
mova m1, m10
vpdpbusd m1, m4, m11
pshufb m4, m17, m7
mova m3, m10
vpdpbusd m3, m4, m11
pshufb m4, m16, m8
vpdpbusd m0, m4, m12
pshufb m16, m9
vpdpbusd m2, m16, m12
pshufb m4, m17, m8
vpdpbusd m1, m4, m12
pshufb m17, m9
vpdpbusd m3, m17, m12
packssdw m0, m2
packssdw m1, m3
psraw m0, 3
psraw m1, 3
mova m16, [t4+r10*2]
paddw m16, [t2+r10*2]
mova m3, [t3+r10*2]
mova m17, [t4+r10*2+64]
paddw m17, [t2+r10*2+64]
mova m5, [t3+r10*2+64]
punpcklwd m4, m16, m3
mova m2, m13
vpdpwssd m2, m4, m15
punpcklwd m18, m17, m5
mova m4, m13
vpdpwssd m4, m18, m15
punpckhwd m16, m3
mova m3, m13
vpdpwssd m3, m16, m15
punpckhwd m17, m5
mova m5, m13
vpdpwssd m5, m17, m15
mova m17, [t5+r10*2]
paddw m17, [t1+r10*2]
paddw m16, m0, [t6+r10*2]
mova m19, [t5+r10*2+64]
paddw m19, [t1+r10*2+64]
paddw m18, m1, [t6+r10*2+64]
mova [t0+r10*2+ 0], m0
mova [t0+r10*2+64], m1
punpcklwd m0, m16, m17
vpdpwssd m2, m0, m14
punpcklwd m1, m18, m19
vpdpwssd m4, m1, m14
punpckhwd m16, m17
vpdpwssd m3, m16, m14
punpckhwd m18, m19
vpdpwssd m5, m18, m14
packuswb m2, m4
psrlw m2, 8
vpackuswb m2{k2}, m3, m5
movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
add r10, 64 ; function is used for chroma as well, and in some
jl .hv_loop ; esoteric edge cases chroma dst pointers may only
mov t6, t5 ; have a 32-byte alignment despite having a width
mov t5, t4 ; larger than 32, so use an unaligned store here.
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t6
add dstq, strideq
ret
.v:
mov r10, wq
.v_loop:
mova m4, [t4+r10*2+ 0]
paddw m4, [t2+r10*2+ 0]
mova m1, [t3+r10*2+ 0]
mova m5, [t4+r10*2+64]
paddw m5, [t2+r10*2+64]
mova m3, [t3+r10*2+64]
punpcklwd m6, m4, m1
mova m0, m13
vpdpwssd m0, m6, m15
punpcklwd m6, m5, m3
mova m2, m13
vpdpwssd m2, m6, m15
punpckhwd m4, m1
mova m1, m13
vpdpwssd m1, m4, m15
punpckhwd m5, m3
mova m3, m13
vpdpwssd m3, m5, m15
mova m5, [t1+r10*2+ 0]
paddw m4, m5, [t6+r10*2+ 0]
paddw m5, [t5+r10*2+ 0]
mova m7, [t1+r10*2+64]
paddw m6, m7, [t6+r10*2+64]
paddw m7, [t5+r10*2+64]
punpcklwd m8, m4, m5
vpdpwssd m0, m8, m14
punpcklwd m8, m6, m7
vpdpwssd m2, m8, m14
punpckhwd m4, m5
vpdpwssd m1, m4, m14
punpckhwd m6, m7
vpdpwssd m3, m6, m14
packuswb m0, m2
psrlw m0, 8
vpackuswb m0{k2}, m1, m3
movu [dstq+r10], m0
add r10, 64
jl .v_loop
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
add dstq, strideq
ret
.w32:
lea r10, [r_ext_mask+73]
mova ym18, [wiener_perm32]
lea t1, [rsp+16]
sub r10, wq
test edgeb, 4 ; LR_HAVE_TOP
jz .w32_no_top
call .w32_h_top
add lpfq, strideq
mov t6, t1
mov t5, t1
add t1, 32*2
call .w32_h_top
lea r9, [lpfq+strideq*4]
mov lpfq, dstq
mov t4, t1
add t1, 32*2
add r9, strideq
mov [rsp], r9 ; below
call .w32_h
mov t3, t1
mov t2, t1
dec hd
jz .w32_v1
add lpfq, strideq
add t1, 32*2
call .w32_h
mov t2, t1
dec hd
jz .w32_v2
add lpfq, strideq
add t1, 32*2
call .w32_h
dec hd
jz .w32_v3
.w32_main:
lea t0, [t1+32*2]
.w32_main_loop:
call .w32_hv
dec hd
jnz .w32_main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .w32_v3
mov lpfq, [rsp]
call .w32_hv_bottom
add lpfq, strideq
call .w32_hv_bottom
.w32_v1:
call .w32_v
RET
.w32_no_top:
lea r9, [lpfq+strideq*4]
mov lpfq, dstq
lea r9, [r9+strideq*2]
mov [rsp], r9
call .w32_h
mov t6, t1
mov t5, t1
mov t4, t1
mov t3, t1
mov t2, t1
dec hd
jz .w32_v1
add lpfq, strideq
add t1, 32*2
call .w32_h
mov t2, t1
dec hd
jz .w32_v2
add lpfq, strideq
add t1, 32*2
call .w32_h
dec hd
jz .w32_v3
lea t0, [t1+32*2]
call .w32_hv
dec hd
jz .w32_v3
add t0, 32*8
call .w32_hv
dec hd
jnz .w32_main
.w32_v3:
call .w32_v
.w32_v2:
call .w32_v
jmp .w32_v1
.w32_h:
test edgeb, 1 ; LR_HAVE_LEFT
jz .w32_h_extend_left
movd xm16, [leftq]
vmovdqu32 ym16{k1}, [lpfq-4]
add leftq, 4
jmp .w32_h_main
.w32_h_extend_left:
vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception
vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
jmp .w32_h_main
.w32_h_top:
test edgeb, 1 ; LR_HAVE_LEFT
jz .w32_h_extend_left
movu ym16, [lpfq-4]
.w32_h_main:
vinserti32x8 m16, [lpfq+4], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .w32_h_have_right
vpbroadcastb m0, [lpfq+wq-1]
movu ym17, [r10-8]
vinserti32x8 m17, [r10+0], 1
vpternlogd m16, m0, m17, 0xe4 ; c ? a : b
.w32_h_have_right:
pshufb m2, m16, m6
mova m0, m10
vpdpbusd m0, m2, m11
pshufb m2, m16, m7
mova m1, m10
vpdpbusd m1, m2, m11
pshufb m2, m16, m8
vpdpbusd m0, m2, m12
pshufb m16, m9
vpdpbusd m1, m16, m12
packssdw m0, m1
psraw m0, 3
mova [t1], m0
ret
.w32_hv:
add lpfq, strideq
test edgeb, 1 ; LR_HAVE_LEFT
jz .w32_hv_extend_left
movd xm16, [leftq]
vmovdqu32 ym16{k1}, [lpfq-4]
add leftq, 4
jmp .w32_hv_main
.w32_hv_extend_left:
vpbroadcastb xm16, [lpfq]
vmovdqu32 ym16{k1}, [lpfq-4]
jmp .w32_hv_main
.w32_hv_bottom:
test edgeb, 1 ; LR_HAVE_LEFT
jz .w32_hv_extend_left
movu ym16, [lpfq-4]
.w32_hv_main:
vinserti32x8 m16, [lpfq+4], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .w32_hv_have_right
vpbroadcastb m0, [lpfq+wq-1]
movu ym17, [r10-8]
vinserti32x8 m17, [r10+0], 1
vpternlogd m16, m0, m17, 0xe4
.w32_hv_have_right:
mova m3, [t4]
paddw m3, [t2]
mova m2, [t3]
pshufb m4, m16, m6
mova m0, m10
vpdpbusd m0, m4, m11
pshufb m4, m16, m7
mova m5, m10
vpdpbusd m5, m4, m11
punpcklwd m4, m3, m2
mova m1, m13
vpdpwssd m1, m4, m15
punpckhwd m3, m2
mova m2, m13
vpdpwssd m2, m3, m15
pshufb m4, m16, m8
vpdpbusd m0, m4, m12
pshufb m16, m9
vpdpbusd m5, m16, m12
packssdw m0, m5
psraw m0, 3
mova m4, [t5]
paddw m4, [t1]
paddw m3, m0, [t6]
mova [t0], m0
punpcklwd m0, m3, m4
vpdpwssd m1, m0, m14
punpckhwd m3, m4
vpdpwssd m2, m3, m14
packuswb m1, m2
vpermb m16, m18, m1
mova [dstq], ym16
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
mov t1, t0
mov t0, t6
add dstq, strideq
ret
.w32_v:
mova m2, [t4]
paddw m2, [t2]
mova m1, [t3]
mova m4, [t1]
paddw m3, m4, [t6]
paddw m4, [t5]
punpcklwd m5, m2, m1
mova m0, m13
vpdpwssd m0, m5, m15
punpckhwd m2, m1
mova m1, m13
vpdpwssd m1, m2, m15
punpcklwd m2, m3, m4
vpdpwssd m0, m2, m14
punpckhwd m3, m4
vpdpwssd m1, m3, m14
packuswb m0, m1
vpermb m16, m18, m0
mova [dstq], ym16
mov t6, t5
mov t5, t4
mov t4, t3
mov t3, t2
mov t2, t1
add dstq, strideq
ret
cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
w, h, edge, params
mov paramsq, r6mp
mov wd, wm
mov hd, hm
mov edged, r7m
vbroadcasti32x4 m5, [sgr_shuf+1]
add lpfq, wq
vbroadcasti32x4 m6, [sgr_shuf+9]
add dstq, wq
vbroadcasti32x4 m7, [sgr_shuf+3]
lea t3, [rsp+wq*4+16+416*12]
vbroadcasti32x4 m8, [sgr_shuf+7]
pxor m4, m4
vpbroadcastd m9, [pd_m25]
vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
vpbroadcastw m15, [paramsq+8] ; w0
lea t1, [rsp+wq*2+20]
vpbroadcastd m10, [pw_164_455]
neg wq
vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
mov r10d, 0xfe
vpbroadcastd m13, [pd_m4096]
kmovb k1, r10d
vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
mov r10, 0x3333333333333333
mova m18, [sgr_x_by_x+64*0]
kmovq k2, r10
mova m19, [sgr_x_by_x+64*1]
lea r12, [r_ext_mask+75]
mova m20, [sgr_x_by_x+64*2]
psllw m15, 4
mova m21, [sgr_x_by_x+64*3]
lea r10, [lpfq+strideq*4]
mova ym22, [sgr_shuf]
add r10, strideq
mov [rsp], r10 ; below
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
call .top_fixup
add t1, 416*6
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov [rsp], r10 ; below
mov t0, t2
dec hd
jz .height1
or edged, 16
call .h
.main:
add lpfq, strideq
call .hv
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
add lpfq, strideq
test hd, hd
jz .odd_height
call .h
add lpfq, strideq
call .hv
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .h_top
add lpfq, strideq
call .hv_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .hv
call .prep_n
jmp .odd_height_end
.odd_height:
call .hv
call .n0
call .n1
.odd_height_end:
call .v
call .n0
jmp .end2
.extend_bottom:
call .v
jmp .end
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
lea t2, [t1+416*6]
call .top_fixup
dec hd
jz .no_top_height1
or edged, 16
mov t0, t1
mov t1, t2
jmp .main
.no_top_height1:
call .v
call .prep_n
jmp .odd_height_end
.h: ; horizontal boxsum
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .h_main
.h_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .h_main
.h_top:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu ym17, [lpfq+r10-2]
.h_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -34
jl .h_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r12+r10-8]
vinserti32x8 m16, [r12+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.h_have_right:
pshufb m3, m17, m5
pmullw m2, m3, m3
pshufb m1, m17, m6
paddw m0, m3, m1
shufps m3, m1, q2121
paddw m0, m3
punpcklwd m16, m3, m1
punpckhwd m3, m1
punpcklwd m1, m2, m4
vpdpwssd m1, m16, m16
punpckhwd m2, m4
vpdpwssd m2, m3, m3
pshufb m16, m17, m7
paddw m0, m16
pshufb m17, m8
paddw m0, m17 ; sum
punpcklwd m3, m16, m17
vpdpwssd m1, m3, m3 ; sumsq
punpckhwd m16, m17
vpdpwssd m2, m16, m16
test edgeb, 16 ; y > 0
jz .h_loop_end
paddw m0, [t1+r10*2+416*0]
paddd m1, [t1+r10*2+416*2]
paddd m2, [t1+r10*2+416*4]
.h_loop_end:
mova [t1+r10*2+416*0], m0
mova [t1+r10*2+416*2], m1
mova [t1+r10*2+416*4], m2
add r10, 32
jl .h_loop
ret
.top_fixup:
lea r10, [wq-2]
.top_fixup_loop: ; the sums of the first row needs to be doubled
mova m0, [t1+r10*2+416*0]
mova m1, [t1+r10*2+416*2]
mova m2, [t1+r10*2+416*4]
paddw m0, m0
paddd m1, m1
paddd m2, m2
mova [t2+r10*2+416*0], m0
mova [t2+r10*2+416*2], m1
mova [t2+r10*2+416*4], m2
add r10, 32
jl .top_fixup_loop
ret
ALIGN function_align
.hv: ; horizontal boxsum + vertical boxsum + ab
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .hv_main
.hv_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .hv_main
.hv_bottom:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu ym17, [lpfq+r10-2]
.hv_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -34
jl .hv_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r12+r10-8]
vinserti32x8 m16, [r12+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.hv_have_right:
pshufb m1, m17, m5
pmullw m3, m1, m1
pshufb m2, m17, m6
paddw m0, m1, m2
shufps m1, m2, q2121
paddw m0, m1
punpcklwd m16, m1, m2
punpckhwd m1, m2
punpcklwd m2, m3, m4
vpdpwssd m2, m16, m16
punpckhwd m3, m4
vpdpwssd m3, m1, m1
pshufb m16, m17, m7
paddw m0, m16
pshufb m17, m8
paddw m0, m17 ; h sum
punpcklwd m1, m16, m17
vpdpwssd m2, m1, m1 ; h sumsq
punpckhwd m16, m17
vpdpwssd m3, m16, m16
paddw m1, m0, [t1+r10*2+416*0]
paddd m16, m2, [t1+r10*2+416*2]
paddd m17, m3, [t1+r10*2+416*4]
test hd, hd
jz .hv_last_row
.hv_main2:
paddd m16, [t2+r10*2+416*2] ; hv sumsq
paddd m17, [t2+r10*2+416*4]
paddw m1, [t2+r10*2+416*0] ; hv sum
mova [t0+r10*2+416*2], m2
mova [t0+r10*2+416*4], m3
mova [t0+r10*2+416*0], m0
pmulld m16, m9 ; -a * 25
pmulld m17, m9
punpcklwd m0, m1, m4 ; b
vpdpwssd m16, m0, m0 ; -p
punpckhwd m1, m4
vpdpwssd m17, m1, m1
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
pmulld m16, m11 ; p * s
pmulld m17, m11
vpalignr m17{k2}, m16, m16, 2
mova m16, m20
paddusw m17, m12
psraw m17, 4 ; min(z, 255) - 256
vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x
pandn m16, m13, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m14
vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
vpternlogd m17, m1, m13, 0xd8
mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires
mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b.
vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
mova [t3+r10*4+ 72], m17 ; that gets us most of the way.
vextracti128 [t3+r10*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+104], m16, 3
add r10, 32
jl .hv_loop
mov t2, t1
mov t1, t0
mov t0, t2
ret
.hv_last_row: ; esoteric edge case for odd heights
mova [t1+r10*2+416*0], m1
paddw m1, m0
mova [t1+r10*2+416*2], m16
paddd m16, m2
mova [t1+r10*2+416*4], m17
paddd m17, m3
jmp .hv_main2
.v: ; vertical boxsum + ab
lea r10, [wq-2]
.v_loop:
mova m2, [t1+r10*2+416*2]
paddd m16, m2, [t2+r10*2+416*2]
mova m3, [t1+r10*2+416*4]
paddd m17, m3, [t2+r10*2+416*4]
paddd m2, m2
paddd m3, m3
paddd m16, m2 ; hv sumsq
paddd m17, m3
pmulld m16, m9 ; -a * 25
pmulld m17, m9
mova m0, [t1+r10*2+416*0]
paddw m1, m0, [t2+r10*2+416*0]
paddw m0, m0
paddw m1, m0 ; hv sum
punpcklwd m0, m1, m4 ; b
vpdpwssd m16, m0, m0 ; -p
punpckhwd m1, m4
vpdpwssd m17, m1, m1
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
pmulld m16, m11 ; p * s
pmulld m17, m11
vpalignr m17{k2}, m16, m16, 2
mova m16, m20
paddusw m17, m12
psraw m17, 4 ; min(z, 255) - 256
vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x
pandn m16, m13, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m14
vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
vpternlogd m17, m1, m13, 0xd8
mova [t3+r10*4+ 8], m16
mova [t3+r10*4+ 24], xm17
vextracti32x4 [t3+r10*4+ 56], m17, 2
mova [t3+r10*4+ 72], m17
vextracti128 [t3+r10*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+104], m16, 3
add r10, 32
jl .v_loop
ret
.prep_n: ; initial neighbor setup
mov r10, wq
.prep_n_loop:
movu m0, [t3+r10*4+ 4]
movu m1, [t3+r10*4+68]
paddd m2, m0, [t3+r10*4+ 0]
paddd m3, m1, [t3+r10*4+64]
paddd m2, [t3+r10*4+ 8]
paddd m3, [t3+r10*4+72]
paddd m0, m2
pslld m2, 2
paddd m1, m3
pslld m3, 2
paddd m2, m0 ; ab 565
paddd m3, m1
pandn m0, m13, m2 ; a
psrld m2, 12 ; b
pandn m1, m13, m3
psrld m3, 12
mova [t3+r10*4+416*4+ 0], m0
mova [t3+r10*4+416*8+ 0], m2
mova [t3+r10*4+416*4+64], m1
mova [t3+r10*4+416*8+64], m3
add r10, 32
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
mov r10, wq
.n0_loop:
movu m16, [t3+r10*4+ 4]
movu m17, [t3+r10*4+68]
paddd m0, m16, [t3+r10*4+ 0]
paddd m1, m17, [t3+r10*4+64]
paddd m0, [t3+r10*4+ 8]
paddd m1, [t3+r10*4+72]
paddd m16, m0
pslld m0, 2
paddd m17, m1
pslld m1, 2
paddd m0, m16
paddd m1, m17
pandn m16, m13, m0
psrld m0, 12
pandn m17, m13, m1
psrld m1, 12
paddd m2, m16, [t3+r10*4+416*4+ 0] ; a
paddd m3, m17, [t3+r10*4+416*4+64]
mova [t3+r10*4+416*4+ 0], m16
mova [t3+r10*4+416*4+64], m17
paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
paddd m17, m1, [t3+r10*4+416*8+64]
mova [t3+r10*4+416*8+ 0], m0
mova [t3+r10*4+416*8+64], m1
pmovzxbd m0, [dstq+r10+ 0]
pmovzxbd m1, [dstq+r10+16]
pmaddwd m2, m0 ; a * src
pmaddwd m3, m1
packssdw m0, m1
psubd m16, m2 ; b - a * src + (1 << 8)
psubd m17, m3
psrad m16, 9
psrad m17, 9
packssdw m16, m17
pmulhrsw m16, m15
paddw m16, m0
packuswb m16, m16
vpermd m16, m22, m16
mova [dstq+r10], ym16
add r10, 32
jl .n0_loop
add dstq, strideq
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
mov r10, wq
.n1_loop:
pmovzxbd m0, [dstq+r10+ 0]
pmovzxbd m1, [dstq+r10+16]
pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src
pmaddwd m3, m1, [t3+r10*4+416*4+64]
mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7)
mova m17, [t3+r10*4+416*8+64]
packssdw m0, m1
psubd m16, m2 ; b - a * src + (1 << 7)
psubd m17, m3
psrad m16, 8
psrad m17, 8
packssdw m16, m17
pmulhrsw m16, m15
paddw m16, m0
packuswb m16, m16
vpermd m16, m22, m16
mova [dstq+r10], ym16
add r10, 32
jl .n1_loop
add dstq, strideq
ret
cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
w, h, edge, params
mov paramsq, r6mp
mov wd, wm
movifnidn hd, hm
mov edged, r7m
vbroadcasti32x4 m5, [sgr_shuf+3]
add lpfq, wq
vbroadcasti32x4 m6, [sgr_shuf+5]
add dstq, wq
vbroadcasti32x4 m7, [sgr_shuf+7]
pxor m4, m4
vpbroadcastd m8, [pd_m9]
vpsubd m11, m4, [paramsq+4] {1to16} ; -s1
vpbroadcastw m15, [paramsq+10] ; w1
lea t1, [rsp+wq*2+20]
vpbroadcastd m10, [pw_164_455]
lea t3, [rsp+wq*4+16+416*12]
vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
neg wq
vpbroadcastd m13, [pd_m4096]
mov r10d, 0xfe
vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
kmovb k1, r10d
mova m18, [sgr_x_by_x+64*0]
mov r10, 0x3333333333333333
mova m19, [sgr_x_by_x+64*1]
kmovq k2, r10
mova m20, [sgr_x_by_x+64*2]
psllw m15, 4
mova m21, [sgr_x_by_x+64*3]
lea r14, [r_ext_mask+75]
mova ym9, [sgr_shuf]
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
add t1, 416*6
call .h_top
lea t4, [lpfq+strideq*4]
mov lpfq, dstq
add t4, strideq
mov [rsp], t4 ; below
mov t0, t2
call .hv
.main:
mov t5, t3
add t3, 416*4
dec hd
jz .height1
add lpfq, strideq
call .hv
call .prep_n
dec hd
jz .extend_bottom
.main_loop:
add lpfq, strideq
call .hv
call .n
dec hd
jnz .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .hv_bottom
call .n
add lpfq, strideq
call .hv_bottom
.end:
call .n
RET
.height1:
call .v
call .prep_n
mov t2, t1
call .v
jmp .end
.extend_bottom:
call .v
call .n
mov t2, t1
call .v
jmp .end
.no_top:
lea t4, [lpfq+strideq*4]
mov lpfq, dstq
lea t4, [t4+strideq*2]
mov [rsp], t4
call .h
lea t0, [t1+416*6]
mov t2, t1
call .v
jmp .main
.h: ; horizontal boxsum
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .h_main
.h_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .h_main
.h_top:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu ym17, [lpfq+r10-2]
.h_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -33
jl .h_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r14+r10-8]
vinserti32x8 m16, [r14+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.h_have_right:
pshufb m0, m17, m5
pmullw m2, m0, m0
pshufb m16, m17, m6
paddw m0, m16
pshufb m17, m7
paddw m0, m17 ; sum
punpcklwd m3, m16, m17
punpcklwd m1, m2, m4
vpdpwssd m1, m3, m3 ; sumsq
punpckhwd m16, m17
punpckhwd m2, m4
vpdpwssd m2, m16, m16
mova [t1+r10*2+416*0], m0
mova [t1+r10*2+416*2], m1
mova [t1+r10*2+416*4], m2
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv: ; horizontal boxsum + vertical boxsum + ab
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .hv_main
.hv_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .hv_main
.hv_bottom:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv_extend_left
.hv_loop:
movu ym17, [lpfq+r10-2]
.hv_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv_have_right
cmp r10d, -33
jl .hv_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r14+r10-8]
vinserti32x8 m16, [r14+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.hv_have_right:
pshufb m0, m17, m5
pmullw m3, m0, m0
pshufb m1, m17, m6
paddw m0, m1
pshufb m17, m7
paddw m0, m17 ; h sum
punpcklwd m16, m17, m1
punpcklwd m2, m3, m4
vpdpwssd m2, m16, m16 ; h sumsq
punpckhwd m17, m1
punpckhwd m3, m4
vpdpwssd m3, m17, m17
paddw m1, m0, [t2+r10*2+416*0]
paddw m1, [t1+r10*2+416*0] ; hv sum
paddd m16, m2, [t2+r10*2+416*2]
paddd m17, m3, [t2+r10*2+416*4]
paddd m16, [t1+r10*2+416*2] ; hv sumsq
paddd m17, [t1+r10*2+416*4]
mova [t0+r10*2+416*0], m0
mova [t0+r10*2+416*2], m2
mova [t0+r10*2+416*4], m3
pmulld m16, m8 ; -a * 9
pmulld m17, m8
punpcklwd m0, m4, m1 ; b
vpdpwssd m16, m0, m0 ; -p
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
pmulld m16, m11 ; p * s
pmulld m17, m11
vpalignr m17{k2}, m16, m16, 2
mova m16, m20
paddusw m17, m12
psraw m17, 4 ; min(z, 255) - 256
vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x
pandn m16, m13, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m14
vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
vpternlogd m17, m1, m13, 0xd8
mova [t3+r10*4+ 8], m16
mova [t3+r10*4+ 24], xm17
vextracti32x4 [t3+r10*4+ 56], m17, 2
mova [t3+r10*4+ 72], m17
vextracti128 [t3+r10*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+104], m16, 3
add r10, 32
jl .hv_loop
mov t2, t1
mov t1, t0
mov t0, t2
ret
.v: ; vertical boxsum + ab
lea r10, [wq-2]
.v_loop:
mova m16, [t1+r10*2+416*2]
mova m17, [t1+r10*2+416*4]
paddd m16, m16
paddd m17, m17
paddd m16, [t2+r10*2+416*2] ; hv sumsq
paddd m17, [t2+r10*2+416*4]
pmulld m16, m8 ; -a * 9
pmulld m17, m8
mova m1, [t1+r10*2+416*0]
paddw m1, m1
paddw m1, [t2+r10*2+416*0] ; hv sum
punpcklwd m0, m4, m1 ; b
vpdpwssd m16, m0, m0 ; -p
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
pmulld m16, m11 ; p * s
pmulld m17, m11
vpalignr m17{k2}, m16, m16, 2
mova m16, m20
paddusw m17, m12
psraw m17, 4 ; min(z, 255) - 256
vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x
pandn m16, m13, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m14
vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
vpternlogd m17, m1, m13, 0xd8
mova [t3+r10*4+ 8], m16
mova [t3+r10*4+ 24], xm17
vextracti32x4 [t3+r10*4+ 56], m17, 2
mova [t3+r10*4+ 72], m17
vextracti128 [t3+r10*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+104], m16, 3
add r10, 32
jl .v_loop
ret
.prep_n: ; initial neighbor setup
mov r10, wq
mov t4, t3
add t3, 416*4
.prep_n_loop:
mova m2, [t5+r10*4+0]
mova m3, [t4+r10*4+0]
paddd m2, [t5+r10*4+8]
paddd m3, [t4+r10*4+8]
paddd m0, m2, [t5+r10*4+4]
paddd m1, m3, [t4+r10*4+4]
pslld m0, 2
paddd m1, m1 ; ab[ 0] 222
psubd m0, m2 ; ab[-1] 343
mova [t3+r10*4+416*4], m1
paddd m1, m1
mova [t5+r10*4], m0
psubd m1, m3 ; ab[ 0] 343
mova [t4+r10*4], m1
add r10, 16
jl .prep_n_loop
ret
; a+b are packed together in a single dword, but we can't do the
; full neighbor calculations before splitting them since we don't
; have sufficient precision. The solution is to do the calculations
; in two equal halves and split a and b before doing the final sum.
ALIGN function_align
.n: ; neighbor + output
mov r10, wq
.n_loop:
mova m16, [t3+r10*4+ 0]
paddd m16, [t3+r10*4+ 8]
paddd m17, m16, [t3+r10*4+ 4]
paddd m17, m17 ; ab[+1] 222
mova m2, [t3+r10*4+416*4+ 0]
paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
mova m3, [t3+r10*4+416*4+64]
paddd m1, m3, [t5+r10*4+64]
mova [t3+r10*4+416*4+ 0], m17
paddd m17, m17
psubd m17, m16 ; ab[+1] 343
mova [t5+r10*4+ 0], m17
paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343
mova m16, [t3+r10*4+64]
paddd m16, [t3+r10*4+72]
paddd m17, m16, [t3+r10*4+68]
paddd m17, m17
mova [t3+r10*4+416*4+64], m17
paddd m17, m17
psubd m17, m16
mova [t5+r10*4+64], m17
pandn m16, m13, m0
psrld m0, 12
paddd m3, m17
pandn m17, m13, m2
psrld m2, 12
paddd m16, m17 ; a
pandn m17, m13, m1
psrld m1, 12
paddd m0, m2 ; b + (1 << 8)
pandn m2, m13, m3
psrld m3, 12
paddd m17, m2
pmovzxbd m2, [dstq+r10+ 0]
paddd m1, m3
pmovzxbd m3, [dstq+r10+16]
pmaddwd m16, m2 ; a * src
pmaddwd m17, m3
packssdw m2, m3
psubd m0, m16 ; b - a * src + (1 << 8)
psubd m1, m17
psrad m0, 9
psrad m1, 9
packssdw m0, m1
pmulhrsw m0, m15
paddw m0, m2
packuswb m0, m0
vpermd m16, m9, m0
mova [dstq+r10], ym16
add r10, 32
jl .n_loop
mov r10, t5
mov t5, t4
mov t4, r10
add dstq, strideq
ret
cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
w, h, edge, params
mov paramsq, r6mp
mov wd, wm
movifnidn hd, hm
mov edged, r7m
vbroadcasti128 m5, [sgr_shuf+1]
add lpfq, wq
vbroadcasti128 m6, [sgr_shuf+9]
add dstq, wq
vbroadcasti128 m7, [sgr_shuf+3]
lea t3, [rsp+wq*4+416*24+8]
vbroadcasti128 m8, [sgr_shuf+7]
pxor m4, m4
vpbroadcastd m9, [pd_m9]
vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
vpbroadcastd m14, [pw_61448]
vpsubd m12, m4, [paramsq+4] {1to16} ; -s1
vpbroadcastd m26, [paramsq+8] ; w0 w1
lea t1, [rsp+wq*2+12]
vpbroadcastd m10, [pd_m25]
neg wq
vpbroadcastd m13, [pw_164_455]
mov r10d, 0xfe
vpbroadcastd m15, [pd_34816]
kmovb k1, r10d
mova m20, [sgr_x_by_x+64*0]
mov r10, 0x3333333333333333
mova m21, [sgr_x_by_x+64*1]
kmovq k2, r10
mova m22, [sgr_x_by_x+64*2]
lea r12, [r_ext_mask+75]
mova m23, [sgr_x_by_x+64*3]
vpbroadcastd m24, [pd_m4096]
vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____
psllw m26, 5
mova xm27, [sgr_mix_perm]
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
add lpfq, strideq
mov t2, t1
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
add t1, 416*12
call .h_top
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
add r10, strideq
mov [rsp], r10 ; below
call .hv0
.main:
dec hd
jz .height1
add lpfq, strideq
call .hv1
call .prep_n
sub hd, 2
jl .extend_bottom
.main_loop:
add lpfq, strideq
call .hv0
test hd, hd
jz .odd_height
add lpfq, strideq
call .hv1
call .n0
call .n1
sub hd, 2
jge .main_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .extend_bottom
mov lpfq, [rsp]
call .hv0_bottom
add lpfq, strideq
call .hv1_bottom
.end:
call .n0
call .n1
.end2:
RET
.height1:
call .v1
call .prep_n
jmp .odd_height_end
.odd_height:
call .v1
call .n0
call .n1
.odd_height_end:
call .v0
call .v1
call .n0
jmp .end2
.extend_bottom:
call .v0
call .v1
jmp .end
.no_top:
lea r10, [lpfq+strideq*4]
mov lpfq, dstq
lea r10, [r10+strideq*2]
mov [rsp], r10
call .h
lea t2, [t1+416*12]
lea r10, [wq-2]
.top_fixup_loop:
mova m0, [t1+r10*2+416* 0]
mova m1, [t1+r10*2+416* 2]
mova m2, [t1+r10*2+416* 4]
paddw m0, m0
mova m3, [t1+r10*2+416* 6]
paddd m1, m1
mova m16, [t1+r10*2+416* 8]
paddd m2, m2
mova m17, [t1+r10*2+416*10]
mova [t2+r10*2+416* 0], m0
mova [t2+r10*2+416* 2], m1
mova [t2+r10*2+416* 4], m2
mova [t2+r10*2+416* 6], m3
mova [t2+r10*2+416* 8], m16
mova [t2+r10*2+416*10], m17
add r10, 32
jl .top_fixup_loop
call .v0
jmp .main
.h: ; horizontal boxsums
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .h_main
.h_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .h_main
.h_top:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
.h_loop:
movu ym17, [lpfq+r10-2]
.h_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp r10d, -34
jl .h_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r12+r10-8]
vinserti32x8 m16, [r12+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.h_have_right:
pshufb m3, m17, m5
pshufb m18, m17, m6
shufps m0, m3, m18, q2121
pmullw m2, m0, m0
pshufb m19, m17, m7
paddw m0, m19
pshufb m17, m8
paddw m0, m17 ; sum3
punpcklwd m16, m19, m17
punpcklwd m1, m2, m4
vpdpwssd m1, m16, m16 ; sumsq3
punpckhwd m19, m17
punpckhwd m2, m4
vpdpwssd m2, m19, m19
mova [t1+r10*2+416* 6], m0
mova [t1+r10*2+416* 8], m1
mova [t1+r10*2+416*10], m2
punpcklwd m19, m3, m18
paddw m0, m3
vpdpwssd m1, m19, m19 ; sumsq5
punpckhwd m3, m18
paddw m0, m18 ; sum5
vpdpwssd m2, m3, m3
mova [t1+r10*2+416* 0], m0
mova [t1+r10*2+416* 2], m1
mova [t1+r10*2+416* 4], m2
add r10, 32
jl .h_loop
ret
ALIGN function_align
.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .hv0_main
.hv0_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .hv0_main
.hv0_bottom:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv0_extend_left
.hv0_loop:
movu ym17, [lpfq+r10-2]
.hv0_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv0_have_right
cmp r10d, -34
jl .hv0_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r12+r10-8]
vinserti32x8 m16, [r12+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.hv0_have_right:
pshufb m18, m17, m5
pshufb m19, m17, m6
shufps m1, m18, m19, q2121
pmullw m3, m1, m1
pshufb m0, m17, m7
paddw m1, m0
pshufb m17, m8
paddw m1, m17 ; sum3
punpcklwd m16, m0, m17
punpcklwd m2, m3, m4
vpdpwssd m2, m16, m16 ; sumsq3
punpckhwd m0, m17
punpckhwd m3, m4
vpdpwssd m3, m0, m0
paddw m0, m1, [t1+r10*2+416* 6]
paddd m16, m2, [t1+r10*2+416* 8]
paddd m17, m3, [t1+r10*2+416*10]
mova [t1+r10*2+416* 6], m1
mova [t1+r10*2+416* 8], m2
mova [t1+r10*2+416*10], m3
paddw m1, m18
paddw m1, m19 ; sum5
mova [t3+r10*4+416*8+ 8], m1
paddw m1, [t1+r10*2+416* 0]
mova [t1+r10*2+416* 0], m1
punpcklwd m1, m18, m19
vpdpwssd m2, m1, m1 ; sumsq5
punpckhwd m18, m19
vpdpwssd m3, m18, m18
mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row
mova [t3+r10*4+416*0+72], m3 ; in case height is odd
paddd m2, [t1+r10*2+416* 2]
paddd m3, [t1+r10*2+416* 4]
mova [t1+r10*2+416* 2], m2
mova [t1+r10*2+416* 4], m3
paddw m1, m0, [t2+r10*2+416* 6]
paddd m2, m16, [t2+r10*2+416* 8]
paddd m3, m17, [t2+r10*2+416*10]
mova [t2+r10*2+416* 6], m0
mova [t2+r10*2+416* 8], m16
mova [t2+r10*2+416*10], m17
pmulld m16, m2, m9 ; -a3 * 9
pmulld m17, m3, m9
punpcklwd m0, m4, m1 ; b3
vpdpwssd m16, m0, m0 ; -p3
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
pmulld m16, m12 ; p3 * s1
pmulld m17, m12
pmaddwd m0, m13 ; b3 * 455
pmaddwd m1, m13
vpalignr m17{k2}, m16, m16, 2
mova m16, m22
paddusw m17, m14
psraw m17, 4 ; min(z3, 255) - 256
vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x3
pandn m16, m24, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m15
vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
vpternlogd m17, m1, m24, 0xd8
mova [t3+r10*4+416*4+ 8], m16
mova [t3+r10*4+416*4+ 24], xm17
vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
mova [t3+r10*4+416*4+ 72], m17
vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+416*4+104], m16, 3
add r10, 32
jl .hv0_loop
ret
ALIGN function_align
.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
movd xm17, [leftq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
add leftq, 4
jmp .hv1_main
.hv1_extend_left:
vpbroadcastb xm17, [lpfq+wq]
vmovdqu32 ym17{k1}, [lpfq+wq-4]
jmp .hv1_main
.hv1_bottom:
lea r10, [wq-2]
test edgeb, 1 ; LR_HAVE_LEFT
jz .hv1_extend_left
.hv1_loop:
movu ym17, [lpfq+r10-2]
.hv1_main:
vinserti32x8 m17, [lpfq+r10+6], 1
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .hv1_have_right
cmp r10d, -34
jl .hv1_have_right
vpbroadcastb m0, [lpfq-1]
movu ym16, [r12+r10-8]
vinserti32x8 m16, [r12+r10+0], 1
vpternlogd m17, m0, m16, 0xe4
.hv1_have_right:
pshufb m3, m17, m5
pshufb m19, m17, m6
shufps m2, m3, m19, q2121
pmullw m1, m2, m2
pshufb m18, m17, m7
paddw m2, m18
pshufb m17, m8
paddw m2, m17 ; sum3
punpcklwd m16, m17, m18
punpcklwd m0, m1, m4
vpdpwssd m0, m16, m16 ; sumsq3
punpckhwd m17, m18
punpckhwd m1, m4
vpdpwssd m1, m17, m17
paddd m16, m0, [t2+r10*2+416* 8]
paddd m17, m1, [t2+r10*2+416*10]
mova [t2+r10*2+416* 8], m0
mova [t2+r10*2+416*10], m1
punpcklwd m18, m3, m19
vpdpwssd m0, m18, m18 ; sumsq5
punpckhwd m18, m3, m19
vpdpwssd m1, m18, m18
paddw m3, m19
pmulld m16, m9 ; -a3 * 9
pmulld m17, m9
paddd m18, m0, [t2+r10*2+416*2]
paddd m19, m1, [t2+r10*2+416*4]
paddd m18, [t1+r10*2+416*2]
paddd m19, [t1+r10*2+416*4]
mova [t2+r10*2+416*2], m0
mova [t2+r10*2+416*4], m1
pmulld m18, m10 ; -a5 * 25
pmulld m19, m10
paddw m1, m2, [t2+r10*2+416* 6]
mova [t2+r10*2+416* 6], m2
paddw m2, m3 ; sum5
paddw m3, m2, [t2+r10*2+416*0]
paddw m3, [t1+r10*2+416*0]
mova [t2+r10*2+416*0], m2
punpcklwd m0, m4, m1 ; b3
vpdpwssd m16, m0, m0 ; -p3
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
punpcklwd m2, m3, m4 ; b5
vpdpwssd m18, m2, m2 ; -p5
punpckhwd m3, m4
vpdpwssd m19, m3, m3
pmulld m16, m12 ; p3 * s1
pmulld m17, m12
pmulld m18, m11 ; p5 * s0
pmulld m19, m11
pmaddwd m0, m13 ; b3 * 455
pmaddwd m1, m13
pmaddwd m2, m13 ; b5 * 164
pmaddwd m3, m13
vpalignr m17{k2}, m16, m16, 2
vpalignr m19{k2}, m18, m18, 2
paddusw m17, m14
mova m16, m22
psraw m17, 4 ; min(z3, 255) - 256
vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
paddusw m19, m14
mova m18, m22
psraw m19, 4 ; min(z5, 255) - 256
vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
vpmovb2m k4, m19
vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x3
vmovdqu8 m19{k4}, m18 ; x5
pandn m16, m24, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
pandn m18, m24, m19
psrld m19, 16
pmulld m2, m18
pmulld m3, m19
paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m15
vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
vpternlogd m17, m1, m24, 0xd8
mova [t3+r10*4+416*8+ 8], m16
mova [t3+r10*4+416*8+ 24], xm17
vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m3, m15
mova [t3+r10*4+416*8+ 72], m17
vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
vextracti32x4 [t3+r10*4+416*8+104], m16, 3
vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
vpternlogd m19, m3, m24, 0xd8
mova [t3+r10*4+416*0+ 8], m18
mova [t3+r10*4+416*0+ 24], xm19
vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
mova [t3+r10*4+416*0+ 72], m19
vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
vextracti32x4 [t3+r10*4+416*0+104], m18, 3
add r10, 32
jl .hv1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.v0: ; vertical boxsums + ab3 (even rows)
lea r10, [wq-2]
.v0_loop:
mova m2, [t1+r10*2+416* 8]
mova m3, [t1+r10*2+416*10]
paddd m2, m2
paddd m3, m3
paddd m16, m2, [t2+r10*2+416* 8]
paddd m17, m3, [t2+r10*2+416*10]
mova m0, [t1+r10*2+416* 6]
paddw m0, m0
paddw m1, m0, [t2+r10*2+416* 6]
pmulld m16, m9 ; -a3 * 9
pmulld m17, m9
mova [t2+r10*2+416* 6], m0
mova [t2+r10*2+416* 8], m2
mova [t2+r10*2+416*10], m3
mova m2, [t1+r10*2+416*0]
mova m3, [t1+r10*2+416*2]
mova m18, [t1+r10*2+416*4]
punpcklwd m0, m4, m1 ; b3
vpdpwssd m16, m0, m0 ; -p3
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
pmulld m16, m12 ; p3 * s1
pmulld m17, m12
pmaddwd m0, m13 ; b3 * 455
pmaddwd m1, m13
mova [t3+r10*4+416*8+ 8], m2
mova [t3+r10*4+416*0+ 8], m3
mova [t3+r10*4+416*0+72], m18
vpalignr m17{k2}, m16, m16, 2
mova m16, m22
paddusw m17, m14
psraw m17, 4 ; min(z3, 255) - 256
vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x3
pandn m16, m24, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
paddw m2, m2 ; cc5
paddd m3, m3
paddd m18, m18
mova [t1+r10*2+416*0], m2
mova [t1+r10*2+416*2], m3
mova [t1+r10*2+416*4], m18
paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m15
vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
vpternlogd m17, m1, m24, 0xd8
mova [t3+r10*4+416*4+ 8], m16
mova [t3+r10*4+416*4+ 24], xm17
vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
mova [t3+r10*4+416*4+ 72], m17
vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
vextracti32x4 [t3+r10*4+416*4+104], m16, 3
add r10, 32
jl .v0_loop
ret
.v1: ; vertical boxsums + ab (odd rows)
lea r10, [wq-2]
.v1_loop:
mova m0, [t1+r10*2+416* 8]
paddd m16, m0, [t2+r10*2+416* 8]
mova m1, [t1+r10*2+416*10]
paddd m17, m1, [t2+r10*2+416*10]
mova m2, [t3+r10*4+416*0+ 8]
paddd m18, m2, [t2+r10*2+416* 2]
mova m3, [t3+r10*4+416*0+72]
paddd m19, m3, [t2+r10*2+416* 4]
paddd m18, [t1+r10*2+416* 2]
paddd m19, [t1+r10*2+416* 4]
mova [t2+r10*2+416* 8], m0
mova [t2+r10*2+416*10], m1
mova [t2+r10*2+416* 2], m2
mova [t2+r10*2+416* 4], m3
pmulld m16, m9 ; -a3 * 9
pmulld m17, m9
pmulld m18, m10 ; -a5 * 25
pmulld m19, m10
mova m0, [t1+r10*2+416* 6]
paddw m1, m0, [t2+r10*2+416* 6]
mova m2, [t3+r10*4+416*8+ 8]
paddw m3, m2, [t2+r10*2+416*0]
paddw m3, [t1+r10*2+416*0]
mova [t2+r10*2+416* 6], m0
mova [t2+r10*2+416*0], m2
punpcklwd m0, m4, m1 ; b3
vpdpwssd m16, m0, m0 ; -p3
punpckhwd m1, m4, m1
vpdpwssd m17, m1, m1
punpcklwd m2, m3, m4 ; b5
vpdpwssd m18, m2, m2 ; -p5
punpckhwd m3, m4
vpdpwssd m19, m3, m3
pmulld m16, m12 ; p3 * s1
pmulld m17, m12
pmulld m18, m11 ; p5 * s0
pmulld m19, m11
pmaddwd m0, m13 ; b3 * 455
pmaddwd m1, m13
pmaddwd m2, m13 ; b5 * 164
pmaddwd m3, m13
vpalignr m17{k2}, m16, m16, 2
vpalignr m19{k2}, m18, m18, 2
paddusw m17, m14
mova m16, m22
psraw m17, 4 ; min(z3, 255) - 256
vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
vpmovb2m k3, m17
vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
paddusw m19, m14
mova m18, m22
psraw m19, 4 ; min(z5, 255) - 256
vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
vpmovb2m k4, m19
vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
vmovdqu8 m17{k3}, m16 ; x3
vmovdqu8 m19{k4}, m18 ; x5
pandn m16, m24, m17
psrld m17, 16
pmulld m0, m16
pmulld m1, m17
pandn m18, m24, m19
psrld m19, m19, 16
pmulld m2, m18
pmulld m3, m19
paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m15
vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
vpternlogd m17, m1, m24, 0xd8
mova [t3+r10*4+416*8+ 8], m16
mova [t3+r10*4+416*8+ 24], xm17
vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m3, m15
mova [t3+r10*4+416*8+ 72], m17
vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
vextracti32x4 [t3+r10*4+416*8+104], m16, 3
vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
vpternlogd m19, m3, m24, 0xd8
mova [t3+r10*4+416*0+ 8], m18
mova [t3+r10*4+416*0+ 24], xm19
vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
mova [t3+r10*4+416*0+ 72], m19
vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
vextracti32x4 [t3+r10*4+416*0+104], m18, 3
add r10, 32
jl .v1_loop
mov r10, t2
mov t2, t1
mov t1, r10
ret
.prep_n: ; initial neighbor setup
mov r10, wq
.prep_n_loop:
movu m0, [t3+r10*4+416*0+4]
paddd m1, m0, [t3+r10*4+416*0+0]
mova m16, [t3+r10*4+416*4+0]
paddd m1, [t3+r10*4+416*0+8]
mova m17, [t3+r10*4+416*8+0]
paddd m16, [t3+r10*4+416*4+8]
paddd m17, [t3+r10*4+416*8+8]
paddd m2, m16, [t3+r10*4+416*4+4]
paddd m3, m17, [t3+r10*4+416*8+4]
paddd m0, m1
pslld m1, 2
pslld m2, 2
paddd m1, m0 ; ab5 565
paddd m3, m3 ; ab3[ 0] 222
psubd m2, m16 ; ab3[-1] 343
mova [t3+r10*4+416*20], m3
pandn m0, m24, m1 ; a5 565
mova [t3+r10*4+416*24], m2
psrld m1, 12 ; b5 565
mova [t3+r10*4+416*12], m0
paddd m3, m3
mova [t3+r10*4+416*16], m1
psubd m3, m17 ; ab3[ 0] 343
mova [t3+r10*4+416*28], m3
add r10, 16
jl .prep_n_loop
ret
ALIGN function_align
.n0: ; neighbor + output (even rows)
mov r10, wq
.n0_loop:
movu m2, [t3+r10*4+4]
paddd m3, m2, [t3+r10*4+0]
paddd m3, [t3+r10*4+8]
mova m1, [t3+r10*4+416*4+0]
paddd m2, m3
pslld m3, 2
paddd m1, [t3+r10*4+416*4+8]
paddd m3, m2
pandn m2, m24, m3
psrld m3, 12
paddd m0, m2, [t3+r10*4+416*12] ; a5
paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
mova [t3+r10*4+416*12], m2
mova [t3+r10*4+416*16], m3
paddd m2, m1, [t3+r10*4+416*4+4]
paddd m2, m2 ; ab3[ 1] 222
mova m3, [t3+r10*4+416*20]
paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
mova [t3+r10*4+416*20], m2
paddd m2, m2
psubd m2, m1 ; ab3[ 1] 343
mova [t3+r10*4+416*24], m2
paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343
pandn m1, m24, m17
psrld m17, 12
pandn m3, m24, m2
psrld m2, 12
paddd m1, m3 ; a3
pmovzxbd m3, [dstq+r10]
paddd m17, m2 ; b3 + (1 << 8)
pmaddwd m0, m3 ; a5 * src
pmaddwd m1, m3 ; a3 * src
vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
psubd m16, m0 ; b5 - a5 * src + (1 << 8)
psubd m17, m1 ; b3 - a3 * src + (1 << 8)
psrld m16, 9
pslld m17, 7
vmovdqu8 m17{k2}, m16
vpdpwssd m3, m17, m26
packuswb m3, m2
vpermb m16, m27, m3
mova [dstq+r10], xm16
add r10, 16
jl .n0_loop
add dstq, strideq
ret
ALIGN function_align
.n1: ; neighbor + output (odd rows)
mov r10, wq
.n1_loop:
mova m1, [t3+r10*4+416*8+0]
paddd m1, [t3+r10*4+416*8+8]
paddd m2, m1, [t3+r10*4+416*8+4]
paddd m2, m2 ; ab3[ 1] 222
mova m0, [t3+r10*4+416*20]
paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
pmovzxbd m3, [dstq+r10]
mova [t3+r10*4+416*20], m2
paddd m2, m2
psubd m2, m1 ; ab3[ 1] 343
mova [t3+r10*4+416*28], m2
paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343
pandn m1, m24, m17
psrld m17, 12
pandn m2, m24, m0
psrld m0, 12
paddd m1, m2 ; a3
paddd m17, m0 ; b3 + (1 << 8)
mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7)
pmaddwd m1, m3 ; a3 * src
pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src
vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
psubd m17, m1 ; b3 - a3 * src + (1 << 8)
psubd m16, m0 ; b5 - a5 * src + (1 << 7)
pslld m17, 7
palignr m17{k2}, m16, m16, 1
vpdpwssd m3, m17, m26
packuswb m3, m3
vpermb m16, m27, m3
mova [dstq+r10], xm16
add r10, 16
jl .n1_loop
add dstq, strideq
ret
%endif ; ARCH_X86_64