blob: 4c8d3bca4377b3494b8327f20d077bbca3a77c10 [file] [log] [blame]
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA
%macro DIR_TABLE 1 ; stride
db 1 * %1 + 0, 2 * %1 + 0
db 1 * %1 + 0, 2 * %1 - 2
db -1 * %1 + 2, -2 * %1 + 4
db 0 * %1 + 2, -1 * %1 + 4
db 0 * %1 + 2, 0 * %1 + 4
db 0 * %1 + 2, 1 * %1 + 4
db 1 * %1 + 2, 2 * %1 + 4
db 1 * %1 + 0, 2 * %1 + 2
db 1 * %1 + 0, 2 * %1 + 0
db 1 * %1 + 0, 2 * %1 - 2
db -1 * %1 + 2, -2 * %1 + 4
db 0 * %1 + 2, -1 * %1 + 4
%endmacro
dir_table4: DIR_TABLE 16
dir_table8: DIR_TABLE 32
pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3
dir_shift: times 2 dw 0x4000
times 2 dw 0x1000
pw_2048: times 2 dw 2048
pw_m16384: times 2 dw -16384
cextern cdef_dir_8bpc_avx2.main
SECTION .text
%macro CDEF_FILTER 2 ; w, h
DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
movifnidn prid, r5m
movifnidn secd, r6m
mov dird, r7m
vpbroadcastd m8, [base+pw_2048]
lea dirq, [base+dir_table%1+dirq*2]
test prid, prid
jz .sec_only
%if WIN64
vpbroadcastw m6, prim
movaps [rsp+16*0], xmm9
movaps [rsp+16*1], xmm10
%else
movd xm6, prid
vpbroadcastw m6, xm6
%endif
lzcnt pridmpd, prid
rorx tmpd, prid, 2
cmp dword r10m, 0xfff ; if (bpc == 12)
cmove prid, tmpd ; pri >>= 2
mov tmpd, r8m ; damping
and prid, 4
sub tmpd, 31
vpbroadcastd m9, [base+pri_taps+priq+8*0]
vpbroadcastd m10, [base+pri_taps+priq+8*1]
test secd, secd
jz .pri_only
%if WIN64
movaps r8m, xmm13
vpbroadcastw m13, secm
movaps r4m, xmm11
movaps r6m, xmm12
%else
movd xm0, secd
vpbroadcastw m13, xm0
%endif
lzcnt secd, secd
xor prid, prid
add pridmpd, tmpd
cmovs pridmpd, prid
add secd, tmpd
lea tmpq, [px]
mov [pri_shift], pridmpq
mov [sec_shift], secq
%rep %1*%2/16
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
%endrep
%if WIN64
movaps xmm11, r4m
movaps xmm12, r6m
movaps xmm13, r8m
%endif
jmp .pri_end
.pri_only:
add pridmpd, tmpd
cmovs pridmpd, secd
lea tmpq, [px]
mov [pri_shift], pridmpq
%rep %1*%2/16
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
%endrep
.pri_end:
%if WIN64
movaps xmm9, [rsp+16*0]
movaps xmm10, [rsp+16*1]
%endif
.end:
RET
.sec_only:
mov tmpd, r8m ; damping
%if WIN64
vpbroadcastw m6, secm
%else
movd xm6, secd
vpbroadcastw m6, xm6
%endif
tzcnt secd, secd
sub tmpd, secd
mov [sec_shift], tmpq
lea tmpq, [px]
%rep %1*%2/16
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
%endrep
jmp .end
%if %1 == %2
ALIGN function_align
.pri:
movsx offq, byte [dirq+4] ; off_k0
%if %1 == 4
mova m1, [tmpq+32*0]
punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
%else
mova xm1, [tmpq+32*0]
vinserti128 m1, [tmpq+32*1], 1
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+5] ; off_k1
psubw m2, m1 ; diff_k0p0
psubw m3, m1 ; diff_k0p1
pabsw m4, m2 ; adiff_k0p0
psrlw m5, m4, [pri_shift+gprsize]
psubusw m0, m6, m5
pabsw m5, m3 ; adiff_k0p1
pminsw m0, m4
psrlw m4, m5, [pri_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0p0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
psubw m4, m1 ; diff_k1p0
psubw m5, m1 ; diff_k1p1
psignw m2, m3 ; constrain(diff_k0p1)
pabsw m3, m4 ; adiff_k1p0
paddw m0, m2 ; constrain(diff_k0)
psrlw m2, m3, [pri_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1p1
pminsw m7, m3
psrlw m3, m2, [pri_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1p0)
psubusw m4, m6, m3
pminsw m4, m2
psignw m4, m5 ; constrain(diff_k1p1)
paddw m7, m4 ; constrain(diff_k1)
pmullw m0, m9 ; pri_tap_k0
pmullw m7, m10 ; pri_tap_k1
paddw m0, m7 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
add tmpq, 32*2
paddw m0, m1
%if %1 == 4
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r9 ], xm1
lea dstq, [dstq+strideq*4]
%else
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
%endif
ret
ALIGN function_align
.sec:
movsx offq, byte [dirq+8] ; off1_k0
%if %1 == 4
mova m1, [tmpq+32*0]
punpcklqdq m1, [tmpq+32*1]
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
%else
mova xm1, [tmpq+32*0]
vinserti128 m1, [tmpq+32*1], 1
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+0] ; off2_k0
psubw m2, m1 ; diff_k0s0
psubw m3, m1 ; diff_k0s1
pabsw m4, m2 ; adiff_k0s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m0, m6, m5
pabsw m5, m3 ; adiff_k0s1
pminsw m0, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+9] ; off1_k1
psubw m4, m1 ; diff_k0s2
psubw m5, m1 ; diff_k0s3
psignw m2, m3 ; constrain(diff_k0s1)
pabsw m3, m4 ; adiff_k0s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k0s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k0s2)
psubusw m4, m6, m3
pminsw m4, m2
%if %1 == 4
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
%else
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+1] ; off2_k1
paddw m0, m7
psignw m4, m5 ; constrain(diff_k0s3)
paddw m0, m4 ; constrain(diff_k0)
psubw m2, m1 ; diff_k1s0
psubw m3, m1 ; diff_k1s1
paddw m0, m0 ; sec_tap_k0
pabsw m4, m2 ; adiff_k1s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m7, m6, m5
pabsw m5, m3 ; adiff_k1s1
pminsw m7, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m7, m2 ; constrain(diff_k1s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
paddw m0, m7
psubw m4, m1 ; diff_k1s2
psubw m5, m1 ; diff_k1s3
psignw m2, m3 ; constrain(diff_k1s1)
pabsw m3, m4 ; adiff_k1s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1s2)
psubusw m4, m6, m3
pminsw m4, m2
paddw m0, m7
psignw m4, m5 ; constrain(diff_k1s3)
paddw m0, m4 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
add tmpq, 32*2
paddw m0, m1
%if %1 == 4
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r9 ], xm1
lea dstq, [dstq+strideq*4]
%else
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
%endif
ret
ALIGN function_align
.pri_sec:
movsx offq, byte [dirq+8] ; off2_k0
%if %1 == 4
mova m1, [tmpq+32*0]
punpcklqdq m1, [tmpq+32*1]
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
%else
mova xm1, [dstq+strideq*0]
vinserti128 m1, [dstq+strideq*1], 1
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+0] ; off3_k0
pmaxsw m11, m2, m3
pminuw m12, m2, m3
psubw m2, m1 ; diff_k0s0
psubw m3, m1 ; diff_k0s1
pabsw m4, m2 ; adiff_k0s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m0, m13, m5
pabsw m5, m3 ; adiff_k0s1
pminsw m0, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0s0)
psubusw m2, m13, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+9] ; off2_k1
psignw m2, m3 ; constrain(diff_k0s1)
pmaxsw m11, m4
pminuw m12, m4
pmaxsw m11, m5
pminuw m12, m5
psubw m4, m1 ; diff_k0s2
psubw m5, m1 ; diff_k0s3
paddw m0, m2
pabsw m3, m4 ; adiff_k0s2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m13, m2
pabsw m2, m5 ; adiff_k0s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k0s2)
psubusw m4, m13, m3
pminsw m4, m2
%if %1 == 4
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
%else
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+1] ; off3_k1
paddw m0, m7
psignw m4, m5 ; constrain(diff_k0s3)
pmaxsw m11, m2
pminuw m12, m2
pmaxsw m11, m3
pminuw m12, m3
paddw m0, m4 ; constrain(diff_k0)
psubw m2, m1 ; diff_k1s0
psubw m3, m1 ; diff_k1s1
paddw m0, m0 ; sec_tap_k0
pabsw m4, m2 ; adiff_k1s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m7, m13, m5
pabsw m5, m3 ; adiff_k1s1
pminsw m7, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m7, m2 ; constrain(diff_k1s0)
psubusw m2, m13, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+4] ; off1_k0
paddw m0, m7
psignw m2, m3 ; constrain(diff_k1s1)
pmaxsw m11, m4
pminuw m12, m4
pmaxsw m11, m5
pminuw m12, m5
psubw m4, m1 ; diff_k1s2
psubw m5, m1 ; diff_k1s3
pabsw m3, m4 ; adiff_k1s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m13, m2
pabsw m2, m5 ; adiff_k1s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1s2)
psubusw m4, m13, m3
pminsw m4, m2
paddw m0, m7
%if %1 == 4
movu m2, [tmpq+offq+32*0]
punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
neg offq
movu m3, [tmpq+offq+32*0]
punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
%else
movu xm2, [tmpq+offq+32*0]
vinserti128 m2, [tmpq+offq+32*1], 1
neg offq
movu xm3, [tmpq+offq+32*0]
vinserti128 m3, [tmpq+offq+32*1], 1
%endif
movsx offq, byte [dirq+5] ; off1_k1
psignw m4, m5 ; constrain(diff_k1s3)
pmaxsw m11, m2
pminuw m12, m2
pmaxsw m11, m3
pminuw m12, m3
psubw m2, m1 ; diff_k0p0
psubw m3, m1 ; diff_k0p1
paddw m0, m4
pabsw m4, m2 ; adiff_k0p0
psrlw m5, m4, [pri_shift+gprsize]
psubusw m7, m6, m5
pabsw m5, m3 ; adiff_k0p1
pminsw m7, m4
psrlw m4, m5, [pri_shift+gprsize]
psignw m7, m2 ; constrain(diff_k0p0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movu m4, [tmpq+offq+32*0]
punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
neg offq
movu m5, [tmpq+offq+32*0]
punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
%else
movu xm4, [tmpq+offq+32*0]
vinserti128 m4, [tmpq+offq+32*1], 1
neg offq
movu xm5, [tmpq+offq+32*0]
vinserti128 m5, [tmpq+offq+32*1], 1
%endif
psignw m2, m3 ; constrain(diff_k0p1)
paddw m7, m2 ; constrain(diff_k0)
pmaxsw m11, m4
pminuw m12, m4
pmaxsw m11, m5
pminuw m12, m5
psubw m4, m1 ; diff_k1p0
psubw m5, m1 ; diff_k1p1
pabsw m3, m4 ; adiff_k1p0
pmullw m7, m9 ; pri_tap_k0
paddw m0, m7
psrlw m2, m3, [pri_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1p1
pminsw m7, m3
psrlw m3, m2, [pri_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1p0)
psubusw m4, m6, m3
pminsw m4, m2
psignw m4, m5 ; constrain(diff_k1p1)
paddw m7, m4 ; constrain(diff_k1)
pmullw m7, m10 ; pri_tap_k1
paddw m0, m7 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
add tmpq, 32*2
pmaxsw m11, m1
pminuw m12, m1
paddw m0, m1
pminsw m0, m11
pmaxsw m0, m12
%if %1 == 4
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r9 ], xm1
lea dstq, [dstq+strideq*4]
%else
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
%endif
ret
%endif
%endmacro
INIT_YMM avx2
cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
pri, sec, edge
%if WIN64
%define px rsp+16*6
%define offq r8
%define pri_shift rsp+16*2
%define sec_shift rsp+16*3
%else
%define px rsp+16*4
%define offq r4
%define pri_shift rsp+16*0
%define sec_shift rsp+16*1
%endif
%define base r8-dir_table4
mov edged, r9m
lea r8, [dir_table4]
movu xm0, [dstq+strideq*0]
movu xm1, [dstq+strideq*1]
lea r9, [strideq*3]
movu xm2, [dstq+strideq*2]
movu xm3, [dstq+r9 ]
vpbroadcastd m7, [base+pw_m16384]
mova [px+16*0+0], xm0
mova [px+16*1+0], xm1
mova [px+16*2+0], xm2
mova [px+16*3+0], xm3
test edgeb, 4 ; HAVE_TOP
jz .no_top
movu xm0, [topq+strideq*0]
movu xm1, [topq+strideq*1]
mova [px-16*2+0], xm0
mova [px-16*1+0], xm1
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd xm0, [topq+strideq*0-4]
movd xm1, [topq+strideq*1-4]
movd [px-16*2-4], xm0
movd [px-16*1-4], xm1
jmp .top_done
.no_top:
mova [px-16*2+0], m7
.top_no_left:
movd [px-16*2-4], xm7
movd [px-16*1-4], xm7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
movu xm0, [botq+strideq*0]
movu xm1, [botq+strideq*1]
mova [px+16*4+0], xm0
mova [px+16*5+0], xm1
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd xm0, [botq+strideq*0-4]
movd xm1, [botq+strideq*1-4]
movd [px+16*4-4], xm0
movd [px+16*5-4], xm1
jmp .bottom_done
.no_bottom:
mova [px+16*4+0], m7
.bottom_no_left:
movd [px+16*4-4], xm7
movd [px+16*5-4], xm7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movd xm0, [leftq+4*0]
movd xm1, [leftq+4*1]
movd xm2, [leftq+4*2]
movd xm3, [leftq+4*3]
movd [px+16*0-4], xm0
movd [px+16*1-4], xm1
movd [px+16*2-4], xm2
movd [px+16*3-4], xm3
jmp .left_done
.no_left:
REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
.padding_done:
CDEF_FILTER 4, 4
cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
pri, sec, edge
mov edged, r9m
movu xm0, [dstq+strideq*0]
movu xm1, [dstq+strideq*1]
lea r9, [strideq*3]
movu xm2, [dstq+strideq*2]
movu xm3, [dstq+r9 ]
lea r6, [dstq+strideq*4]
movu xm4, [r6 +strideq*0]
movu xm5, [r6 +strideq*1]
movu xm6, [r6 +strideq*2]
movu xm7, [r6 +r9 ]
lea r8, [dir_table4]
mova [px+16*0+0], xm0
mova [px+16*1+0], xm1
mova [px+16*2+0], xm2
mova [px+16*3+0], xm3
mova [px+16*4+0], xm4
mova [px+16*5+0], xm5
mova [px+16*6+0], xm6
mova [px+16*7+0], xm7
vpbroadcastd m7, [base+pw_m16384]
test edgeb, 4 ; HAVE_TOP
jz .no_top
movu xm0, [topq+strideq*0]
movu xm1, [topq+strideq*1]
mova [px-16*2+0], xm0
mova [px-16*1+0], xm1
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd xm0, [topq+strideq*0-4]
movd xm1, [topq+strideq*1-4]
movd [px-16*2-4], xm0
movd [px-16*1-4], xm1
jmp .top_done
.no_top:
mova [px-16*2+0], m7
.top_no_left:
movd [px-16*2-4], xm7
movd [px-16*1-4], xm7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
movu xm0, [botq+strideq*0]
movu xm1, [botq+strideq*1]
mova [px+16*8+0], xm0
mova [px+16*9+0], xm1
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd xm0, [botq+strideq*0-4]
movd xm1, [botq+strideq*1-4]
movd [px+16*8-4], xm0
movd [px+16*9-4], xm1
jmp .bottom_done
.no_bottom:
mova [px+16*8+0], m7
.bottom_no_left:
movd [px+16*8-4], xm7
movd [px+16*9-4], xm7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movd xm0, [leftq+4*0]
movd xm1, [leftq+4*1]
movd xm2, [leftq+4*2]
movd xm3, [leftq+4*3]
movd [px+16*0-4], xm0
movd [px+16*1-4], xm1
movd [px+16*2-4], xm2
movd [px+16*3-4], xm3
movd xm0, [leftq+4*4]
movd xm1, [leftq+4*5]
movd xm2, [leftq+4*6]
movd xm3, [leftq+4*7]
movd [px+16*4-4], xm0
movd [px+16*5-4], xm1
movd [px+16*6-4], xm2
movd [px+16*7-4], xm3
jmp .left_done
.no_left:
REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
.padding_done:
CDEF_FILTER 4, 8
cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
pri, sec, edge
%if WIN64
%define px rsp+32*4
%else
%define px rsp+32*3
%endif
%define base r8-dir_table8
mov edged, r9m
movu m0, [dstq+strideq*0]
movu m1, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movu m2, [r6 +strideq*0]
movu m3, [r6 +strideq*1]
lea r6, [r6 +strideq*2]
movu m4, [r6 +strideq*0]
movu m5, [r6 +strideq*1]
lea r6, [r6 +strideq*2]
movu m6, [r6 +strideq*0]
movu m7, [r6 +strideq*1]
lea r8, [dir_table8]
mova [px+32*0+0], m0
mova [px+32*1+0], m1
mova [px+32*2+0], m2
mova [px+32*3+0], m3
mova [px+32*4+0], m4
mova [px+32*5+0], m5
mova [px+32*6+0], m6
mova [px+32*7+0], m7
vpbroadcastd m7, [base+pw_m16384]
test edgeb, 4 ; HAVE_TOP
jz .no_top
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
mova [px-32*2+0], m0
mova [px-32*1+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd xm0, [topq+strideq*0-4]
movd xm1, [topq+strideq*1-4]
movd [px-32*2-4], xm0
movd [px-32*1-4], xm1
jmp .top_done
.no_top:
mova [px-32*2+0], m7
mova [px-32*1+0], m7
.top_no_left:
movd [px-32*2-4], xm7
movd [px-32*1-4], xm7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
movu m0, [botq+strideq*0]
movu m1, [botq+strideq*1]
mova [px+32*8+0], m0
mova [px+32*9+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd xm0, [botq+strideq*0-4]
movd xm1, [botq+strideq*1-4]
movd [px+32*8-4], xm0
movd [px+32*9-4], xm1
jmp .bottom_done
.no_bottom:
mova [px+32*8+0], m7
mova [px+32*9+0], m7
.bottom_no_left:
movd [px+32*8-4], xm7
movd [px+32*9-4], xm7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movd xm0, [leftq+4*0]
movd xm1, [leftq+4*1]
movd xm2, [leftq+4*2]
movd xm3, [leftq+4*3]
movd [px+32*0-4], xm0
movd [px+32*1-4], xm1
movd [px+32*2-4], xm2
movd [px+32*3-4], xm3
movd xm0, [leftq+4*4]
movd xm1, [leftq+4*5]
movd xm2, [leftq+4*6]
movd xm3, [leftq+4*7]
movd [px+32*4-4], xm0
movd [px+32*5-4], xm1
movd [px+32*6-4], xm2
movd [px+32*7-4], xm3
jmp .left_done
.no_left:
REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
.padding_done:
CDEF_FILTER 8, 8
cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
lea r6, [dir_shift]
shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
vpbroadcastd m4, [r6+bdmaxq*4]
lea r6, [strideq*3]
mova xm0, [srcq+strideq*0]
mova xm1, [srcq+strideq*1]
mova xm2, [srcq+strideq*2]
mova xm3, [srcq+r6 ]
lea srcq, [srcq+strideq*4]
vinserti128 m0, [srcq+r6 ], 1
vinserti128 m1, [srcq+strideq*2], 1
vinserti128 m2, [srcq+strideq*1], 1
vinserti128 m3, [srcq+strideq*0], 1
REPX {pmulhuw x, m4}, m0, m1, m2, m3
jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
%endif ; ARCH_X86_64