blob: 07ea9567e1ab983bd54b2319cea75e0d16ae4a7e [file] [log] [blame]
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA
filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
pb_0_1: times 4 db 0, 1
pb_2_3: times 4 db 2, 3
pw_1: times 4 dw 1
pw_2: times 4 dw 2
pw_4: times 4 dw 4
pw_512: times 4 dw 512
pw_2048: times 4 dw 2048
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4)
%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64
JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32
JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64
cextern smooth_weights_1d_16bpc
cextern smooth_weights_2d_16bpc
cextern filter_intra_taps
SECTION .text
INIT_XMM ssse3
cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
LEA r5, ipred_dc_left_16bpc_ssse3_table
movd m4, wm
tzcnt wd, wm
add tlq, 2
movifnidn hd, hm
pxor m3, m3
pavgw m4, m3
movd m5, wd
movu m0, [tlq]
movsxd r6, [r5+wq*4]
add r6, r5
add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_left_16bpc_ssse3_table
mov hd, hm
movd m4, hm
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
pxor m3, m3
sub tlq, hq
pavgw m4, m3
movd m5, r6d
movu m0, [tlq]
movsxd r6, [r5+r6*4]
add r6, r5
add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
.h64:
movu m2, [tlq+112]
movu m1, [tlq+ 96]
paddw m0, m2
movu m2, [tlq+ 80]
paddw m1, m2
movu m2, [tlq+ 64]
paddw m0, m2
paddw m0, m1
.h32:
movu m1, [tlq+ 48]
movu m2, [tlq+ 32]
paddw m1, m2
paddw m0, m1
.h16:
movu m1, [tlq+ 16]
paddw m0, m1
.h8:
movhlps m1, m0
paddw m0, m1
.h4:
punpcklwd m0, m3
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m4, m0, q1032
paddd m0, m4
psrld m0, m5
lea stride3q, [strideq*3]
pshuflw m0, m0, q0000
punpcklqdq m0, m0
jmp wq
cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd m4, r5d
tzcnt r5d, r5d
movd m5, r5d
LEA r5, ipred_dc_16bpc_ssse3_table
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+5*4]
pxor m3, m3
psrlw m4, 1
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movq m0, [tlq-8]
jmp wq
.w4:
movq m1, [tlq+2]
paddw m1, m0
punpckhwd m0, m3
punpcklwd m1, m3
paddd m0, m1
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m1, m0, q1032
paddd m0, m1
cmp hd, 4
jg .w4_mul
psrlw m0, 3
jmp .w4_end
.w4_mul:
mov r2d, 0xAAAB
mov r3d, 0x6667
cmp hd, 16
cmove r2d, r3d
psrld m0, 2
movd m1, r2d
pmulhuw m0, m1
psrlw m0, 1
.w4_end:
pshuflw m0, m0, q0000
.s4:
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
.h8:
mova m0, [tlq-16]
jmp wq
.w8:
movu m1, [tlq+2]
paddw m0, m1
punpcklwd m1, m0, m3
punpckhwd m0, m3
paddd m0, m1
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m1, m0, q1032
paddd m0, m1
psrld m0, m5
cmp hd, 8
je .w8_end
mov r2d, 0xAAAB
mov r3d, 0x6667
cmp hd, 32
cmove r2d, r3d
movd m1, r2d
pmulhuw m0, m1
psrlw m0, 1
.w8_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s8:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
.h16:
mova m0, [tlq-32]
paddw m0, [tlq-16]
jmp wq
.w16:
movu m1, [tlq+ 2]
movu m2, [tlq+18]
paddw m1, m2
paddw m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
paddd m0, m1
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m1, m0, q1032
paddd m0, m1
psrld m0, m5
cmp hd, 16
je .w16_end
mov r2d, 0xAAAB
mov r3d, 0x6667
test hd, 8|32
cmovz r2d, r3d
movd m1, r2d
pmulhuw m0, m1
psrlw m0, 1
.w16_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s16c:
mova m1, m0
.s16:
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
mova [dstq+strideq*1+16*0], m0
mova [dstq+strideq*1+16*1], m1
mova [dstq+strideq*2+16*0], m0
mova [dstq+strideq*2+16*1], m1
mova [dstq+stride3q +16*0], m0
mova [dstq+stride3q +16*1], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
.h32:
mova m0, [tlq-64]
paddw m0, [tlq-48]
paddw m0, [tlq-32]
paddw m0, [tlq-16]
jmp wq
.w32:
movu m1, [tlq+ 2]
movu m2, [tlq+18]
paddw m1, m2
movu m2, [tlq+34]
paddw m0, m2
movu m2, [tlq+50]
paddw m1, m2
paddw m0, m1
punpcklwd m1, m0, m3
punpckhwd m0, m3
paddd m0, m1
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m1, m0, q1032
paddd m0, m1
psrld m0, m5
cmp hd, 32
je .w32_end
mov r2d, 0xAAAB
mov r3d, 0x6667
cmp hd, 8
cmove r2d, r3d
movd m1, r2d
pmulhuw m0, m1
psrlw m0, 1
.w32_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s32c:
mova m1, m0
mova m2, m0
mova m3, m0
.s32:
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
mova [dstq+strideq*0+16*2], m2
mova [dstq+strideq*0+16*3], m3
mova [dstq+strideq*1+16*0], m0
mova [dstq+strideq*1+16*1], m1
mova [dstq+strideq*1+16*2], m2
mova [dstq+strideq*1+16*3], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .s32
RET
.h64:
mova m0, [tlq-128]
mova m1, [tlq-112]
paddw m0, [tlq- 96]
paddw m1, [tlq- 80]
paddw m0, [tlq- 64]
paddw m1, [tlq- 48]
paddw m0, [tlq- 32]
paddw m1, [tlq- 16]
paddw m0, m1
jmp wq
.w64:
movu m1, [tlq+ 2]
movu m2, [tlq+ 18]
paddw m1, m2
movu m2, [tlq+ 34]
paddw m0, m2
movu m2, [tlq+ 50]
paddw m1, m2
movu m2, [tlq+ 66]
paddw m0, m2
movu m2, [tlq+ 82]
paddw m1, m2
movu m2, [tlq+ 98]
paddw m0, m2
movu m2, [tlq+114]
paddw m1, m2
paddw m0, m1
punpcklwd m1, m0, m3
punpckhwd m0, m3
paddd m0, m1
paddd m4, m0
punpckhqdq m0, m0
paddd m0, m4
pshuflw m1, m0, q1032
paddd m0, m1
psrld m0, m5
cmp hd, 64
je .w64_end
mov r2d, 0xAAAB
mov r3d, 0x6667
cmp hd, 16
cmove r2d, r3d
movd m1, r2d
pmulhuw m0, m1
psrlw m0, 1
.w64_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s64:
mova [dstq+16*0], m0
mova [dstq+16*1], m0
mova [dstq+16*2], m0
mova [dstq+16*3], m0
mova [dstq+16*4], m0
mova [dstq+16*5], m0
mova [dstq+16*6], m0
mova [dstq+16*7], m0
add dstq, strideq
dec hd
jg .s64
RET
cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
mov r6d, r8m
LEA r5, ipred_dc_128_16bpc_ssse3_table
tzcnt wd, wm
shr r6d, 11
movifnidn hd, hm
movsxd wq, [r5+wq*4]
movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_16bpc_ssse3_table
movifnidn hd, hm
movu m0, [tlq+ 2]
movu m1, [tlq+ 18]
movu m2, [tlq+ 34]
movu m3, [tlq+ 50]
cmp wd, 64
je .w64
tzcnt wd, wd
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w64:
WIN64_SPILL_XMM 8
movu m4, [tlq+ 66]
movu m5, [tlq+ 82]
movu m6, [tlq+ 98]
movu m7, [tlq+114]
.w64_loop:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
mova [dstq+16*4], m4
mova [dstq+16*5], m5
mova [dstq+16*6], m6
mova [dstq+16*7], m7
add dstq, strideq
dec hd
jg .w64_loop
RET
cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
%define base r5-ipred_h_16bpc_ssse3_table
tzcnt wd, wm
LEA r5, ipred_h_16bpc_ssse3_table
movifnidn hd, hm
movsxd wq, [r5+wq*4]
movddup m2, [base+pb_0_1]
movddup m3, [base+pb_2_3]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w4:
sub tlq, 8
movq m3, [tlq]
pshuflw m0, m3, q3333
pshuflw m1, m3, q2222
pshuflw m2, m3, q1111
pshuflw m3, m3, q0000
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m1
movq [dstq+strideq*2], m2
movq [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
RET
.w8:
sub tlq, 8
movq m3, [tlq]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
.w16:
sub tlq, 4
movd m1, [tlq]
pshufb m0, m1, m3
pshufb m1, m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m0
mova [dstq+strideq*1+16*0], m1
mova [dstq+strideq*1+16*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
sub tlq, 4
movd m1, [tlq]
pshufb m0, m1, m3
pshufb m1, m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m0
mova [dstq+strideq*0+16*2], m0
mova [dstq+strideq*0+16*3], m0
mova [dstq+strideq*1+16*0], m1
mova [dstq+strideq*1+16*1], m1
mova [dstq+strideq*1+16*2], m1
mova [dstq+strideq*1+16*3], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32
RET
.w64:
sub tlq, 2
movd m0, [tlq]
pshufb m0, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m0
mova [dstq+16*2], m0
mova [dstq+16*3], m0
mova [dstq+16*4], m0
mova [dstq+16*5], m0
mova [dstq+16*6], m0
mova [dstq+16*7], m0
add dstq, strideq
dec hd
jg .w64
RET
cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
%define base r5-ipred_paeth_16bpc_ssse3_table
movifnidn hd, hm
pshuflw m4, [tlq], q0000
mov leftq, tlq
add hd, hd
punpcklqdq m4, m4 ; topleft
sub leftq, hq
and wd, ~7
jnz .w8
movddup m5, [tlq+2] ; top
psubw m6, m5, m4
pabsw m7, m6
.w4_loop:
movd m1, [leftq+hq-4]
punpcklwd m1, m1
punpckldq m1, m1 ; left
%macro PAETH 0
paddw m0, m6, m1
psubw m2, m4, m0 ; tldiff
psubw m0, m5 ; tdiff
pabsw m2, m2
pabsw m0, m0
pminsw m2, m0
pcmpeqw m0, m2
pand m3, m5, m0
pandn m0, m4
por m0, m3
pcmpgtw m3, m7, m2
pand m0, m3
pandn m3, m1
por m0, m3
%endmacro
PAETH
movhps [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2*2
jg .w4_loop
RET
.w8:
%if ARCH_X86_32
PUSH r6
%define r7d hm
%assign regs_used 7
%elif WIN64
movaps r4m, m8
PUSH r7
%assign regs_used 8
%endif
%if ARCH_X86_64
movddup m8, [pb_0_1]
%endif
lea tlq, [tlq+wq*2+2]
neg wq
mov r7d, hd
.w8_loop0:
movu m5, [tlq+wq*2]
mov r6, dstq
add dstq, 16
psubw m6, m5, m4
pabsw m7, m6
.w8_loop:
movd m1, [leftq+hq-2]
%if ARCH_X86_64
pshufb m1, m8
%else
pshuflw m1, m1, q0000
punpcklqdq m1, m1
%endif
PAETH
mova [r6], m0
add r6, strideq
sub hd, 1*2
jg .w8_loop
mov hd, r7d
add wq, 8
jl .w8_loop0
%if WIN64
movaps m8, r4m
%endif
RET
%if ARCH_X86_64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 4
%endif
cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
LEA weightsq, smooth_weights_1d_16bpc
mov hd, hm
lea weightsq, [weightsq+hq*4]
neg hq
movd m5, [tlq+hq*2] ; bottom
pshuflw m5, m5, q0000
punpcklqdq m5, m5
cmp wd, 4
jne .w8
movddup m4, [tlq+2] ; top
lea r3, [strideq*3]
psubw m4, m5 ; top - bottom
.w4_loop:
movq m1, [weightsq+hq*2]
punpcklwd m1, m1
pshufd m0, m1, q1100
punpckhdq m1, m1
pmulhrsw m0, m4
pmulhrsw m1, m4
paddw m0, m5
paddw m1, m5
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r3 ], m1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w4_loop
RET
.w8:
%if ARCH_X86_32
PUSH r6
%assign regs_used 7
mov hm, hq
%define hq hm
%elif WIN64
PUSH r7
%assign regs_used 8
%endif
.w8_loop0:
mov t0, hq
movu m4, [tlq+2]
add tlq, 16
mov r6, dstq
add dstq, 16
psubw m4, m5
.w8_loop:
movq m3, [weightsq+t0*2]
punpcklwd m3, m3
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
REPX {paddw x, m5}, m0, m1, m2, m3
mova [r6+strideq*0], m0
mova [r6+strideq*1], m1
lea r6, [r6+strideq*2]
mova [r6+strideq*0], m2
mova [r6+strideq*1], m3
lea r6, [r6+strideq*2]
add t0, 4
jl .w8_loop
sub wd, 8
jg .w8_loop0
RET
cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
LEA weightsq, smooth_weights_1d_16bpc
mov wd, wm
movifnidn hd, hm
movd m5, [tlq+wq*2] ; right
sub tlq, 8
add hd, hd
pshuflw m5, m5, q0000
sub tlq, hq
punpcklqdq m5, m5
cmp wd, 4
jne .w8
movddup m4, [weightsq+4*2]
lea r3, [strideq*3]
.w4_loop:
movq m1, [tlq+hq] ; left
punpcklwd m1, m1
psubw m1, m5 ; left - right
pshufd m0, m1, q3322
punpckldq m1, m1
pmulhrsw m0, m4
pmulhrsw m1, m4
paddw m0, m5
paddw m1, m5
movhps [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movhps [dstq+strideq*2], m1
movq [dstq+r3 ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4*2
jg .w4_loop
RET
.w8:
lea weightsq, [weightsq+wq*4]
neg wq
%if ARCH_X86_32
PUSH r6
%assign regs_used 7
%define hd hm
%elif WIN64
PUSH r7
%assign regs_used 8
%endif
.w8_loop0:
mov t0d, hd
mova m4, [weightsq+wq*2]
mov r6, dstq
add dstq, 16
.w8_loop:
movq m3, [tlq+t0*(1+ARCH_X86_32)]
punpcklwd m3, m3
psubw m3, m5
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
REPX {paddw x, m5}, m0, m1, m2, m3
mova [r6+strideq*0], m0
mova [r6+strideq*1], m1
lea r6, [r6+strideq*2]
mova [r6+strideq*0], m2
mova [r6+strideq*1], m3
lea r6, [r6+strideq*2]
sub t0d, 4*(1+ARCH_X86_64)
jg .w8_loop
add wq, 8
jl .w8_loop0
RET
%if ARCH_X86_64
DECLARE_REG_TMP 10
%else
DECLARE_REG_TMP 3
%endif
cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
h_weights, v_weights, top
LEA h_weightsq, smooth_weights_2d_16bpc
mov wd, wm
mov hd, hm
movd m7, [tlq+wq*2] ; right
lea v_weightsq, [h_weightsq+hq*8]
neg hq
movd m6, [tlq+hq*2] ; bottom
pshuflw m7, m7, q0000
pshuflw m6, m6, q0000
cmp wd, 4
jne .w8
movq m4, [tlq+2] ; top
mova m5, [h_weightsq+4*4]
punpcklwd m4, m6 ; top, bottom
pxor m6, m6
.w4_loop:
movq m1, [v_weightsq+hq*4]
sub tlq, 4
movd m3, [tlq] ; left
pshufd m0, m1, q0000
pshufd m1, m1, q1111
pmaddwd m0, m4
punpcklwd m3, m7 ; left, right
pmaddwd m1, m4
pshufd m2, m3, q1111
pshufd m3, m3, q0000
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m2
paddd m1, m3
psrld m0, 8
psrld m1, 8
packssdw m0, m1
pavgw m0, m6
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w4_loop
RET
.w8:
%if ARCH_X86_32
lea h_weightsq, [h_weightsq+wq*4]
mov t0, tlq
mov r1m, tlq
mov r2m, hq
%define m8 [h_weightsq+16*0]
%define m9 [h_weightsq+16*1]
%else
%if WIN64
movaps r4m, m8
movaps r6m, m9
PUSH r7
PUSH r8
%endif
PUSH r9
PUSH r10
%assign regs_used 11
lea h_weightsq, [h_weightsq+wq*8]
lea topq, [tlq+wq*2]
neg wq
mov r8, tlq
mov r9, hq
%endif
punpcklqdq m6, m6
.w8_loop0:
%if ARCH_X86_32
movu m5, [t0+2]
add t0, 16
mov r0m, t0
%else
movu m5, [topq+wq*2+2]
mova m8, [h_weightsq+wq*4+16*0]
mova m9, [h_weightsq+wq*4+16*1]
%endif
mov t0, dstq
add dstq, 16
punpcklwd m4, m5, m6
punpckhwd m5, m6
.w8_loop:
movd m1, [v_weightsq+hq*4]
sub tlq, 2
movd m3, [tlq] ; left
pshufd m1, m1, q0000
pmaddwd m0, m4, m1
pshuflw m3, m3, q0000
pmaddwd m1, m5
punpcklwd m3, m7 ; left, right
pmaddwd m2, m8, m3
pmaddwd m3, m9
paddd m0, m2
paddd m1, m3
psrld m0, 8
psrld m1, 8
packssdw m0, m1
pxor m1, m1
pavgw m0, m1
mova [t0], m0
add t0, strideq
inc hq
jl .w8_loop
%if ARCH_X86_32
mov t0, r0m
mov tlq, r1m
add h_weightsq, 16*2
mov hq, r2m
sub dword wm, 8
jg .w8_loop0
%else
mov tlq, r8
mov hq, r9
add wq, 8
jl .w8_loop0
%endif
%if WIN64
movaps m8, r4m
movaps m9, r6m
%endif
RET
%if ARCH_X86_64
cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
%else
cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
%define m8 [esp+16*0]
%define m9 [esp+16*1]
%define m10 [esp+16*2]
%define m11 [esp+16*3]
%define m12 [esp+16*4]
%define m13 [esp+16*5]
%define m14 [esp+16*6]
%define m15 [esp+16*7]
%endif
%define base r6-$$
movifnidn hd, hm
movd m6, r8m ; bitdepth_max
%ifidn filterd, filterm
movzx filterd, filterb
%else
movzx filterd, byte filterm
%endif
LEA r6, $$
shl filterd, 6
movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
mova m1, [base+filter_intra_taps+filterq+16*0]
mova m2, [base+filter_intra_taps+filterq+16*1]
mova m3, [base+filter_intra_taps+filterq+16*2]
mova m4, [base+filter_intra_taps+filterq+16*3]
pxor m5, m5
%if ARCH_X86_64
punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper
punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid
punpcklbw m10, m5, m2 ; having to perform sign-extension.
punpckhbw m11, m5, m2
punpcklbw m12, m5, m3
punpckhbw m13, m5, m3
punpcklbw m14, m5, m4
punpckhbw m15, m5, m4
%else
punpcklbw m7, m5, m1
mova m8, m7
punpckhbw m7, m5, m1
mova m9, m7
punpcklbw m7, m5, m2
mova m10, m7
punpckhbw m7, m5, m2
mova m11, m7
punpcklbw m7, m5, m3
mova m12, m7
punpckhbw m7, m5, m3
mova m13, m7
punpcklbw m7, m5, m4
mova m14, m7
punpckhbw m7, m5, m4
mova m15, m7
%endif
mova m7, [base+filter_shuf]
add hd, hd
mov r5, dstq
pshuflw m6, m6, q0000
mov r6, tlq
punpcklqdq m6, m6
sub tlq, hq
.left_loop:
pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __
pshufd m1, m0, q0000
pmaddwd m2, m8, m1
pmaddwd m1, m9
pshufd m4, m0, q1111
pmaddwd m3, m10, m4
pmaddwd m4, m11
paddd m2, m3
paddd m1, m4
pshufd m4, m0, q2222
pmaddwd m3, m12, m4
pmaddwd m4, m13
paddd m2, m3
paddd m1, m4
pshufd m3, m0, q3333
pmaddwd m0, m14, m3
pmaddwd m3, m15
paddd m0, m2
paddd m1, m3
psrad m0, 11 ; x >> 3
psrad m1, 11
packssdw m0, m1
pmaxsw m0, m5
pavgw m0, m5 ; (x + 8) >> 4
pminsw m0, m6
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movlps m0, [tlq+hq-10]
lea dstq, [dstq+strideq*2]
sub hd, 2*2
jg .left_loop
sub wd, 4
jz .end
sub tld, r6d ; -h*2
sub r6, r5 ; tl-dst
.right_loop0:
add r5, 8
mov hd, tld
movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
mov dstq, r5
.right_loop:
pshufd m2, m0, q0000
pmaddwd m1, m8, m2
pmaddwd m2, m9
pshufd m4, m0, q1111
pmaddwd m3, m10, m4
pmaddwd m4, m11
pinsrw m0, [dstq+strideq*0-2], 5
paddd m1, m3
paddd m2, m4
pshufd m0, m0, q2222
movddup m4, [dstq+strideq*1-8]
pmaddwd m3, m12, m0
pmaddwd m0, m13
paddd m1, m3
paddd m0, m2
pshuflw m2, m4, q3333
punpcklwd m2, m5
pmaddwd m3, m14, m2
pmaddwd m2, m15
paddd m1, m3
paddd m0, m2
psrad m1, 11
psrad m0, 11
packssdw m0, m1
pmaxsw m0, m5
pavgw m0, m5
pminsw m0, m6
movhps [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
palignr m0, m4, 14
lea dstq, [dstq+strideq*2]
add hd, 2*2
jl .right_loop
sub wd, 4
jg .right_loop0
.end:
RET
%if UNIX64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 5
%endif
cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
LEA t0, ipred_cfl_left_16bpc_ssse3_table
movd m4, wd
tzcnt wd, wd
movifnidn hd, hm
add tlq, 2
movsxd r6, [t0+wq*4]
movd m5, wd
jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
movifnidn hd, hm
LEA t0, ipred_cfl_left_16bpc_ssse3_table
tzcnt wd, wm
lea r6d, [hq*2]
movd m4, hd
sub tlq, r6
tzcnt r6d, hd
movd m5, r6d
movsxd r6, [t0+r6*4]
.start:
movd m7, r7m
movu m0, [tlq]
add r6, t0
add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
movsxd wq, [t0+wq*4]
pxor m6, m6
pshuflw m7, m7, q0000
pcmpeqw m3, m3
add wq, t0
movifnidn acq, acmp
pavgw m4, m6
punpcklqdq m7, m7
jmp r6
.h32:
movu m1, [tlq+48]
movu m2, [tlq+32]
paddw m0, m1
paddw m0, m2
.h16:
movu m1, [tlq+16]
paddw m0, m1
.h8:
pshufd m1, m0, q1032
paddw m0, m1
.h4:
pmaddwd m0, m3
psubd m4, m0
pshuflw m0, m4, q1032
paddd m0, m4
psrld m0, m5
pshuflw m0, m0, q0000
punpcklqdq m0, m0
jmp wq
%macro IPRED_CFL 2 ; dst, src
pabsw m%1, m%2
pmulhrsw m%1, m2
psignw m%2, m1
psignw m%1, m%2
paddw m%1, m0
pmaxsw m%1, m6
pminsw m%1, m7
%endmacro
cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
movifnidn hd, hm
tzcnt r6d, hd
lea t0d, [wq+hq]
movd m4, t0d
tzcnt t0d, t0d
movd m5, t0d
LEA t0, ipred_cfl_16bpc_ssse3_table
tzcnt wd, wd
movd m7, r7m
movsxd r6, [t0+r6*4]
movsxd wq, [t0+wq*4+4*4]
psrlw m4, 1
pxor m6, m6
pshuflw m7, m7, q0000
add r6, t0
add wq, t0
movifnidn acq, acmp
pcmpeqw m3, m3
punpcklqdq m7, m7
jmp r6
.h4:
movq m0, [tlq-8]
jmp wq
.w4:
movq m1, [tlq+2]
paddw m0, m1
pmaddwd m0, m3
psubd m4, m0
pshufd m0, m4, q1032
paddd m0, m4
pshuflw m4, m0, q1032
paddd m0, m4
cmp hd, 4
jg .w4_mul
psrld m0, 3
jmp .w4_end
.w4_mul:
mov r6d, 0xAAAB
mov r2d, 0x6667
cmp hd, 16
cmove r6d, r2d
movd m1, r6d
psrld m0, 2
pmulhuw m0, m1
psrlw m0, 1
.w4_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s4:
movd m1, alpham
lea r6, [strideq*3]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s4_loop:
mova m4, [acq+16*0]
mova m5, [acq+16*1]
add acq, 16*2
IPRED_CFL 3, 4
IPRED_CFL 4, 5
movq [dstq+strideq*0], m3
movhps [dstq+strideq*1], m3
movq [dstq+strideq*2], m4
movhps [dstq+r6 ], m4
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4_loop
RET
.h8:
mova m0, [tlq-16]
jmp wq
.w8:
movu m1, [tlq+2]
paddw m0, m1
pmaddwd m0, m3
psubd m4, m0
pshufd m0, m4, q1032
paddd m0, m4
pshuflw m4, m0, q1032
paddd m0, m4
psrld m0, m5
cmp hd, 8
je .w8_end
mov r6d, 0xAAAB
mov r2d, 0x6667
cmp hd, 32
cmove r6d, r2d
movd m1, r6d
pmulhuw m0, m1
psrlw m0, 1
.w8_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s8:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s8_loop:
mova m4, [acq+16*0]
mova m5, [acq+16*1]
add acq, 16*2
IPRED_CFL 3, 4
IPRED_CFL 4, 5
mova [dstq+strideq*0], m3
mova [dstq+strideq*1], m4
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .s8_loop
RET
.h16:
mova m0, [tlq-32]
paddw m0, [tlq-16]
jmp wq
.w16:
movu m1, [tlq+ 2]
movu m2, [tlq+18]
paddw m1, m2
paddw m0, m1
pmaddwd m0, m3
psubd m4, m0
pshufd m0, m4, q1032
paddd m0, m4
pshuflw m4, m0, q1032
paddd m0, m4
psrld m0, m5
cmp hd, 16
je .w16_end
mov r6d, 0xAAAB
mov r2d, 0x6667
test hd, 8|32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
psrlw m0, 1
.w16_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s16:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s16_loop:
mova m4, [acq+16*0]
mova m5, [acq+16*1]
add acq, 16*2
IPRED_CFL 3, 4
IPRED_CFL 4, 5
mova [dstq+16*0], m3
mova [dstq+16*1], m4
add dstq, strideq
dec hd
jg .s16_loop
RET
.h32:
mova m0, [tlq-64]
paddw m0, [tlq-48]
paddw m0, [tlq-32]
paddw m0, [tlq-16]
jmp wq
.w32:
movu m1, [tlq+ 2]
movu m2, [tlq+18]
paddw m1, m2
movu m2, [tlq+34]
paddw m1, m2
movu m2, [tlq+50]
paddw m1, m2
paddw m0, m1
pmaddwd m0, m3
psubd m4, m0
pshufd m0, m4, q1032
paddd m0, m4
pshuflw m4, m0, q1032
paddd m0, m4
psrld m0, m5
cmp hd, 32
je .w32_end
mov r6d, 0xAAAB
mov r2d, 0x6667
cmp hd, 8
cmove r6d, r2d
movd m1, r6d
pmulhuw m0, m1
psrlw m0, 1
.w32_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s32:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s32_loop:
mova m4, [acq+16*0]
mova m5, [acq+16*1]
IPRED_CFL 3, 4
IPRED_CFL 4, 5
mova [dstq+16*0], m3
mova [dstq+16*1], m4
mova m4, [acq+16*2]
mova m5, [acq+16*3]
add acq, 16*4
IPRED_CFL 3, 4
IPRED_CFL 4, 5
mova [dstq+16*2], m3
mova [dstq+16*3], m4
add dstq, strideq
dec hd
jg .s32_loop
RET
cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
tzcnt wd, wm
LEA t0, ipred_cfl_splat_16bpc_ssse3_table
mov r6d, r7m
movifnidn hd, hm
shr r6d, 11
movd m7, r7m
movsxd wq, [t0+wq*4]
movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
pshuflw m7, m7, q0000
pxor m6, m6
add wq, t0
movifnidn acq, acmp
punpcklqdq m7, m7
jmp wq
cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
movifnidn hpadd, hpadm
%if ARCH_X86_32 && PIC
pcmpeqw m5, m5
pabsw m5, m5
paddw m5, m5
%else
movddup m5, [pw_2]
%endif
mov hd, hm
shl hpadd, 2
pxor m4, m4
sub hd, hpadd
cmp dword wm, 8
mov r5, acq
jg .w16
je .w8
lea r3, [strideq*3]
.w4_loop:
pmaddwd m0, m5, [ypxq+strideq*0]
pmaddwd m1, m5, [ypxq+strideq*1]
pmaddwd m2, m5, [ypxq+strideq*2]
pmaddwd m3, m5, [ypxq+r3 ]
lea ypxq, [ypxq+strideq*4]
paddd m0, m1
paddd m2, m3
paddd m4, m0
packssdw m0, m2
paddd m4, m2
mova [acq], m0
add acq, 16
sub hd, 2
jg .w4_loop
test hpadd, hpadd
jz .dc
punpckhqdq m0, m0
pslld m2, 2
.w4_hpad:
mova [acq+16*0], m0
paddd m4, m2
mova [acq+16*1], m0
add acq, 16*2
sub hpadd, 4
jg .w4_hpad
jmp .dc
.w8:
%if ARCH_X86_32
cmp dword wpadm, 0
%else
test wpadd, wpadd
%endif
jnz .w8_wpad1
.w8_loop:
pmaddwd m0, m5, [ypxq+strideq*0+16*0]
pmaddwd m2, m5, [ypxq+strideq*1+16*0]
pmaddwd m1, m5, [ypxq+strideq*0+16*1]
pmaddwd m3, m5, [ypxq+strideq*1+16*1]
lea ypxq, [ypxq+strideq*2]
paddd m0, m2
paddd m1, m3
paddd m2, m0, m1
packssdw m0, m1
paddd m4, m2
mova [acq], m0
add acq, 16
dec hd
jg .w8_loop
.w8_hpad:
test hpadd, hpadd
jz .dc
pslld m2, 2
mova m1, m0
jmp .hpad
.w8_wpad1:
pmaddwd m0, m5, [ypxq+strideq*0]
pmaddwd m1, m5, [ypxq+strideq*1]
lea ypxq, [ypxq+strideq*2]
paddd m0, m1
pshufd m1, m0, q3333
paddd m2, m0, m1
packssdw m0, m1
paddd m4, m2
mova [acq], m0
add acq, 16
dec hd
jg .w8_wpad1
jmp .w8_hpad
.w16_wpad3:
pshufd m3, m0, q3333
mova m1, m3
mova m2, m3
jmp .w16_wpad_end
.w16_wpad2:
pshufd m1, m3, q3333
mova m2, m1
jmp .w16_wpad_end
.w16_wpad1:
pshufd m2, m1, q3333
jmp .w16_wpad_end
.w16:
movifnidn wpadd, wpadm
WIN64_SPILL_XMM 7
.w16_loop:
pmaddwd m0, m5, [ypxq+strideq*0+16*0]
pmaddwd m6, m5, [ypxq+strideq*1+16*0]
paddd m0, m6
cmp wpadd, 2
jg .w16_wpad3
pmaddwd m3, m5, [ypxq+strideq*0+16*1]
pmaddwd m6, m5, [ypxq+strideq*1+16*1]
paddd m3, m6
je .w16_wpad2
pmaddwd m1, m5, [ypxq+strideq*0+16*2]
pmaddwd m6, m5, [ypxq+strideq*1+16*2]
paddd m1, m6
jp .w16_wpad1
pmaddwd m2, m5, [ypxq+strideq*0+16*3]
pmaddwd m6, m5, [ypxq+strideq*1+16*3]
paddd m2, m6
.w16_wpad_end:
lea ypxq, [ypxq+strideq*2]
paddd m6, m0, m3
packssdw m0, m3
paddd m6, m1
mova [acq+16*0], m0
packssdw m1, m2
paddd m2, m6
mova [acq+16*1], m1
add acq, 16*2
paddd m4, m2
dec hd
jg .w16_loop
WIN64_RESTORE_XMM
add hpadd, hpadd
jz .dc
paddd m2, m2
.hpad:
mova [acq+16*0], m0
mova [acq+16*1], m1
paddd m4, m2
mova [acq+16*2], m0
mova [acq+16*3], m1
add acq, 16*4
sub hpadd, 4
jg .hpad
.dc:
sub r5, acq ; -w*h*2
pshufd m2, m4, q1032
tzcnt r1d, r5d
paddd m2, m4
sub r1d, 2
pshufd m4, m2, q2301
movd m0, r1d
paddd m2, m4
psrld m2, m0
pxor m0, m0
pavgw m2, m0
packssdw m2, m2
.dc_loop:
mova m0, [acq+r5+16*0]
mova m1, [acq+r5+16*1]
psubw m0, m2
psubw m1, m2
mova [acq+r5+16*0], m0
mova [acq+r5+16*1], m1
add r5, 16*2
jl .dc_loop
RET
cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
movifnidn hpadd, hpadm
%if ARCH_X86_32 && PIC
pcmpeqw m5, m5
pabsw m5, m5
psllw m5, 2
%else
movddup m5, [pw_4]
%endif
mov hd, hm
shl hpadd, 2
pxor m4, m4
sub hd, hpadd
cmp dword wm, 8
mov r5, acq
jg .w16
je .w8
lea r3, [strideq*3]
.w4_loop:
pmaddwd m0, m5, [ypxq+strideq*0]
pmaddwd m3, m5, [ypxq+strideq*1]
pmaddwd m1, m5, [ypxq+strideq*2]
pmaddwd m2, m5, [ypxq+r3 ]
lea ypxq, [ypxq+strideq*4]
paddd m4, m0
packssdw m0, m3
paddd m3, m1
packssdw m1, m2
paddd m4, m2
paddd m4, m3
mova [acq+16*0], m0
mova [acq+16*1], m1
add acq, 16*2
sub hd, 4
jg .w4_loop
test hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
punpckhqdq m1, m1
pslld m2, 3
mova [acq+16*0], m1
mova [acq+16*1], m1
paddd m4, m2
mova [acq+16*2], m1
mova [acq+16*3], m1
add acq, 16*4
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
.w8:
%if ARCH_X86_32
cmp dword wpadm, 0
%else
test wpadd, wpadd
%endif
jnz .w8_wpad1
.w8_loop:
pmaddwd m0, m5, [ypxq+strideq*0+16*0]
pmaddwd m2, m5, [ypxq+strideq*0+16*1]
pmaddwd m1, m5, [ypxq+strideq*1+16*0]
pmaddwd m3, m5, [ypxq+strideq*1+16*1]
lea ypxq, [ypxq+strideq*2]
paddd m4, m0
packssdw m0, m2
paddd m4, m2
mova [acq+16*0], m0
paddd m2, m1, m3
packssdw m1, m3
paddd m4, m2
mova [acq+16*1], m1
add acq, 16*2
sub hd, 2
jg .w8_loop
.w8_hpad:
test hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
pslld m2, 2
mova m0, m1
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
.w8_wpad1:
pmaddwd m0, m5, [ypxq+strideq*0]
pmaddwd m1, m5, [ypxq+strideq*1]
lea ypxq, [ypxq+strideq*2]
pshufd m2, m0, q3333
pshufd m3, m1, q3333
paddd m4, m0
packssdw m0, m2
paddd m4, m2
paddd m2, m1, m3
packssdw m1, m3
paddd m4, m2
mova [acq+16*0], m0
mova [acq+16*1], m1
add acq, 16*2
sub hd, 2
jg .w8_wpad1
jmp .w8_hpad
.w16_wpad3:
pshufd m3, m0, q3333
mova m1, m3
mova m2, m3
jmp .w16_wpad_end
.w16_wpad2:
pshufd m1, m3, q3333
mova m2, m1
jmp .w16_wpad_end
.w16_wpad1:
pshufd m2, m1, q3333
jmp .w16_wpad_end
.w16:
movifnidn wpadd, wpadm
WIN64_SPILL_XMM 7
.w16_loop:
pmaddwd m0, m5, [ypxq+16*0]
cmp wpadd, 2
jg .w16_wpad3
pmaddwd m3, m5, [ypxq+16*1]
je .w16_wpad2
pmaddwd m1, m5, [ypxq+16*2]
jp .w16_wpad1
pmaddwd m2, m5, [ypxq+16*3]
.w16_wpad_end:
add ypxq, strideq
paddd m6, m0, m3
packssdw m0, m3
mova [acq+16*0], m0
paddd m6, m1
packssdw m1, m2
paddd m2, m6
mova [acq+16*1], m1
add acq, 16*2
paddd m4, m2
dec hd
jg .w16_loop
WIN64_RESTORE_XMM
add hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
paddd m2, m2
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table
tzcnt wd, wm
movifnidn hpadd, hpadm
pxor m4, m4
movsxd wq, [r6+wq*4]
movddup m5, [base+pw_1]
add wq, r6
mov hd, hm
shl hpadd, 2
sub hd, hpadd
jmp wq
.w4:
lea r3, [strideq*3]
mov r5, acq
.w4_loop:
movq m0, [ypxq+strideq*0]
movhps m0, [ypxq+strideq*1]
movq m1, [ypxq+strideq*2]
movhps m1, [ypxq+r3 ]
lea ypxq, [ypxq+strideq*4]
psllw m0, 3
psllw m1, 3
mova [acq+16*0], m0
pmaddwd m0, m5
mova [acq+16*1], m1
pmaddwd m2, m5, m1
add acq, 16*2
paddd m4, m0
paddd m4, m2
sub hd, 4
jg .w4_loop
test hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
punpckhqdq m1, m1
mova [acq+16*0], m1
pslld m2, 2
mova [acq+16*1], m1
punpckhqdq m2, m2
mova [acq+16*2], m1
paddd m4, m2
mova [acq+16*3], m1
add acq, 16*4
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
.w8:
mov r5, acq
.w8_loop:
mova m0, [ypxq+strideq*0]
mova m1, [ypxq+strideq*1]
lea ypxq, [ypxq+strideq*2]
psllw m0, 3
psllw m1, 3
mova [acq+16*0], m0
pmaddwd m0, m5
mova [acq+16*1], m1
pmaddwd m2, m5, m1
add acq, 16*2
paddd m4, m0
paddd m4, m2
sub hd, 2
jg .w8_loop
.w8_hpad:
test hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
pslld m2, 2
mova m0, m1
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
.w16_wpad2:
pshufhw m3, m2, q3333
pshufhw m1, m0, q3333
punpckhqdq m3, m3
punpckhqdq m1, m1
jmp .w16_wpad_end
.w16:
movifnidn wpadd, wpadm
mov r5, acq
.w16_loop:
mova m2, [ypxq+strideq*0+16*0]
mova m0, [ypxq+strideq*1+16*0]
psllw m2, 3
psllw m0, 3
test wpadd, wpadd
jnz .w16_wpad2
mova m3, [ypxq+strideq*0+16*1]
mova m1, [ypxq+strideq*1+16*1]
psllw m3, 3
psllw m1, 3
.w16_wpad_end:
lea ypxq, [ypxq+strideq*2]
mova [acq+16*0], m2
pmaddwd m2, m5
mova [acq+16*1], m3
pmaddwd m3, m5
paddd m4, m2
pmaddwd m2, m5, m0
mova [acq+16*2], m0
paddd m4, m3
pmaddwd m3, m5, m1
mova [acq+16*3], m1
add acq, 16*4
paddd m2, m3
paddd m4, m2
sub hd, 2
jg .w16_loop
add hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
paddd m2, m2
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
.w32_wpad6:
pshufhw m1, m0, q3333
punpckhqdq m1, m1
mova m2, m1
mova m3, m1
jmp .w32_wpad_end
.w32_wpad4:
pshufhw m2, m1, q3333
punpckhqdq m2, m2
mova m3, m2
jmp .w32_wpad_end
.w32_wpad2:
pshufhw m3, m2, q3333
punpckhqdq m3, m3
jmp .w32_wpad_end
.w32:
movifnidn wpadd, wpadm
mov r5, acq
WIN64_SPILL_XMM 8
.w32_loop:
mova m0, [ypxq+16*0]
psllw m0, 3
cmp wpadd, 4
jg .w32_wpad6
mova m1, [ypxq+16*1]
psllw m1, 3
je .w32_wpad4
mova m2, [ypxq+16*2]
psllw m2, 3
jnp .w32_wpad2
mova m3, [ypxq+16*3]
psllw m3, 3
.w32_wpad_end:
add ypxq, strideq
pmaddwd m6, m5, m0
mova [acq+16*0], m0
pmaddwd m7, m5, m1
mova [acq+16*1], m1
paddd m6, m7
pmaddwd m7, m5, m2
mova [acq+16*2], m2
paddd m6, m7
pmaddwd m7, m5, m3
mova [acq+16*3], m3
add acq, 16*4
paddd m6, m7
paddd m4, m6
dec hd
jg .w32_loop
%if WIN64
mova m5, m6
WIN64_RESTORE_XMM
SWAP 5, 6
%endif
test hpadd, hpadd
jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
.w32_hpad_loop:
mova [acq+16*0], m0
mova [acq+16*1], m1
paddd m4, m6
mova [acq+16*2], m2
mova [acq+16*3], m3
add acq, 16*4
dec hpadd
jg .w32_hpad_loop
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
%define base r2-pal_pred_16bpc_ssse3_table
%if ARCH_X86_32
%define hd r2d
%endif
mova m3, [palq]
LEA r2, pal_pred_16bpc_ssse3_table
tzcnt wd, wm
pshufb m3, [base+pal_pred_shuf]
movsxd wq, [r2+wq*4]
pshufd m4, m3, q1032
add wq, r2
movifnidn hd, hm
jmp wq
.w4:
mova m0, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w4
RET
.w8:
mova m0, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8
RET
.w16:
mova m0, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w16
RET
.w32:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
add idxq, 16*2
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova [dstq+16*2], m2
mova [dstq+16*3], m0
add dstq, strideq
dec hd
jg .w32
RET
.w64:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova m1, [idxq+16*2]
mova [dstq+16*2], m2
pshufb m2, m3, m1
mova [dstq+16*3], m0
pshufb m0, m4, m1
punpcklbw m1, m2, m0
punpckhbw m2, m0
mova m0, [idxq+16*3]
add idxq, 16*4
mova [dstq+16*4], m1
pshufb m1, m3, m0
mova [dstq+16*5], m2
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*6], m0
mova [dstq+16*7], m1
add dstq, strideq
dec hd
jg .w64
RET