blob: dd188a7f376300d2dfa2c9eab885d3100347d9a9 [file] [log] [blame]
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db %1-128, 127-%1
%rotate 1
%endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights: SMOOTH_WEIGHT_TABLE \
0, 0, 255, 128, 255, 149, 85, 64, \
255, 197, 146, 105, 73, 50, 37, 32, \
255, 225, 196, 170, 145, 123, 102, 84, \
68, 54, 43, 33, 26, 20, 17, 16, \
255, 240, 225, 210, 196, 182, 169, 157, \
145, 133, 122, 111, 101, 92, 83, 74, \
66, 59, 52, 45, 39, 34, 29, 25, \
21, 17, 14, 12, 10, 9, 8, 8, \
255, 248, 240, 233, 225, 218, 210, 203, \
196, 189, 182, 176, 169, 163, 156, 150, \
144, 138, 133, 127, 121, 116, 111, 106, \
101, 96, 91, 86, 82, 77, 73, 69, \
65, 61, 57, 54, 50, 47, 44, 41, \
38, 35, 32, 29, 27, 25, 22, 20, \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
pb_128: times 4 db 128 ; those are just placed here for alignment.
pb_36_m4: times 2 db 36, -4
z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13
z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64
dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64
z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5
; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1
pb_127_m127: times 2 db 127, -127
ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0
pw_64: times 2 dw 64
cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
times 9 db 7, -1
cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
; w=8, w_pad=1 as well as second half of previous one
cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
times 5 db 6, 7
; w=16,w_pad=2
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
times 8 db 14, 15
; w=16,w_pad=3
db 0, 1, 2, 3, 4, 5
times 13 db 6, 7
pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
%define pb_0to15 cfl_ac_w16_pad_shuffle
%define pb_1 (ipred_h_shuf+12)
%define pb_2 (ipred_h_shuf+20)
%define pb_3 (ipred_h_shuf+ 4)
%define pb_4 (ipred_h_shuf+24)
%define pb_5 (ipred_h_shuf+ 8)
%define pb_7 (ipred_h_shuf+ 0)
%define pb_8 (z_upsample2 +12)
%define pb_12 (z2_y_shuf_h4+20)
%define pb_14 (z2_y_shuf_h4+ 4)
%define pb_15 (z_filter_s +32)
%define pb_27 (z2_y_shuf_h4+ 8)
%define pb_31 (z2_y_shuf_h4+12)
%define pb_32 (z2_y_shuf_h4+16)
%define pb_90 (z2_y_shuf_h4+ 0)
%define pw_1 (z2_y_shuf_h4+24)
%define pw_8 (z_filter_k +32)
pw_62: times 2 dw 62
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pw_512: times 2 dw 512
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32
JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
INIT_YMM avx2
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
lea r5, [ipred_dc_left_avx2_table]
tzcnt wd, wm
inc tlq
movu m0, [tlq]
movifnidn hd, hm
mov r6d, 0x8000
shrx r6d, r6d, wd
movd xm3, r6d
movsxd r6, [r5+wq*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
movu m0, [tlq]
mov r5d, 0x8000
shrx r5d, r5d, r6d
movd xm3, r5d
lea r5, [ipred_dc_left_avx2_table]
movsxd r6, [r5+r6*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
.h64:
movu m1, [tlq+32] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h32:
vextracti128 xm1, m0, 1
paddw xm0, xm1
.h16:
punpckhqdq xm1, xm0, xm0
paddw xm0, xm1
.h8:
psrlq xm1, xm0, 32
paddw xm0, xm1
.h4:
pmaddwd xm0, xm2
pmulhrsw xm0, xm3
lea stride3q, [strideq*3]
vpbroadcastb m0, xm0
mova m1, m0
jmp wq
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd xm4, r5d
tzcnt r5d, r5d
movd xm5, r5d
lea r5, [ipred_dc_avx2_table]
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+5*4]
pcmpeqd m3, m3
psrlw xm4, 1
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movd xm0, [tlq-4]
pmaddubsw xm0, xm3
jmp wq
.w4:
movd xm1, [tlq+1]
pmaddubsw xm1, xm3
psubw xm0, xm4
paddw xm0, xm1
pmaddwd xm0, xm3
cmp hd, 4
jg .w4_mul
psrlw xm0, 3
jmp .w4_end
.w4_mul:
punpckhqdq xm1, xm0, xm0
lea r2d, [hq*2]
mov r6d, 0x55563334
paddw xm0, xm1
shrx r6d, r6d, r2d
psrlq xm1, xm0, 32
paddw xm0, xm1
movd xm1, r6d
psrlw xm0, 2
pmulhuw xm0, xm1
.w4_end:
vpbroadcastb xm0, xm0
.s4:
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm0
movd [dstq+strideq*2], xm0
movd [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
ALIGN function_align
.h8:
movq xm0, [tlq-8]
pmaddubsw xm0, xm3
jmp wq
.w8:
movq xm1, [tlq+1]
vextracti128 xm2, m0, 1
pmaddubsw xm1, xm3
psubw xm0, xm4
paddw xm0, xm2
punpckhqdq xm2, xm0, xm0
paddw xm0, xm2
paddw xm0, xm1
psrlq xm1, xm0, 32
paddw xm0, xm1
pmaddwd xm0, xm3
psrlw xm0, xm5
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmove r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w8_end:
vpbroadcastb xm0, xm0
.s8:
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm0
movq [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
ALIGN function_align
.h16:
mova xm0, [tlq-16]
pmaddubsw xm0, xm3
jmp wq
.w16:
movu xm1, [tlq+1]
vextracti128 xm2, m0, 1
pmaddubsw xm1, xm3
psubw xm0, xm4
paddw xm0, xm2
paddw xm0, xm1
punpckhqdq xm1, xm0, xm0
paddw xm0, xm1
psrlq xm1, xm0, 32
paddw xm0, xm1
pmaddwd xm0, xm3
psrlw xm0, xm5
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hb, 8|32
cmovz r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w16_end:
vpbroadcastb xm0, xm0
.s16:
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm0
mova [dstq+strideq*2], xm0
mova [dstq+stride3q ], xm0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
ALIGN function_align
.h32:
mova m0, [tlq-32]
pmaddubsw m0, m3
jmp wq
.w32:
movu m1, [tlq+1]
pmaddubsw m1, m3
paddw m0, m1
vextracti128 xm1, m0, 1
psubw xm0, xm4
paddw xm0, xm1
punpckhqdq xm1, xm0, xm0
paddw xm0, xm1
psrlq xm1, xm0, 32
paddw xm0, xm1
pmaddwd xm0, xm3
psrlw xm0, xm5
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x33345556
shrx r6d, r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w32_end:
vpbroadcastb m0, xm0
.s32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s32
RET
ALIGN function_align
.h64:
mova m0, [tlq-64]
mova m1, [tlq-32]
pmaddubsw m0, m3
pmaddubsw m1, m3
paddw m0, m1
jmp wq
.w64:
movu m1, [tlq+ 1]
movu m2, [tlq+33]
pmaddubsw m1, m3
pmaddubsw m2, m3
paddw m0, m1
paddw m0, m2
vextracti128 xm1, m0, 1
psubw xm0, xm4
paddw xm0, xm1
punpckhqdq xm1, xm0, xm0
paddw xm0, xm1
psrlq xm1, xm0, 32
paddw xm0, xm1
pmaddwd xm0, xm3
psrlw xm0, xm5
cmp hd, 64
je .w64_end
mov r6d, 0x33345556
shrx r6d, r6d, hd
movd xm1, r6d
pmulhuw xm0, xm1
.w64_end:
vpbroadcastb m0, xm0
mova m1, m0
.s64:
mova [dstq+strideq*0+32*0], m0
mova [dstq+strideq*0+32*1], m1
mova [dstq+strideq*1+32*0], m0
mova [dstq+strideq*1+32*1], m1
mova [dstq+strideq*2+32*0], m0
mova [dstq+strideq*2+32*1], m1
mova [dstq+stride3q +32*0], m0
mova [dstq+stride3q +32*1], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s64
RET
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128]
mova m1, m0
add wq, r5
lea stride3q, [strideq*3]
jmp wq
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_avx2_table]
tzcnt wd, wm
movu m0, [tlq+ 1]
movu m1, [tlq+33]
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
%macro IPRED_H 2 ; w, store_type
vpbroadcastb m0, [tlq-1]
vpbroadcastb m1, [tlq-2]
vpbroadcastb m2, [tlq-3]
sub tlq, 4
vpbroadcastb m3, [tlq+0]
mov%2 [dstq+strideq*0], m0
mov%2 [dstq+strideq*1], m1
mov%2 [dstq+strideq*2], m2
mov%2 [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w%1
RET
ALIGN function_align
%endmacro
INIT_XMM avx2
cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
lea r5, [ipred_h_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w4:
IPRED_H 4, d
.w8:
IPRED_H 8, q
.w16:
IPRED_H 16, a
INIT_YMM avx2
.w32:
IPRED_H 32, a
.w64:
vpbroadcastb m0, [tlq-1]
vpbroadcastb m1, [tlq-2]
vpbroadcastb m2, [tlq-3]
sub tlq, 4
vpbroadcastb m3, [tlq+0]
mova [dstq+strideq*0+32*0], m0
mova [dstq+strideq*0+32*1], m0
mova [dstq+strideq*1+32*0], m1
mova [dstq+strideq*1+32*1], m1
mova [dstq+strideq*2+32*0], m2
mova [dstq+strideq*2+32*1], m2
mova [dstq+stride3q +32*0], m3
mova [dstq+stride3q +32*1], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w64
RET
%macro PAETH 2 ; top, ldiff
pavgb m1, m%1, m3 ; Calculating tldiff normally requires
pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it
pand m0, m4 ; in 8-bit with some tricks which avoids
psubusb m2, m5, m1 ; having to unpack everything to 16-bit.
psubb m1, m0
psubusb m1, m5
por m1, m2
paddusb m1, m1
por m1, m0 ; min(tldiff, 255)
psubusb m2, m5, m3
psubusb m0, m3, m5
por m2, m0 ; tdiff
pminub m2, m%2
pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
vpblendvb m0, m%1, m3, m0
pminub m1, m2
pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff
vpblendvb m0, m5, m0, m1
%endmacro
cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
%define base r5-ipred_paeth_avx2_table
lea r5, [ipred_paeth_avx2_table]
tzcnt wd, wm
vpbroadcastb m5, [tlq] ; topleft
movifnidn hd, hm
movsxd wq, [r5+wq*4]
vpbroadcastd m4, [base+pb_1]
add wq, r5
jmp wq
.w4:
vpbroadcastd m6, [tlq+1] ; top
mova m8, [base+ipred_h_shuf]
lea r3, [strideq*3]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0 ; ldiff
.w4_loop:
sub tlq, 8
vpbroadcastq m3, [tlq]
pshufb m3, m8 ; left
PAETH 6, 7
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
vpbroadcastq m6, [tlq+1]
mova m8, [base+ipred_h_shuf]
lea r3, [strideq*3]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w8_loop:
sub tlq, 4
vpbroadcastd m3, [tlq]
pshufb m3, m8
PAETH 6, 7
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
ALIGN function_align
.w16:
vbroadcasti128 m6, [tlq+1]
mova xm8, xm4 ; lower half = 1, upper half = 0
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w16_loop:
sub tlq, 2
vpbroadcastd m3, [tlq]
pshufb m3, m8
PAETH 6, 7
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
movu m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w32_loop:
dec tlq
vpbroadcastb m3, [tlq]
PAETH 6, 7
mova [dstq], m0
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
movu m6, [tlq+ 1]
movu m7, [tlq+33]
%if WIN64
movaps r4m, xmm9
%endif
psubusb m8, m5, m6
psubusb m0, m6, m5
psubusb m9, m5, m7
psubusb m1, m7, m5
por m8, m0
por m9, m1
.w64_loop:
dec tlq
vpbroadcastb m3, [tlq]
PAETH 6, 8
mova [dstq+32*0], m0
PAETH 7, 9
mova [dstq+32*1], m0
add dstq, strideq
dec hd
jg .w64_loop
%if WIN64
movaps xmm9, r4m
%endif
RET
%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
pmaddubsw m0, m%3, m%1
pmaddubsw m1, m%4, m%2
paddw m0, m%5
paddw m1, m%6
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_avx2_table
lea r6, [ipred_smooth_v_avx2_table]
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m0, [base+pb_127_m127]
vpbroadcastd m1, [base+pw_128]
lea weightsq, [base+smooth_weights+hq*4]
neg hq
vpbroadcastb m5, [tlq+hq] ; bottom
add wq, r6
jmp wq
.w4:
vpbroadcastd m2, [tlq+1]
punpcklbw m2, m5 ; top, bottom
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
punpckldq m4, m5, m5
punpckhdq m5, m5
pmaddubsw m3, m2, m0
paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
paddw m3, m1 ; 128 * top + 129 * bottom + 128
.w4_loop:
vbroadcasti128 m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 1
pextrd [dstq+r3 ], xm1, 1
cmp hd, -4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm1, 2
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
add hq, 8
jl .w4_loop
.ret:
RET
ALIGN function_align
.w8:
vpbroadcastq m2, [tlq+1]
punpcklbw m2, m5
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
pshufd m4, m5, q0000
pshufd m5, m5, q1111
pmaddubsw m3, m2, m0
paddw m1, m2
paddw m3, m1
.w8_loop:
vpbroadcastq m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
RET
ALIGN function_align
.w16:
WIN64_SPILL_XMM 7
vbroadcasti128 m3, [tlq+1]
mova m6, [base+ipred_v_shuf]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w16_loop:
vpbroadcastd m1, [weightsq+hq*2]
pshufb m1, m6
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w16_loop
RET
ALIGN function_align
.w32:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 6
movu m3, [tlq+1]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w32_loop:
vpbroadcastw m1, [weightsq+hq*2]
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m0
add dstq, strideq
inc hq
jl .w32_loop
RET
ALIGN function_align
.w64:
WIN64_SPILL_XMM 11
movu m4, [tlq+ 1]
movu m8, [tlq+33]
punpcklbw m3, m4, m5
punpckhbw m4, m5
punpcklbw m7, m8, m5
punpckhbw m8, m5
pmaddubsw m5, m3, m0
pmaddubsw m6, m4, m0
pmaddubsw m9, m7, m0
pmaddubsw m10, m8, m0
paddw m2, m1, m3
paddw m5, m2
paddw m2, m1, m4
paddw m6, m2
paddw m0, m1, m7
paddw m9, m0
paddw m1, m8
paddw m10, m1
.w64_loop:
vpbroadcastw m2, [weightsq+hq*2]
SMOOTH 2, 2, 3, 4, 5, 6
mova [dstq+32*0], m0
SMOOTH 2, 2, 7, 8, 9, 10
mova [dstq+32*1], m0
add dstq, strideq
inc hq
jl .w64_loop
RET
%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
%assign stack_offset 0
%assign stack_size_padded 0
%assign regs_used %2
%xdefine rstk rsp
SETUP_STACK_POINTER %1
%if regs_used != %2 && WIN64
PUSH r%2
%endif
ALLOC_STACK %1, %3
%endmacro
cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_avx2_table
lea r6, [ipred_smooth_h_avx2_table]
mov wd, wm
vpbroadcastb m3, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m4, [base+pb_127_m127]
vpbroadcastd m5, [base+pw_128]
add wq, r6
jmp wq
.w4:
WIN64_SPILL_XMM 8
vpbroadcastq m6, [base+smooth_weights+4*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 8
sub tlq, hq
lea r3, [strideq*3]
.w4_loop:
vpbroadcastq m2, [tlq+hq]
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
vbroadcasti128 m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
.w8_loop:
vpbroadcastd m2, [tlq+hq]
pshufb m2, m7
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m0, m1, m4
paddw m0, m1
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
ALIGN function_align
.w16:
SETUP_STACK_FRAME 32*4, 7, 8
lea r3, [rsp+64*2-4]
call .prep ; only worthwhile for for w16 and above
sub tlq, 2
vpbroadcastd xm6, [base+pb_1]
mova xm7, [base+ipred_v_shuf+16]
vinserti128 m7, [base+ipred_v_shuf+ 0], 1
vbroadcasti128 m4, [base+smooth_weights+16*2]
vbroadcasti128 m5, [base+smooth_weights+16*3]
.w16_loop:
vpbroadcastd m1, [tlq+hq]
vpbroadcastd m2, [r3+hq*2]
pshufb m1, m6
punpcklbw m1, m3
pshufb m2, m7
SMOOTH 4, 5, 1, 1, 2, 2
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
SETUP_STACK_FRAME 32*4, 7, 6
lea r3, [rsp+64*2-2]
call .prep
dec tlq
mova xm4, [base+smooth_weights+16*4]
vinserti128 m4, [base+smooth_weights+16*6], 1
mova xm5, [base+smooth_weights+16*5]
vinserti128 m5, [base+smooth_weights+16*7], 1
.w32_loop:
vpbroadcastb m1, [tlq+hq]
punpcklbw m1, m3
vpbroadcastw m2, [r3+hq*2]
SMOOTH 4, 5, 1, 1, 2, 2
mova [dstq], m0
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
SETUP_STACK_FRAME 32*4, 7, 9
lea r3, [rsp+64*2-2]
call .prep
add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
dec tlq
mova xm5, [r6-16*7]
vinserti128 m5, [r6-16*5], 1
mova xm6, [r6-16*6]
vinserti128 m6, [r6-16*4], 1
mova xm7, [r6-16*3]
vinserti128 m7, [r6-16*1], 1
mova xm8, [r6-16*2]
vinserti128 m8, [r6-16*0], 1
.w64_loop:
vpbroadcastb m2, [tlq+hq]
punpcklbw m2, m3
vpbroadcastw m4, [r3+hq*2]
SMOOTH 5, 6, 2, 2, 4, 4
mova [dstq+32*0], m0
SMOOTH 7, 8, 2, 2, 4, 4
mova [dstq+32*1], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
ALIGN function_align
.prep:
vpermq m2, [tlq-32*1], q3120
punpckhbw m1, m2, m3
punpcklbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m1, m5 ; 1 * left + 256 * right + 128
paddw m0, m1 ; 128 * left + 129 * right + 128
pmaddubsw m1, m2, m4
paddw m2, m5
paddw m1, m2
vpermq m2, [tlq-32*2], q3120
mova [rsp+gprsize+32*3], m0
mova [rsp+gprsize+32*2], m1
punpckhbw m1, m2, m3
punpcklbw m2, m3
pmaddubsw m0, m1, m4
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m2, m5
paddw m1, m2
mova [rsp+gprsize+32*1], m0
mova [rsp+gprsize+32*0], m1
sub r3, hq
sub tlq, hq
sub r3, hq
ret
%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
pmaddubsw m0, m%3, m%1
pmaddubsw m1, m%4, m%2
%ifnum %5
paddw m0, m%5
%else
paddw m0, %5
%endif
%ifnum %6
paddw m1, m%6
%else
paddw m1, %6
%endif
pavgw m0, m2
pavgw m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_avx2_table
lea r6, [ipred_smooth_avx2_table]
mov wd, wm
vpbroadcastb m4, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
mov r5, tlq
sub r5, hq
movsxd wq, [r6+wq*4]
vpbroadcastd m5, [base+pb_127_m127]
vpbroadcastb m0, [r5] ; bottom
vpbroadcastd m3, [base+pw_255]
add wq, r6
lea v_weightsq, [base+smooth_weights+hq*2]
jmp wq
.w4:
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vpbroadcastq m11, [base+smooth_weights+4*2]
mova m7, [base+ipred_v_shuf]
vpbroadcastd m8, [tlq+1]
sub tlq, 8
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m8, m0 ; top, bottom
pshufd m6, m7, q2200
pshufd m7, m7, q3311
pmaddubsw m9, m8, m5
paddw m3, m8 ; 1 * top + 255 * bottom + 255
paddw m9, m3 ; 128 * top + 129 * bottom + 255
.w4_loop:
vpbroadcastq m1, [tlq+hq]
pshufb m1, m10
punpcklbw m0, m1, m4 ; left, right
punpckhbw m1, m4
pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
pmaddubsw m3, m1, m5
paddw m2, m0 ; 128 * left + 129 * right
paddw m3, m1
pmaddubsw m0, m11
pmaddubsw m1, m11
paddw m2, m0
paddw m3, m1
vbroadcasti128 m1, [v_weightsq]
add v_weightsq, 16
pshufb m0, m1, m6
pshufb m1, m7
SMOOTH_2D_END 0, 1, 8, 8, 9, 9
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vbroadcasti128 m11, [base+smooth_weights+8*2]
mova m7, [base+ipred_v_shuf]
vpbroadcastq m8, [tlq+1]
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m8, m0
pshufd m6, m7, q0000
pshufd m7, m7, q1111
pmaddubsw m9, m8, m5
paddw m3, m8
paddw m9, m3
.w8_loop:
vpbroadcastd m1, [tlq+hq]
pshufb m1, m10
punpcklbw m0, m1, m4
punpckhbw m1, m4
pmaddubsw m2, m0, m5
pmaddubsw m3, m1, m5
paddw m2, m0
paddw m3, m1
pmaddubsw m0, m11
pmaddubsw m1, m11
paddw m2, m0
paddw m3, m1
vpbroadcastq m1, [v_weightsq]
add v_weightsq, 8
pshufb m0, m1, m6
pshufb m1, m7
SMOOTH_2D_END 0, 1, 8, 8, 9, 9
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
ALIGN function_align
.w16:
SETUP_STACK_FRAME 32*4, 7, 14
vbroadcasti128 m11, [tlq+1]
lea r3, [rsp+64*2-4]
punpcklbw m10, m11, m0 ; top, bottom
punpckhbw m11, m0
call .prep_v
sub tlq, 2
pmaddubsw m12, m10, m5
pmaddubsw m13, m11, m5
vpbroadcastd xm5, [base+pb_1]
mova m9, [base+ipred_v_shuf]
vbroadcasti128 m6, [base+smooth_weights+16*2]
vbroadcasti128 m7, [base+smooth_weights+16*3]
vperm2i128 m8, m9, m9, 0x01
paddw m0, m10, m3
paddw m3, m11
paddw m12, m0
paddw m13, m3
.w16_loop:
vpbroadcastd m3, [tlq+hq]
vpbroadcastd m0, [r3+hq*2]
vpbroadcastd m1, [v_weightsq]
add v_weightsq, 4
pshufb m3, m5
punpcklbw m3, m4 ; left, right
pmaddubsw m2, m3, m6
pmaddubsw m3, m7
pshufb m0, m8
pshufb m1, m9
paddw m2, m0
paddw m3, m0
SMOOTH_2D_END 1, 1, 10, 11, 12, 13
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
SETUP_STACK_FRAME 32*4, 7, 11
movu m8, [tlq+1]
lea r3, [rsp+64*2-2]
punpcklbw m7, m8, m0
punpckhbw m8, m0
call .prep_v
dec tlq
pmaddubsw m9, m7, m5
pmaddubsw m10, m8, m5
mova xm5, [base+smooth_weights+16*4]
vinserti128 m5, [base+smooth_weights+16*6], 1
mova xm6, [base+smooth_weights+16*5]
vinserti128 m6, [base+smooth_weights+16*7], 1
paddw m0, m7, m3
paddw m3, m8
paddw m9, m0
paddw m10, m3
.w32_loop:
vpbroadcastb m3, [tlq+hq]
punpcklbw m3, m4
vpbroadcastw m0, [r3+hq*2]
vpbroadcastw m1, [v_weightsq]
add v_weightsq, 2
pmaddubsw m2, m3, m5
pmaddubsw m3, m6
paddw m2, m0
paddw m3, m0
SMOOTH_2D_END 1, 1, 7, 8, 9, 10
mova [dstq], m0
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
SETUP_STACK_FRAME 32*8, 7, 16
movu m13, [tlq+1 ]
movu m15, [tlq+33]
add r6, smooth_weights+16*15-ipred_smooth_avx2_table
lea r3, [rsp+64*2-2]
punpcklbw m12, m13, m0
punpckhbw m13, m0
punpcklbw m14, m15, m0
punpckhbw m15, m0
call .prep_v
dec tlq
pmaddubsw m0, m12, m5
pmaddubsw m1, m13, m5
pmaddubsw m2, m14, m5
pmaddubsw m5, m15, m5
mova xm8, [r6-16*7]
vinserti128 m8, [r6-16*5], 1
mova xm9, [r6-16*6]
vinserti128 m9, [r6-16*4], 1
mova xm10, [r6-16*3]
vinserti128 m10, [r6-16*1], 1
mova xm11, [r6-16*2]
vinserti128 m11, [r6-16*0], 1
lea r6, [rsp+32*4]
paddw m0, m3
paddw m1, m3
paddw m2, m3
paddw m3, m5
paddw m0, m12
paddw m1, m13
paddw m2, m14
paddw m3, m15
mova [r6+32*0], m0
mova [r6+32*1], m1
mova [r6+32*2], m2
mova [r6+32*3], m3
.w64_loop:
vpbroadcastb m5, [tlq+hq]
punpcklbw m5, m4
vpbroadcastw m6, [r3+hq*2]
vpbroadcastw m7, [v_weightsq]
add v_weightsq, 2
pmaddubsw m2, m5, m8
pmaddubsw m3, m5, m9
paddw m2, m6
paddw m3, m6
SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1]
mova [dstq+32*0], m0
pmaddubsw m2, m5, m10
pmaddubsw m3, m5, m11
paddw m2, m6
paddw m3, m6
SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3]
mova [dstq+32*1], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
ALIGN function_align
.prep_v:
vpermq m2, [tlq-32*1], q3120
punpckhbw m1, m2, m4
punpcklbw m2, m4
pmaddubsw m0, m1, m5 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m2, m5
paddw m1, m2
vpermq m2, [tlq-32*2], q3120
mova [rsp+gprsize+32*3], m0
mova [rsp+gprsize+32*2], m1
punpckhbw m1, m2, m4
punpcklbw m2, m4
pmaddubsw m0, m1, m5
paddw m0, m1
pmaddubsw m1, m2, m5
paddw m1, m2
mova [rsp+gprsize+32*1], m0
mova [rsp+gprsize+32*0], m1
sub r3, hq
sub tlq, hq
sub r3, hq
ret
cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
%assign org_stack_offset stack_offset
lea r6, [ipred_z1_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
movifnidn hd, hm
lea r7, [dr_intra_derivative]
inc tlq
movsxd wq, [r6+wq*4]
add wq, r6
mov dxd, angled
and dxd, 0x7e
add angled, 165 ; ~90
movzx dxd, word [r7+dxq]
xor angled, 0x4ff ; d = 90 - angle
vpbroadcastd m3, [pw_512]
vpbroadcastd m4, [pw_62]
vpbroadcastd m5, [pw_64]
jmp wq
.w4:
cmp angleb, 40
jae .w4_no_upsample
lea r3d, [angleq-1024]
sar r3d, 7
add r3d, hd
jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
ALLOC_STACK -32, 8
mova xm1, [tlq-1]
pshufb xm0, xm1, [z_upsample1]
pshufb xm1, [z_upsample2]
vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
add dxd, dxd ; pw_512 (which is already in m3)
pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
pextrd [rsp+16], xm1, 3 ; top[max_base_x]
pmaddubsw xm1, xm2
movd xm7, dxd
mov r3d, dxd ; xpos
vpbroadcastw m7, xm7
paddw xm1, xm0
movq xm0, [tlq]
pmulhrsw xm1, xm3
pslldq m6, m7, 8
paddw xm2, xm7, xm7
lea r2, [strideq*3]
paddw m6, m7
packuswb xm1, xm1
paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
punpcklbw xm0, xm1
psllw m7, 2
mova [rsp], xm0
.w4_upsample_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
vpbroadcastq m1, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vpbroadcastq m2, [rsp+r5]
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
movq xm0, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
movhps xm0, [rsp+r5]
vpblendd m1, m2, 0xc0
pand m2, m4, m6 ; frac
vpblendd m0, m1, 0xf0
psubw m1, m5, m2 ; 64-frac
psllw m2, 8
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
paddw m6, m7 ; xpos += dx
pmulhrsw m0, m3
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [dstq+strideq*2], xm0
pextrd [dstq+r2 ], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_upsample_loop
RET
ALIGN function_align
.filter_strength: ; w4/w8/w16
; The C version uses a lot of branches, but we can do all the comparisons
; in parallel and use popcnt to get the final filter strength value.
%define base r3-z_filter_t0
lea r3, [z_filter_t0]
movd xm0, maxbased
movd xm2, angled
shr angled, 8 ; is_sm << 1
vpbroadcastb m0, xm0
vpbroadcastb m2, xm2
pcmpeqb m1, m0, [base+z_filter_wh]
pand m1, m2
mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
pcmpgtb m1, m2
pmovmskb r5d, m1
ret
.w4_no_upsample:
%assign stack_offset org_stack_offset
ALLOC_STACK -16, 11
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w4_main
lea maxbased, [hq+3]
call .filter_strength
mov maxbased, 7
test r5d, r5d
jz .w4_main ; filter_strength == 0
popcnt r5d, r5d
vpbroadcastd m7, [base+pb_8]
vbroadcasti128 m2, [tlq-1]
pminub m1, m7, [base+z_filter_s]
vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
pminub m7, [base+z_filter_s+8]
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
pshufb m0, m2, m1
shufps m1, m7, q2121
pmaddubsw m0, m8
pshufb m1, m2, m1
pmaddubsw m1, m9
pshufb m2, m7
pmaddubsw m2, m10
paddw m0, m1
paddw m0, m2
pmulhrsw m0, m3
mov r3d, 9
mov tlq, rsp
cmp hd, 4
cmovne maxbased, r3d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [tlq], xm0
.w4_main:
movd xm6, dxd
vpbroadcastq m0, [z_base_inc] ; base_inc << 6
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
mov r3d, dxd ; xpos
movd xm9, maxbased
vpbroadcastw m9, xm9
vbroadcasti128 m8, [z1_shuf_w4]
psrlw m7, 8 ; top[max_base_x]
paddw m10, m6, m6
psubw m9, m0 ; max_base_x
vpblendd m6, m10, 0xcc
mova xm0, xm10
paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
paddw m10, m10
.w4_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
vpbroadcastq m1, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vpbroadcastq m2, [tlq+r5]
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
movq xm0, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
movhps xm0, [tlq+r5]
vpblendd m1, m2, 0xc0
pand m2, m4, m6 ; frac
vpblendd m0, m1, 0xf0
psubw m1, m5, m2 ; 64-frac
psllw m2, 8
pshufb m0, m8
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
pcmpgtw m1, m9, m6 ; base < max_base_x
pmulhrsw m0, m3
paddw m6, m10 ; xpos += dx
lea r5, [dstq+strideq*2]
vpblendvb m0, m7, m0, m1
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [r5 +strideq*0], xm0
pextrd [r5 +strideq*1], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
sub hd, 4
jz .w4_end
lea dstq, [dstq+strideq*4]
cmp r3d, maxbased
jb .w4_loop
packuswb xm7, xm7
lea r6, [strideq*3]
.w4_end_loop:
movd [dstq+strideq*0], xm7
movd [dstq+strideq*1], xm7
movd [dstq+strideq*2], xm7
movd [dstq+r6 ], xm7
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_end_loop
.w4_end:
RET
ALIGN function_align
.w8:
lea r3d, [angleq+216]
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
%assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
movu xm2, [z_filter_s+6]
mova xm0, [tlq-1]
movd xm6, hd
vinserti128 m0, [tlq+7], 1
vpbroadcastb xm6, xm6
vbroadcasti128 m1, [z_upsample1]
pminub xm6, xm2
vpbroadcastd m7, [pb_36_m4]
vinserti128 m2, xm6, 1
add dxd, dxd
pshufb m1, m0, m1
pshufb m2, m0, m2
movd xm6, dxd
pmaddubsw m1, m7
pmaddubsw m2, m7
vpbroadcastw m6, xm6
mov r3d, dxd
psrldq m0, 1
lea r2, [strideq*3]
paddw m7, m6, m6
paddw m1, m2
vpblendd m6, m7, 0xf0
pmulhrsw m1, m3
pslldq m2, m7, 8
paddw m7, m7
paddw m6, m2
packuswb m1, m1
punpcklbw m0, m1
mova [rsp], m0
.w8_upsample_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
movu xm0, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [rsp+r5], 1
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
punpcklqdq m1, m2, m2 ; frac0 frac1
pmaddubsw m0, m1
movu xm1, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
vinserti128 m1, [rsp+r5], 1
punpckhqdq m2, m2 ; frac2 frac3
pmaddubsw m1, m2
pmulhrsw m0, m3
paddw m6, m7
pmulhrsw m1, m3
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*2], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+r2 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_upsample_loop
RET
.w8_no_intra_edge_filter:
and maxbased, 7
or maxbased, 8 ; imin(h+7, 15)
jmp .w8_main
.w8_no_upsample:
%assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [hq+7]
test angled, 0x400
jnz .w8_no_intra_edge_filter
call .filter_strength
test r5d, r5d
jz .w8_main ; filter_strength == 0
popcnt r5d, r5d
movu xm2, [tlq]
pminub xm1, xm0, [base+z_filter_s+14]
vinserti128 m2, [tlq-1], 1
vinserti128 m1, [base+z_filter_s+ 0], 1
vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
pminub xm0, [base+z_filter_s+22]
vinserti128 m0, [base+z_filter_s+ 8], 1
pshufb m6, m2, m1
pmaddubsw m6, m7
vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
movzx r3d, byte [tlq+15]
shufps m1, m0, q2121
pshufb m1, m2, m1
pmaddubsw m1, m7
paddw m1, m6
sub r5d, 3
jnz .w8_3tap
; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
; which also results in an awkward edge case where out[w*2] is
; slightly different from out[max_base_x] when h > w.
vpbroadcastd m7, [z_filter_k+4*8]
movzx r2d, byte [tlq+14]
pshufb m2, m0
pmaddubsw m2, m7
sub r2d, r3d
lea r2d, [r2+r3*8+4]
shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
mov [rsp+16], r2b
paddw m1, m2
.w8_3tap:
pmulhrsw m1, m3
sar r5d, 1
mov tlq, rsp
add r5d, 17 ; w*2 + (filter_strength == 3)
cmp hd, 16
cmovns maxbased, r5d
mov [tlq+r5], r3b
vextracti128 xm0, m1, 1
packuswb xm0, xm1
mova [tlq], xm0
.w8_main:
movd xm2, dxd
vbroadcasti128 m0, [z_base_inc]
vpbroadcastw m2, xm2
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
psrlw m7, 8
psubw m9, m0
mov r3d, dxd
paddw m6, m2, m2
vpblendd m2, m6, 0xf0
.w8_loop:
lea r5d, [r3+dxq]
shr r3d, 6
pand m0, m4, m2
psubw m1, m5, m0
psllw m0, 8
por m1, m0
movu xm0, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [tlq+r5], 1
pshufb m0, m8
pmaddubsw m0, m1
pcmpgtw m1, m9, m2
paddw m2, m6
pmulhrsw m0, m3
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
sub hd, 2
jz .w8_end
lea dstq, [dstq+strideq*2]
cmp r3d, maxbased
jb .w8_loop
packuswb xm7, xm7
.w8_end_loop:
movq [dstq+strideq*0], xm7
movq [dstq+strideq*1], xm7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_end_loop
.w8_end:
RET
.w16_no_intra_edge_filter:
and maxbased, 15
or maxbased, 16 ; imin(h+15, 31)
jmp .w16_main
ALIGN function_align
.w16:
%assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [hq+15]
test angled, 0x400
jnz .w16_no_intra_edge_filter
call .filter_strength
test r5d, r5d
jz .w16_main ; filter_strength == 0
popcnt r5d, r5d
vpbroadcastd m1, [base+pb_12]
vbroadcasti128 m6, [base+z_filter_s+8]
vinserti128 m2, m6, [base+z_filter_s], 0
vinserti128 m6, [base+z_filter_s+16], 1
mova xm10, [tlq-1]
vinserti128 m10, [tlq+3], 1
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
vbroadcasti128 m7, [base+z_filter_s+14]
vinserti128 m8, m7, [base+z_filter_s+6], 0
vinserti128 m7, [base+z_filter_s+22], 1
psubw m0, m1
movu xm11, [tlq+12]
vinserti128 m11, [tlq+16], 1
pminub m8, m0
pminub m7, m0
pshufb m0, m10, m2
shufps m2, m6, q2121
pmaddubsw m0, m9
pshufb m1, m11, m8
shufps m8, m7, q2121
pmaddubsw m1, m9
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
movzx r3d, byte [tlq+31]
pshufb m2, m10, m2
pmaddubsw m2, m9
pshufb m8, m11, m8
pmaddubsw m8, m9
paddw m0, m2
paddw m1, m8
sub r5d, 3
jnz .w16_3tap
vpbroadcastd m9, [z_filter_k+4*8]
movzx r2d, byte [tlq+30]
pshufb m10, m6
pmaddubsw m10, m9
pshufb m11, m7
pmaddubsw m11, m9
sub r2d, r3d
lea r2d, [r2+r3*8+4]
shr r2d, 3
mov [rsp+32], r2b
paddw m0, m10
paddw m1, m11
.w16_3tap:
pmulhrsw m0, m3
pmulhrsw m1, m3
sar r5d, 1
mov tlq, rsp
add r5d, 33
cmp hd, 32
cmovns maxbased, r5d
mov [tlq+r5], r3b
packuswb m0, m1
vpermq m0, m0, q3120
mova [tlq], m0
.w16_main:
movd xm6, dxd
vbroadcasti128 m0, [z_base_inc]
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
mov r3d, dxd
psubw m9, m0
paddw m11, m6, m6
psubw m10, m9, m3 ; 64*8
vpblendd m6, m11, 0xf0
.w16_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
movu xm0, [tlq+r3+0]
movu xm1, [tlq+r3+8]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [tlq+r5+0], 1
vinserti128 m1, [tlq+r5+8], 1
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
sub hd, 2
jz .w16_end
lea dstq, [dstq+strideq*2]
cmp r3d, maxbased
jb .w16_loop
.w16_end_loop:
mova [dstq+strideq*0], xm7
mova [dstq+strideq*1], xm7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_end_loop
.w16_end:
RET
ALIGN function_align
.w32:
%assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea r3d, [hq+31]
mov maxbased, 63
cmp hd, 32
cmovs maxbased, r3d
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
vbroadcasti128 m0, [pb_0to15]
sub r3d, 29 ; h+2
movu xm13, [tlq+29] ; 32-39
movd xm1, r3d
movu xm14, [tlq+37] ; 40-47
sub r3d, 8 ; h-6
vinserti128 m14, [tlq+51], 1 ; 56-63
vpbroadcastb xm1, xm1
mova xm11, [tlq- 1] ; 0- 7
vinserti128 m11, [tlq+13], 1 ; 16-23
movd xm2, r3d
movu xm12, [tlq+ 5] ; 8-15
vinserti128 m12, [tlq+19], 1 ; 24-31
pminub xm1, xm0 ; clip 32x8
mova m7, [z_filter_s+0]
pshufb xm13, xm1
vpbroadcastd m1, [pb_12]
vpbroadcastb xm2, xm2
vinserti128 m13, [tlq+43], 1 ; 48-55
vinserti128 m8, m7, [z_filter_s+4], 1
vpblendd m2, m1, 0xf0
vinserti128 m7, [z_filter_s+12], 0
pminub m2, m0 ; clip 32x16 and 32x(32|64)
vpbroadcastd m9, [z_filter_k+4*2+12*0]
pshufb m14, m2
pshufb m0, m11, m8
shufps m8, m7, q1021
pmaddubsw m0, m9
pshufb m2, m12, m8
pmaddubsw m2, m9
pshufb m1, m13, m8
pmaddubsw m1, m9
pshufb m6, m14, m8
pmaddubsw m6, m9
vpbroadcastd m9, [z_filter_k+4*2+12*1]
pshufb m10, m11, m8
shufps m8, m7, q2121
pmaddubsw m10, m9
paddw m0, m10
pshufb m10, m12, m8
pmaddubsw m10, m9
paddw m2, m10
pshufb m10, m13, m8
pmaddubsw m10, m9
paddw m1, m10
pshufb m10, m14, m8
pmaddubsw m10, m9
paddw m6, m10
vpbroadcastd m9, [z_filter_k+4*2+12*2]
pshufb m11, m8
pmaddubsw m11, m9
pshufb m12, m7
pmaddubsw m12, m9
movzx r3d, byte [tlq+63]
movzx r2d, byte [tlq+62]
paddw m0, m11
paddw m2, m12
pshufb m13, m7
pmaddubsw m13, m9
pshufb m14, m7
pmaddubsw m14, m9
paddw m1, m13
paddw m6, m14
sub r2d, r3d
lea r2d, [r2+r3*8+4] ; edge case for 32x64
pmulhrsw m0, m3
pmulhrsw m2, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
shr r2d, 3
mov [rsp+64], r2b
mov tlq, rsp
mov [tlq+65], r3b
mov r3d, 65
cmp hd, 64
cmove maxbased, r3d
packuswb m0, m2
packuswb m1, m6
mova [tlq+ 0], m0
mova [tlq+32], m1
.w32_main:
movd xm6, dxd
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
mov r5d, dxd
psubw m9, [z_base_inc]
mova m11, m6
psubw m10, m9, m3 ; 64*8
.w32_loop:
mov r3d, r5d
shr r3d, 6
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
movu m0, [tlq+r3+0]
movu m1, [tlq+r3+8]
add r5d, dxd
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq], m0
dec hd
jz .w32_end
add dstq, strideq
cmp r5d, maxbased
jb .w32_loop
test hb, 1
jz .w32_end_loop
mova [dstq], m7
add dstq, strideq
dec hd
jz .w32_end
.w32_end_loop:
mova [dstq+strideq*0], m7
mova [dstq+strideq*1], m7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_end_loop
.w32_end:
RET
ALIGN function_align
.w64:
%assign stack_offset org_stack_offset
ALLOC_STACK -128, 16
lea maxbased, [hq+63]
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w64_main
mova xm11, [tlq- 1] ; 0- 7
vinserti128 m11, [tlq+13], 1 ; 16-23
movu xm12, [tlq+ 5] ; 8-15
vinserti128 m12, [tlq+19], 1 ; 24-31
mova m7, [z_filter_s+0]
vinserti128 m8, m7, [z_filter_s+4], 1
vinserti128 m7, [z_filter_s+12], 0
vpbroadcastd m9, [z_filter_k+4*2+12*0]
movu xm13, [tlq+29] ; 32-39
vinserti128 m13, [tlq+43], 1 ; 48-55
movu xm14, [tlq+37] ; 40-47
vinserti128 m14, [tlq+51], 1 ; 56-63
pshufb m0, m11, m8
shufps m8, m7, q1021
pmaddubsw m0, m9
pshufb m2, m12, m8
pmaddubsw m2, m9
pshufb m1, m13, m8
pmaddubsw m1, m9
pshufb m6, m14, m8
pmaddubsw m6, m9
vpbroadcastd m9, [z_filter_k+4*2+12*1]
pshufb m10, m11, m8
shufps m15, m8, m7, q2121
pmaddubsw m10, m9
paddw m0, m10
pshufb m10, m12, m15
pmaddubsw m10, m9
paddw m2, m10
pshufb m10, m13, m15
pmaddubsw m10, m9
paddw m1, m10
pshufb m10, m14, m15
pmaddubsw m10, m9
paddw m6, m10
vpbroadcastd m10, [z_filter_k+4*2+12*2]
pshufb m11, m15
pmaddubsw m11, m10
pshufb m12, m7
pmaddubsw m12, m10
pshufb m13, m7
pmaddubsw m13, m10
pshufb m14, m7
pmaddubsw m14, m10
paddw m0, m11
paddw m2, m12
paddw m1, m13
paddw m6, m14
movu xm11, [tlq+ 61] ; 64- 71
vinserti128 m11, [tlq+ 75], 1 ; 80- 87
movu xm12, [tlq+ 69] ; 72- 79
vinserti128 m12, [tlq+ 83], 1 ; 88- 95
movu xm13, [tlq+ 93] ; 96-103
vinserti128 m13, [tlq+107], 1 ; 112-119
movu xm14, [tlq+101] ; 104-111
vinserti128 m14, [tlq+115], 1 ; 120-127
pmulhrsw m0, m3
pmulhrsw m2, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
lea r3d, [hq-20]
mov tlq, rsp
packuswb m0, m2
packuswb m1, m6
vpbroadcastd xm2, [pb_14]
vbroadcasti128 m6, [pb_0to15]
mova [tlq+32*0], m0
mova [tlq+32*1], m1
movd xm0, r3d
vpbroadcastd m1, [pb_12]
vpbroadcastb m0, xm0
paddb m0, m2
pminub m0, m6 ; clip 64x16 and 64x32
pshufb m12, m0
pminub m1, m6 ; clip 64x64
pshufb m14, m1
pshufb m0, m11, m7
pmaddubsw m0, m10
pshufb m2, m12, m7
pmaddubsw m2, m10
pshufb m1, m13, m7
pmaddubsw m1, m10
pshufb m6, m14, m7
pmaddubsw m6, m10
pshufb m7, m11, m15
pmaddubsw m7, m9
pshufb m10, m12, m15
pmaddubsw m10, m9
paddw m0, m7
pshufb m7, m13, m15
pmaddubsw m7, m9
paddw m2, m10
pshufb m10, m14, m15
pmaddubsw m10, m9
paddw m1, m7
paddw m6, m10
vpbroadcastd m9, [z_filter_k+4*2+12*0]
pshufb m11, m8
pmaddubsw m11, m9
pshufb m12, m8
pmaddubsw m12, m9
pshufb m13, m8
pmaddubsw m13, m9
pshufb m14, m8
pmaddubsw m14, m9
paddw m0, m11
paddw m2, m12
paddw m1, m13
paddw m6, m14
pmulhrsw m0, m3
pmulhrsw m2, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
packuswb m0, m2
packuswb m1, m6
mova [tlq+32*2], m0
mova [tlq+32*3], m1
.w64_main:
movd xm12, dxd
vpbroadcastb m7, [tlq+maxbaseq]
lea r3d, [dxq-64]
shl maxbased, 6
vpbroadcastw m12, xm12
sub r3d, maxbased
vbroadcasti128 m8, [z_filter_s+2]
movd xm6, r3d
mov r5d, dxd
mova m10, [pb_1to32]
vpbroadcastd m11, [pb_32]
vpbroadcastw m6, xm6
.w64_loop:
mov r3d, r5d
shr r3d, 6
movu m0, [tlq+r3+ 0]
movu m1, [tlq+r3+ 8]
pand m2, m4, m6
psubw m9, m5, m2
psllw m2, 8
por m9, m2
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
psraw m2, m6, 6
pmulhrsw m0, m3
pmulhrsw m1, m3
packsswb m2, m2
paddb m2, m10
packuswb m0, m1
vpblendvb m0, m7, m0, m2
mova [dstq+ 0], m0
movu m0, [tlq+r3+32]
movu m1, [tlq+r3+40]
add r5d, dxd
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
paddb m2, m11
pmulhrsw m0, m3
pmulhrsw m1, m3
paddw m6, m12
packuswb m0, m1
vpblendvb m0, m7, m0, m2
mova [dstq+32], m0
dec hd
jz .w64_end
add dstq, strideq
cmp r5d, maxbased
jb .w64_loop
.w64_end_loop:
mova [dstq+ 0], m7
mova [dstq+32], m7
add dstq, strideq
dec hd
jg .w64_end_loop
.w64_end:
RET
cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
%define base r9-z_filter_t0
lea r9, [ipred_z2_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
movifnidn hd, hm
lea dxq, [dr_intra_derivative-90]
movsxd wq, [r9+wq*4]
movzx dyd, angleb
xor angled, 0x400
mov r8, dxq
sub dxq, dyq
add wq, r9
add r9, z_filter_t0-ipred_z2_avx2_table
mova m2, [tlq-64]
mova m0, [tlq-32]
mova m1, [tlq]
and dyd, ~1
and dxq, ~1
movzx dyd, word [r8+dyq] ; angle - 90
movzx dxd, word [dxq+270] ; 180 - angle
vpbroadcastd m13, [base+pw_512]
vpbroadcastd m14, [base+pw_62]
vpbroadcastd m15, [base+pw_64]
mova [rsp+ 0], m2
mova [rsp+32], m0
mova [rsp+64], m1
neg dxd
neg dyd
jmp wq
.w4:
vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6
vbroadcasti128 m10, [base+z1_shuf_w4]
vbroadcasti128 m11, [base+z2_shuf_h4]
lea r2d, [dxq+(65<<6)] ; xpos
movd xm5, dyd
mov r8d, (63-4)<<6
mov dyq, -4
pshuflw xm5, xm5, q0000
pmullw xm5, [base+z2_ymul]
test angled, 0x400
jnz .w4_main ; !enable_intra_edge_filter
lea r3d, [hq+2]
add angled, 1022
shl r3d, 6
test r3d, angled
jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
vpbroadcastd xm3, [base+pb_4]
call .upsample_above
sub angled, 1075 ; angle - 53
lea r3d, [hq+3]
xor angled, 0x7f ; 180 - angle
call .filter_strength
jmp .w4_filter_left
ALIGN function_align
.filter_strength:
movd xm8, r3d
mov r3d, angled
movd xm7, angled
vpbroadcastb m8, xm8
shr r3d, 8 ; is_sm << 1
vpbroadcastb m7, xm7
pcmpeqb m8, [base+z_filter_wh]
mova xm9, [r9+r3*8]
pand m0, m8, m7
pcmpgtb m0, m9
pmovmskb r3d, m0
ret
ALIGN function_align
.upsample_above: ; w4/w8
pshufb xm2, xm1, [base+z_upsample1-2]
pminub xm3, [base+z_filter_s+4]
vpbroadcastd xm4, [base+pb_36_m4]
vbroadcasti128 m10, [base+pb_0to15]
pshufb xm3, xm1, xm3
pmaddubsw xm2, xm4
pmaddubsw xm3, xm4
lea r2d, [r2+dxq+(1<<6)]
add dxd, dxd
paddw xm2, xm3
pmulhrsw xm2, xm13
sub r8d, 3<<6
paddw m6, m6
packuswb xm2, xm2
punpcklbw xm1, xm2
mova [rsp+gprsize+64], xm1
ret
ALIGN function_align
.upsample_left: ; h4/h8
mov r3d, hd
and r3d, 4
movd xm2, [rsp+gprsize+64]
movddup xm0, [rsp+gprsize+56]
movd xm1, r3d
palignr xm2, xm0, 1
vpbroadcastb xm1, xm1
pshufb xm2, [base+z_filter_s+18]
vpbroadcastd xm3, [base+pb_36_m4]
pmaxub xm1, [base+z_upsample1-2]
pshufb xm1, xm0, xm1
pmaddubsw xm2, xm3
pmaddubsw xm1, xm3
paddw xm5, xm5
add dyq, dyq
paddw xm1, xm2
pmulhrsw xm1, xm13
vbroadcasti128 m11, [base+z2_upsample]
paddw xm5, xm15
packuswb xm1, xm1
punpcklbw xm0, xm1
mova [rsp+gprsize+48], xm0
ret
.w4_no_upsample_above:
lea r3d, [hq+3]
sub angled, 1112 ; angle - 90
call .filter_strength
test r3d, r3d
jz .w4_no_filter_above
popcnt r3d, r3d
vpbroadcastd xm2, [base+pb_4]
pminub xm2, [base+z_filter_s]
vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
pshufb xm3, xm1, xm2 ; 00 01 12 23
pshufd xm2, xm2, q0321
pmaddubsw xm0, xm3, xm0
pshufb xm2, xm1, xm2 ; 12 23 34 44
pmaddubsw xm2, xm4
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
punpckhqdq xm3, xm3 ; 34 44 44 44
pmaddubsw xm3, xm4
movd xm4, r6m ; max_width
pminsw xm4, xm15
vpbroadcastb xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
psubb xm4, [base+pb_1to32]
psrlq xm1, 8
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movd [rsp+65], xm0
.w4_no_filter_above:
lea r3d, [hq+2]
add angled, 973 ; angle + 883
shl r3d, 6
test r3d, angled
jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
vpbroadcastd xm0, [base+pb_90]
psubb xm0, xm7 ; 180 - angle
pand xm0, xm8 ; reuse from previous filter_strength call
pcmpgtb xm0, xm9
pmovmskb r3d, xm0
.w4_filter_left:
test r3d, r3d
jz .w4_main
popcnt r3d, r3d
mov r5d, 10
cmp hd, 16
movu xm2, [rsp+49]
vinserti128 m2, [rsp+43], 1
cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vbroadcasti128 m1, [base+z_filter_s+12]
vbroadcasti128 m4, [base+z_filter_s+16]
vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
vpbroadcastb m0, xm0
pmaxub m0, m3
vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
pshufb m0, m2, m0
pmaddubsw m0, m3
vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1]
pshufb m1, m2, m1
pmaddubsw m1, m3
vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
pshufb m2, m4
pmaddubsw m2, m3
movd xm4, r7m ; max_height
pminsw xm4, xm15
vpbroadcastb xm4, xm4
psubb xm4, [base+pb_16to1]
paddw m1, m0
paddw m1, m2
pmulhrsw m1, m13
vextracti128 xm0, m1, 1
packuswb xm0, xm1
vpblendvb xm0, [rsp+48], xm4
mova [rsp+48], xm0
jmp .w4_main
.w4_upsample_left:
call .upsample_left
.w4_main:
movd xm0, dxd
mova m12, [base+z2_y_shuf_h4]
lea r5, [rsp+56] ; left-7
vpbroadcastw m0, xm0
lea r9, [strideq*3]
psraw xm1, xm5, 6
pand xm5, xm14 ; frac_y
pxor xm2, xm2
paddw m7, m0, m0
psubw xm4, xm2, xm1 ; base_y
vpblendd m0, m7, 0xcc
mova xm1, xm7
punpcklwd xm4, xm2
paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1
psubw xm1, xm15, xm5 ; 64-frac_y
psllw xm5, 8
paddw m7, m7
paddw m6, m0
por xm5, xm1 ; 64-frac_y, frac_y
vpbroadcastq m5, xm5
.w4_loop:
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x0
vpbroadcastq m1, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x1
vpbroadcastq m2, [rsp+r3]
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x2
movq xm0, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x3
movhps xm0, [rsp+r3]
vpblendd m1, m2, 0xc0
pand m2, m14, m6 ; frac_x
vpblendd m0, m1, 0xf0
psubw m1, m15, m2 ; 64-frac_x
psllw m2, 8
pshufb m0, m10
por m1, m2 ; 64-frac_x, frac_x
pmaddubsw m0, m1
cmp r3d, 64
jge .w4_toponly
mova m1, m7 ; arbitrary negative value
vpgatherdq m3, [r5+xm4], m1
pshufb m1, m3, m11
vpermd m1, m12, m1
pmaddubsw m1, m5
psraw m2, m6, 15 ; base_x < topleft
vpblendvb m0, m1, m2
.w4_toponly:
pmulhrsw m0, m13
paddw m6, m7 ; xpos += dx
add r5, dyq
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [dstq+strideq*2], xm0
pextrd [dstq+r9 ], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
sub hd, 4
jz .w4_end
lea dstq, [dstq+strideq*4]
cmp r2d, r8d
jge .w4_loop
.w4_leftonly_loop:
mova m1, m7
vpgatherdq m2, [r5+xm4], m1
add r5, dyq
pshufb m0, m2, m11
vpermd m0, m12, m0
pmaddubsw m0, m5
pmulhrsw m0, m13
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [dstq+strideq*2], xm0
pextrd [dstq+r9 ], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_leftonly_loop
.w4_end:
RET
.w8:
vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6
movd xm5, dyd
vbroadcasti128 m10, [base+z_filter_s+2]
vbroadcasti128 m11, [base+z2_shuf_h4]
lea r2d, [dxq+(65<<6)] ; xpos
vpbroadcastw xm5, xm5
mov r8d, (63-8)<<6
mov dyq, -4
pmullw xm5, [base+z2_ymul]
test angled, 0x400
jnz .w8_main
lea r3d, [angleq+126]
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
vpbroadcastd xm3, [base+pb_8]
movhps [rsp+80], xm1
call .upsample_above
sub angled, 53 ; angle - 53
lea r3d, [hq+7]
xor angled, 0x7f ; 180 - angle
call .filter_strength
jmp .w8_filter_left
.w8_no_upsample_above:
lea r3d, [hq+7]
sub angled, 90 ; angle - 90
call .filter_strength
test r3d, r3d
jz .w8_no_filter_above
popcnt r3d, r3d
vpbroadcastd xm3, [base+pb_8]
pminub xm3, [base+z_filter_s+8]
vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
pmaddubsw xm0, xm2, xm0
pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88
shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88
pmaddubsw xm2, xm4
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
pmaddubsw xm3, xm4
movd xm4, r6m ; max_width
pminuw xm4, xm15
vpbroadcastb xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
psubb xm4, [base+pb_1to32]
psrldq xm1, 1
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movq [rsp+65], xm0
.w8_no_filter_above:
lea r3d, [angleq-51]
mov r3b, hb
cmp r3d, 8
jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
vpbroadcastd m0, [base+pb_90]
psubb m0, m7
pand m0, m8
pcmpgtb m0, m9
pmovmskb r3d, m0
.w8_filter_left:
test r3d, r3d
jz .w8_main
popcnt r3d, r3d
vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
cmp hd, 32
jne .w8_filter_left_h16
movu xm2, [rsp+27]
vinserti128 m2, [rsp+35], 1
vpbroadcastd xm0, [base+pb_5]
vbroadcasti128 m3, [base+z_filter_s+ 8]
vbroadcasti128 m1, [base+z_filter_s+12]
vbroadcasti128 m4, [base+z_filter_s+16]
pmaxub m3, m0
pshufb m3, m2, m3
pmaddubsw m3, m7
pshufb m1, m2, m1
pmaddubsw m1, m8
pshufb m2, m4
pmaddubsw m2, m9
paddw m3, m1
paddw m3, m2
pmulhrsw m3, m13
jmp .w8_filter_left_top16
.w8_filter_left_h16:
mov r5d, 10
cmp hd, 16
cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vpbroadcastb m0, xm0
.w8_filter_left_top16:
vbroadcasti128 m1, [base+z_filter_s+12]
vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
vbroadcasti128 m4, [base+z_filter_s+16]
vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
pmaxub m0, m2
movu xm2, [rsp+49]
vinserti128 m2, [rsp+43], 1
pshufb m0, m2, m0
pmaddubsw m0, m7
movd xm7, r7m ; max_height
pshufb m1, m2, m1
pmaddubsw m1, m8
pshufb m2, m4
pmaddubsw m2, m9
pminsw xm7, xm15
paddw m1, m0
vpbroadcastb m7, xm7
paddw m1, m2
pmulhrsw m1, m13
psubb m7, [base+pb_32to1]
packuswb m3, m1
vpermq m3, m3, q1320
vpblendvb m3, [rsp+32], m7
mova [rsp+32], m3
jmp .w8_main
.w8_upsample_left:
call .upsample_left
.w8_main:
movd xm3, dxd
lea r5, [rsp+56] ; left-7
pshufd xm1, xm5, q3120
pand xm5, xm14
vpbroadcastw m3, xm3
pxor xm0, xm0
psubw xm2, xm15, xm5
psraw xm1, 6
lea r9, [strideq*3]
paddw m7, m3, m3
psubw xm9, xm0, xm1 ; base_y
psllw xm5, 8
punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5
vpblendd m3, m7, 0xf0 ; xpos0 xpos1
por xm5, xm2 ; 64-frac_y, frac_y
punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7
paddw m6, m3
vinserti128 m12, m5, xm5, 1
.w8_loop:
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x0
movu xm0, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x1
vinserti128 m0, [rsp+r3], 1
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x2
movu xm1, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x3
vinserti128 m1, [rsp+r3], 1
pand m2, m14, m6
paddsw m4, m6, m7
psubw m5, m15, m2
psllw m2, 8
pshufb m0, m10
por m2, m5
pmaddubsw m0, m2
pand m2, m14, m4
psubw m5, m15, m2
psllw m2, 8
pshufb m1, m10
por m2, m5
pmaddubsw m1, m2
cmp r3d, 64
jge .w8_toponly
mova m5, m7
vpgatherdq m3, [r5+xm9], m7
mova m7, m5
vpgatherdq m2, [r5+xm8], m5
pshufb m3, m11
pshufb m2, m11
punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3
vpermq m5, m5, q3120 ; y0 y1
vpermq m2, m2, q3120 ; y2 y3
pmaddubsw m5, m12
pmaddubsw m2, m12
psraw m6, 15 ; base_x < topleft
vpblendvb m0, m5, m6
psraw m3, m4, 15
vpblendvb m1, m2, m3
.w8_toponly:
pmulhrsw m0, m13
pmulhrsw m1, m13
paddw m6, m4, m7 ; xpos += dx
add r5, dyq
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*2], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+r9 ], xm1
sub hd, 4
jz .w8_end
lea dstq, [dstq+strideq*4]
cmp r2d, r8d
jge .w8_loop
.w8_leftonly_loop:
mova m0, m7
vpgatherdq m5, [r5+xm9], m7
mova m7, m0
vpgatherdq m3, [r5+xm8], m0
add r5, dyq
pshufb m2, m5, m11
pshufb m1, m3, m11
punpckldq m0, m1, m2
punpckhdq m1, m2
vpermq m0, m0, q3120
vpermq m1, m1, q3120
pmaddubsw m0, m12
pmaddubsw m1, m12
pmulhrsw m0, m13
pmulhrsw m1, m13
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*2], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+r9 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_leftonly_loop
.w8_end:
RET
.w16:
mov r8d, hd
test angled, 0x400
jnz .w16_main
lea r3d, [hq+15]
sub angled, 90
call .filter_strength
test r3d, r3d