blob: 3833e17c99f540b3a6d935c03dbdd67787fd376a [file] [log] [blame]
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; Copyright © 2017-2021, The rav1e contributors
; Copyright © 2020, Nathan Egge
; Copyright © 2021, Matthias Dressel
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA
%macro COEF 1-2
pd_%1: times 4 dd %1
%if %0 == 2
pd_m%1: times 4 dd -%1
%endif
%endmacro
COEF 201
COEF 401
COEF 601, 1
COEF 799
COEF 995
COEF 1189, 1
COEF 1380, 1
COEF 1567
COEF 1751
COEF 1931
COEF 2106, 1
COEF 2276, 1
COEF 2440
COEF 2598, 1
COEF 2751, 1
COEF 2896
COEF 3035
COEF 3166
COEF 3290
COEF 3406
COEF 3513
COEF 3612
COEF 3703
COEF 3784
COEF 3857
COEF 3920
COEF 3973
COEF 4017
COEF 4052
COEF 4076
COEF 4091
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
%if ARCH_X86_32
pd_1: times 4 dd 1
%endif
pd_2: times 4 dd 2
pw_5: times 8 dw 5
pd_1321: times 4 dd 1321
pd_2482: times 4 dd 2482
pd_m3344: times 4 dd -3344
pd_2048: times 4 dd 2048
pw_4x2048_4xm2048: times 4 dw 2048
times 4 dw -2048
pw_4xm2048_4x2048: times 4 dw -2048
times 4 dw 2048
pw_2048: times 8 dw 2048
pw_m2048: times 8 dw -2048
pd_3803: times 4 dd 3803
pw_4096: times 8 dw 4096
pd_5793: times 4 dd 5793
pd_6144: times 4 dd 6144
pw_8192: times 8 dw 8192
pd_10240: times 4 dd 10240
pd_11586: times 4 dd 11586
pw_1697x8: times 8 dw 1697*8
pw_2896x8: times 8 dw 2896*8
pw_1697x16: times 8 dw 1697*16
pw_16384: times 8 dw 16384
pixel_10bpc_max: times 8 dw 0x03ff
pw_1567_3784: times 4 dw 1567, 3784
pw_m3784_1567: times 4 dw -3784, 1567
pw_2896_2896: times 4 dw 2896, 2896
pw_m2896_2896: times 4 dw -2896, 2896
clip_18b_min: times 4 dd -0x20000
clip_18b_max: times 4 dd 0x1ffff
idct64_mul_16bpc:
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
cextern iadst_4x4_internal_8bpc_ssse3.main
cextern idct_4x8_internal_8bpc_ssse3.main
cextern iadst_4x8_internal_8bpc_ssse3.main
cextern idct_16x4_internal_8bpc_ssse3.main
cextern iadst_16x4_internal_8bpc_ssse3.main
cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
cextern idct_8x4_internal_8bpc_ssse3.main
cextern iadst_8x4_internal_8bpc_ssse3.main
cextern idct_8x8_internal_8bpc_ssse3.main
cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
cextern iadst_8x8_internal_8bpc_ssse3.main
cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
cextern idct_16x8_internal_8bpc_ssse3.main
cextern iadst_16x8_internal_8bpc_ssse3.main
cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
cextern idct_8x32_internal_8bpc_ssse3.main
cextern idct_8x32_internal_8bpc_ssse3.main_fast
cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
cextern idct_16x64_internal_8bpc_ssse3.main
cextern idct_16x64_internal_8bpc_ssse3.main_fast
tbl_4x16_2d: db 0, 13, 29, 45
tbl_4x16_h: db 0, 16, 32, 48
tbl_4x16_v: db 0, 4, 8, 12
tbl_8x16_2d: db 0, 14, 30, 46
tbl_8x16_v: db 0, 4, 8, 12
tbl_8x16_h: db 0, 32, 64, 96
tbl_16x16_2d: db 0, 10, 36, 78
tbl_16x16_v: db 0, 4, 8, 12
tbl_16x16_h: db 0, 64, 128, 192
tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
tbl_Nx32_odd_offset: db 2*16, 2*23
db 2*20, 2*19
db 2*18, 2*21
db 2*22, 2*17
db 2*30, 2*25
db 2*26, 2*29
db 2*28, 2*27
db 2*24, 2*31
tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
db 2* 8, 2*40, 2*23, 2*38
db 2* 1, 2*36, 2*20, 2*42
db 2* 9, 2*44, 2*19, 2*34
db 2* 2, 2*60, 2*18, 2*50
db 2*10, 2*52, 2*21, 2*58
db 2* 3, 2*56, 2*22, 2*54
db 2*11, 2*48, 2*17, 2*62
SECTION .text
%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
%define m(x) m_suffix(x, SUFFIX)
; This refers to the first function in itx_sse i.e. the start of the text section
; which is needed as a base pointer for constants.
%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
%if ARCH_X86_64
%define o(x) x
%else
%define o(x) r6-$$+x ; PIC
%endif
%macro IWHT4_1D 0
; m0 = in0, m1 = in1, m2 = in2, m3 = in3
paddd m0, m1 ; in0 += in1
psubd m4, m2, m3 ; tmp0 = in2 - in3
psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1
psrad m5, 1
psubd m2, m5, m1 ; in2 = tmp1 - in1
psubd m5, m3 ; in1 = tmp1 - in3
psubd m0, m5 ; in0 -= in1
paddd m4, m2 ; in3 = tmp0 + in2
; m0 = out0, m1 = in1, m2 = out2, m3 = in3
; m4 = out3, m5 = out1
%endmacro
INIT_XMM sse2
cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
mova m0, [cq+16*0]
mova m1, [cq+16*1]
mova m2, [cq+16*2]
mova m3, [cq+16*3]
REPX {psrad x, 2}, m0, m1, m2, m3
IWHT4_1D
punpckldq m1, m0, m5
punpckhdq m3, m0, m5
punpckldq m5, m2, m4
punpckhdq m2, m4
punpcklqdq m0, m1, m5
punpckhqdq m1, m5
punpcklqdq m4, m3, m2
punpckhqdq m3, m2
mova m2, m4
IWHT4_1D
packssdw m0, m4 ; low: out3, high: out0
packssdw m2, m5 ; low: out2, high: out1
pxor m4, m4
mova [cq+16*0], m4
mova [cq+16*1], m4
mova [cq+16*2], m4
mova [cq+16*3], m4
lea r2, [dstq+strideq*2]
movq m1, [dstq+strideq*0]
movhps m1, [r2 +strideq*1]
movq m3, [r2 +strideq*0]
movhps m3, [dstq+strideq*1]
movd m5, bdmaxm
pshuflw m5, m5, q0000 ; broadcast
punpcklqdq m5, m5 ; broadcast
paddsw m0, m1
paddsw m2, m3
pmaxsw m0, m4
pmaxsw m2, m4
pminsw m0, m5
pminsw m2, m5
movhps [r2 +strideq*1], m0 ; write out0
movhps [dstq+strideq*1], m2 ; write out1
movq [r2 +strideq*0], m2 ; write out2
movq [dstq+strideq*0], m0 ; write out3
RET
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
; flags: 2 = inv_dst1, 4 = inv_dst2
; skip round/shift if rnd is not a number
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
; %1 dst/src[1]
; %2 dst/src[2]
; %3 tmp[1]
; %4 tmp[2]
; %5 tmp[3]
; %6 rnd
; %7 coef[1]
; %8 coef[2]
; %9 flags
%ifnidn %7,%8 ; optimize when coef1 == coef2
%if %8 < 32
pmulld m%4, m%1, m%8
pmulld m%3, m%2, m%8
%else
mova m%3, [o(pd_%8)]
pmulld m%4, m%1, m%3
pmulld m%3, m%2
%endif
%endif
%if %7 < 32
pmulld m%1, m%7
pmulld m%2, m%7
%else
mova m%5, [o(pd_%7)]
pmulld m%1, m%5
pmulld m%2, m%5
%endif
%if %9 & 4 ; invert dst2
paddd m%4, m%2
psubd m%2, m%6, m%4
%else
%ifnum %6
%ifnidn %7,%8
paddd m%4, m%6
%else
paddd m%1, m%6
%endif
%endif
%ifnidn %7,%8
paddd m%2, m%4
%else
mova m%3, m%2
paddd m%2, m%1
%endif
%endif
%if %9 & 2 ; invert dst1
psubd m%3, m%1
paddd m%1, m%3, m%6
%else
%ifnum %6
%ifnidn %7,%8
paddd m%1, m%6
%endif
%endif
psubd m%1, m%3
%endif
%ifnum %6
psrad m%2, 12
psrad m%1, 12
%endif
%endmacro
%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
%define %%p1 m(i%1_%4_internal_16bpc)
%if ARCH_X86_32
LEA r6, $$
%endif
%if has_epilogue
%ifidn %1_%2, dct_dct
test eobd, eobd
jz %%end
%endif
lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
%ifnum %3
%if %3
add eobd, %3
%endif
%else
lea r5, [o(%3)]
%endif
call %%p1
RET
%%end:
%else
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
%ifnum %3
%if %3
add eobd, %3
%endif
%else
lea r5, [o(%3)]
%endif
%ifidn %1_%2, dct_dct
test eobd, eobd
jnz %%p1
%else
; jump to the 1st txfm function unless it's located directly after this
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
%endif
%endif
%endmacro
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
add r5d, 128
sar r5d, 8
.dconly2:
imul r5d, 2896
mova m2, [o(pixel_10bpc_max)]
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
pxor m3, m3
punpcklqdq m0, m0
.dconly_loop:
movq m1, [dstq+strideq*0]
movhps m1, [dstq+strideq*1]
paddw m1, m0
pminsw m1, m2
pmaxsw m1, m3
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
%endif
%endmacro
%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
; butterfly rotation
ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0
ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3
; Hadamard rotation
psubd m%5, m%1, m%2
paddd m%2, m%1
paddd m%1, m%3, m%4
psubd m%3, m%4
; %1 (src1) = out0
; %2 (src2) = out1
; %3 (src3) = out3
; $5 (tmp1) = out2
%endmacro
INIT_XMM sse4
INV_TXFM_4X4_FN dct, dct
INV_TXFM_4X4_FN dct, identity
INV_TXFM_4X4_FN dct, adst
INV_TXFM_4X4_FN dct, flipadst
cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
mova m2, [cq+16*2]
mova m3, [cq+16*3]
mova m5, [o(pd_2048)]
call .pass1_main
packssdw m0, m1 ; out0 out1
packssdw m4, m2 ; out2 out3
; transpose
punpckhwd m2, m0, m4
punpcklwd m0, m4
punpckhwd m1, m0, m2
punpcklwd m0, m2
; m0 = out0 out1
; m1 = out2 out3
; m5 = pd_2048
jmp tx2q
.pass1_main:
IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
ret
.pass2:
; m0 = in0 in1
; m1 = in2 in3
; m5 = pd_2048
punpckhwd m2, m1, m0
punpcklwd m1, m0
pmaddwd m4, m2, [o(pw_m3784_1567)]
pmaddwd m2, [o(pw_1567_3784)]
pmaddwd m0, m1, [o(pw_m2896_2896)]
pmaddwd m1, [o(pw_2896_2896)]
REPX {paddd x, m5}, m4, m2, m0, m1
packssdw m5, m5 ; pw_2048
REPX {psrad x, 12}, m4, m2, m0, m1
packssdw m2, m4 ; t3 t2
packssdw m1, m0 ; t0 t1
paddsw m0, m1, m2 ; out0 out1
psubsw m1, m2 ; out3 out2
pmulhrsw m0, m5
pmulhrsw m1, m5
movq m2, [dstq+strideq*0]
movhps m2, [dstq+strideq*1]
lea r5, [dstq+strideq*2]
movq m3, [r5 +strideq*1]
movhps m3, [r5 +strideq*0]
mova m5, [o(pixel_10bpc_max)]
pxor m4, m4
mova [cq+16*0], m4
mova [cq+16*1], m4
mova [cq+16*2], m4
mova [cq+16*3], m4
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movhps [r5 +strideq*0], m1
movq [r5 +strideq*1], m1
RET
INV_TXFM_4X4_FN adst, dct
INV_TXFM_4X4_FN adst, adst
INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
call .main
packssdw m0, m2 ; out0 out1
packssdw m1, m4 ; out2 out3
; transpose
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
punpcklwd m0, m2
; m0 = out0 out1
; m1 = out2 out3
; m5 = pd_2048
jmp tx2q
.pass2:
; m0 = in0 in1
; m1 = in2 in3
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
.end:
mova m4, [o(pw_2048)]
movq m2, [dstq+strideq*0]
movhps m2, [dstq+strideq*1]
lea r5, [dstq+strideq*2]
movq m3, [r5 +strideq*0]
movhps m3, [r5 +strideq*1]
mova m5, [o(pixel_10bpc_max)]
pmulhrsw m0, m4
pmulhrsw m1, m4
pxor m4, m4
mova [cq+16*0], m4
mova [cq+16*1], m4
mova [cq+16*2], m4
mova [cq+16*3], m4
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [r5 +strideq*0], m1
movhps [r5 +strideq*1], m1
RET
ALIGN function_align
.main:
mova m1, [cq+16*2]
mova m3, [cq+16*3]
mova m5, [cq+16*0]
lea r3, [cq+16*1]
.main2:
mova m0, [o(pd_1321)] ; SINPI_1_9
mova m2, [o(pd_2482)] ; SINPI_2_9
mova m6, [o(pd_3803)] ; SINPI_4_9
pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2]
pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3]
pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2]
pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0]
psubd m1, m3 ; T[2] - T[3]
pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3]
pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0]
paddd m0, m6 ; s[0] += s[3]
paddd m0, m3 ; s[0] += s[5]
mova m3, [o(pd_m3344)] ; -SINPI_3_9
psubd m2, m4 ; s[1] -= s[4]
psubd m2, m7 ; s[1] -= s[6]
psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0]
pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7
pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1]
mova m5, [o(pd_2048)]
REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048
paddd m4, m0, m2 ; x[3] = s[0] + s[1]
psubd m2, m3 ; x[1] = s[1] + s[3]
psubd m0, m3 ; x[0] = s[0] + s[3]
paddd m4, m3 ; x[3] -= s[3]
paddd m2, m5 ; x[1] + 2048
REPX {psrad x, 12}, m0, m2, m1, m4
ret
INV_TXFM_4X4_FN flipadst, dct
INV_TXFM_4X4_FN flipadst, adst
INV_TXFM_4X4_FN flipadst, flipadst
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_16bpc).main
packssdw m0, m2 ; out0 out1
packssdw m1, m4 ; out2 out3
; transpose
punpcklwd m2, m1, m0
punpckhwd m1, m0
punpcklwd m0, m1, m2
punpckhwd m1, m2
; m0 = out0 out1
; m1 = out2 out3
; m5 = pd_2048
jmp tx2q
.pass2:
; m0 = in0 in1
; m1 = in2 in3
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
mova m4, [o(pw_2048)]
movq m3, [dstq+strideq*1]
movhps m3, [dstq+strideq*0]
lea r5, [dstq+strideq*2]
movq m2, [r5 +strideq*1]
movhps m2, [r5 +strideq*0]
mova m5, [o(pixel_10bpc_max)]
pmulhrsw m0, m4
pmulhrsw m1, m4
pxor m4, m4
mova [cq+16*0], m4
mova [cq+16*1], m4
mova [cq+16*2], m4
mova [cq+16*3], m4
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
movhps [dstq+strideq*0], m1
movq [dstq+strideq*1], m1
movhps [r5 +strideq*0], m0
movq [r5 +strideq*1], m0
RET
INV_TXFM_4X4_FN identity, dct
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
mova m3, [o(pd_5793)]
pmulld m0, m3, [cq+16*0]
pmulld m1, m3, [cq+16*1]
pmulld m2, m3, [cq+16*2]
pmulld m3, [cq+16*3]
mova m5, [o(pd_2048)]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
packssdw m0, m1
packssdw m2, m3
; transpose
punpckhwd m3, m0, m2
punpcklwd m0, m2
punpckhwd m1, m0, m3
punpcklwd m0, m3
; m0 = out0 out1
; m1 = out2 out3
; m5 = pd_2048
jmp tx2q
.pass2:
; m0 = in0 in1
; m1 = in2 in3
; m5 = pd_2048
mova m4, [o(pw_1697x8)]
movq m2, [dstq+strideq*0]
movhps m2, [dstq+strideq*1]
lea r5, [dstq+strideq*2]
pmulhrsw m3, m4, m0
pmulhrsw m4, m1
paddsw m0, m3
paddsw m1, m4
movq m3, [r5 +strideq*0]
movhps m3, [r5 +strideq*1]
mova m4, [o(pixel_10bpc_max)]
packssdw m5, m5 ; pw_2048
pmulhrsw m0, m5
pmulhrsw m1, m5
pxor m5, m5
mova [cq+16*0], m5
mova [cq+16*1], m5
mova [cq+16*2], m5
mova [cq+16*3], m5
paddw m0, m2
paddw m1, m3
pmaxsw m0, m5
pmaxsw m1, m5
pminsw m0, m4
pminsw m1, m4
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [r5 +strideq*0], m1
movhps [r5 +strideq*1], m1
RET
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
add r5d, 128
sar r5d, 8
imul r5d, 181
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
INV_TXFM_4X8_FN dct, dct
INV_TXFM_4X8_FN dct, identity, 9
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
mova m5, [o(pd_2048)]
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 13
setge r5b
%else
mov r5d, 1
cmp eobd, 13
sbb r5d, 0
%endif
shl r5d, 4
.loop_pass1:
mova m3, [o(pd_2896)]
pmulld m0, m3, [cq+32*0+r5]
pmulld m1, m3, [cq+32*1+r5]
pmulld m2, m3, [cq+32*2+r5]
pmulld m3, [cq+32*3+r5]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
call m(idct_4x4_internal_16bpc).pass1_main
packssdw m0, m1 ; out0 out1
packssdw m4, m2 ; out2 out3
test r5d, r5d
jz .end_pass1
mova [cq+32*0+16], m0
mova [cq+32*1+16], m4
xor r5d, r5d
jmp .loop_pass1
.end_pass1:
punpckhwd m2, m0, m4
punpcklwd m0, m4
punpckhwd m1, m0, m2
punpcklwd m0, m2
mova m2, [cq+32*0+16]
mova m6, [cq+32*1+16]
punpckhwd m4, m2, m6
punpcklwd m2, m6
punpckhwd m3, m2, m4
punpcklwd m2, m4
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
; m0-3 is now out0/1,3/2,4/5,7/6
mova m4, [o(pw_2048)]
shufps m1, m1, q1032
shufps m3, m3, q1032
.end:
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
pxor m4, m4
REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
mova m7, [o(pixel_10bpc_max)]
lea r2, [strideq*3]
movq m5, [dstq+strideq*0]
movq m6, [dstq+strideq*2]
movhps m5, [dstq+strideq*1]
movhps m6, [dstq+r2]
lea r4, [dstq+strideq*4]
paddw m0, m5
paddw m1, m6
movq m5, [r4+strideq*0]
movq m6, [r4+strideq*2]
movhps m5, [r4+strideq*1]
movhps m6, [r4+r2]
paddw m2, m5
paddw m3, m6
REPX {pminsw x, m7}, m0, m1, m2, m3
REPX {pmaxsw x, m4}, m0, m1, m2, m3
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r2 ], m1
movq [r4 +strideq*0], m2
movhps [r4 +strideq*1], m2
movq [r4 +strideq*2], m3
movhps [r4 +r2 ], m3
RET
INV_TXFM_4X8_FN adst, dct
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity, 9
cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
call .pass1_main
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
punpcklwd m0, m2
mova m2, [cq+32*2+16]
mova m6, [cq+32*3+16]
punpckhwd m4, m2, m6
punpcklwd m2, m6
punpckhwd m3, m2, m4
punpcklwd m2, m4
; m0-3 = packed & transposed output
jmp tx2q
.pass1_main:
%undef cmp
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 13
setge r5b
%else
mov r5d, 1
cmp eobd, 13
sbb r5d, 0
%endif
shl r5d, 4
lea r3, [cq+32*1+16]
.loop_pass1:
mova m0, [o(pd_2048)]
mova m3, [o(pd_2896)]
pmulld m5, m3, [cq+32*0+r5]
pmulld m2, m3, [cq+32*1+r5]
pmulld m1, m3, [cq+32*2+r5]
pmulld m3, [cq+32*3+r5]
REPX {paddd x, m0}, m5, m2, m1, m3
REPX {psrad x, 12}, m5, m2, m1, m3
mova [r3], m2
call m(iadst_4x4_internal_16bpc).main2
packssdw m0, m2 ; out0 out1
packssdw m1, m4 ; out2 out3
test r5d, r5d
jz .end_pass1
mova [cq+32*2+16], m0
mova [cq+32*3+16], m1
xor r5d, r5d
jmp .loop_pass1
.end_pass1:
ret
.pass2:
shufps m0, m0, q1032
shufps m1, m1, q1032
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
mova m4, [o(pw_4x2048_4xm2048)]
jmp m(idct_4x8_internal_16bpc).end
INV_TXFM_4X8_FN flipadst, dct
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity, 9
cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
call m(iadst_4x8_internal_16bpc).pass1_main
punpcklwd m2, m1, m0
punpckhwd m1, m0
punpcklwd m0, m1, m2
punpckhwd m1, m2
mova m6, [cq+32*2+16]
mova m2, [cq+32*3+16]
punpcklwd m4, m2, m6
punpckhwd m2, m6
punpckhwd m3, m2, m4
punpcklwd m2, m4
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
shufps m0, m0, q1032
shufps m1, m1, q1032
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
mova m4, m0
mova m5, m1
pshufd m0, m3, q1032
pshufd m1, m2, q1032
pshufd m2, m5, q1032
pshufd m3, m4, q1032
mova m4, [o(pw_4xm2048_4x2048)]
jmp m(idct_4x8_internal_16bpc).end
INV_TXFM_4X8_FN identity, dct
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity, 3
cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
mova m5, [o(pd_2048)]
mova m4, [o(pd_2896)]
mova m6, [o(pd_5793)]
; clear m7 in case we skip the bottom square
pxor m7, m7
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 16
setge r5b
%else
mov r5d, 1
cmp eobd, 16
sbb r5d, 0
%endif
shl r5d, 4
.loop_pass1:
pmulld m0, m4, [cq+32*0+r5]
pmulld m1, m4, [cq+32*1+r5]
pmulld m2, m4, [cq+32*2+r5]
pmulld m3, m4, [cq+32*3+r5]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
REPX {pmulld x, m6}, m0, m1, m2, m3
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
packssdw m0, m1
packssdw m2, m3
test r5d, r5d
jz .end_pass1
mova [cq+32*0+16], m0
mova m7, m2
xor r5d, r5d
jmp .loop_pass1
.end_pass1:
punpckhwd m4, m0, m2
punpcklwd m0, m2
punpckhwd m1, m0, m4
punpcklwd m0, m4
mova m2, [cq+32*0+16]
punpckhwd m4, m2, m7
punpcklwd m2, m7
punpckhwd m3, m2, m4
punpcklwd m2, m4
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
mova m4, [o(pw_4096)]
jmp m(idct_4x8_internal_16bpc).end
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
add r5d, 384
sar r5d, 9
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
INV_TXFM_4X16_FN dct, dct
INV_TXFM_4X16_FN dct, identity, v
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
%if ARCH_X86_32
mov r5m, r6d
%endif
mov r6d, 4
.zero_loop:
dec r6d
cmp eobb, byte [r5+r6]
jl .zero_loop
mov r5d, r6d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, r5m
%endif
mova m5, [o(pd_2048)]
.loop_pass1:
mova m0, [cq+64*0+r5]
mova m1, [cq+64*1+r5]
mova m2, [cq+64*2+r5]
mova m3, [cq+64*3+r5]
call m(idct_4x4_internal_16bpc).pass1_main
pcmpeqd m3, m3
REPX {psubd x, m3}, m0, m1, m4, m2
REPX {psrad x, 1}, m0, m1, m4, m2
packssdw m0, m1 ; out0 out1
packssdw m4, m2 ; out2 out3
punpckhwd m2, m0, m4
punpcklwd m0, m4
punpckhwd m1, m0, m2
punpcklwd m0, m2
test r5d, r5d
jz .end_pass1
mova [cq+64*0+r5], m0
mova [cq+64*1+r5], m1
sub r5d, 16
jmp .loop_pass1
.end_pass1:
mova m2, [cq+64*0+16]
mova m3, [cq+64*1+16]
mova m4, [cq+64*0+32]
mova m5, [cq+64*1+32]
mova m6, [cq+64*0+48]
mova m7, [cq+64*1+48]
; m0-7 = packed & transposed output
jmp tx2q
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
; m0-6 is out0-13 [with odd registers having inversed output]
; [coeffq+16*7] has out15/14
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [cq+16*7]
REPX {shufps x, x, q1032}, m1, m3, m5, m7
mova [cq+16*0], m4
mova [cq+16*1], m5
mova [cq+16*2], m6
mova [cq+16*3], m7
.end:
pxor m4, m4
REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
mova m7, [o(pixel_10bpc_max)]
mov r5d, 2
lea r3, [strideq*3]
.loop:
movq m5, [dstq+strideq*0]
movq m6, [dstq+strideq*2]
movhps m5, [dstq+strideq*1]
movhps m6, [dstq+r3]
lea r4, [dstq+strideq*4]
paddw m0, m5
paddw m1, m6
movq m5, [r4+strideq*0]
movq m6, [r4+strideq*2]
movhps m5, [r4+strideq*1]
movhps m6, [r4+r3]
paddw m2, m5
paddw m3, m6
REPX {pminsw x, m7}, m0, m1, m2, m3
REPX {pmaxsw x, m4}, m0, m1, m2, m3
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r3 ], m1
movq [r4 +strideq*0], m2
movhps [r4 +strideq*1], m2
movq [r4 +strideq*2], m3
movhps [r4 +r3 ], m3
dec r5d
jz .end2
lea dstq, [dstq+strideq*8]
mova m0, [cq+0*16]
mova m1, [cq+1*16]
mova m2, [cq+2*16]
mova m3, [cq+3*16]
REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
jmp .loop
.end2:
RET
INV_TXFM_4X16_FN adst, dct
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity, v
cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
%if ARCH_X86_32
mov r5m, r6d
%endif
mov r6d, 4
.zero_loop:
dec r6d
cmp eobb, byte [r6+r5]
jl .zero_loop
mov r5d, r6d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, r5m
%endif
.loop_pass1:
mova m5, [cq+64*0+r5]
lea r3, [cq+64*1+r5]
mova m1, [cq+64*2+r5]
mova m3, [cq+64*3+r5]
call m(iadst_4x4_internal_16bpc).main2
pcmpeqd m3, m3
REPX {psubd x, m3}, m0, m2, m1, m4
REPX {psrad x, 1}, m0, m2, m1, m4
packssdw m0, m2 ; out0 out1
packssdw m1, m4 ; out2 out3
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
punpcklwd m0, m2
test r5d, r5d
jz m(idct_4x16_internal_16bpc).end_pass1
mova [cq+64*0+r5], m0
mova [cq+64*1+r5], m1
sub r5d, 16
jmp .loop_pass1
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
mova m1, [o(pw_4x2048_4xm2048)]
REPX {pmulhrsw x, m1}, m7, m2, m0
pshufd m6, m1, q1032 ; 4x-2048,4x2048
pmulhrsw m1, [cq+16*7]
REPX {pmulhrsw x, m6}, m5, m4, m3
pmulhrsw m6, [cq+16*6]
; m7/5/2/4 = out4/11,5/10,6/9,7/8
; m0/3/6/1 = out0/15,3/12,1/14,2/13
; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
movhps [cq+0*8], m4
movhps [cq+1*8], m2
movhps [cq+2*8], m5
movhps [cq+3*8], m7
movhps [cq+4*8], m3
movhps [cq+5*8], m1
movhps [cq+6*8], m6
movhps [cq+7*8], m0
punpcklqdq m0, m6
punpcklqdq m1, m3
punpcklqdq m3, m2, m4
punpcklqdq m2, m7, m5
jmp m(idct_4x16_internal_16bpc).end
INV_TXFM_4X16_FN flipadst, dct
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity, v
cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
%if ARCH_X86_32
mov r5m, r6d
%endif
mov r6d, 4
.zero_loop:
dec r6d
cmp eobb, byte [r5+r6]
jl .zero_loop
mov r5d, r6d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, r5m
%endif
.loop_pass1:
mova m5, [cq+64*0+r5]
lea r3, [cq+64*1+r5]
mova m1, [cq+64*2+r5]
mova m3, [cq+64*3+r5]
call m(iadst_4x4_internal_16bpc).main2
pcmpeqd m3, m3
REPX {psubd x, m3}, m0, m2, m1, m4
REPX {psrad x, 1}, m0, m2, m1, m4
packssdw m0, m2 ; out3 out2
packssdw m1, m4 ; out1 out0
punpcklwd m2, m1, m0
punpckhwd m1, m0
punpcklwd m0, m1, m2
punpckhwd m1, m2
test r5d, r5d
jz m(idct_4x16_internal_16bpc).end_pass1
mova [cq+64*0+r5], m0
mova [cq+64*1+r5], m1
sub r5d, 16
jmp .loop_pass1
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
mova m1, [o(pw_4x2048_4xm2048)]
REPX {pmulhrsw x, m1}, m7, m2, m0
pshufd m6, m1, q1032 ; 4x-2048,4x2048
pmulhrsw m1, [cq+16*7]
REPX {pmulhrsw x, m6}, m5, m4, m3
pmulhrsw m6, [cq+16*6]
; m7/5/2/4 = out11/4,10/5,9/6,8/7
; m0/3/6/1 = out15/0,12/3,14/1,13/2
; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
movq [cq+0*8], m4
movq [cq+1*8], m2
movq [cq+2*8], m5
movq [cq+3*8], m7
movq [cq+4*8], m3
movq [cq+5*8], m1
movq [cq+6*8], m6
movq [cq+7*8], m0
punpckhqdq m0, m6
punpckhqdq m1, m3
punpckhqdq m3, m2, m4
punpckhqdq m2, m7, m5
jmp m(idct_4x16_internal_16bpc).end
INV_TXFM_4X16_FN identity, dct, h
INV_TXFM_4X16_FN identity, adst, h
INV_TXFM_4X16_FN identity, flipadst, h
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%undef cmp
%if ARCH_X86_32
mov r5m, r6d
%endif
mov r6d, 4
.zero_loop:
dec r6d
cmp eobb, byte [r5+r6]
jl .zero_loop
mov r5d, r6d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, r5m
%endif
mova m5, [o(pd_6144)]
mova m4, [o(pd_5793)]
.loop_pass1:
pmulld m0, m4, [cq+64*0+r5]
pmulld m1, m4, [cq+64*1+r5]
pmulld m2, m4, [cq+64*2+r5]
pmulld m3, m4, [cq+64*3+r5]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 13}, m0, m1, m2, m3
packssdw m0, m1
packssdw m2, m3
punpckhwd m3, m0, m2
punpcklwd m0, m2
punpckhwd m1, m0, m3
punpcklwd m0, m3
test r5d, r5d
jz m(idct_4x16_internal_16bpc).end_pass1
mova [cq+64*0+r5], m0
mova [cq+64*1+r5], m1
sub r5d, 16
jmp .loop_pass1
.pass2:
mova [cq+16*4], m0
mova [cq+16*5], m1
mova [cq+16*6], m2
mova [cq+16*7], m7
mova m0, [o(pw_1697x16)]
mova m7, [o(pw_2048)]
pmulhrsw m1, m0, m4
pmulhrsw m2, m0, m5
REPX {paddsw x, x}, m4, m5
paddsw m4, m1
paddsw m5, m2
REPX {pmulhrsw x, m7}, m4, m5
mova [cq+16*0], m4
mova [cq+16*1], m5
mova m4, [cq+16*7]
pmulhrsw m1, m0, m6
pmulhrsw m2, m0, m4
REPX {paddsw x, x}, m6, m4
paddsw m6, m1
paddsw m4, m2
REPX {pmulhrsw x, m7}, m6, m4
mova [cq+16*2], m6
mova [cq+16*3], m4
mova m4, [cq+16*4]
mova m1, [cq+16*5]
mova m2, [cq+16*6]
pmulhrsw m5, m0, m2
pmulhrsw m6, m0, m3
REPX {paddsw x, x}, m2, m3
paddsw m2, m5
paddsw m3, m6
pmulhrsw m6, m0, m1
pmulhrsw m0, m4
REPX {paddsw x, x}, m1, m4
paddsw m1, m6
paddsw m0, m4
REPX {pmulhrsw x, m7}, m2, m3, m1, m0
jmp m(idct_4x16_internal_16bpc).end
%macro INV_TXFM_8X4_FN 2 ; type1, type2
%if ARCH_X86_64
INV_TXFM_FN %1, %2, 0, 8x4, 15
%else
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 128
sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
mova m6, [o(pixel_10bpc_max)]
pxor m5, m5
lea r2, [strideq*3]
mova m1, [dstq+strideq*0]
mova m2, [dstq+strideq*1]
mova m3, [dstq+strideq*2]
mova m4, [dstq+r2]
REPX {paddw x, m0}, m1, m2, m3, m4
REPX {pmaxsw x, m5}, m1, m2, m3, m4
REPX {pminsw x, m6}, m1, m2, m3, m4
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
mova [dstq+strideq*2], m3
mova [dstq+r2 ], m4
RET
%endif
%endmacro
INV_TXFM_8X4_FN dct, dct
INV_TXFM_8X4_FN dct, identity
INV_TXFM_8X4_FN dct, adst
INV_TXFM_8X4_FN dct, flipadst
cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
lea r5, [o(.main)]
.pass1_entry:
%if ARCH_X86_32
lea r3, [rsp+gprsize]
%else
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+0*16]
mova m1, [cq+1*16]
mova m2, [cq+2*16]
mova m3, [cq+3*16]
mova m4, [cq+4*16]
mova m5, [cq+5*16]
mova m6, [cq+6*16]
mova m7, [cq+7*16]
call .rect2_mul
call r5
call .transpose4x8packed
; m0-3 = packed & transposed output
jmp tx2q
.transpose4x8packed:
; transpose
punpcklwd m1, m2, m6
punpckhwd m2, m6
punpckhwd m6, m0, m4
punpcklwd m0, m4
punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhwd m4, m6, m2
punpcklwd m6, m2
punpcklwd m2, m3, m4
punpckhwd m3, m4
punpckhwd m1, m0, m6
punpcklwd m0, m6
ret
.main:
call .main_pass1
call .round
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
ret
.rect2_mul:
%if ARCH_X86_64
REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
%else
mova [r3], m7
mova m7, [o(pd_2896)]
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulld m7, [r3]
mova [r3], m7
mova m7, [o(pd_2048)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3]
%endif
REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
ret
%if ARCH_X86_64
.main_pass1_fast:
pmulld m5, m3, [o(pd_m2276)]
pmulld m3, [o(pd_3406)]
pmulld m7, m1, [o(pd_4017)]
pmulld m1, [o(pd_799)]
pmulld m6, m2, [o(pd_3784)]
pmulld m2, [o(pd_1567)]
pmulld m0, m14
pxor m4, m4
jmp .main_pass1_fast2
.main_pass1:
ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
REPX {pmulld x, m14}, m0, m4
.main_pass1_fast2:
REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7
REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7
paddd m8, m1, m5 ; t4
psubd m1, m5 ; t5a
paddd m9, m7, m3 ; t7
psubd m7, m3 ; t6a
REPX {pmaxsd x, m12}, m1, m8, m7, m9
REPX {pminsd x, m13}, m1, m8, m7, m9
REPX {pmulld x, m14}, m7, m1
paddd m0, m11
paddd m7, m11
psubd m5, m0, m4
paddd m0, m4
psubd m4, m7, m1
paddd m7, m1
REPX {psrad x, 12 }, m5, m0, m4, m7
psubd m3, m0, m6 ; dct4 out3
paddd m0, m6 ; dct4 out0
paddd m6, m5, m2 ; dct4 out1
psubd m5, m2 ; dct4 out2
REPX {pmaxsd x, m12}, m0, m6, m5, m3
REPX {pminsd x, m13}, m0, m6, m5, m3
ret
.round:
paddd m1, m6, m7 ; out1
psubd m6, m7 ; out6
psubd m7, m0, m9 ; out7
paddd m0, m9 ; out0
paddd m2, m5, m4 ; out2
psubd m5, m4 ; out5
psubd m4, m3, m8 ; out4
paddd m3, m8 ; out3
%else
.main_pass1_fast:
pmulld m5, m3, [o(pd_m2276)]
pmulld m3, [o(pd_3406)]
pmulld m7, m1, [o(pd_4017)]
pmulld m1, [o(pd_799)]
pmulld m6, m2, [o(pd_3784)]
pmulld m2, [o(pd_1567)]
mova m4, [o(pd_2048)]
mova [r3+0*16], m2
REPX {paddd x, m4}, m5, m3, m7, m1
REPX {psrad x, 12}, m5, m3, m7, m1
paddd m2, m1, m5 ; t4
psubd m1, m5 ; t5a
pmulld m5, m0, [o(pd_2896)]
mova m0, m4
paddd m4, m7, m3 ; t7
psubd m7, m3 ; t6a
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3 }, m1, m2, m7, m4
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3 }, m1, m2, m7, m4
mova [r3+3*16], m2
mova [r3+1*16], m4
pxor m4, m4
mova m2, [r3+0*16]
mova m3, [o(pd_2896)]
jmp .main_pass1_fast2
.main_pass1:
mova [r3+0*16], m0
mova [r3+1*16], m2
mova [r3+2*16], m4
mova [r3+3*16], m6
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a
paddd m2, m1, m5 ; t4
psubd m1, m5 ; t5a
paddd m4, m7, m3 ; t7
psubd m7, m3 ; t6a
mova m6, [o(clip_18b_min)]
REPX {pmaxsd x, m6 }, m1, m2, m7, m4
mova m6, [o(clip_18b_max)]
REPX {pminsd x, m6 }, m1, m2, m7, m4
mova m6, [r3+3*16]
mova [r3+3*16], m2
mova m2, [r3+1*16]
mova [r3+1*16], m4
ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3
mova m3, [o(pd_2896)]
mova m5, [r3+0*16]
mova m4, [r3+2*16]
REPX {pmulld x, m3 }, m5, m4
.main_pass1_fast2:
REPX {paddd x, m0 }, m2, m6
REPX {psrad x, 12 }, m2, m6
REPX {pmulld x, m3 }, m7, m1
paddd m7, m0
paddd m0, m5
psubd m5, m0, m4
paddd m0, m4
psubd m4, m7, m1
paddd m7, m1
REPX {psrad x, 12 }, m5, m0, m4, m7
psubd m3, m0, m6 ; dct4 out3
paddd m0, m6 ; dct4 out0
paddd m6, m5, m2 ; dct4 out1
psubd m5, m2 ; dct4 out2
mova m1, [o(clip_18b_min)]
REPX {pmaxsd x, m1 }, m0, m6, m5, m3
mova m1, [o(clip_18b_max)]
REPX {pminsd x, m1 }, m0, m6, m5, m3
ret
.round:
paddd m1, m6, m7 ; out1
psubd m6, m7 ; out6
mova [r3+0*16], m6
mova m6, [r3+1*16]
psubd m7, m0, m6 ; out7
paddd m0, m6 ; out0
paddd m2, m5, m4 ; out2
psubd m5, m4 ; out5
mova m6, [r3+3*16]
psubd m4, m3, m6 ; out4
paddd m3, m6 ; out3
mova m6, [r3+0*16]
%endif
ret
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
.end:
lea r3, [strideq*3]
call .round2_and_write_8x4
REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
RET
.round2_and_write_8x4:
pxor m6, m6
mova m5, [o(pixel_10bpc_max)]
mova m4, [o(pw_2048)]
.round1_and_write_8x4:
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
.write_8x4:
paddw m0, [dstq+strideq*0]
paddw m1, [dstq+strideq*1]
paddw m2, [dstq+strideq*2]
paddw m3, [dstq+r3]
REPX {pminsw x, m5}, m0, m1, m2, m3
REPX {pmaxsw x, m6}, m0, m1, m2, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r3 ], m3
ret
INV_TXFM_8X4_FN adst, dct
INV_TXFM_8X4_FN adst, adst
INV_TXFM_8X4_FN adst, flipadst
INV_TXFM_8X4_FN adst, identity
cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
call .main_pass1
call .round
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
ret
.main_pass1:
%if ARCH_X86_64
ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a
ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
psubd m8, m2, m6 ; t6
paddd m2, m6 ; t2
psubd m6, m0, m4 ; t4
paddd m0, m4 ; t0
psubd m4, m5, m1 ; t7
paddd m5, m1 ; t3
psubd m1, m7, m3 ; t5
paddd m7, m3 ; t1
REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a
psubd m9, m6, m8 ; t7
paddd m6, m8 ; out6
mova m8, [o(pd_2896)]
psubd m3, m7, m5 ; t3
paddd m7, m5 ; -out7
psubd m5, m0, m2 ; t2
paddd m0, m2 ; out0
psubd m2, m1, m4 ; t6
paddd m1, m4 ; -out1
REPX {pmaxsd x, m12}, m5, m3, m2, m9
REPX {pminsd x, m13}, m5, m3, m2, m9
REPX {pmulld x, m14}, m5, m3, m2, m9
psubd m4, m5, m3 ; (t2 - t3) * 2896
paddd m3, m5 ; (t2 + t3) * 2896
psubd m5, m2, m9 ; (t6 - t7) * 2896
paddd m2, m9 ; (t6 + t7) * 2896
ret
.round:
; m0=out0,m1=-out1,m6=out6,m7=-out7
pcmpeqd m8, m8
REPX {pxor x, m8 }, m1, m7, m3, m5
REPX {psubd x, m8 }, m1, m7
REPX {paddd x, m11}, m2, m3, m4, m5
REPX {psrad x, 12 }, m2, m3, m4, m5
%else
mova [r3+0*16], m2
mova [r3+1*16], m3
mova [r3+2*16], m4
mova [r3+3*16], m5
mova m5, [o(pd_2048)]
ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a
ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
mova m2, [r3+0*16]
mova m3, [r3+1*16]
mova m4, [r3+2*16]
mova [r3+0*16], m0
mova [r3+1*16], m1
mova [r3+2*16], m6
mova m1, [r3+3*16]
mova [r3+3*16], m7
ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
mova m0, [r3+0*16]
mova m6, [r3+2*16]
psubd m7, m2, m6 ; t6
paddd m2, m6 ; t2
psubd m6, m0, m4 ; t4
paddd m0, m4 ; t0
mova [r3+0*16], m7
mova m5, [r3+1*16]
mova m7, [r3+3*16]
psubd m4, m1, m5 ; t7
paddd m5, m1 ; t3
psubd m1, m7, m3 ; t5
paddd m7, m3 ; t1
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
mova [r3+1*16], m7
mova m7, [o(clip_18b_max)]
pmaxsd m3, [r3+0*16]
REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
pminsd m7, [r3+1*16]
mova [r3+0*16], m0
mova [r3+1*16], m2
mova [r3+2*16], m5
mova [r3+3*16], m7
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a
mova m5, [r3+2*16]
mova m7, [r3+3*16]
psubd m2, m6, m3 ; t7
paddd m6, m3 ; out6
mova [r3+3*16], m6
mova m0, [r3+0*16]
mova m6, [r3+1*16]
psubd m3, m7, m5 ; t3
paddd m7, m5 ; -out7
psubd m5, m0, m6 ; t2
paddd m0, m6 ; out0
psubd m6, m1, m4 ; t6
paddd m1, m4 ; -out1
mova m4, [o(clip_18b_min)]
REPX {pmaxsd x, m4 }, m5, m3, m6, m2
mova m4, [o(clip_18b_max)]
REPX {pminsd x, m4 }, m5, m3, m6, m2
mova m4, [o(pd_2896)]
REPX {pmulld x, m4 }, m5, m3, m6, m2
psubd m4, m5, m3 ; (t2 - t3) * 2896
paddd m3, m5 ; (t2 + t3) * 2896
psubd m5, m6, m2 ; (t6 - t7) * 2896
paddd m2, m6 ; (t6 + t7) * 2896
ret
.round:
mova [r3+2*16], m0
pcmpeqd m0, m0
mova m6, [o(pd_2048)]
REPX {pxor x, m0 }, m1, m7, m3, m5
REPX {psubd x, m0 }, m1, m7
REPX {paddd x, m6 }, m2, m3, m4, m5
REPX {psrad x, 12 }, m2, m3, m4, m5
mova m6, [r3+3*16]
mova m0, [r3+2*16]
%endif
ret
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
jmp m(idct_8x4_internal_16bpc).end
INV_TXFM_8X4_FN flipadst, dct
INV_TXFM_8X4_FN flipadst, adst
INV_TXFM_8X4_FN flipadst, flipadst
INV_TXFM_8X4_FN flipadst, identity
cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
call m(iadst_8x4_internal_16bpc).main_pass1
call m(iadst_8x4_internal_16bpc).round
packssdw m7, m6
packssdw m5, m4
packssdw m3, m2
packssdw m1, m0
mova m0, m7
mova m2, m5
mova m4, m3
mova m6, m1
ret
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
lea r3, [strideq*3]
add dstq, r3
neg strideq
jmp m(idct_8x4_internal_16bpc).end
INV_TXFM_8X4_FN identity, dct
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
ret
.pass2:
mova m7, [o(pw_1697x8)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
jmp m(idct_8x4_internal_16bpc).end
%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
%if ARCH_X86_64
INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16
%else
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
add r5d, 384
sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
mova m6, [o(pixel_10bpc_max)]
pxor m5, m5
lea r2, [strideq*3]
.loop:
mova m1, [dstq+strideq*0]
mova m2, [dstq+strideq*1]
mova m3, [dstq+strideq*2]
mova m4, [dstq+r2]
REPX {paddw x, m0}, m1, m2, m3, m4
REPX {pmaxsw x, m5}, m1, m2, m3, m4
REPX {pminsw x, m6}, m1, m2, m3, m4
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
mova [dstq+strideq*2], m3
mova [dstq+r2 ], m4
lea dstq, [dstq+strideq*4]
dec r3d
jg .loop
RET
%endif
%endmacro
INV_TXFM_8X8_FN dct, dct
INV_TXFM_8X8_FN dct, identity, 6
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
DECLARE_REG_TMP 1
mov [rsp+4*16+1*gprsize], r1
%else
DECLARE_REG_TMP 6
%endif
lea t0, [o(.pass1_main)]
.pass1_full:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
%undef cmp
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 10
setge r5b
%else
mov r5d, 1
cmp eobd, 10
sbb r5d, 0
%endif
shl r5d, 4
%if ARCH_X86_32
lea r3, [rsp+gprsize]
%endif
.loop_pass1:
mova m0, [cq+0*32+r5]
mova m1, [cq+1*32+r5]
mova m2, [cq+2*32+r5]
mova m3, [cq+3*32+r5]
mova m4, [cq+4*32+r5]
mova m5, [cq+5*32+r5]
mova m6, [cq+6*32+r5]
mova m7, [cq+7*32+r5]
call t0
test r5d, r5d
jz .end_pass1
mova [cq+0*32+16], m0
mova [cq+1*32+16], m1
mova [cq+2*32+16], m2
mova [cq+3*32+16], m3
sub r5d, 16
jmp .loop_pass1
.end_pass1:
mova m4, [cq+0*32+16]
mova m5, [cq+1*32+16]
mova m6, [cq+2*32+16]
mova m7, [cq+3*32+16]
%if ARCH_X86_32
mov r1, [rsp+4*16+1*gprsize]
%endif
jmp tx2q
.pass1_main:
call m(idct_8x4_internal_16bpc).main_pass1
pcmpeqd m1, m1
REPX {psubd x, m1}, m0, m6, m5, m3
call m(idct_8x4_internal_16bpc).round
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
.pack_and_transpose:
packssdw m2, m3
packssdw m6, m7
packssdw m0, m1
packssdw m4, m5
jmp m(idct_8x4_internal_16bpc).transpose4x8packed
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
lea r3, [strideq*3]
%if ARCH_X86_64
mova m10, [o(pixel_10bpc_max)]
pxor m9, m9
%endif
call .round3_and_write_8x8
.zero:
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%undef mzero
RET
; round (rounded right-shift by 5) before writing
; data in m0-7
; on x86-64, pw_2048 is in m8
; .round1 is for m0-7
; .round2 is for m0-6 & [rsp+gprsize*2]
; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
%if ARCH_X86_32
.round1_and_write_8x8:
mova [rsp+gprsize*2], m7
.round2_and_write_8x8:
%endif
.round3_and_write_8x8:
mova m7, [o(pw_2048)]
%if ARCH_X86_32
.round4_and_write_8x8:
%endif
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [rsp+gprsize*2]
%if ARCH_X86_64
jmp .write_8x8
.round2_and_write_8x8:
mova m7, [rsp+gprsize*2]
.round1_and_write_8x8:
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
%endif
; m0-7 have to-be-written data [pre-rounded]
; on x86-64, m9-10 contain a zero/pixel_max
; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
; r0,1,3 contain dstq/strideq/stride3q
; r5 is a scratch register
.write_8x8:
lea r5, [dstq+strideq*4]
paddw m0, [dstq+strideq*0]
paddw m1, [dstq+strideq*1]
paddw m2, [dstq+strideq*2]
paddw m3, [dstq+r3]
paddw m4, [r5 +strideq*0]
paddw m5, [r5 +strideq*1]
paddw m6, [r5 +strideq*2]
paddw m7, [r5 +r3]
%if ARCH_X86_64
REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
%else
mova [rsp+gprsize*2], m7
pxor m7, m7
REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmaxsw m7, [rsp+gprsize*2]
mova [rsp+gprsize*2], m7
mova m7, [o(pixel_10bpc_max)]
REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pminsw m7, [rsp+gprsize*2]
%endif
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r3 ], m3
mova [r5 +strideq*0], m4
mova [r5 +strideq*1], m5
mova [r5 +strideq*2], m6
mova [r5 +r3 ], m7
ret
INV_TXFM_8X8_FN adst, dct
INV_TXFM_8X8_FN adst, adst
INV_TXFM_8X8_FN adst, flipadst
INV_TXFM_8X8_FN adst, identity, 6
cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
mov [rsp+4*16+1*gprsize], r1
%endif
lea t0, [o(.pass1_main)]
jmp m(idct_8x8_internal_16bpc).pass1_full
.pass1_main:
call m(iadst_8x4_internal_16bpc).main_pass1
call .round
jmp m(idct_8x8_internal_16bpc).pack_and_transpose
.round:
%if ARCH_X86_64
pcmpeqd m8, m8 ; -1
REPX {psubd x, m8 }, m0, m6
REPX {pxor x, m8 }, m1, m7, m3, m5
REPX {psrad x, 1 }, m0, m1, m6, m7
REPX {psubd x, m8 }, m1, m7
mova m8, [o(pd_6144)]
REPX {paddd x, m8 }, m2, m3, m4, m5
REPX {psrad x, 13 }, m2, m3, m4, m5
%else
mova [r3+2*16], m0
pcmpeqd m0, m0 ; -1
mova m6, [o(pd_6144)]
REPX {pxor x, m0 }, m1, m7, m3, m5
REPX {psrad x, 1 }, m1, m7
REPX {psubd x, m0 }, m1, m7
REPX {paddd x, m6 }, m2, m3, m4, m5
REPX {psrad x, 13 }, m2, m3, m4, m5
mova m0, [r3+2*16]
psrld m6, 12 ; +1
paddd m0, m6
paddd m6, [r3+3*16]
REPX {psrad x, 1 }, m0, m6
%endif
ret
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
lea r3, [strideq*3]
%if ARCH_X86_64
mova m10, [o(pixel_10bpc_max)]
pxor m9, m9
%endif
call .round3_and_write_8x8
jmp m(idct_8x8_internal_16bpc).zero
; round (rounded right-shift by 5) before writing; odd registers are negated
; data in m0-7
; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
; .round1 is for m0-7
; .round2 is for m0-6 & [rsp+gprsize*2]
; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
%if ARCH_X86_64
.round2_and_write_8x8:
mova m7, [rsp+gprsize*2]
.round1_and_write_8x8:
REPX {pmulhrsw x, m8 }, m0, m2, m4, m6
REPX {pmulhrsw x, m11}, m1, m3, m5, m7
jmp m(idct_8x8_internal_16bpc).write_8x8
%else
.round1_and_write_8x8:
mova [rsp+gprsize*2], m7
.round2_and_write_8x8:
%endif
.round3_and_write_8x8:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova m7, [o(pw_m2048)]
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, [rsp+gprsize*2]
jmp m(idct_8x8_internal_16bpc).write_8x8
INV_TXFM_8X8_FN flipadst, dct
INV_TXFM_8X8_FN flipadst, adst
INV_TXFM_8X8_FN flipadst, flipadst
INV_TXFM_8X8_FN flipadst, identity, 6
cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
mov [rsp+4*16+1*gprsize], r1
%endif
lea t0, [o(.pass1_main)]
jmp m(idct_8x8_internal_16bpc).pass1_full
.pass1_main:
call m(iadst_8x4_internal_16bpc).main_pass1
call m(iadst_8x8_internal_16bpc).round
; invert registers
packssdw m7, m6
packssdw m5, m4
packssdw m3, m2
packssdw m1, m0
mova m0, m7
mova m2, m5
mova m4, m3
mova m6, m1
jmp m(idct_8x4_internal_16bpc).transpose4x8packed
.pass2:
lea dstq, [dstq+strideq*8]
sub dstq, strideq
neg strideq
jmp m(iadst_8x8_internal_16bpc).pass2
INV_TXFM_8X8_FN identity, dct
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
mova m0, [cq+0*32]
mova m1, [cq+1*32]
mova m2, [cq+2*32]
mova m3, [cq+3*32]
mova m4, [cq+4*32]
mova m5, [cq+5*32]
mova m6, [cq+6*32]
mova m7, [cq+7*32]
packssdw m0, [cq+0*32+16]
packssdw m1, [cq+1*32+16]
packssdw m2, [cq+2*32+16]
packssdw m3, [cq+3*32+16]
packssdw m4, [cq+4*32+16]
packssdw m5, [cq+5*32+16]
packssdw m6, [cq+6*32+16]
packssdw m7, [cq+7*32+16]
mova [rsp+gprsize+16*1], m6
jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
lea r3, [strideq*3]
%if ARCH_X86_64
mova m10, [o(pixel_10bpc_max)]
pxor m9, m9
mova m8, [o(pw_4096)]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
%else
mova [rsp+gprsize], m7
mova m7, [o(pw_4096)]
call m(idct_8x8_internal_16bpc).round4_and_write_8x8
%endif
jmp m(idct_8x8_internal_16bpc).zero
%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
%if ARCH_X86_64
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
%else
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
add r5d, 128
sar r5d, 8
imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
add rsp, (12+ARCH_X86_64)*16
%endif
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
%endif
%endmacro
INV_TXFM_8X16_FN dct, dct
INV_TXFM_8X16_FN dct, identity, v
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
%if ARCH_X86_64
DECLARE_REG_TMP 7
%endif
cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
PUSH r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
.pass1_full:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
%undef cmp
mov r6d, 4
.zero_loop:
dec r6d
cmp eobb, byte [r5+r6]
jl .zero_loop
mov r5d, r6d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, [rsp+16*16+2*gprsize]
; setup stack pointer
lea r3, [rsp+gprsize]
%endif
.loop_pass1:
mova m0, [cq+0*64+r5]
mova m1, [cq+1*64+r5]
mova m2, [cq+2*64+r5]
mova m3, [cq+3*64+r5]
mova m4, [cq+4*64+r5]
mova m5, [cq+5*64+r5]
mova m6, [cq+6*64+r5]
mova m7, [cq+7*64+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
call t0
mova [cq+0*64+r5], m0
mova [cq+1*64+r5], m1
mova [cq+2*64+r5], m2
mova [cq+3*64+r5], m3
sub r5d, 16
jge .loop_pass1
%if WIN64
POP r7
%elif ARCH_X86_32
mov r1, [rsp+16*16+1*gprsize]
%endif
jmp tx2q
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
; some are still pre-loaded from the final loop iteration in pass=1
mova m1, m2
mova m2, [cq+ 1*16]
mova m3, [cq+ 9*16]
mova m4, [cq+ 2*16]
mova m5, [cq+10*16]
mova m6, [cq+ 3*16]
mova m7, [cq+11*16]
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
mova [rsp+gprsize+3*16], m0
mova [rsp+gprsize+4*16], m1
mova [rsp+gprsize+5*16], m2
mova [rsp+gprsize+6*16], m3
mova [rsp+gprsize+7*16], m4
mova [rsp+gprsize+8*16], m5
mova [rsp+gprsize+9*16], m6
; m7 is already stored in [rsp+gprsize+0*16]
mova m0, [cq+ 4*16]
mova m1, [cq+12*16]
mova m2, [cq+ 5*16]
mova m3, [cq+13*16]
mova m4, [cq+ 6*16]
mova m5, [cq+14*16]
mova m6, [cq+ 7*16]
mova m7, [cq+15*16]
call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
; out0-7 is in rsp+gprsize+3-10*mmsize
; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
%if ARCH_X86_64
mova m8, [o(pw_2048)]
mova m10, [o(pixel_10bpc_max)]
pxor m9, m9
mov r6, dstq
%else
mov [rsp+16*16+gprsize*1], dstq
%endif
lea r3, [strideq*3]
lea dstq, [dstq+strideq*8]
call m(idct_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
%undef mzero
mova m0, [rsp+gprsize+ 3*16]
mova m1, [rsp+gprsize+ 4*16]
mova m2, [rsp+gprsize+ 5*16]
mova m3, [rsp+gprsize+ 6*16]
mova m4, [rsp+gprsize+ 7*16]
mova m5, [rsp+gprsize+ 8*16]
mova m6, [rsp+gprsize+ 9*16]
mova m7, [rsp+gprsize+10*16]
%if ARCH_X86_64
mov dstq, r6
%else
mov dstq, [rsp+16*16+gprsize*1]
%endif
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
RET
INV_TXFM_8X16_FN adst, dct
INV_TXFM_8X16_FN adst, adst
INV_TXFM_8X16_FN adst, flipadst
INV_TXFM_8X16_FN adst, identity, v
cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
PUSH r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
mova m4, [cq+ 9*16]
mova m5, [cq+13*16]
mova [rsp+gprsize+7*16], m0
mova [rsp+gprsize+8*16], m1
mova [rsp+gprsize+5*16], m4
mova [rsp+gprsize+6*16], m5
mova m0, m2
mova m1, m3
mova m2, [cq+ 1*16]
mova m3, [cq+ 5*16]
mova m4, [cq+ 2*16]
mova m5, [cq+ 6*16]
mova m6, [cq+11*16]
mova m7, [cq+15*16]
mova [rsp+gprsize+ 3*16], m4
mova [rsp+gprsize+ 4*16], m5
mova [rsp+gprsize+ 9*16], m6
mova [rsp+gprsize+10*16], m7
mova m4, [cq+10*16]
mova m5, [cq+14*16]
mova m6, [cq+ 3*16]
mova m7, [cq+ 7*16]
call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
%if ARCH_X86_64
mova m11, [o(pw_m2048)]
mova m8, [o(pw_2048)]
mova m10, [o(pixel_10bpc_max)]
pxor m9, m9
mov r6, dstq
%else
mov [rsp+16*16+gprsize*1], dstq
%endif
lea r3, [strideq*3]
lea dstq, [dstq+strideq*8]
call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
%undef mzero
mova m0, [rsp+gprsize+ 3*16]
mova m1, [rsp+gprsize+ 4*16]
mova m2, [rsp+gprsize+ 5*16]
mova m3, [rsp+gprsize+ 6*16]
mova m4, [rsp+gprsize+ 7*16]
mova m5, [rsp+gprsize+ 8*16]
mova m6, [rsp+gprsize+ 9*16]
mova m7, [rsp+gprsize+10*16]
%if ARCH_X86_64
mov dstq, r6
%else
mov dstq, [rsp+16*16+gprsize*1]
%endif
call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
RET
INV_TXFM_8X16_FN flipadst, dct
INV_TXFM_8X16_FN flipadst, adst
INV_TXFM_8X16_FN flipadst, flipadst
INV_TXFM_8X16_FN flipadst, identity, v
cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
PUSH r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
lea r3, [strideq*3]
lea r3, [r3*5]
add dstq, r3
neg strideq
jmp m(iadst_8x16_internal_16bpc).pass2
INV_TXFM_8X16_FN identity, dct, h
INV_TXFM_8X16_FN identity, adst, h
INV_TXFM_8X16_FN identity, flipadst, h
INV_TXFM_8X16_FN identity, identity
cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
PUSH r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
%if ARCH_X86_64
mova m4, [o(pw_2048)]
mova m5, [o(pixel_10bpc_max)]
pxor m6, m6
mova m7, [o(pw_1697x16)]
%endif
mov r5d, 4
lea r3, [strideq*3]
.pass2_loop:
call .main
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).round1_and_write_8x4
%else
call m(idct_8x4_internal_16bpc).round2_and_write_8x4
%endif
REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
dec r5d
jle .end
add cq, 16
lea dstq, [dstq+strideq*4]
mova m0, [cq+ 0*16]
mova m1, [cq+ 4*16]
mova m2, [cq+ 8*16]
mova m3, [cq+12*16]
jmp .pass2_loop
.end:
RET
.main:
; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
%if ARCH_X86_32
mova m7, [o(pw_1697x16)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
%else
pmulhrsw m8, m7, m0
pmulhrsw m9, m7, m1
pmulhrsw m10, m7, m2
pmulhrsw m11, m7, m3
%endif
REPX {paddsw x, x}, m0, m1, m2, m3
%if ARCH_X86_64
paddsw m0, m8
paddsw m1, m9
paddsw m2, m10
paddsw m3, m11
%else
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
%endif
ret
%macro INV_TXFM_16X4_FN 2 ; type1, type2
%if ARCH_X86_64
INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16
%else
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
add r5d, 384
sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
mova m3, [o(pixel_10bpc_max)]
pxor m4, m4
.loop:
mova m1, [dstq+ 0]
mova m2, [dstq+16]
REPX {paddw x, m0}, m1, m2
REPX {pminsw x, m3}, m1, m2
REPX {pmaxsw x, m4}, m1, m2
mova [dstq+ 0], m1
mova [dstq+16], m2
add dstq, strideq
dec r3d
jg .loop
RET
%endif
%endmacro
INV_TXFM_16X4_FN dct, dct
INV_TXFM_16X4_FN dct, identity
INV_TXFM_16X4_FN dct, adst
INV_TXFM_16X4_FN dct, flipadst
cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
; setup stack pointer
lea r3, [rsp+gprsize]
mova m0, [cq+ 1*16]
mova m1, [cq+ 3*16]
mova m2, [cq+ 5*16]
mova m3, [cq+ 7*16]
mova m4, [cq+ 9*16]
mova m5, [cq+11*16]
mova m6, [cq+13*16]
mova m7, [cq+15*16]
call .main_oddhalf
mova m0, [cq+ 0*16]
mova m1, [cq+ 2*16]
mova m2, [cq+ 4*16]
mova m3, [cq+ 6*16]
mova m4, [cq+ 8*16]
mova m5, [cq+10*16]
mova m6, [cq+12*16]
mova m7, [cq+14*16]
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
; t0-7 is in m0-7
call .round
%if ARCH_X86_64
.pack_transpose:
; transpose in two parts
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
.transpose:
call m(idct_8x4_internal_16bpc).transpose4x8packed
call .transpose4x8packed_hi
%else
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [r3+0*16], m0
mova [r3+1*16], m1
mova [r3+2*16], m2
mova [r3+3*16], m3
mova m0, [r3+ 8*16]
mova m2, [r3+ 9*16]
mova m4, [r3+10*16]
mova m6, [r3+11*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
jmp tx2q
%if ARCH_X86_64
.transpose4x8packed_hi:
punpcklwd m9, m10, m14
punpckhwd m10, m14
punpckhwd m14, m8, m12
punpcklwd m8, m12
punpckhwd m11, m8, m9
punpcklwd m8, m9
punpckhwd m12, m14, m10
punpcklwd m14, m10
punpcklwd m10, m11, m12
punpckhwd m11, m12
punpckhwd m9, m8, m14
punpcklwd m8, m14
ret
%endif
.main_oddhalf_fast: ; lower half zero
pmulld m7, m0, [o(pd_4076)]
pmulld m0, [o(pd_401)]
pmulld m6, m1, [o(pd_m1189)]
pmulld m1, [o(pd_3920)]
%if ARCH_X86_32
mova m4, [o(pd_2048)]
REPX {paddd x, m4}, m1, m6
REPX {psrad x, 12}, m1, m6
mova [r3+1*16], m1
%endif
pmulld m5, m2, [o(pd_3612)]
pmulld m2, [o(pd_1931)]
%if ARCH_X86_32
pmulld m1, m3, [o(pd_m2598)]
%else
pmulld m4, m3, [o(pd_m2598)]
%endif
pmulld m3, [o(pd_3166)]
jmp .main_oddhalf_fast2
.main_oddhalf:
%if ARCH_X86_64
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
.main_oddhalf_fast2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
psubd m8, m0, m4 ; t9
paddd m0, m4 ; t8
psubd m4, m6, m2 ; t10
paddd m2, m6 ; t11
psubd m6, m1, m5 ; t13
paddd m5, m1 ; t12
psubd m1, m7, m3 ; t14
paddd m7, m3 ; t15
REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
mova m15, [o(pd_3784)]
mova m10, [o(pd_1567)]
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
psubd m3, m1, m4 ; t10
paddd m1, m4 ; t9
psubd m4, m0, m2 ; t11a
paddd m0, m2 ; t8a
psubd m2, m8, m6 ; t13
paddd m6, m8 ; t14
psubd m8, m7, m5 ; t12a
paddd m7, m5 ; t15a
REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
REPX {pmulld x, m14}, m2, m8, m3, m4
paddd m2, m11
paddd m8, m11
paddd m5, m2, m3 ; t13a
psubd m2, m3 ; t10a
psubd m3, m8, m4 ; t11
paddd m4, m8 ; t12
REPX {psrad x, 12}, m5, m2, m3, m4
mova [r3+0*16], m0
mova [r3+1*16], m1
mova [r3+2*16], m2
mova [r3+3*16], m3
mova [r3+4*16], m4
mova [r3+5*16], m5
mova [r3+6*16], m6
mova [r3+7*16], m7
%else
mova [r3+0*16], m2
mova [r3+1*16], m3
mova [r3+2*16], m4
mova [r3+3*16], m5
mova m4, [o(pd_2048)]
ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a
ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
mova m2, [r3+0*16]
mova m3, [r3+1*16]
mova [r3+0*16], m0
mova [r3+1*16], m1
mova m1, [r3+2*16]
mova m5, [r3+3*16]
mova [r3+2*16], m6
mova [r3+3*16], m7
ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a
mova m0, [r3+0*16]
mova m6, [r3+2*16]
mova m7, [r3+3*16]
.main_oddhalf_fast2:
REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3
REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3
psubd m4, m0, m1 ; t9
paddd m0, m1 ; t8
mova m1, [r3+1*16]
mova [r3+0*16], m4
psubd m4, m6, m2 ; t10
paddd m2, m6 ; t11
psubd m6, m1, m5 ; t13
paddd m5, m1 ; t12
psubd m1, m7, m3 ; t14
paddd m7, m3 ; t15
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
pmaxsd m3, [r3+0*16]
mova [r3+0*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
pminsd m3, [r3+0*16]
mova [r3+0*16], m0
mova [r3+1*16], m2
mova [r3+2*16], m5
mova [r3+3*16], m7
mova m7, [o(pd_2048)]
ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784
ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4
mova m0, [r3+0*16]
mova m2, [r3+1*16]
psubd m5, m1, m4 ; t10
mova [r3+1*16], m5
paddd m1, m4 ; t9
psubd m4, m0, m2 ; t11a
paddd m0, m2 ; t8a
mova m5, [r3+2*16]
mova m7, [r3+3*16]
psubd m2, m3, m6 ; t13
paddd m6, m3 ; t14
paddd m3, m7, m5 ; t15a
psubd m7, m5 ; t12a
mova [r3+0*16], m3
mova m3, [r3+1*16]
mova m5, [o(clip_18b_min)]
REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
pmaxsd m5, [r3+0*16]
mova [r3+0*16], m5
mova m5, [o(clip_18b_max)]
REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
pminsd m5, [r3+0*16]
mova [r3+0*16], m5
mova m5, [o(pd_2896)]
REPX {pmulld x, m5}, m2, m7, m3, m4
mova m5, [o(pd_2048)]
REPX {paddd x, m5}, m2, m7
paddd m5, m2, m3 ; t13a
psubd m2, m3 ; t10a
psubd m3, m7, m4 ; t11
paddd m4, m7 ; t12
REPX {psrad x, 12}, m5, m2, m3, m4
mova m7, [r3+0*16]
mova [r3+11*16], m0
mova [r3+10*16], m1
mova [r3+9*16], m2
mova [r3+8*16], m3
mova [r3+7*16], m4
mova [r3+6*16], m5
mova [r3+5*16], m6
mova [r3+4*16], m7
%endif
ret
.round:
%if ARCH_X86_64
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
pcmpeqd m8, m8
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
mova m9, [r3+2*16]
mova m10, [r3+3*16]
mova m11, [r3+4*16]
mova m12, [r3+5*16]
mova m13, [r3+6*16]
mova m14, [r3+7*16]
psubd m15, m0, m14 ; out15
paddd m0, m14 ; out0
psubd m14, m1, m13 ; out14
paddd m1, m13 ; out1
psubd m13, m2, m12 ; out13
paddd m2, m12 ; out2
psubd m12, m3, m11 ; out12
paddd m3, m11 ; out3
psubd m11, m4, m10 ; out11
paddd m4, m10 ; out4
psubd m10, m5, m9 ; out10
paddd m5, m9 ; out5
psubd m9, m6, m8 ; out9
paddd m6, m8 ; out6
psubd m8, m7, [r3+0*16] ; out8
paddd m7, [r3+0*16] ; out7
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
pmaxsd m0, [r3+ 0*16]
mova [r3+ 0*16], m7
mova m7, [o(clip_18b_max)]
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
pminsd m7, [r3+ 0*16]
mova [r3+ 0*16], m0
pcmpeqd m0, m0
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
mova [r3+ 1*16], m1
mova [r3+ 2*16], m2
mova m1, [r3+ 0*16]
psubd m1, m0
mova [r3+ 0*16], m1
mova m1, [r3+11*16]
mova m2, [r3+10*16]
psubd m0, m7, m1
paddd m7, m1
psubd m1, m6, m2
paddd m6, m2
REPX {psrad x, 1}, m0, m1, m6, m7
packssdw m0, m1 ; out8-9
packssdw m6, m7 ; out6-7
mova [r3+11*16], m6
mova m1, [r3+9*16]
mova m7, [r3+8*16]
psubd m2, m5, m1
paddd m5, m1
psubd m1, m4, m7
paddd m4, m7
REPX {psrad x, 1}, m2, m1, m4, m5
packssdw m2, m1 ; out10-11
packssdw m4, m5 ; out4-5
mova m1, [r3+2*16]
mova [r3+10*16], m4
mova m6, [r3+7*16]
mova m7, [r3+6*16]
psubd m4, m3, m6
paddd m3, m6
psubd m6, m1, m7
paddd m1, m7
REPX {psrad x, 1}, m4, m6, m1, m3
packssdw m4, m6 ; out12-13
packssdw m1, m3 ; out2-3
mova m3, [r3+1*16]
mova [r3+9*16], m1
mova m1, [r3+0*16]
mova m5, [r3+5*16]
mova m7, [r3+4*16]
psubd m6, m3, m5
paddd m3, m5
psubd m5, m1, m7
paddd m1, m7
REPX {psrad x, 1}, m6, m5, m1, m3
packssdw m6, m5 ; out14-15
packssdw m1, m3 ; out0-1
mova [r3+8*16], m1
%endif
ret
.pass2:
lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
.pass2_loop:
lea r3, [strideq*3]
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call r4
call m(idct_8x4_internal_16bpc).round2_and_write_8x4
REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%if ARCH_X86_64
mova m0, m8
mova m1, m9
mova m2, m10
mova m3, m11
%else
mova m0, [rsp+gprsize+0*16]
mova m1, [rsp+gprsize+1*16]
mova m2, [rsp+gprsize+2*16]
mova m3, [rsp+gprsize+3*16]
%endif
add dstq, 16
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call r4
call m(idct_8x4_internal_16bpc).round2_and_write_8x4
RET
INV_TXFM_16X4_FN adst, dct
INV_TXFM_16X4_FN adst, adst
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; setup stack pointer
lea r3, [rsp+gprsize]
call .main
%if ARCH_X86_64
jmp m(idct_16x4_internal_16bpc).pack_transpose
%else
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+gprsize+0*16], m0
mova [rsp+gprsize+1*16], m1
mova [rsp+gprsize+2*16], m2
mova [rsp+gprsize+3*16], m3
mova m0, [rsp+gprsize+ 8*16]
mova m2, [rsp+gprsize+ 9*16]
mova m4, [rsp+gprsize+10*16]
mova m6, [rsp+gprsize+11*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
jmp tx2q
%endif
.main:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 2*16]
mova m1, [cq+13*16]
mova m2, [cq+ 6*16]
mova m3, [cq+ 9*16]
mova m4, [cq+10*16]
mova m5, [cq+ 5*16]
mova m6, [cq+14*16]
mova m7, [cq+ 1*16]
call .main_part1
mova m0, [cq+ 0*16]
mova m1, [cq+15*16]
mova m2, [cq+ 4*16]
mova m3, [cq+11*16]
mova m4, [cq+ 8*16]
mova m5, [cq+ 7*16]
mova m6, [cq+12*16]
mova m7, [cq+ 3*16]
call .main_part2
.round:
%if ARCH_X86_64
mova m15, [o(pd_6144)]
psrld m14, 11 ; pd_1
pcmpeqd m8, m8 ; -1
psubd m13, m15, m14 ; pd_6143
REPX {paddd x, m14}, m0, m2
REPX {paddd x, m15}, m4, m6
REPX {pxor x, m8 }, m1, m3, m5, m7
REPX {psrad x, 1 }, m1, m3
REPX {paddd x, m15}, m5, m7
REPX {psubd x, m8 }, m1, m3
paddd m8, m15, m9
psubd m9, m13, m10
paddd m10, m15, m11
psubd m11, m13, m12
paddd m12, m14, [r3+3*16]
psubd m13, m14, [r3+2*16]
psubd m15, m14, [r3+0*16]
paddd m14, [r3+1*16]
REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15
REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
%else
mova [r3+8*16], m1
mova [r3+9*16], m3
mova m3, [o(pd_6144)]
pcmpeqd m1, m1
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m4, m5, m6, m7
REPX {psrad x, 13}, m4, m5, m6, m7
packssdw m4, m5
packssdw m6, m7
mova [r3+10*16], m4
mova [r3+11*16], m6
mova m4, [r3+4*16]
mova m5, [r3+5*16]
mova m6, [r3+6*16]
mova m7, [r3+7*16]
REPX {pxor x, m1}, m5, m7
REPX {psubd x, m1}, m4, m6
REPX {psrad x, 1 }, m4, m5, m6, m7
REPX {psubd x, m1}, m5, m7
packssdw m4, m5
packssdw m6, m7
mova m5, [r3+8*16]
mova m7, [r3+9*16]
mova [r3+8*16], m4
mova [r3+9*16], m6
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m0, m5, m2, m7
REPX {psrad x, 13}, m0, m5, m2, m7
packssdw m0, m5
packssdw m2, m7
mova m4, [r3+0*16]
mova m5, [r3+1*16]
mova m6, [r3+2*16]
mova m7, [r3+3*16]
REPX {psubd x, m1}, m4, m6
REPX {pxor x, m1}, m5, m7
REPX {psrad x, 1 }, m4, m5, m6, m7
REPX {psubd x, m1}, m5, m7
packssdw m4, m5
packssdw m6, m7
%endif
ret
.main_part2:
%if ARCH_X86_64
ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091
ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703
ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751
ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380
psubd m8, m0, m4 ; t8a
paddd m0, m4 ; t0a
psubd m4, m1, m5 ; t9a
paddd m1, m5 ; t1a
psubd m5, m2, m6 ; t12a
paddd m2, m6 ; t4a
psubd m6, m3, m7 ; t13a
paddd m7, m3 ; t5a
REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
mova m15, [o(pd_4017)]
mova m10, [o(pd_799)]
ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
psubd m3, m0, m2 ; t4
paddd m0, m2 ; t0
psubd m2, m1, m7 ; t5
paddd m1, m7 ; t1
psubd m7, m4, m6 ; t12a
paddd m4, m6 ; t8a
psubd m6, m8, m5 ; t13a
paddd m5, m8 ; t9a
REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
mova m15, [o(pd_3784)]
mova m10, [o(pd_1567)]
ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15
ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15
mova m10, [r3+0*16] ; t2
mova m8, [r3+1*16] ; t3
psubd m9, m0, m10 ; t2a
paddd m0, m10 ; out0
psubd m10, m1, m8 ; t3a
paddd m1, m8 ; -out15
mova [r3+0*16], m1
mova m15, [r3+3*16] ; t7a
mova m1, [r3+2*16] ; t6a
psubd m8, m3, m15 ; t7
paddd m15, m3 ; out12
paddd m3, m2, m1 ; -out3
psubd m2, m1 ; t6
mova [r3+3*16], m15
mova [r3+1*16], m2
mova m1, [r3+7*16] ; t15
mova m2, [r3+6*16] ; t14
paddd m15, m7, m1 ; -out13
psubd m7, m1 ; t15a
psubd m11, m6, m2 ; t14a
paddd m2, m6 ; out2
mova [r3+2*16], m15
mova m1, [r3+4*16] ; t10a
mova m15, [r3+5*16] ; t11a
psubd m6, m4, m1 ; t10
paddd m1, m4 ; -out1
psubd m4, m5, m15 ; t11
paddd m5, m15 ; out14
REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
pmaxsd m12, [r3+1*16] ; t6
mova [r3+1*16], m5
REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
paddd m5, m11, m7 ; -out5 (unshifted)
psubd m11, m7 ; out10 (unshifted)
paddd m7, m9, m10 ; -out7 (unshifted)
psubd m9, m10 ; out8 (unshifted)
psubd m10, m6, m4 ; -out9 (unshifted)
paddd m6, m4 ; out6 (unshifted)
paddd m4, m12, m8 ; out4 (unshifted)
psubd m12, m8 ; -out11 (unshifted)
%else
mova [r3+8*16], m0
mova [r3+9*16], m1
mova [r3+10*16], m2
mova [r3+11*16], m3
mova m3, [o(pd_2048)]
ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751
ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380
mova m0, [r3+8*16]
mova m1, [r3+9*16]
mova [r3+8*16], m4
mova m4, [r3+10*16]
mova [r3+9*16], m5
mova [r3+10*16], m6
mova m5, [r3+11*16]
mova [r3+11*16], m7
ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091
ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703
mova m2, [r3+8*16]
mova m6, [r3+9*16]
psubd m3, m0, m2 ; t8a
paddd m0, m2 ; t0a
mova [r3+8*16], m3
psubd m2, m1, m6 ; t9a
paddd m1, m6 ; t1a
mova m3, [r3+10*16]
psubd m6, m4, m3 ; t12a
paddd m4, m3 ; t4a
mova m3, [r3+11*16]
psubd m7, m5, m3 ; t13a
paddd m5, m3 ; t5a
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
pmaxsd m3, [r3+8*16]
mova [r3+8*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
pminsd m3, [r3+8*16]
mova [r3+8*16], m3
psubd m3, m0, m4 ; t4
paddd m0, m4 ; t0
psubd m4, m1, m5 ; t5
paddd m1, m5 ; t1
mova m5, [o(pd_2048)]
mova [r3+9*16], m1
mova [r3+10*16], m4
mova [r3+11*16], m3
mova m3, [r3+8*16]
mova [r3+8*16], m0
ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017
ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4
psubd m5, m2, m7 ; t12a
paddd m2, m7 ; t8a
psubd m7, m3, m6 ; t13a
paddd m6, m3 ; t9a
mova m0, [r3+8*16]
mova m1, [r3+9*16]
mova m4, [r3+10*16]
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
pmaxsd m3, [r3+11*16]
mova [r3+8*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
pminsd m3, [r3+8*16]
mova [r3+8*16], m0
mova [r3+9*16], m1
mova [r3+10*16], m2
mova [r3+11*16], m6
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784
ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784
mova m0, [r3+7*16] ; t7a
mova m2, [r3+6*16] ; t6a
psubd m1, m3, m0 ; t7
paddd m0, m3 ; out12
paddd m3, m4, m2 ; -out3
psubd m4, m2 ; t6
mova [r3+7*16], m3
mova m3, [r3+3*16] ; t15
mova m2, [r3+2*16] ; t14
paddd m6, m5, m3 ; -out13
psubd m5, m3 ; t15a
psubd m3, m7, m2 ; t14a
paddd m2, m7 ; out2
mova [r3+6*16], m2
mova m7, [r3+0*16] ; t10a
mova m2, [r3+1*16] ; t11a
mova [r3+0*16], m0
mova [r3+1*16], m6
mova m6, [r3+11*16]
psubd m0, m6, m2 ; t11
paddd m6, m2 ; out14
mova [r3+2*16], m6
mova m2, [r3+10*16]
psubd m6, m2, m7 ; t10
paddd m2, m7 ; -out1
mova m7, [r3+5*16] ; t3
mova [r3+5*16], m2
mova [r3+10*16], m1
mova m1, [r3+9*16]
psubd m2, m1, m7 ; t3a
paddd m1, m7 ; -out15
mova [r3+3*16], m1
mova m1, [r3+4*16] ; t2
mova m7, [r3+8*16]
psubd m7, m1 ; t2a
paddd m1, [r3+8*16] ; out0
mova [r3+4*16], m1
mova m1, [o(clip_18b_min)]
REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
pmaxsd m1, [r3+10*16]
mova [r3+10*16], m1
mova m1, [o(clip_18b_max)]
REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
pminsd m1, [r3+10*16]
mova [r3+10*16], m1
mova m1, [o(pd_2896)]
REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
pmulld m1, [r3+10*16]
mova [r3+11*16], m3
psubd m3, m4, m1 ; -out11 (unshifted)
paddd m4, m1 ; out4 (unshifted)
psubd m1, m6, m0 ; -out9 (unshifted)
paddd m6, m0 ; out6 (unshifted)
psubd m0, m7, m2 ; out8 (unshifted)
paddd m7, m2 ; -out7 (unshifted)
mova m2, [r3+11*16]
mova [r3+11*16], m5
paddd m5, m2 ; -out5 (unshifted)
psubd m2, [r3+11*16] ; out10 (unshifted)
; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
; r[-4,3] contain out0-3 and out12-15
%endif
ret
.main_part1:
%if ARCH_X86_64
ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973
ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290
ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106
ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601
psubd m8, m0, m4 ; t10a
paddd m0, m4 ; t2a
psubd m4, m1, m5 ; t11a
paddd m1, m5 ; t3a
psubd m5, m2, m6 ; t14a
paddd m2, m6 ; t6a
psubd m6, m3, m7 ; t15a
paddd m7, m3 ; t7a
REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
mova m15, [o(pd_2276)]
mova m10, [o(pd_3406)]
ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
psubd m3, m0, m2 ; t6
paddd m0, m2 ; t2
psubd m2, m1, m7 ; t7
paddd m1, m7 ; t3
psubd m7, m4, m6 ; t14a
paddd m4, m6 ; t10a
psubd m6, m8, m5 ; t15a
paddd m5, m8 ; t11a
REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
mova m15, [o(pd_1567)]
mova m10, [o(pd_3784)]
ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15
ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15
mova [r3+0*16], m0
mova [r3+1*16], m1
mova [r3+4*16], m4
mova [r3+5*16], m5
mova [r3+2*16], m2
mova [r3+3*16], m3
mova [r3+6*16], m6
mova [r3+7*16], m7
%else
mova [r3+4*16], m0
mova [r3+5*16], m1
mova [r3+6*16], m2
mova [r3+7*16], m3
mova m3, [o(pd_2048)]
ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106
ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601
mova [r3+0*16], m4
mova [r3+1*16], m5
mova [r3+2*16], m6
mova [r3+3*16], m7
mova m0, [r3+4*16]
mova m1, [r3+5*16]
mova m2, [r3+6*16]
mova m7, [r3+7*16]
ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973
ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290
mova m4, [r3+0*16]
mova m5, [r3+1*16]
psubd m6, m0, m4 ; t10a
paddd m0, m4 ; t2a
mova [r3+4*16], m6
mova m6, [r3+2*16]
mova m3, [r3+3*16]
psubd m4, m1, m5 ; t11a
paddd m1, m5 ; t3a
psubd m5, m2, m6 ; t14a
paddd m2, m6 ; t6a
psubd m6, m7, m3 ; t15a
paddd m7, m3 ; t7a
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
pmaxsd m3, [r3+4*16]
mova [r3+4*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
pminsd m3, [r3+4*16]
mova [r3+4*16], m3
psubd m3, m0, m2 ; t6
paddd m0, m2 ; t2
psubd m2, m1, m7 ; t7
paddd m1, m7 ; t3
mova [r3+5*16], m1
mova [r3+6*16], m3
mova [r3+7*16], m2
mova m1, [r3+4*16]
mova [r3+4*16], m0
mova m3, [o(pd_2048)]
ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276
ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2
psubd m7, m4, m6 ; t14a
paddd m4, m6 ; t10a
psubd m6, m1, m5 ; t15a
paddd m5, m1 ; t11a
mova m1, [r3+5*16]
mova m3, [r3+6*16]
mova m2, [r3+7*16]
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
pmaxsd m0, [r3+4*16]
mova [r3+4*16], m0
mova m0, [o(clip_18b_max)]
REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
pminsd m0, [r3+4*16]
mova [r3+4*16], m0
mova [r3+5*16], m1
mova [r3+0*16], m4
mova [r3+1*16], m5
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567
ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567
mova [r3+6*16], m2
mova [r3+7*16], m3
mova [r3+2*16], m6
mova [r3+3*16], m7
%endif
ret
.pass2:
lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
jmp m(idct_16x4_internal_16bpc).pass2_loop
INV_TXFM_16X4_FN flipadst, dct
INV_TXFM_16X4_FN flipadst, adst
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
lea r3, [rsp+gprsize]
call m(iadst_16x4_internal_16bpc).main
%if ARCH_X86_64
packssdw m1, m0
packssdw m3, m2
packssdw m5, m4
packssdw m7, m6
packssdw m9, m8
packssdw m11, m10
packssdw m13, m12
packssdw m15, m14
mova m0, m15
mova m2, m13
mova m4, m11
mova m6, m9
mova m8, m7
mova m10, m5
mova m12, m3
mova m14, m1
jmp m(idct_16x4_internal_16bpc).transpose
%else
mova [rsp+gprsize+4*16], m0
mova [rsp+gprsize+5*16], m2
mova [rsp+gprsize+6*16], m4
mova [rsp+gprsize+7*16], m6
pshufd m6, [rsp+gprsize+ 8*16], q1032
pshufd m4, [rsp+gprsize+ 9*16], q1032
pshufd m2, [rsp+gprsize+10*16], q1032
pshufd m0, [rsp+gprsize+11*16], q1032
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+gprsize+0*16], m0
mova [rsp+gprsize+1*16], m1
mova [rsp+gprsize+2*16], m2
mova [rsp+gprsize+3*16], m3
pshufd m6, [rsp+gprsize+ 4*16], q1032
pshufd m4, [rsp+gprsize+ 5*16], q1032
pshufd m2, [rsp+gprsize+ 6*16], q1032
pshufd m0, [rsp+gprsize+ 7*16], q1032
call m(idct_8x4_internal_16bpc).transpose4x8packed
jmp tx2q
%endif
.pass2:
lea r3, [strideq*3]
lea dstq, [dstq+r3]
neg strideq
lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
jmp m(idct_16x4_internal_16bpc).pass2_loop
INV_TXFM_16X4_FN identity, dct
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_64
mova m15, [o(pd_11586)]
pmulld m0, m15, [cq+ 0*16]
pmulld m1, m15, [cq+ 1*16]
pmulld m2, m15, [cq+ 2*16]
pmulld m3, m15, [cq+ 3*16]
pmulld m4, m15, [cq+ 4*16]
pmulld m5, m15, [cq+ 5*16]
pmulld m6, m15, [cq+ 6*16]
pmulld m7, m15, [cq+ 7*16]
pmulld m8, m15, [cq+ 8*16]
pmulld m9, m15, [cq+ 9*16]
pmulld m10, m15, [cq+10*16]
pmulld m11, m15, [cq+11*16]
pmulld m12, m15, [cq+12*16]
pmulld m13, m15, [cq+13*16]
pmulld m14, m15, [cq+14*16]
pmulld m15, [cq+15*16]
mova [cq+ 0*16], m15
mova m15, [o(pd_6144)]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [cq+ 0*16]
REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp m(idct_16x4_internal_16bpc).pack_transpose
%else
add cq, 8*16
mov r5d, 2
.loop_pass1:
mova m7, [o(pd_11586)]
pmulld m0, m7, [cq+0*16]
pmulld m1, m7, [cq+1*16]
pmulld m2, m7, [cq+2*16]
pmulld m3, m7, [cq+3*16]
pmulld m4, m7, [cq+4*16]
pmulld m5, m7, [cq+5*16]
pmulld m6, m7, [cq+6*16]
pmulld m7, [cq+7*16]
mova [cq+7*16], m7
mova m7, [o(pd_6144)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [cq+7*16]
REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
call m(idct_8x4_internal_16bpc).transpose4x8packed
dec r5d
jz .end_pass1
mova [rsp+gprsize+0*16], m0
mova [rsp+gprsize+1*16], m1
mova [rsp+gprsize+2*16], m2
mova [rsp+gprsize+3*16], m3
sub cq, 8*16
jmp .loop_pass1
.end_pass1:
jmp tx2q
%endif
.pass2:
%if ARCH_X86_64
mova m12, [o(pw_1697x8)]
%endif
lea r4, [o(.main)]
jmp m(idct_16x4_internal_16bpc).pass2_loop
.main:
%if ARCH_X86_64
pmulhrsw m4, m0, m12
pmulhrsw m5, m1, m12
pmulhrsw m6, m2, m12
pmulhrsw m7, m3, m12
%else
mova m7, [o(pw_1697x8)]
pmulhrsw m4, m0, m7
pmulhrsw m5, m1, m7
pmulhrsw m6, m2, m7
pmulhrsw m7, m3
%endif
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
ret
%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
%if ARCH_X86_64
INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16
%else
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
add r5d, 128
sar r5d, 8
imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
%endif
%endmacro
INV_TXFM_16X8_FN dct, dct
INV_TXFM_16X8_FN dct, identity, 6
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_64
DECLARE_REG_TMP 6, 4, 6
%else
mov [rsp+gprsize+12*16], r1
DECLARE_REG_TMP 1, 4, 3
%endif
lea t0, [o(.main)]
.loop_main:
%undef cmp
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 10
setge r5b
%else
mov r5d, 1
cmp eobd, 10
sbb r5d, 0
%endif
shl r5d, 4
lea r3, [rsp+gprsize]
.loop_pass1:
call t0
%if ARCH_X86_64
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+4*32+r5], m8
mova [cq+5*32+r5], m9
mova [cq+6*32+r5], m10
mova [cq+7*32+r5], m11
%else
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+4*32+r5], m0
mova [cq+5*32+r5], m1
mova [cq+6*32+r5], m2
mova [cq+7*32+r5], m3
mova m0, [rsp+gprsize+ 8*16]
mova m2, [rsp+gprsize+ 9*16]
mova m4, [rsp+gprsize+10*16]
mova m6, [rsp+gprsize+11*16]
%endif
call m(idct_8x4_internal_16bpc).transpose4x8packed
pxor m7, m7
REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
test r5d, r5d
jz .end
mova [cq+0*32+r5], m0
mova [cq+1*32+r5], m1
mova [cq+2*32+r5], m2
mova [cq+3*32+r5], m3
xor r5d, r5d
jmp .loop_pass1
.end:
jmp tx2q
.main:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 1*32+r5]
mova m1, [cq+ 3*32+r5]
mova m2, [cq+ 5*32+r5]
mova m3, [cq+ 7*32+r5]
mova m4, [cq+ 9*32+r5]
mova m5, [cq+11*32+r5]
mova m6, [cq+13*32+r5]
mova m7, [cq+15*32+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+ 0*32+r5]
mova m1, [cq+ 2*32+r5]
mova m2, [cq+ 4*32+r5]
mova m3, [cq+ 6*32+r5]
mova m4, [cq+ 8*32+r5]
mova m5, [cq+10*32+r5]
mova m6, [cq+12*32+r5]
mova m7, [cq+14*32+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
call m(idct_16x4_internal_16bpc).round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
ret
.pass2:
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
mov r4d, 2
.pass2_main:
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%endif
lea r3, [strideq*3]
jmp .loop_pass2_entry
.loop_pass2:
mova m0, [cq+0*32+ 0]
mova m1, [cq+1*32+ 0]
mova m2, [cq+2*32+ 0]
mova m3, [cq+3*32+ 0]
.loop_pass2_entry:
mova m4, [cq+0*32+16]
mova m5, [cq+1*32+16]
mova m6, [cq+2*32+16]
mova m7, [cq+3*32+16]
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
call m(idct_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
add dstq, 16
add cq, 4*32
dec r4d
jg .loop_pass2
RET
INV_TXFM_16X8_FN adst, dct
INV_TXFM_16X8_FN adst, adst
INV_TXFM_16X8_FN adst, flipadst
INV_TXFM_16X8_FN adst, identity, 6
cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
mov [rsp+gprsize+12*16], r1
%endif
lea t0, [o(.main)]
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 2*32+r5]
mova m1, [cq+13*32+r5]
mova m2, [cq+ 6*32+r5]
mova m3, [cq+ 9*32+r5]
mova m4, [cq+10*32+r5]
mova m5, [cq+ 5*32+r5]
mova m6, [cq+14*32+r5]
mova m7, [cq+ 1*32+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(iadst_16x4_internal_16bpc).main_part1
mova m0, [cq+ 0*32+r5]
mova m1, [cq+15*32+r5]
mova m2, [cq+ 4*32+r5]
mova m3, [cq+11*32+r5]
mova m4, [cq+ 8*32+r5]
mova m5, [cq+ 7*32+r5]
mova m6, [cq+12*32+r5]
mova m7, [cq+ 3*32+r5]
%if ARCH_X86_32
add r3, 8*16
%endif
call m(idct_8x4_internal_16bpc).rect2_mul
%if ARCH_X86_32
sub r3, 8*16
%endif
call m(iadst_16x4_internal_16bpc).main_part2
call m(iadst_16x4_internal_16bpc).round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
ret
.pass2:
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
mov r4d, 2
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
mova m11, [o(pw_m2048)]
%endif
lea r3, [strideq*3]
jmp .loop_pass2_entry
.loop_pass2:
mova m0, [cq+0*32+ 0]
mova m1, [cq+1*32+ 0]
mova m2, [cq+2*32+ 0]
mova m3, [cq+3*32+ 0]
.loop_pass2_entry:
mova m4, [cq+0*32+16]
mova m5, [cq+1*32+16]
mova m6, [cq+2*32+16]
mova m7, [cq+3*32+16]
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
add dstq, 16
add cq, 4*32
dec r4d
jg .loop_pass2
RET
INV_TXFM_16X8_FN flipadst, dct
INV_TXFM_16X8_FN flipadst, adst
INV_TXFM_16X8_FN flipadst, flipadst
INV_TXFM_16X8_FN flipadst, identity, 6
cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
mov [rsp+gprsize+12*16], r1
%endif
lea t0, [o(.main)]
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
call m(iadst_16x8_internal_16bpc).main
%if ARCH_X86_64
pshufd m1, m0, q1032
pshufd m3, m2, q1032
pshufd m5, m4, q1032
pshufd m7, m6, q1032
pshufd m0, m14, q1032
pshufd m2, m12, q1032
pshufd m4, m10, q1032
pshufd m6, m8, q1032
mova m14, m1
mova m12, m3
mova m10, m5
mova m8, m7
%else
pshufd m1, m0, q1032
pshufd m3, m2, q1032
pshufd m5, m4, q1032
pshufd m7, m6, q1032
pshufd m0, [r3+11*16], q1032
pshufd m2, [r3+10*16], q1032
pshufd m4, [r3+9*16], q1032
pshufd m6, [r3+8*16], q1032
mova [r3+8*16], m7
mova [r3+9*16], m5
mova [r3+10*16], m3
mova [r3+11*16], m1
%endif
ret
.pass2:
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
lea dstq, [dstq+strideq*8]
neg strideq
add dstq, strideq
%if ARCH_X86_32
mov [rsp+gprsize+12*16], strideq
%endif
jmp m(iadst_16x8_internal_16bpc).pass2
INV_TXFM_16X8_FN identity, dct, -54
INV_TXFM_16X8_FN identity, adst, -54
INV_TXFM_16X8_FN identity, flipadst, -54
INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_32
mov [rsp+gprsize+12*16], r1
%endif
lea t0, [o(.main)]
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
%if ARCH_X86_64
mova m15, [o(pd_2896)]
pmulld m0, m15, [cq+ 0*32+r5]
pmulld m1, m15, [cq+ 1*32+r5]
pmulld m2, m15, [cq+ 2*32+r5]
pmulld m3, m15, [cq+ 3*32+r5]
pmulld m4, m15, [cq+ 4*32+r5]
pmulld m5, m15, [cq+ 5*32+r5]
pmulld m6, m15, [cq+ 6*32+r5]
pmulld m7, m15, [cq+ 7*32+r5]
pmulld m8, m15, [cq+ 8*32+r5]
pmulld m9, m15, [cq+ 9*32+r5]
pmulld m10, m15, [cq+10*32+r5]
pmulld m11, m15, [cq+11*32+r5]
pmulld m12, m15, [cq+12*32+r5]
pmulld m13, m15, [cq+13*32+r5]
pmulld m14, m15, [cq+14*32+r5]
pmulld m15, [cq+15*32+r5]
mova [r3], m15
mova m15, [o(pd_2048)]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [r3]
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [r3], m15
mova m15, [o(pd_11586)]
REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
pmulld m15, [r3]
mova [r3], m15
mova m15, [o(pd_6144)]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [r3]
REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%else
mova m0, [cq+ 0*32+r5]
mova m1, [cq+ 1*32+r5]
mova m2, [cq+ 2*32+r5]
mova m3, [cq+ 3*32+r5]
mova m4, [cq+ 4*32+r5]
mova m5, [cq+ 5*32+r5]
mova m6, [cq+ 6*32+r5]
mova m7, [cq+ 7*32+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
mova [r3], m7
mova m7, [o(pd_11586)]
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulld m7, [r3]
mova [r3], m7
mova m7, [o(pd_6144)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3]
REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
mova [r3+ 8*16], m0
mova [r3+ 9*16], m2
mova [r3+10*16], m4
mova [r3+11*16], m6
mova m0, [cq+ 8*32+r5]
mova m1, [cq+ 9*32+r5]
mova m2, [cq+10*32+r5]
mova m3, [cq+11*32+r5]
mova m4, [cq+12*32+r5]
mova m5, [cq+13*32+r5]
mova m6, [cq+14*32+r5]
mova m7, [cq+15*32+r5]
call m(idct_8x4_internal_16bpc).rect2_mul
mova [r3], m7
mova m7, [o(pd_11586)]
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulld m7, [r3]
mova [r3], m7
mova m7, [o(pd_6144)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3]
REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
%endif
ret
.pass2:
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
mov r4d, 2
%if ARCH_X86_64
mova m8, [o(pw_4096)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%endif
lea r3, [strideq*3]
jmp .loop_pass2_entry
.loop_pass2:
mova m0, [cq+0*32+ 0]
mova m1, [cq+1*32+ 0]
mova m2, [cq+2*32+ 0]
mova m3, [cq+3*32+ 0]
.loop_pass2_entry:
mova m4, [cq+0*32+16]
mova m5, [cq+1*32+16]
mova m6, [cq+2*32+16]
mova m7, [cq+3*32+16]
%if ARCH_X86_64
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
%else
mova [rsp+gprsize], m7
mova m7, [o(pw_4096)]
call m(idct_8x8_internal_16bpc).round4_and_write_8x8
%endif
%if ARCH_X86_64
%define mzero m9
%else
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
add dstq, 16
add cq, 4*32
dec r4d
jg .loop_pass2
RET
%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
%if ARCH_X86_64
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
%else
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
add r5d, 640
sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
%endmacro
INV_TXFM_16X16_FN dct, dct
INV_TXFM_16X16_FN dct, identity, v
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if ARCH_X86_64
DECLARE_REG_TMP 6, 7
%if WIN64
mov [rsp+16*16+gprsize], r7
%endif
%elif ARCH_X86_32
DECLARE_REG_TMP 1, 6
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(.main)]
.pass1_full:
%undef cmp
mov t1d, 4
.zero_loop:
dec t1d
cmp eobb, byte [r5+t1]
jb .zero_loop
mov r5d, t1d
shl r5d, 4
%if ARCH_X86_32
; restore pic-ptr
mov r6, [rsp+16*16+2*gprsize]
%endif
; setup stack pointer
lea r3, [rsp+gprsize]
.loop_pass1:
call t0
%if ARCH_X86_64
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+4*64+r5], m8
mova [cq+5*64+r5], m9
mova [cq+6*64+r5], m10
mova [cq+7*64+r5], m11
%else
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+4*64+r5], m0
mova [cq+5*64+r5], m1
mova [cq+6*64+r5], m2
mova [cq+7*64+r5], m3
mova m0, [rsp+gprsize+ 8*16]
mova m2, [rsp+gprsize+ 9*16]
mova m4, [rsp+gprsize+10*16]
mova m6, [rsp+gprsize+11*16]
%endif
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+0*64+r5], m0
mova [cq+1*64+r5], m1
mova [cq+2*64+r5], m2
mova [cq+3*64+r5], m3
pxor m0, m0
REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
sub r5d, 16
jge .loop_pass1
%if ARCH_X86_32
; restore pic-ptr
mov r1, [rsp+16*16+1*gprsize]
%endif
jmp tx2q
.main:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 1*64+r5]
mova m1, [cq+ 3*64+r5]
mova m2, [cq+ 5*64+r5]
mova m3, [cq+ 7*64+r5]
mova m4, [cq+ 9*64+r5]
mova m5, [cq+11*64+r5]
mova m6, [cq+13*64+r5]
mova m7, [cq+15*64+r5]
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+ 0*64+r5]
mova m1, [cq+ 2*64+r5]
mova m2, [cq+ 4*64+r5]
mova m3, [cq+ 6*64+r5]
mova m4, [cq+ 8*64+r5]
mova m5, [cq+10*64+r5]
mova m6, [cq+12*64+r5]
mova m7, [cq+14*64+r5]
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
call .round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
ret
.round:
%if ARCH_X86_64
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
psrld m8, m11, 10 ; 2
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
mova m9, [r3+2*16]
mova m10, [r3+3*16]
mova m11, [r3+4*16]
mova m12, [r3+5*16]
mova m13, [r3+6*16]
mova m14, [r3+7*16]
psubd m15, m0, m14 ; out15
paddd m0, m14 ; out0
psubd m14, m1, m13 ; out14
paddd m1, m13 ; out1
psubd m13, m2, m12 ; out13
paddd m2, m12 ; out2
psubd m12, m3, m11 ; out12
paddd m3, m11 ; out3
psubd m11, m4, m10 ; out11
paddd m4, m10 ; out4
psubd m10, m5, m9 ; out10
paddd m5, m9 ; out5
psubd m9, m6, m8 ; out9
paddd m6, m8 ; out6
psubd m8, m7, [r3+0*16] ; out8
paddd m7, [r3+0*16] ; out7
REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
pmaxsd m0, [r3+ 0*16]
mova [r3+ 0*16], m7
mova m7, [o(clip_18b_max)]
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
pminsd m7, [r3+ 0*16]
mova [r3+ 0*16], m0
mova m0, [o(pd_2)]
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
paddd m0, [r3+ 0*16]
mova [r3+ 0*16], m0
mova [r3+ 1*16], m1
mova [r3+ 2*16], m2
mova m1, [r3+11*16]
mova m2, [r3+10*16]
psubd m0, m7, m1
paddd m7, m1
psubd m1, m6, m2
paddd m6, m2
REPX {psrad x, 2}, m0, m1, m6, m7
packssdw m0, m1 ; out8-9
packssdw m6, m7 ; out6-7
mova [r3+11*16], m6
mova m1, [r3+9*16]
mova m7, [r3+8*16]
psubd m2, m5, m1
paddd m5, m1
psubd m1, m4, m7
paddd m4, m7
REPX {psrad x, 2}, m2, m1, m4, m5
packssdw m2, m1 ; out10-11
packssdw m4, m5 ; out4-5
mova m1, [r3+2*16]
mova [r3+10*16], m4
mova m6, [r3+7*16]
mova m7, [r3+6*16]
psubd m4, m3, m6
paddd m3, m6
psubd m6, m1, m7
paddd m1, m7
REPX {psrad x, 2}, m4, m6, m1, m3
packssdw m4, m6 ; out12-13
packssdw m1, m3 ; out2-3
mova m3, [r3+1*16]
mova [r3+9*16], m1
mova m1, [r3+0*16]
mova m5, [r3+5*16]
mova m7, [r3+4*16]
psubd m6, m3, m5
paddd m3, m5
psubd m5, m1, m7
paddd m1, m7
REPX {psrad x, 2}, m6, m5, m1, m3
packssdw m6, m5 ; out14-15
packssdw m1, m3 ; out0-1
mova [r3+8*16], m1
%endif
ret
.pass2:
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
mov r7, dstq
%else
mov [rsp+2*gprsize+16*16], dstq
%endif
lea r3, [strideq*3]
mov r4d, 2
.loop_pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
mova m0, [cq+0*64+ 0]
mova m1, [cq+2*64+ 0]
mova m2, [cq+0*64+16]
mova m3, [cq+2*64+16]
mova m4, [cq+0*64+32]
mova m5, [cq+2*64+32]
mova m6, [cq+0*64+48]
mova m7, [cq+2*64+48]
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
mova [rsp+gprsize+3*16], m0
mova [rsp+gprsize+4*16], m1
mova [rsp+gprsize+5*16], m2
mova [rsp+gprsize+6*16], m3
mova [rsp+gprsize+7*16], m4
mova [rsp+gprsize+8*16], m5
mova [rsp+gprsize+9*16], m6
; m7 is already stored in [rsp+gprsize+0*16]
mova m0, [cq+1*64+ 0]
mova m1, [cq+3*64+ 0]
mova m2, [cq+1*64+16]
mova m3, [cq+3*64+16]
mova m4, [cq+1*64+32]
mova m5, [cq+3*64+32]
mova m6, [cq+1*64+48]
mova m7, [cq+3*64+48]
call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
; out0-7 is in rsp+gprsize+3-10*mmsize
; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
%if ARCH_X86_64
lea dstq, [r7+strideq*8]
%else
mov dstq, [rsp+2*gprsize+16*16]
lea dstq, [dstq+strideq*8]
%endif
call m(idct_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
mov dstq, r7
%else
mov dstq, [rsp+2*gprsize+16*16]
%endif
mova m0, [rsp+gprsize+ 3*16]
mova m1, [rsp+gprsize+ 4*16]
mova m2, [rsp+gprsize+ 5*16]
mova m3, [rsp+gprsize+ 6*16]
mova m4, [rsp+gprsize+ 7*16]
mova m5, [rsp+gprsize+ 8*16]
mova m6, [rsp+gprsize+ 9*16]
mova m7, [rsp+gprsize+10*16]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
%if ARCH_X86_64
add r7, 16
%define mzero m9
%else
add dword [rsp+2*gprsize+16*16], 16
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
add cq, 64*4
REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
%undef mzero
dec r4d
jg .loop_pass2
%if WIN64
mov r7, [rsp+16*16+gprsize]
%endif
RET
INV_TXFM_16X16_FN adst, dct
INV_TXFM_16X16_FN adst, adst
INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
mov [rsp+16*16+gprsize], r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(.main)]
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 2*64+r5]
mova m1, [cq+13*64+r5]
mova m2, [cq+ 6*64+r5]
mova m3, [cq+ 9*64+r5]
mova m4, [cq+10*64+r5]
mova m5, [cq+ 5*64+r5]
mova m6, [cq+14*64+r5]
mova m7, [cq+ 1*64+r5]
call m(iadst_16x4_internal_16bpc).main_part1
mova m0, [cq+ 0*64+r5]
mova m1, [cq+15*64+r5]
mova m2, [cq+ 4*64+r5]
mova m3, [cq+11*64+r5]
mova m4, [cq+ 8*64+r5]
mova m5, [cq+ 7*64+r5]
mova m6, [cq+12*64+r5]
mova m7, [cq+ 3*64+r5]
call m(iadst_16x4_internal_16bpc).main_part2
call .round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
ret
.round:
%if ARCH_X86_64
pcmpeqd m8, m8 ; -1
mova m15, [o(pd_10240)]
psrld m14, 10 ; +2
psubd m13, m14, m8 ; +3
REPX {pxor x, m8 }, m1, m3, m5, m7
REPX {paddd x, m14}, m0, m2
REPX {paddd x, m13}, m1, m3
REPX {paddd x, m15}, m4, m5, m6, m7
paddd m13, m15, m8 ; +10239
paddd m8, m15, m9
psubd m9, m13, m10
paddd m10, m15, m11
psubd m11, m13, m12
paddd m12, m14, [r3+3*16]
psubd m13, m14, [r3+2*16]
psubd m15, m14, [r3+0*16]
paddd m14, [r3+1*16]
REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
%else
mova [r3+8*16], m1
mova [r3+9*16], m3
mova m3, [o(pd_10240)]
pcmpeqd m1, m1
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m4, m5, m6, m7
REPX {psrad x, 14}, m4, m5, m6, m7
packssdw m4, m5
packssdw m6, m7
mova [r3+10*16], m4
mova [r3+11*16], m6
mova m4, [r3+4*16]
mova m5, [r3+5*16]
mova m6, [r3+6*16]
mova m7, [r3+7*16]
mova m3, [o(pd_2)]
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m4, m6
psubd m3, m1
REPX {paddd x, m3}, m5, m7
REPX {psrad x, 2 }, m4, m5, m6, m7
packssdw m4, m5
packssdw m6, m7
mova m5, [r3+8*16]
mova m7, [r3+9*16]
mova [r3+8*16], m4
mova [r3+9*16], m6
mova m3, [o(pd_10240)]
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m0, m5, m2, m7
REPX {psrad x, 14}, m0, m5, m2, m7
packssdw m0, m5
packssdw m2, m7
mova m4, [r3+0*16]
mova m5, [r3+1*16]
mova m6, [r3+2*16]
mova m7, [r3+3*16]
mova m3, [o(pd_2)]
REPX {pxor x, m1}, m5, m7
REPX {paddd x, m3}, m4, m6
psubd m3, m1
REPX {paddd x, m3}, m5, m7
REPX {psrad x, 2 }, m4, m5, m6, m7
packssdw m4, m5
packssdw m6, m7
%endif
ret
.pass2:
%if ARCH_X86_64
mova m8, [o(pw_2048)]
mova m11, [o(pw_m2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
mov r7, dstq
%else
mov [rsp+2*gprsize+16*16], dstq
%endif
lea r3, [strideq*3]
mov r4d, 2
.loop_pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
mova m0, [cq+0*64+32]
mova m1, [cq+1*64+32]
mova m2, [cq+2*64+16]
mova m3, [cq+3*64+16]
mova m4, [cq+0*64+ 0]
mova m5, [cq+1*64+ 0]
mova m6, [cq+2*64+48]
mova m7, [cq+3*64+48]
mova [rsp+gprsize+3*16], m0
mova [rsp+gprsize+4*16], m1
mova [rsp+gprsize+5*16], m2
mova [rsp+gprsize+6*16], m3
mova [rsp+gprsize+7*16], m4
mova [rsp+gprsize+8*16], m5
mova [rsp+gprsize+9*16], m6
mova [rsp+gprsize+10*16], m7
mova m0, [cq+2*64+ 0]
mova m1, [cq+3*64+ 0]
mova m2, [cq+0*64+16]
mova m3, [cq+1*64+16]
mova m4, [cq+2*64+32]
mova m5, [cq+3*64+32]
mova m6, [cq+0*64+48]
mova m7, [cq+1*64+48]
call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
; out0-7 is in rsp+gprsize+3-10*mmsize
; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
%if ARCH_X86_64
lea dstq, [r7+strideq*8]
%else
mov dstq, [rsp+2*gprsize+16*16]
lea dstq, [dstq+strideq*8]
%endif
call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
%if ARCH_X86_64
mov dstq, r7
%else
mov dstq, [rsp+2*gprsize+16*16]
%endif
mova m0, [rsp+gprsize+ 3*16]
mova m1, [rsp+gprsize+ 4*16]
mova m2, [rsp+gprsize+ 5*16]
mova m3, [rsp+gprsize+ 6*16]
mova m4, [rsp+gprsize+ 7*16]
mova m5, [rsp+gprsize+ 8*16]
mova m6, [rsp+gprsize+ 9*16]
mova m7, [rsp+gprsize+10*16]
call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
%if ARCH_X86_64
add r7, 16
%define mzero m9
%else
add dword [rsp+2*gprsize+16*16], 16
%define mzero m7
pxor m7, m7
%endif
REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
add cq, 64*4
REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
%undef mzero
dec r4d
jg .loop_pass2
%if WIN64
mov r7, [rsp+16*16+gprsize]
%endif
RET
INV_TXFM_16X16_FN flipadst, dct
INV_TXFM_16X16_FN flipadst, adst
INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
mov [rsp+16*16+gprsize], r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(.main)]
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
call m(iadst_16x16_internal_16bpc).main
%if ARCH_X86_64
mova m1, m0
mova m3, m2
mova m5, m4
mova m7, m6
pshufd m0, m14, q1032
pshufd m2, m12, q1032
pshufd m4, m10, q1032
pshufd m6, m8, q1032
pshufd m8, m7, q1032
pshufd m10, m5, q1032
pshufd m12, m3, q1032
pshufd m14, m1, q1032
%else
pshufd m1, m0, q1032
pshufd m3, m2, q1032
pshufd m5, m4, q1032
pshufd m7, m6, q1032
pshufd m0, [r3+11*16], q1032
pshufd m2, [r3+10*16], q1032
pshufd m4, [r3+9*16], q1032
pshufd m6, [r3+8*16], q1032
mova [r3+11*16], m1
mova [r3+10*16], m3
mova [r3+ 9*16], m5
mova [r3+ 8*16], m7
%endif
ret
.pass2:
lea r3, [strideq*3]
lea r3, [r3*5]
add dstq, r3
neg strideq
jmp m(iadst_16x16_internal_16bpc).pass2
INV_TXFM_16X16_FN identity, dct, h
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%if WIN64
mov [rsp+16*16+gprsize], r7
%elif ARCH_X86_32
mov [rsp+16*16+gprsize*1], r1
mov [rsp+16*16+gprsize*2], r6
%endif
lea t0, [o(.main)]
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
%if ARCH_X86_64
mova m15, [o(pd_11586)]
pmulld m0, m15, [cq+ 0*64+r5]
pmulld m1, m15, [cq+ 1*64+r5]
pmulld m2, m15, [cq+ 2*64+r5]
pmulld m3, m15, [cq+ 3*64+r5]
pmulld m4, m15, [cq+ 4*64+r5]
pmulld m5, m15, [cq+ 5*64+r5]
pmulld m6, m15, [cq+ 6*64+r5]
pmulld m7, m15, [cq+ 7*64+r5]
pmulld m8, m15, [cq+ 8*64+r5]
pmulld m9, m15, [cq+ 9*64+r5]
pmulld m10, m15, [cq+10*64+r5]
pmulld m11, m15, [cq+11*64+r5]
pmulld m12, m15, [cq+12*64+r5]
pmulld m13, m15, [cq+13*64+r5]
pmulld m14, m15, [cq+14*64+r5]
pmulld m15, [cq+15*64+r5]
mova [r3], m15
mova m15, [o(pd_10240)]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [r3]
REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%else
mova m7, [o(pd_11586)]
pmulld m0, m7, [cq+ 0*64+r5]
pmulld m1, m7, [cq+ 1*64+r5]
pmulld m2, m7, [cq+ 2*64+r5]
pmulld m3, m7, [cq+ 3*64+r5]
pmulld m4, m7, [cq+ 4*64+r5]
pmulld m5, m7, [cq+ 5*64+r5]
pmulld m6, m7, [cq+ 6*64+r5]
pmulld m7, [cq+ 7*64+r5]
mova [r3], m7
mova m7, [o(pd_10240)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3]
REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
mova [r3+8*16], m0
mova [r3+9*16], m2
mova [r3+10*16], m4
mova [r3+11*16], m6
mova m7, [o(pd_11586)]
pmulld m0, m7, [cq+ 8*64+r5]
pmulld m1, m7, [cq+ 9*64+r5]
pmulld m2, m7, [cq+10*64+r5]
pmulld m3, m7, [cq+11*64+r5]
pmulld m4, m7, [cq+12*64+r5]
pmulld m5, m7, [cq+13*64+r5]
pmulld m6, m7, [cq+14*64+r5]
pmulld m7, [cq+15*64+r5]
mova [r3], m7
mova m7, [o(pd_10240)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3]
REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
%endif
ret
.pass2:
%if ARCH_X86_64
mova m4, [o(pw_2048)]
mova m5, [o(pixel_10bpc_max)]
pxor m6, m6
mova m7, [o(pw_1697x16)]
mov r7, dstq
%else
mov [rsp+2*gprsize+16*16], dstq
%endif
mov r5d, 4
lea r3, [strideq*3]
.pass2_loop:
mova m0, [cq+0*64+0]
mova m1, [cq+1*64+0]
mova m2, [cq+2*64+0]
mova m3, [cq+3*64+0]
call m(iidentity_8x16_internal_16bpc).main
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).round1_and_write_8x4
%else
call m(idct_8x4_internal_16bpc).round2_and_write_8x4
%endif
REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
add cq, 16
lea dstq, [dstq+strideq*4]
dec r5w
jg .pass2_loop
add cq, 64*3
btc r5d, 16
jc .end
%if ARCH_X86_64
lea dstq, [r7+16]
%else
mov dstq, [rsp+2*gprsize+16*16]
add dstq, 16
%endif
add r5d, 4
jmp .pass2_loop
.end:
%if WIN64
mov r7, [rsp+16*16+gprsize]
%endif
RET
cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%endif
mova m5, [o(pw_5)]
mova m7, [o(pixel_10bpc_max)]
pxor m6, m6
mov r5d, eobd
add eobb, 21
cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192
lea r4, [strideq*3]
.loop:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
REPX {paddsw x, m5}, m0, m1, m2, m3
REPX {psraw x, 3 }, m0, m1, m2, m3
call .main_zero
add cq, 16
lea dstq, [dstq+strideq*4]
btc eobd, 16
jnc .loop
sub eobd, 64
jge .loop
RET
ALIGN function_align
.main_zero:
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
.main:
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m4
punpcklwd m0, m4
punpckhwd m4, m2, m1
punpcklwd m2, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
paddw m0, [dstq+strideq*0]
paddw m1, [dstq+strideq*1]
paddw m2, [dstq+strideq*2]
paddw m3, [dstq+r4 ]
REPX {pmaxsw x, m6}, m0, m1, m2, m3
REPX {pminsw x, m7}, m0, m1, m2, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r4 ], m3
ret
cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%endif
mova m5, [o(pw_4096)]
mova m7, [o(pixel_10bpc_max)]
pxor m6, m6
mov r4d, eobd
add eobb, 21
cmovc eobd, r4d
lea r4, [strideq*3]
mov r5, dstq
.loop:
mova m0, [cq+32*0]
packssdw m0, [cq+32*1]
mova m1, [cq+32*2]
packssdw m1, [cq+32*3]
mova m2, [cq+32*4]
packssdw m2, [cq+32*5]
mova m3, [cq+32*6]
packssdw m3, [cq+32*7]
REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
call m(inv_txfm_add_identity_identity_8x32_16bpc).main
lea dstq, [dstq+strideq*4]
add cq, 16
btc eobd, 16
jnc .loop
add cq, 32*8-32
add r5, 16
mov dstq, r5
sub eobd, 64
jge .loop
RET
cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%else
mova m8, [o(pw_2896x8)]
mova m9, [o(pw_1697x16)]
mova m11, [o(pw_8192)]
%endif
mova m7, [o(pixel_10bpc_max)]
lea r4, [strideq*3]
pxor m6, m6
%if ARCH_X86_64
paddw m10, m11, m11 ; pw_16384
%endif
mov r5, dstq
call .main
sub eobd, 36
jl .ret
add cq, 128*8-32
lea dstq, [r5+16]
call .main
sub cq, 128*8
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 107 ; eob < 143
jl .ret
add cq, 128*8-32
lea dstq, [r5+16]
call .main
sub cq, 128*8
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 128 ; eob < 271
jl .ret
add cq, 128*8-32
lea dstq, [r5+16]
call .main
sub cq, 128*8
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 128 ; eob < 399
jl .ret
add cq, 128*8-32
lea dstq, [r5+16]
call .main
.ret:
RET
ALIGN function_align
.main:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
%if ARCH_X86_64
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
pmulhrsw m4, m9, m0
pmulhrsw m5, m9, m1
REPX {pmulhrsw x, m10}, m4, m5
%else
mova m6, [o(pw_2896x8)]
REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
mova m5, [o(pw_1697x16)]
pmulhrsw m4, m5, m0
pmulhrsw m5, m1
mova m6, [o(pw_16384)]
REPX {pmulhrsw x, m6 }, m4, m5
%endif
paddsw m0, m4
paddsw m1, m5
%if ARCH_X86_64
pmulhrsw m4, m9, m2
pmulhrsw m5, m9, m3
REPX {pmulhrsw x, m10}, m4, m5
%else
mova m5, [o(pw_1697x16)]
pmulhrsw m4, m5, m2
pmulhrsw m5, m3
REPX {pmulhrsw x, m6 }, m4, m5
%endif
paddsw m2, m4
paddsw m3, m5
%if ARCH_X86_64
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
%else
psrlw m6, 1 ; pw_8192
REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
pxor m6, m6
%endif
call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
lea dstq, [dstq+strideq*4]
add cq, 16
btc eobd, 16
jnc .main
ret
cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%else
mova m8, [o(pw_2896x8)]
mova m9, [o(pw_1697x16)]
mova m10, [o(pw_2048)]
%endif
mova m7, [o(pixel_10bpc_max)]
lea r4, [strideq*3]
pxor m6, m6
mov r5, dstq
call .main
sub eobd, 36
jl .ret
call .main
add cq, 64*8-64
lea dstq, [r5+16*1]
call .main
sub eobd, 107 ; eob < 143
jl .ret
call .main
add cq, 64*8-64
lea dstq, [r5+16*2]
call .main
sub eobd, 128 ; eob < 271
jl .ret
call .main
add cq, 64*8-64
lea dstq, [r5+16*3]
call .main
sub eobd, 128 ; eob < 399
jl .ret
call .main
.ret:
RET
ALIGN function_align
.main:
mova m0, [cq+64*0]
packssdw m0, [cq+64*1]
mova m1, [cq+64*2]
packssdw m1, [cq+64*3]
mova m2, [cq+64*4]
packssdw m2, [cq+64*5]
mova m3, [cq+64*6]
packssdw m3, [cq+64*7]
%if ARCH_X86_64
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
%else
mova m6, [o(pw_2896x8)]
REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
%endif
REPX {paddsw x, x }, m0, m1, m2, m3
%if ARCH_X86_64
pmulhrsw m4, m9, m0
pmulhrsw m5, m9, m1
%else
mova m6, [o(pw_1697x16)]
pmulhrsw m4, m6, m0
pmulhrsw m5, m6, m1
%endif
REPX {paddsw x, x }, m0, m1
paddsw m0, m4
paddsw m1, m5
%if ARCH_X86_64
pmulhrsw m4, m9, m2
pmulhrsw m5, m9, m3
%else
pmulhrsw m4, m6, m2
pmulhrsw m6, m3
%endif
REPX {paddsw x, x }, m2, m3
paddsw m2, m4
%if ARCH_X86_64
paddsw m3, m5
REPX {pmulhrsw x, m10}, m0, m1, m2, m3
%else
paddsw m3, m6
mova m6, [o(pw_2048)]
REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
pxor m6, m6
%endif
REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
call m(inv_txfm_add_identity_identity_8x32_16bpc).main
lea dstq, [dstq+strideq*4]
add cq, 16
btc eobd, 16
jnc .main
ret
cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
%undef cmp
%if ARCH_X86_32
LEA r6, $$
%endif
mova m5, [o(pw_8192)]
mova m7, [o(pixel_10bpc_max)]
pxor m6, m6
lea r4, [strideq*3]
mov r5, dstq
call .main ; 0
cmp eobd, 36
jl .ret
add cq, 128*8-32 ; 0 1
lea dstq, [r5+16] ; 1
call .main
call .main2
cmp eobd, 136
jl .ret
add cq, 128*16-64 ; 0 1 2
lea dstq, [r5+16*2] ; 1 2
call .main ; 2
call .main2
call .main2
cmp eobd, 300
jl .ret
add cq, 128*24-96 ; 0 1 2 3
add r5, 16*3 ; 1 2 3
mov dstq, r5 ; 2 3
call .main ; 3
call .main2
call .main2
call .main2
cmp eobd, 535
jl .ret
add cq, 128*24-96 ; 0 1 2 3
lea dstq, [r5+strideq*8] ; 1 2 3 4
mov r5, dstq ; 2 3 4
call .main ; 3 4
call .main2
call .main2
cmp eobd, 755
jl .ret
add cq, 128*16-64 ; 0 1 2 3
lea dstq, [r5+strideq*8] ; 1 2 3 4
mov r5, dstq ; 2 3 4 5
call .main ; 3 4 5
call .main2
cmp eobd, 911
jl .ret
add cq, 128*8-32 ; 0 1 2 3
lea dstq, [r5+strideq*8] ; 1 2 3 4
call .main ; 2 3 4 5
.ret: ; 3 4 5 6
RET
ALIGN function_align
.main2:
sub cq, 128*8
sub dstq, 16
.main:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
lea dstq, [dstq+strideq*4]
add cq, 16
btc eobd, 16
jnc .main
ret
cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%define base $$
DECLARE_REG_TMP 0, 4
%else
lea r6, [tbl_Nx32_odd_offset]
%define base tbl_Nx32_odd_offset
DECLARE_REG_TMP 4, 7
%if WIN64
mov [rsp+gprsize*1+35*16], r7
%endif
%endif
%define o2(x) r6-base+x
test eobd, eobd
jz .dconly
%if ARCH_X86_32
mov [rsp+gprsize*1+35*16], r0
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 7*2
cmp eobw, word [o2(tbl_8x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
mova [rsp+ 3*16+r5*8], m0
mova [rsp+11*16+r5*8], m0
mova [rsp+ 3*16+t0*8], m0
mova [rsp+ 3*16+t1*8], m0
sub r5d, 2
cmp eobw, word [o2(tbl_8x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
; actual first pass after skipping all-zero data
mov [rsp+gprsize*0+35*16], eobd
mov r3, rsp
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+0*128+r5*8]
mova m1, [cq+1*128+r5*8]
mova m2, [cq+2*128+r5*8]
mova m3, [cq+3*128+r5*8]
mova m4, [cq+4*128+r5*8]
mova m5, [cq+5*128+r5*8]
mova m6, [cq+6*128+r5*8]
mova m7, [cq+7*128+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1
mova m1, [o(pd_2)]
REPX {paddd x, m1}, m0, m6, m5, m3
call m(idct_8x4_internal_16bpc).round
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
call m(idct_8x4_internal_16bpc).transpose4x8packed
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
mova [r3+ 3*16+r5*8], m0
mova [r3+11*16+r5*8], m2
mova [r3+ 3*16+t1*8], m1
mova [r3+ 3*16+t0*8], m3
pxor m7, m7
REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
sub r5d, 2
jge .loop_pass1
; pass 2 code starts here
; m0 is already loaded from last iteration of first pass
%if ARCH_X86_32
mov r0, [rsp+gprsize*1+35*16]
%endif
mov eobd, [rsp+gprsize*0+35*16]
cmp eobd, 43
jl .load_veryfast
cmp eobd, 107
jl .load_fast
; load normal
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
jmp .run
.load_fast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
jmp .run
.load_veryfast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
; fall-through
.run:
call .pass2
%if WIN64
mov r7, [rsp+gprsize*1+35*16]
%endif
RET
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
mova m1, [rsp+gprsize+16* 4]
mova m2, [rsp+gprsize+16* 5]
mova m3, [rsp+gprsize+16* 6]
mova m4, [rsp+gprsize+16* 7]
mova m5, [rsp+gprsize+16* 8]
mova m6, [rsp+gprsize+16* 9]
mova m7, [rsp+gprsize+16*10]
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
mova [rsp+gprsize+ 3*16], m0
mova [rsp+gprsize+ 4*16], m1
mova [rsp+gprsize+ 5*16], m2
mova [rsp+gprsize+ 6*16], m3
mova [rsp+gprsize+ 7*16], m4
mova [rsp+gprsize+ 8*16], m5
mova [rsp+gprsize+ 9*16], m6
mova m0, [rsp+gprsize+11*16]
mova m1, [rsp+gprsize+12*16]
mova m2, [rsp+gprsize+13*16]
mova m3, [rsp+gprsize+14*16]
mova m4, [rsp+gprsize+15*16]
mova m5, [rsp+gprsize+16*16]
mova m6, [rsp+gprsize+17*16]
mova m7, [rsp+gprsize+18*16]
call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
mova m7, [rsp+gprsize+ 0*16]
mova [rsp+gprsize+11*16], m0
mova [rsp+gprsize+12*16], m1
mova [rsp+gprsize+13*16], m2
mova [rsp+gprsize+14*16], m3
mova [rsp+gprsize+15*16], m4
mova [rsp+gprsize+16*16], m5
mova [rsp+gprsize+17*16], m6
mova [rsp+gprsize+18*16], m7
call r4
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%endif
lea r3, [strideq*3]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
lea dstq, [dstq+strideq*8]
mova m0, [rsp+gprsize+11*16]
mova m1, [rsp+gprsize+12*16]
mova m2, [rsp+gprsize+13*16]
mova m3, [rsp+gprsize+14*16]
mova m4, [rsp+gprsize+15*16]
mova m5, [rsp+gprsize+16*16]
mova m6, [rsp+gprsize+17*16]
mova m7, [rsp+gprsize+18*16]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
lea dstq, [dstq+strideq*8]
mova m0, [rsp+gprsize+19*16]
mova m1, [rsp+gprsize+20*16]
mova m2, [rsp+gprsize+21*16]
mova m3, [rsp+gprsize+22*16]
mova m4, [rsp+gprsize+23*16]
mova m5, [rsp+gprsize+24*16]
mova m6, [rsp+gprsize+25*16]
mova m7, [rsp+gprsize+26*16]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
lea dstq, [dstq+strideq*8]
mova m0, [rsp+gprsize+27*16]
mova m1, [rsp+gprsize+28*16]
mova m2, [rsp+gprsize+29*16]
mova m3, [rsp+gprsize+30*16]
mova m4, [rsp+gprsize+31*16]
mova m5, [rsp+gprsize+32*16]
mova m6, [rsp+gprsize+33*16]
mova m7, [rsp+gprsize+34*16]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
add r5d, 640
sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
%if ARCH_X86_32
mov [rsp+gprsize*1+76*16], r0
%elif WIN64
mov [rsp+gprsize*1+76*16], r7
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 7*2
cmp eobw, word [o2(tbl_16x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
mova [rsp+12*16+r5*8], m0
mova [rsp+20*16+r5*8], m0
mova [rsp+12*16+t0*8], m0
mova [rsp+12*16+t1*8], m0
mova [rsp+44*16+r5*8], m0
mova [rsp+52*16+r5*8], m0
mova [rsp+44*16+t0*8], m0
mova [rsp+44*16+t1*8], m0
sub r5d, 2
cmp eobw, word [o2(tbl_16x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
; actual first pass after skipping all-zero data
mov [rsp+gprsize*0+76*16], eobd
mov r3, rsp
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 1*128+r5*8]
mova m1, [cq+ 3*128+r5*8]
mova m2, [cq+ 5*128+r5*8]
mova m3, [cq+ 7*128+r5*8]
mova m4, [cq+ 9*128+r5*8]
mova m5, [cq+11*128+r5*8]
mova m6, [cq+13*128+r5*8]
mova m7, [cq+15*128+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+ 0*128+r5*8]
mova m1, [cq+ 2*128+r5*8]
mova m2, [cq+ 4*128+r5*8]
mova m3, [cq+ 6*128+r5*8]
mova m4, [cq+ 8*128+r5*8]
mova m5, [cq+10*128+r5*8]
mova m6, [cq+12*128+r5*8]
mova m7, [cq+14*128+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
call m(idct_16x4_internal_16bpc).round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
call m(idct_8x4_internal_16bpc).transpose4x8packed
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
%if ARCH_X86_64
mova [rsp+12*16+r5*8], m0
mova [rsp+20*16+r5*8], m2
mova [rsp+12*16+t1*8], m1
mova [rsp+12*16+t0*8], m3
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+44*16+r5*8], m8
mova [rsp+52*16+r5*8], m10
mova [rsp+44*16+t1*8], m9
mova [rsp+44*16+t0*8], m11
%else
mova [rsp+44*16+r5*8], m0
mova [rsp+52*16+r5*8], m2
mova [rsp+44*16+t1*8], m1
mova [rsp+44*16+t0*8], m3
mova m0, [r3+ 8*16]
mova m2, [r3+ 9*16]
mova m4, [r3+10*16]
mova m6, [r3+11*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+12*16+r5*8], m0
mova [rsp+20*16+r5*8], m2
mova [rsp+12*16+t1*8], m1
mova [rsp+12*16+t0*8], m3
%endif
pxor m7, m7
REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
sub r5d, 2
jge .loop_pass1
; pass=2
add rsp, 9*16
%if ARCH_X86_64
mov r6, dstq
%else
mov dstq, [rsp+gprsize*1+67*16]
%endif
mov eobd, [rsp+gprsize*0+67*16]
cmp eobd, 44
jl .load_veryfast
cmp eobd, 151
jl .load_fast
; load normal
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
jmp .run
.load_fast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
jmp .run
.load_veryfast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
; fall-through
.run:
%if ARCH_X86_64
lea r2, [dstq+32]
mov r7, -4
%else
lea r2, [rsp+67*16]
mov dword [r2+0*gprsize], 2
%endif
jmp .loop_pass2_entry
.loop_pass2:
mova m0, [rsp+16* 3]
.loop_pass2_entry:
%if ARCH_X86_32
mov dstq, [r2+1*gprsize]
%endif
call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
add rsp, 32*16
%if ARCH_X86_64
add r7, 2
lea dstq, [r2+r7*8]
jl .loop_pass2
%if WIN64
mov r7, [rsp+gprsize*1+3*16]
%endif
%else
add dword [r2+1*gprsize], 16
dec dword [r2+0*gprsize]
jg .loop_pass2
%endif
%assign stack_size (stack_size-73*16)
%if STACK_ALIGNMENT >= 16
%assign stack_size_padded (stack_size_padded-73*16)
%assign stack_offset (stack_offset-73*16)
%else
%xdefine rstkm [rsp + stack_size]
%endif
RET
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add r5d, 128
sar r5d, 8
imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
dst, stride, c, eob
%if ARCH_X86_32
LEA r6, $$
%endif
test eobd, eobd
jz .dconly
; remove entirely-zero iterations
%undef cmp
%if ARCH_X86_64
xor r5d, r5d
cmp eobd, 10
setge r5b
%else
mov r5d, 1
cmp eobd, 10
sbb r5d, 0
%endif
add r5d, r5d
; actual first pass after skipping all-zero data
.loop_pass1:
mova m0, [cq+32* 1+r5*8]
mova m1, [cq+32* 7+r5*8]
mova m2, [cq+32* 9+r5*8]
mova m3, [cq+32*15+r5*8]
mova m4, [cq+32*17+r5*8]
mova m5, [cq+32*23+r5*8]
mova m6, [cq+32*25+r5*8]
mova m7, [cq+32*31+r5*8]
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mov r3, rsp
call .main_oddhalf_part1
mova m0, [cq+32* 3+r5*8]
mova m1, [cq+32* 5+r5*8]
mova m2, [cq+32*11+r5*8]
mova m3, [cq+32*13+r5*8]
mova m4, [cq+32*19+r5*8]
mova m5, [cq+32*21+r5*8]
mova m6, [cq+32*27+r5*8]
mova m7, [cq+32*29+r5*8]
call .main_oddhalf_part2
mova m0, [cq+32* 2+r5*8]
mova m1, [cq+32* 6+r5*8]
mova m2, [cq+32*10+r5*8]
mova m3, [cq+32*14+r5*8]
mova m4, [cq+32*18+r5*8]
mova m5, [cq+32*22+r5*8]
mova m6, [cq+32*26+r5*8]
mova m7, [cq+32*30+r5*8]
add r3, 16*(16+4*ARCH_X86_32)
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+32* 0+r5*8]
mova m1, [cq+32* 4+r5*8]
mova m2, [cq+32* 8+r5*8]
mova m3, [cq+32*12+r5*8]
mova m4, [cq+32*16+r5*8]
mova m5, [cq+32*20+r5*8]
mova m6, [cq+32*24+r5*8]
mova m7, [cq+32*28+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
sub r3, 16*(16+4*ARCH_X86_32)
call .round_dct32
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).transpose4x8packed
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+32* 8+r5*8], m8
mova [cq+32* 9+r5*8], m9
mova [cq+32*10+r5*8], m10
mova [cq+32*11+r5*8], m11
mova m8, [r3+16* 9] ; 8 9
mova m10, [r3+16*11] ; 10 11
mova m12, [r3+16*13] ; 12 13
mova m14, [r3+16*15] ; 14 15
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+32* 4+r5*8], m8
mova [cq+32* 5+r5*8], m9
mova [cq+32* 6+r5*8], m10
mova [cq+32* 7+r5*8], m11
mova m8, [r3+16* 8] ; 24 25
mova m10, [r3+16*10] ; 26 27
mova m12, [r3+16*12] ; 28 29
mova m14, [r3+16*14] ; 30 31
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+32*12+r5*8], m8
mova [cq+32*13+r5*8], m9
mova [cq+32*14+r5*8], m10
mova [cq+32*15+r5*8], m11
%else
sub r3, 8*16
mova m0, [r3+ 8*16]
mova m2, [r3+10*16]
mova m4, [r3+12*16]
mova m6, [r3+14*16]
packssdw m0, [r3+ 9*16]
packssdw m2, [r3+11*16]
packssdw m4, [r3+13*16]
packssdw m6, [r3+15*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+32* 4+r5*8], m0
mova [cq+32* 5+r5*8], m1
mova [cq+32* 6+r5*8], m2
mova [cq+32* 7+r5*8], m3
mova m0, [r3+16*16]
mova m2, [r3+18*16]
mova m4, [r3+20*16]
mova m6, [r3+22*16]
packssdw m0, [r3+17*16]
packssdw m2, [r3+19*16]
packssdw m4, [r3+21*16]
packssdw m6, [r3+23*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+32* 8+r5*8], m0
mova [cq+32* 9+r5*8], m1
mova [cq+32*10+r5*8], m2
mova [cq+32*11+r5*8], m3
mova m0, [r3+31*16]
mova m2, [r3+29*16]
mova m4, [r3+27*16]
mova m6, [r3+25*16]
packssdw m0, [r3+30*16]
packssdw m2, [r3+28*16]
packssdw m4, [r3+26*16]
packssdw m6, [r3+24*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+32*12+r5*8], m0
mova [cq+32*13+r5*8], m1
mova [cq+32*14+r5*8], m2
mova [cq+32*15+r5*8], m3
mova m0, [r3+ 0*16]
mova m2, [r3+ 2*16]
mova m4, [r3+ 4*16]
mova m6, [r3+ 6*16]
packssdw m0, [r3+ 1*16]
packssdw m2, [r3+ 3*16]
packssdw m4, [r3+ 5*16]
packssdw m6, [r3+ 7*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
pxor m7, m7
; clear lower half of [cq]
REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
test r5d, r5d
jz .end_pass1
mova [cq+32* 0+r5*8], m0
mova [cq+32* 1+r5*8], m1
mova [cq+32* 2+r5*8], m2
mova [cq+32* 3+r5*8], m3
sub r5d, 2
jmp .loop_pass1
.end_pass1:
; pass=2, we need to call this otherwise the stack pointer has
; the wrong offset in the 8-bit code
mov r4d, 4
call m(idct_16x8_internal_16bpc).pass2_main
RET
.main_oddhalf_part1_fast: ; lower half zero
pmulld m7, m0, [o(pd_4091)]
pmulld m0, [o(pd_201)]
pmulld m4, m3, [o(pd_m2751)]
%if ARCH_X86_32
pmulld m3, [o(pd_3035)]
mova m5, [o(pd_2048)]
REPX {paddd x, m5}, m0, m7
REPX {psrad x, 12}, m0, m7
mova [r3+3*16], m7
mova m7, m3
mova m3, m5
%else
pmulld m3, [o(pd_3035)]
%endif
pmulld m6, m1, [o(pd_m1380)]
pmulld m1, [o(pd_3857)]
pmulld m5, m2, [o(pd_3703)]
pmulld m2, [o(pd_1751)]
jmp .main_oddhalf_part1_fast2
.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
%if ARCH_X86_64
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
.main_oddhalf_part1_fast2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
psubd m8, m0, m4 ; t17
paddd m0, m4 ; t16
psubd m4, m6, m2 ; t18
paddd m6, m2 ; t19
psubd m2, m1, m5 ; t29
paddd m1, m5 ; t28
psubd m5, m7, m3 ; t30
paddd m7, m3 ; t31
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
mova m15, [o(pd_4017)]
mova m10, [o(pd_799)]
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
psubd m3, m0, m6 ; t19a
paddd m0, m6 ; t16a
psubd m6, m7, m1 ; t28a
paddd m7, m1 ; t31a
psubd m1, m5, m4 ; t18
paddd m5, m4 ; t17
psubd m4, m8, m2 ; t29
paddd m8, m2 ; t30
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
mova m15, [o(pd_3784)]
mova m10, [o(pd_1567)]
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
mova [r3+16*0], m0
mova [r3+16*1], m5
mova [r3+16*2], m4
mova [r3+16*3], m6
mova [r3+16*4], m3
mova [r3+16*5], m1
mova [r3+16*6], m8
mova [r3+16*7], m7
%else
mova [r3+0*16], m2
mova [r3+1*16], m3
mova [r3+2*16], m4
mova [r3+3*16], m5
mova m3, [o(pd_2048)]
ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a
ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
mova m4, [r3+2*16]
mova m5, [r3+3*16]
mova [r3+2*16], m6
mova [r3+3*16], m7
mova m2, [r3+0*16]
mova m7, [r3+1*16]
mova [r3+0*16], m0
mova [r3+1*16], m1
ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
mova m0, [r3+0*16]
mova m1, [r3+1*16]
mova m6, [r3+2*16]
.main_oddhalf_part1_fast2:
REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7
REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7
psubd m3, m0, m4 ; t17
mova [r3+0*16], m3
mova m3, [r3+3*16]
paddd m0, m4 ; t16
psubd m4, m6, m2 ; t18
paddd m6, m2 ; t19
psubd m2, m1, m5 ; t29
paddd m1, m5 ; t28
psubd m5, m3, m7 ; t30
paddd m7, m3 ; t31
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
pmaxsd m3, [r3+0*16]
mova [r3+0*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
pminsd m3, [r3+0*16]
mova [r3+0*16], m0
mova [r3+1*16], m1
mova [r3+2*16], m6
mova [r3+3*16], m7
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a
ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a
psubd m1, m5, m4 ; t18
paddd m5, m4 ; t17
psubd m4, m3, m2 ; t29
paddd m3, m2 ; t30
mova m0, [r3+0*16]
mova m2, [r3+1*16]
mova m6, [r3+2*16]
mova m7, [r3+3*16]
mova [r3+0*16], m3
psubd m3, m0, m6 ; t19a
paddd m0, m6 ; t16a
psubd m6, m7, m2 ; t28a
paddd m7, m2 ; t31a
mova m2, [o(clip_18b_min)]
REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
pmaxsd m2, [r3+0*16]
mova [r3+0*16], m2
mova m2, [o(clip_18b_max)]
REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
pminsd m2, [r3+0*16]
mova [r3+16*0], m0
mova [r3+16*1], m5
mova [r3+16*6], m2
mova [r3+16*7], m7
mova m7, [o(pd_2048)]
ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28
mova [r3+16*2], m4
mova [r3+16*3], m6
mova [r3+16*4], m3
mova [r3+16*5], m1
%endif
ret
.main_oddhalf_part2_fast: ; lower half zero
pmulld m7, m0, [o(pd_m601)]
pmulld m0, [o(pd_4052)]
pmulld m4, m3, [o(pd_3290)]
%if ARCH_X86_32
pmulld m3, [o(pd_2440)]
mova m5, [o(pd_2048)]
REPX {paddd x, m5}, m0, m7
REPX {psrad x, 12}, m0, m7
mova [r3+11*16], m7
mova m7, m3
mova m3, m5
%else
pmulld m3, [o(pd_2440)]
%endif
pmulld m6, m1, [o(pd_3973)]
pmulld m1, [o(pd_995)]
pmulld m5, m2, [o(pd_m2106)]
pmulld m2, [o(pd_3513)]
jmp .main_oddhalf_part2_fast2
.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
%if ARCH_X86_64
ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
.main_oddhalf_part2_fast2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
psubd m8, m0, m4 ; t25
paddd m0, m4 ; t24
psubd m4, m6, m2 ; t26
paddd m6, m2 ; t27
psubd m2, m1, m5 ; t21
paddd m1, m5 ; t20
psubd m5, m7, m3 ; t22
paddd m7, m3 ; t23
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
mova m15, [o(pd_2276)]
mova m10, [o(pd_3406)]
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
psubd m3, m0, m6 ; t27a
paddd m0, m6 ; t24a
psubd m6, m7, m1 ; t20a
paddd m7, m1 ; t23a
psubd m1, m5, m4 ; t21
paddd m5, m4 ; t22
psubd m4, m8, m2 ; t26
paddd m8, m2 ; t25
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
mova m15, [o(pd_3784)]
mova m10, [o(pd_1567)]
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
mova m9, [r3+16*0] ; t16a
mova m10, [r3+16*1] ; t17
psubd m2, m9, m7 ; t23
paddd m9, m7 ; t16
psubd m7, m10, m5 ; t22a
paddd m10, m5 ; t17a
REPX {pmaxsd x, m12}, m9, m10, m2, m7
REPX {pminsd x, m13}, m9, m10, m2, m7
mova [r3+16*0], m9
mova [r3+16*1], m10
mova m9, [r3+16*2] ; t18a
mova m10, [r3+16*3] ; t19
psubd m5, m9, m1 ; t21
paddd m9, m1 ; t18
psubd m1, m10, m6 ; t20a
paddd m10, m6 ; t19a
REPX {pmaxsd x, m12}, m9, m10, m5, m1
REPX {pminsd x, m13}, m9, m10, m5, m1
mova [r3+16*2], m9
mova [r3+16*3], m10
mova m9, [r3+16*4] ; t28
mova m10, [r3+16*5] ; t29a
psubd m6, m9, m3 ; t27a
paddd m9, m3 ; t28a
psubd m3, m10, m4 ; t26
paddd m10, m4 ; t29
REPX {pmaxsd x, m12}, m9, m10, m6, m3
REPX {pminsd x, m13}, m9, m10, m6, m3
REPX {pmulld x, m14}, m6, m3, m1, m5
paddd m6, m11
paddd m3, m11
psubd m4, m6, m1 ; t20
paddd m6, m1 ; t27
psubd m1, m3, m5 ; t21a
paddd m3, m5 ; t26a
REPX {psrad x, 12 }, m4, m1, m3, m6
mova [r3+16*4], m4
mova [r3+16*5], m1
mova m4, [r3+16*6] ; t30
mova m1, [r3+16*7] ; t31a
psubd m5, m4, m8 ; t25a
paddd m4, m8 ; t30a
psubd m8, m1, m0 ; t24
paddd m1, m0 ; t31
REPX {pmaxsd x, m12}, m8, m5, m4, m1
REPX {pminsd x, m13}, m8, m5, m4, m1
REPX {pmulld x, m14}, m5, m8, m7, m2
paddd m5, m11
paddd m8, m11
psubd m0, m5, m7 ; t22
paddd m5, m7 ; t25
psubd m7, m8, m2 ; t23a
paddd m2, m8 ; t24a
REPX {psrad x, 12 }, m0, m7, m2, m5
mova [r3+16*6], m0
mova [r3+16*7], m7
mova [r3+16*8], m2
mova [r3+16*9], m5
mova [r3+16*10], m3
mova [r3+16*11], m6
mova [r3+16*12], m9
mova [r3+16*13], m10
mova [r3+16*14], m4
mova [r3+16*15], m1
%else
mova [r3+ 8*16], m2
mova [r3+ 9*16], m3
mova [r3+10*16], m4
mova [r3+11*16], m5
mova m3, [o(pd_2048)]
ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a
ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a
mova m2, [r3+ 8*16]
mova m4, [r3+10*16]
mova m5, [r3+11*16]
mova [r3+ 8*16], m0
mova [r3+10*16], m6
mova [r3+11*16], m7
mova m7, [r3+ 9*16]
mova [r3+ 9*16], m1
ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
mova m0, [r3+ 8*16]
mova m1, [r3+ 9*16]
mova m6, [r3+10*16]
.main_oddhalf_part2_fast2:
REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6
REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6
psubd m3, m0, m4 ; t25
mova [r3+ 8*16], m3
mova m3, [r3+11*16]
paddd m0, m4 ; t24
psubd m4, m6, m2 ; t26
paddd m6, m2 ; t27
psubd m2, m1, m5 ; t21
paddd m1, m5 ; t20
psubd m5, m3, m7 ; t22
paddd m7, m3 ; t23
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
pmaxsd m3, [r3+ 8*16]
mova [r3+ 8*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
pminsd m3, [r3+ 8*16]
mova [r3+ 8*16], m0
mova [r3+ 9*16], m1
mova [r3+10*16], m6
mova [r3+11*16], m7
mova m7, [o(pd_2048)]
ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a
ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a
psubd m1, m5, m4 ; t21
paddd m5, m4 ; t22
psubd m4, m3, m2 ; t26
paddd m3, m2 ; t25
mova m0, [r3+ 8*16]
mova m2, [r3+ 9*16]
mova m6, [r3+10*16]
mova m7, [r3+11*16]
mova [r3+ 8*16], m3
psubd m3, m0, m6 ; t27a
paddd m0, m6 ; t24a
psubd m6, m7, m2 ; t20a
paddd m7, m2 ; t23a
mova m2, [o(clip_18b_min)]
REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
pmaxsd m2, [r3+ 8*16]
mova [r3+ 8*16], m2
mova m2, [o(clip_18b_max)]
REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
pminsd m2, [r3+ 8*16]
mova [r3+ 8*16], m0
mova [r3+ 9*16], m2
mova [r3+14*16], m5
mova [r3+15*16], m7
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20
mova [r3+10*16], m3
mova m0, [o(clip_18b_min)]
mova m2, [o(clip_18b_max)]
mova m5, [r3+16*2] ; t18a
mova m7, [r3+16*3] ; t19
psubd m3, m5, m1 ; t21
paddd m5, m1 ; t18
psubd m1, m7, m6 ; t20a
paddd m7, m6 ; t19a
REPX {pmaxsd x, m0}, m5, m7, m3, m1
REPX {pminsd x, m2}, m5, m7, m3, m1
mova [r3+16*2], m5
mova [r3+16*3], m7
mova [r3+11*16], m3
mova m3, [r3+10*16]
mova m5, [r3+16*4] ; t28
mova m7, [r3+16*5] ; t29a
psubd m6, m5, m3 ; t27a
paddd m5, m3 ; t28a
psubd m3, m7, m4 ; t26
paddd m7, m4 ; t29
REPX {pmaxsd x, m0}, m5, m7, m6, m3
REPX {pminsd x, m2}, m5, m7, m6, m3
mova [r3+16*12], m5
mova [r3+16*13], m7
mova m5, [o(pd_2048)]
mova m7, [o(pd_2896)]
mova m4, [r3+11*16]
REPX {pmulld x, m7}, m6, m3, m1, m4
paddd m6, m5
paddd m3, m5
psubd m5, m6, m1 ; t20
paddd m6, m1 ; t27
psubd m1, m3, m4 ; t21a
paddd m3, m4 ; t26a
REPX {psrad x, 12}, m5, m1, m3, m6
mova [r3+16*4], m5
mova [r3+16*5], m1
mova [r3+16*10], m3
mova [r3+16*11], m6
mova m5, [r3+14*16]
mova m6, [r3+15*16]
mova m3, [r3+16*0] ; t16a
mova m4, [r3+16*1] ; t17
psubd m1, m3, m6 ; t23
paddd m3, m6 ; t16
psubd m6, m4, m5 ; t22a
paddd m4, m5 ; t17a
REPX {pmaxsd x, m0}, m3, m4, m1, m6
REPX {pminsd x, m2}, m3, m4, m1, m6
mova [r3+16*0], m3
mova [r3+16*1], m4
mova m5, [r3+ 8*16]
mova m3, [r3+ 9*16]
mova [r3+ 8*16], m1
mova [r3+ 9*16], m6
mova m4, [r3+16*6] ; t30
mova m1, [r3+16*7] ; t31a
psubd m6, m1, m5 ; t24
paddd m1, m5 ; t31
psubd m5, m4, m3 ; t25a
paddd m4, m3 ; t30a
REPX {pmaxsd x, m0}, m6, m5, m4, m1
REPX {pminsd x, m2}, m6, m5, m4, m1
mova [r3+16*14], m4
mova [r3+16*15], m1
mova m4, [o(pd_2048)]
mova m1, [r3+ 9*16]
mova m2, [r3+ 8*16]
REPX {pmulld x, m7}, m5, m6, m1, m2
paddd m5, m4
paddd m6, m4
psubd m0, m5, m1 ; t22
paddd m5, m1 ; t25
psubd m1, m6, m2 ; t23a
paddd m2, m6 ; t24a
REPX {psrad x, 12}, m0, m1, m2, m5
mova [r3+16*6], m0
mova [r3+16*7], m1
mova [r3+16*8], m2
mova [r3+16*9], m5
%endif
ret
; final sumsub for idct16 as well as idct32, plus final downshift
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
mova m%4, [r3+16*(23-%1)]
pmaxsd m%1, m12
pminsd m%1, m13
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
pmaxsd m%3, m12
pminsd m%1, m13
pminsd m%3, m13
paddd m%1, m11
paddd m%3, m11
mova m%5, [r3+16*( 0+%1)]
mova m%2, [r3+16*(15-%1)]
psubd m%4, m%1, m%2 ; out31 - n
paddd m%1, m%2 ; out0 + n
paddd m%2, m%3, m%5 ; out15 - n
psubd m%3, m%5 ; out16 + n
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
%endmacro
.round_dct32:
%if ARCH_X86_64
psrld m11, 10 ; pd_2
IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31
mova [r3+ 0*16], m6
mova [r3+23*16], m7
IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30
packssdw m0, m1 ; 0 1
packssdw m14, m15 ; 14 15
packssdw m8, m6 ; 16 17
packssdw m7, m9 ; 30 31
mova [r3+16*15], m14
mova [r3+16*14], m7
IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29
IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28
packssdw m2, m3 ; 2 3
packssdw m14, m15 ; 12 13
packssdw m10, m1 ; 18 19
packssdw m9, m7 ; 28 29
mova [r3+16*13], m14
mova [r3+16*12], m9
IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27
IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26
packssdw m4, m5 ; 4 5
packssdw m14, m15 ; 10 11
packssdw m1, m3 ; 20 21
packssdw m9, m7 ; 26 27
mova [r3+16*11], m14
mova [r3+16*10], m9
mova m6, [r3+ 0*16]
mova m7, [r3+23*16]
IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25
IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24
packssdw m6, m7 ; 6 7
packssdw m11, m15 ; 8 9
packssdw m14, m3 ; 22 23
packssdw m9, m5 ; 24 25
mova [r3+16*9], m11
mova [r3+16*8], m9
mova m12, m1
ret
%else
mova [r3+16*16], m0
mova [r3+17*16], m1
mova [r3+18*16], m2
mova [r3+19*16], m3
mova [r3+20*16], m4
mova [r3+21*16], m5
mova [r3+22*16], m6
mova [r3+23*16], m7
mova m1, [o(pd_2)]
mova m2, [o(clip_18b_min)]
mova m3, [o(clip_18b_max)]
mov r4, 15*16
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
pmaxsd m0, m2
pminsd m0, m3
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
pmaxsd m5, m2
pminsd m0, m3
pminsd m5, m3
paddd m0, m1
paddd m5, m1
mova m7, [r3]
mova m4, [r3+r4]
psubd m6, m0, m4 ; out31 - n
paddd m0, m4 ; out0 + n
paddd m4, m5, m7 ; out15 - n
psubd m5, m7 ; out16 + n
REPX {psrad x, 2}, m0, m5, m4, m6
mova [r3], m0
mova [r3+r4], m4
mova [r3+16*16], m5
mova [r3+24*16], m6
add r3, 16
sub r4, 32
jg .loop_dct32_end
ret
%endif
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
add r5d, 640
sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
mova m6, [o(pixel_10bpc_max)]
pxor m5, m5
.dconly_loop:
mova m1, [dstq+16*0]
mova m2, [dstq+16*1]
mova m3, [dstq+16*2]
mova m4, [dstq+16*3]
REPX {paddw x, m0}, m1, m2, m3, m4
REPX {pminsw x, m6}, m1, m2, m3, m4
REPX {pmaxsw x, m5}, m1, m2, m3, m4
mova [dstq+16*0], m1
mova [dstq+16*1], m2
mova [dstq+16*2], m3
mova [dstq+16*3], m4
add dstq, strideq
dec r3d
jg .dconly_loop
RET
cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
; remove entirely-zero iterations
%undef cmp
mov r5d, 8
.zero_loop:
sub r5d, 2
cmp eobw, word [o2(tbl_32x16_2d)+r5]
jl .zero_loop
; actual first pass after skipping all-zero data
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+64* 1+r5*8]
mova m1, [cq+64* 7+r5*8]
mova m2, [cq+64* 9+r5*8]
mova m3, [cq+64*15+r5*8]
mova m4, [cq+64*17+r5*8]
mova m5, [cq+64*23+r5*8]
mova m6, [cq+64*25+r5*8]
mova m7, [cq+64*31+r5*8]
mov r3, rsp
call m(idct_8x4_internal_16bpc).rect2_mul
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
mova m0, [cq+64* 3+r5*8]
mova m1, [cq+64* 5+r5*8]
mova m2, [cq+64*11+r5*8]
mova m3, [cq+64*13+r5*8]
mova m4, [cq+64*19+r5*8]
mova m5, [cq+64*21+r5*8]
mova m6, [cq+64*27+r5*8]
mova m7, [cq+64*29+r5*8]
%if ARCH_X86_32
add r3, 16*8
%endif
call m(idct_8x4_internal_16bpc).rect2_mul
%if ARCH_X86_32
sub r3, 16*8
%endif
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
add r3, 16*(16+4*ARCH_X86_32)
mova m0, [cq+64* 2+r5*8]
mova m1, [cq+64* 6+r5*8]
mova m2, [cq+64*10+r5*8]
mova m3, [cq+64*14+r5*8]
mova m4, [cq+64*18+r5*8]
mova m5, [cq+64*22+r5*8]
mova m6, [cq+64*26+r5*8]
mova m7, [cq+64*30+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+64* 0+r5*8]
mova m1, [cq+64* 4+r5*8]
mova m2, [cq+64* 8+r5*8]
mova m3, [cq+64*12+r5*8]
mova m4, [cq+64*16+r5*8]
mova m5, [cq+64*20+r5*8]
mova m6, [cq+64*24+r5*8]
mova m7, [cq+64*28+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
sub r3, 16*(16+4*ARCH_X86_32)
call .round_dct32
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).transpose4x8packed
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+64* 8+r5*8], m8
mova [cq+64* 9+r5*8], m9
mova [cq+64*10+r5*8], m10
mova [cq+64*11+r5*8], m11
mova m8, [r3+16* 9] ; 8 9
mova m10, [r3+16*11] ; 10 11
mova m12, [r3+16*13] ; 12 13
mova m14, [r3+16*15] ; 14 15
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+64* 4+r5*8], m8
mova [cq+64* 5+r5*8], m9
mova [cq+64* 6+r5*8], m10
mova [cq+64* 7+r5*8], m11
mova m8, [r3+16* 8] ; 24 25
mova m10, [r3+16*10] ; 26 27
mova m12, [r3+16*12] ; 28 29
mova m14, [r3+16*14] ; 30 31
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [cq+64*12+r5*8], m8
mova [cq+64*13+r5*8], m9
mova [cq+64*14+r5*8], m10
mova [cq+64*15+r5*8], m11
%else
sub r3, 8*16
mova m0, [r3+ 8*16]
mova m2, [r3+10*16]
mova m4, [r3+12*16]
mova m6, [r3+14*16]
packssdw m0, [r3+ 9*16]
packssdw m2, [r3+11*16]
packssdw m4, [r3+13*16]
packssdw m6, [r3+15*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+64* 4+r5*8], m0
mova [cq+64* 5+r5*8], m1
mova [cq+64* 6+r5*8], m2
mova [cq+64* 7+r5*8], m3
mova m0, [r3+16*16]
mova m2, [r3+18*16]
mova m4, [r3+20*16]
mova m6, [r3+22*16]
packssdw m0, [r3+17*16]
packssdw m2, [r3+19*16]
packssdw m4, [r3+21*16]
packssdw m6, [r3+23*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+64* 8+r5*8], m0
mova [cq+64* 9+r5*8], m1
mova [cq+64*10+r5*8], m2
mova [cq+64*11+r5*8], m3
mova m0, [r3+31*16]
mova m2, [r3+29*16]
mova m4, [r3+27*16]
mova m6, [r3+25*16]
packssdw m0, [r3+30*16]
packssdw m2, [r3+28*16]
packssdw m4, [r3+26*16]
packssdw m6, [r3+24*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [cq+64*12+r5*8], m0
mova [cq+64*13+r5*8], m1
mova [cq+64*14+r5*8], m2
mova [cq+64*15+r5*8], m3
mova m0, [r3+ 0*16]
mova m2, [r3+ 2*16]
mova m4, [r3+ 4*16]
mova m6, [r3+ 6*16]
packssdw m0, [r3+ 1*16]
packssdw m2, [r3+ 3*16]
packssdw m4, [r3+ 5*16]
packssdw m6, [r3+ 7*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
mova [cq+64* 0+r5*8], m0
mova [cq+64* 1+r5*8], m1
mova [cq+64* 2+r5*8], m2
mova [cq+64* 3+r5*8], m3
pxor m0, m0
REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
sub r5d, 2
jge .loop_pass1
; pass=2, we need to call this otherwise the stack pointer has
; the wrong offset in the 8-bit code
call .pass2
RET
.pass2:
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%if WIN64
mov [rsp+16*16+gprsize], r7
%endif
mov r7, dstq
%else
mov [rsp+2*gprsize+16*16], dstq
%endif
lea r3, [strideq*3]
mov r4d, 4
jmp m(idct_16x16_internal_16bpc).loop_pass2
.round_dct32:
%if ARCH_X86_64
psrld m11, 11 ; pd_1
IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31
mova [r3+ 0*16], m6
mova [r3+23*16], m7
IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30
packssdw m0, m1 ; 0 1
packssdw m14, m15 ; 14 15
packssdw m8, m6 ; 16 17
packssdw m7, m9 ; 30 31
mova [r3+16*15], m14
mova [r3+16*14], m7
IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29
IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28
packssdw m2, m3 ; 2 3
packssdw m14, m15 ; 12 13
packssdw m10, m1 ; 18 19
packssdw m9, m7 ; 28 29
mova [r3+16*13], m14
mova [r3+16*12], m9
IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27
IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26
packssdw m4, m5 ; 4 5
packssdw m14, m15 ; 10 11
packssdw m1, m3 ; 20 21
packssdw m9, m7 ; 26 27
mova [r3+16*11], m14
mova [r3+16*10], m9
mova m6, [r3+ 0*16]
mova m7, [r3+23*16]
IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25
IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24
packssdw m6, m7 ; 6 7
packssdw m11, m15 ; 8 9
packssdw m14, m3 ; 22 23
packssdw m9, m5 ; 24 25
mova [r3+16*9], m11
mova [r3+16*8], m9
mova m12, m1
ret
%else
mova [r3+16*16], m0
mova [r3+17*16], m1
mova [r3+18*16], m2
mova [r3+19*16], m3
mova [r3+20*16], m4
mova [r3+21*16], m5
mova [r3+22*16], m6
mova [r3+23*16], m7
pcmpeqd m1, m1 ; -1
mova m2, [o(clip_18b_min)]
mova m3, [o(clip_18b_max)]
mov r4, 15*16
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
pmaxsd m5, m2
pminsd m0, m3
pminsd m5, m3
psubd m0, m1
psubd m5, m1
mova m7, [r3]
mova m4, [r3+r4]
psubd m6, m0, m4 ; out31 - n
paddd m0, m4 ; out0 + n
paddd m4, m5, m7 ; out15 - n
psubd m5, m7 ; out16 + n
REPX {psrad x, 1}, m0, m5, m4, m6
mova [r3], m0
mova [r3+r4], m4
mova [r3+16*16], m5
mova [r3+24*16], m6
add r3, 16
sub r4, 32
jg .loop_dct32_end
ret
%endif
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
; remove entirely-zero iterations
%if ARCH_X86_32
mov [rsp+5*32*16+1*gprsize], dstq
%elif WIN64
mov [rsp+5*32*16+1*gprsize], r7
%endif
%undef cmp
mov r5d, 14
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
mova [rsp+32*16+r5*8+0*32*16], m0
mova [rsp+40*16+r5*8+0*32*16], m0
mova [rsp+32*16+t0*8+0*32*16], m0
mova [rsp+32*16+t1*8+0*32*16], m0
mova [rsp+32*16+r5*8+1*32*16], m0
mova [rsp+40*16+r5*8+1*32*16], m0
mova [rsp+32*16+t0*8+1*32*16], m0
mova [rsp+32*16+t1*8+1*32*16], m0
mova [rsp+32*16+r5*8+2*32*16], m0
mova [rsp+40*16+r5*8+2*32*16], m0
mova [rsp+32*16+t0*8+2*32*16], m0
mova [rsp+32*16+t1*8+2*32*16], m0
mova [rsp+32*16+r5*8+3*32*16], m0
mova [rsp+40*16+r5*8+3*32*16], m0
mova [rsp+32*16+t0*8+3*32*16], m0
mova [rsp+32*16+t1*8+3*32*16], m0
sub r5d, 2
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
; actual first pass after skipping all-zero data
mov [rsp+gprsize*0+5*32*16], eobd
.loop_pass1:
mova m0, [cq+128* 1+r5*8]
mova m1, [cq+128* 7+r5*8]
mova m2, [cq+128* 9+r5*8]
mova m3, [cq+128*15+r5*8]
mova m4, [cq+128*17+r5*8]
mova m5, [cq+128*23+r5*8]
mova m6, [cq+128*25+r5*8]
mova m7, [cq+128*31+r5*8]
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mov r3, rsp
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
mova m0, [cq+128* 3+r5*8]
mova m1, [cq+128* 5+r5*8]
mova m2, [cq+128*11+r5*8]
mova m3, [cq+128*13+r5*8]
mova m4, [cq+128*19+r5*8]
mova m5, [cq+128*21+r5*8]
mova m6, [cq+128*27+r5*8]
mova m7, [cq+128*29+r5*8]
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
mova m0, [cq+128* 2+r5*8]
mova m1, [cq+128* 6+r5*8]
mova m2, [cq+128*10+r5*8]
mova m3, [cq+128*14+r5*8]
mova m4, [cq+128*18+r5*8]
mova m5, [cq+128*22+r5*8]
mova m6, [cq+128*26+r5*8]
mova m7, [cq+128*30+r5*8]
add r3, 16*(16+4*ARCH_X86_32)
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+128* 0+r5*8]
mova m1, [cq+128* 4+r5*8]
mova m2, [cq+128* 8+r5*8]
mova m3, [cq+128*12+r5*8]
mova m4, [cq+128*16+r5*8]
mova m5, [cq+128*20+r5*8]
mova m6, [cq+128*24+r5*8]
mova m7, [cq+128*28+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
sub r3, 16*(16+4*ARCH_X86_32)
call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).transpose4x8packed
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+32*16+r5*8+2*32*16], m8
mova [rsp+40*16+r5*8+2*32*16], m10
mova [rsp+32*16+t1*8+2*32*16], m9
mova [rsp+32*16+t0*8+2*32*16], m11
mova m8, [r3+16* 9] ; 8 9
mova m10, [r3+16*11] ; 10 11
mova m12, [r3+16*13] ; 12 13
mova m14, [r3+16*15] ; 14 15
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+32*16+r5*8+1*32*16], m8
mova [rsp+40*16+r5*8+1*32*16], m10
mova [rsp+32*16+t1*8+1*32*16], m9
mova [rsp+32*16+t0*8+1*32*16], m11
mova m8, [r3+16* 8] ; 24 25
mova m10, [r3+16*10] ; 26 27
mova m12, [r3+16*12] ; 28 29
mova m14, [r3+16*14] ; 30 31
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+32*16+r5*8+3*32*16], m8
mova [rsp+40*16+r5*8+3*32*16], m10
mova [rsp+32*16+t1*8+3*32*16], m9
mova [rsp+32*16+t0*8+3*32*16], m11
%else
sub r3, 8*16
mova m0, [r3+ 8*16]
mova m2, [r3+10*16]
mova m4, [r3+12*16]
mova m6, [r3+14*16]
packssdw m0, [r3+ 9*16]
packssdw m2, [r3+11*16]
packssdw m4, [r3+13*16]
packssdw m6, [r3+15*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+32*16+r5*8+1*32*16], m0
mova [rsp+40*16+r5*8+1*32*16], m2
mova [rsp+32*16+t1*8+1*32*16], m1
mova [rsp+32*16+t0*8+1*32*16], m3
mova m0, [r3+16*16]
mova m2, [r3+18*16]
mova m4, [r3+20*16]
mova m6, [r3+22*16]
packssdw m0, [r3+17*16]
packssdw m2, [r3+19*16]
packssdw m4, [r3+21*16]
packssdw m6, [r3+23*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+32*16+r5*8+2*32*16], m0
mova [rsp+40*16+r5*8+2*32*16], m2
mova [rsp+32*16+t1*8+2*32*16], m1
mova [rsp+32*16+t0*8+2*32*16], m3
mova m0, [r3+31*16]
mova m2, [r3+29*16]
mova m4, [r3+27*16]
mova m6, [r3+25*16]
packssdw m0, [r3+30*16]
packssdw m2, [r3+28*16]
packssdw m4, [r3+26*16]
packssdw m6, [r3+24*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+32*16+r5*8+3*32*16], m0
mova [rsp+40*16+r5*8+3*32*16], m2
mova [rsp+32*16+t1*8+3*32*16], m1
mova [rsp+32*16+t0*8+3*32*16], m3
mova m0, [r3+ 0*16]
mova m2, [r3+ 2*16]
mova m4, [r3+ 4*16]
mova m6, [r3+ 6*16]
packssdw m0, [r3+ 1*16]
packssdw m2, [r3+ 3*16]
packssdw m4, [r3+ 5*16]
packssdw m6, [r3+ 7*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
pxor m7, m7
; clear lower half of [cq]
REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14, 15, \
16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
mova [rsp+32*16+r5*8+0*32*16], m0
mova [rsp+40*16+r5*8+0*32*16], m2
mova [rsp+32*16+t1*8+0*32*16], m1
mova [rsp+32*16+t0*8+0*32*16], m3
sub r5d, 2
jge .loop_pass1
; pass=2 code starts here
mov eobd, [rsp+gprsize*0+5*32*16]
add rsp, 29*16
cmp eobd, 36
jl .load_veryfast
cmp eobd, 136
jl .load_fast
; load normal
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
jmp .run
.load_fast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
jmp .run
.load_veryfast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
; fall-through
.run:
%if ARCH_X86_64
lea r2, [dstq+64]
mov r7, -8
%else
lea r2, [rsp+(4*32+3)*16]
mov dword [r2+0*gprsize], 4
%endif
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 2, 0
mov [rsp+gprsize*1+(64*2+12)*16], r0
mov [rsp+gprsize*2+(64*2+12)*16], r1
mov [rsp+gprsize*3+(64*2+12)*16], r2
%else
DECLARE_REG_TMP 8, 9, 4, 7
mov [rsp+gprsize*1+(64*2+12)*16], r9
%if WIN64
mov [rsp+gprsize*2+(64*2+12)*16], r7
mov [rsp+gprsize*3+(64*2+12)*16], r8
%endif
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 7*2
cmp eobw, word [o2(tbl_16x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
mova [rsp+12*16+t0*8], m0
mova [rsp+12*16+t1*8], m0
mova [rsp+12*16+t2*8], m0
mova [rsp+12*16+t3*8], m0
mova [rsp+76*16+t0*8], m0
mova [rsp+76*16+t1*8], m0
mova [rsp+76*16+t2*8], m0
mova [rsp+76*16+t3*8], m0
sub r5d, 2
cmp eobw, word [o2(tbl_16x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
; actual first pass after skipping all-zero data
mov [rsp+gprsize*0+(64*2+12)*16], eobd
mov r3, rsp
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 6, 0
mov r2, [rsp+gprsize*3+(64*2+12)*16]
mov [rsp+gprsize*3+(64*2+12)*16], r6
%endif
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+ 1*128+r5*8]
mova m1, [cq+ 3*128+r5*8]
mova m2, [cq+ 5*128+r5*8]
mova m3, [cq+ 7*128+r5*8]
mova m4, [cq+ 9*128+r5*8]
mova m5, [cq+11*128+r5*8]
mova m6, [cq+13*128+r5*8]
mova m7, [cq+15*128+r5*8]
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+ 0*128+r5*8]
mova m1, [cq+ 2*128+r5*8]
mova m2, [cq+ 4*128+r5*8]
mova m3, [cq+ 6*128+r5*8]
mova m4, [cq+ 8*128+r5*8]
mova m5, [cq+10*128+r5*8]
mova m6, [cq+12*128+r5*8]
mova m7, [cq+14*128+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
call m(idct_16x16_internal_16bpc).round
%if ARCH_X86_64
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
packssdw m8, m9
packssdw m10, m11
packssdw m12, m13
packssdw m14, m15
%endif
call m(idct_8x4_internal_16bpc).transpose4x8packed
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
%if ARCH_X86_64
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+76*16+t0*8], m8
mova [rsp+76*16+t1*8], m9
mova [rsp+76*16+t2*8], m10
mova [rsp+76*16+t3*8], m11
%else
mova [rsp+76*16+t0*8], m0
mova [rsp+76*16+t1*8], m1
mova [rsp+76*16+t2*8], m2
mova [rsp+76*16+t3*8], m3
mova m0, [rsp+ 8*16]
mova m2, [rsp+ 9*16]
mova m4, [rsp+10*16]
mova m6, [rsp+11*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
mova [rsp+12*16+t0*8], m0
mova [rsp+12*16+t1*8], m1
mova [rsp+12*16+t2*8], m2
mova [rsp+12*16+t3*8], m3
%if ARCH_X86_32
mov r6, [rsp+gprsize*3+(64*2+12)*16]
%endif
pxor m7, m7
REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
sub r5d, 2
jge .loop_pass1
; pass=2
mov eobd, [rsp+gprsize*0+(64*2+12)*16]
cmp eobd, 151
jl .fast
; fall-through
%if ARCH_X86_64
DECLARE_REG_TMP 8, 9
%else
DECLARE_REG_TMP 1, 5
%endif
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
jmp .run
.fast:
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
.run:
add rsp, 9*16
%if ARCH_X86_64
lea r2, [dstq+32]
mov r7, -4
%else
lea r2, [rsp+(64*2+3)*16]
mov [r2+4*gprsize], t0
mov [r2+5*gprsize], t1
mov r1, [r2+2*gprsize]
mov dword [r2+0*gprsize], 2
%endif
.loop_pass2:
%if ARCH_X86_32
mov dstq, [r2+1*gprsize]
%endif
call .pass2
add rsp, 64*16
%if ARCH_X86_64
add r7, 2
lea dstq, [r2+r7*8]
jl .loop_pass2
%else
add dword [r2+1*gprsize], 16
dec dword [r2+0*gprsize]
jg .loop_pass2
%endif
%assign stack_size (stack_size-(64*2+9)*16)
%if STACK_ALIGNMENT >= 16
%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
%assign stack_offset (stack_offset-(64*2+9)*16)
%else
%xdefine rstkm [rsp + stack_size]
%endif
%if ARCH_X86_64
mov r9, [rsp+gprsize*1+3*16]
%if WIN64
mov r7, [rsp+gprsize*2+3*16]
mov r8, [rsp+gprsize*3+3*16]
%endif
%endif
RET
.pass2:
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
mova m0, [rsp+gprsize+16* 3]
mova m1, [rsp+gprsize+16* 4]
mova m2, [rsp+gprsize+16* 5]
mova m3, [rsp+gprsize+16* 6]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
mova [rsp+gprsize+ 3*16], m0
mova [rsp+gprsize+ 4*16], m1
mova [rsp+gprsize+ 5*16], m2
mova [rsp+gprsize+ 6*16], m3
mova [rsp+gprsize+ 7*16], m4
mova [rsp+gprsize+ 8*16], m5
mova [rsp+gprsize+ 9*16], m6
mova [rsp+gprsize+10*16], m7
mova m0, [rsp+gprsize+16*11]
mova m1, [rsp+gprsize+16*12]
mova m2, [rsp+gprsize+16*13]
mova m3, [rsp+gprsize+16*14]
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
mova m7, [rsp+gprsize+ 0*16]
mova [rsp+gprsize+11*16], m0
mova [rsp+gprsize+12*16], m1
mova [rsp+gprsize+13*16], m2
mova [rsp+gprsize+14*16], m3
mova [rsp+gprsize+15*16], m4
mova [rsp+gprsize+16*16], m5
mova [rsp+gprsize+17*16], m6
mova [rsp+gprsize+18*16], m7
%if ARCH_X86_64
call r8
%else
call [r2+4*gprsize]
%endif
mova [rsp+gprsize+ 3*16], m0
mova [rsp+gprsize+ 5*16], m2
mova [rsp+gprsize+ 8*16], m5
mova [rsp+gprsize+10*16], m7
%if ARCH_X86_64
call r9
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%else
call [r2+5*gprsize]
%endif
lea r3, [strideq*3]
lea r4, [rsp+gprsize+ 3*16]
%if ARCH_X86_64
mov r6d, 8
%else
mov dword [r2+2*gprsize], 8
%endif
.loop_write:
mova m0, [r4+0*16]
mova m1, [r4+1*16]
mova m2, [r4+2*16]
mova m3, [r4+3*16]
mova m4, [r4+4*16]
mova m5, [r4+5*16]
mova m6, [r4+6*16]
mova m7, [r4+7*16]
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
lea dstq, [dstq+strideq*8]
add r4, 8*16
%if ARCH_X86_64
dec r6d
%else
dec dword [r2+2*gprsize]
%endif
jg .loop_write
ret
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add r5d, 640
sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 2, 0
mov [rsp+gprsize*1+(64*4+32)*16], r0
mov [rsp+gprsize*2+(64*4+32)*16], r1
mov [rsp+gprsize*3+(64*4+32)*16], r2
%else
DECLARE_REG_TMP 8, 9, 4, 7
mov [rsp+gprsize*1+(64*4+32)*16], r9
%if WIN64
mov [rsp+gprsize*2+(64*4+32)*16], r7
mov [rsp+gprsize*3+(64*4+32)*16], r8
%endif
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 7*2
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
mova [rsp+ 32*16+t0*8], m0
mova [rsp+ 32*16+t1*8], m0
mova [rsp+ 32*16+t2*8], m0
mova [rsp+ 32*16+t3*8], m0
mova [rsp+ 96*16+t0*8], m0
mova [rsp+ 96*16+t1*8], m0
mova [rsp+ 96*16+t2*8], m0
mova [rsp+ 96*16+t3*8], m0
mova [rsp+160*16+t0*8], m0
mova [rsp+160*16+t1*8], m0
mova [rsp+160*16+t2*8], m0
mova [rsp+160*16+t3*8], m0
mova [rsp+224*16+t0*8], m0
mova [rsp+224*16+t1*8], m0
mova [rsp+224*16+t2*8], m0
mova [rsp+224*16+t3*8], m0
sub r5d, 2
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
; actual first pass after skipping all-zero data
mov [rsp+gprsize*0+(64*4+32)*16], eobd
mov r3, rsp
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 6, 0
mov r2, [rsp+gprsize*3+(64*4+32)*16]
mov [rsp+gprsize*3+(64*4+32)*16], r6
%endif
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mova m0, [cq+128* 1+r5*8]
mova m1, [cq+128* 7+r5*8]
mova m2, [cq+128* 9+r5*8]
mova m3, [cq+128*15+r5*8]
mova m4, [cq+128*17+r5*8]
mova m5, [cq+128*23+r5*8]
mova m6, [cq+128*25+r5*8]
mova m7, [cq+128*31+r5*8]
mov r3, rsp
call m(idct_8x4_internal_16bpc).rect2_mul
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
mova m0, [cq+128* 3+r5*8]
mova m1, [cq+128* 5+r5*8]
mova m2, [cq+128*11+r5*8]
mova m3, [cq+128*13+r5*8]
mova m4, [cq+128*19+r5*8]
mova m5, [cq+128*21+r5*8]
mova m6, [cq+128*27+r5*8]
mova m7, [cq+128*29+r5*8]
%if ARCH_X86_32
add r3, 16*8
%endif
call m(idct_8x4_internal_16bpc).rect2_mul
%if ARCH_X86_32
sub r3, 16*8
%endif
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
add r3, 16*(16+4*ARCH_X86_32)
mova m0, [cq+128* 2+r5*8]
mova m1, [cq+128* 6+r5*8]
mova m2, [cq+128*10+r5*8]
mova m3, [cq+128*14+r5*8]
mova m4, [cq+128*18+r5*8]
mova m5, [cq+128*22+r5*8]
mova m6, [cq+128*26+r5*8]
mova m7, [cq+128*30+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_16x4_internal_16bpc).main_oddhalf
mova m0, [cq+128* 0+r5*8]
mova m1, [cq+128* 4+r5*8]
mova m2, [cq+128* 8+r5*8]
mova m3, [cq+128*12+r5*8]
mova m4, [cq+128*16+r5*8]
mova m5, [cq+128*20+r5*8]
mova m6, [cq+128*24+r5*8]
mova m7, [cq+128*28+r5*8]
call m(idct_8x4_internal_16bpc).rect2_mul
call m(idct_8x4_internal_16bpc).main_pass1
call m(idct_8x4_internal_16bpc).round
sub r3, 16*(16+4*ARCH_X86_32)
call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
%if ARCH_X86_64
call m(idct_8x4_internal_16bpc).transpose4x8packed
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+160*16+t0*8], m8
mova [rsp+160*16+t1*8], m9
mova [rsp+160*16+t2*8], m10
mova [rsp+160*16+t3*8], m11
mova m8, [r3+16* 9] ; 8 9
mova m10, [r3+16*11] ; 10 11
mova m12, [r3+16*13] ; 12 13
mova m14, [r3+16*15] ; 14 15
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+ 96*16+t0*8], m8
mova [rsp+ 96*16+t1*8], m9
mova [rsp+ 96*16+t2*8], m10
mova [rsp+ 96*16+t3*8], m11
mova m8, [r3+16* 8] ; 24 25
mova m10, [r3+16*10] ; 26 27
mova m12, [r3+16*12] ; 28 29
mova m14, [r3+16*14] ; 30 31
call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
mova [rsp+224*16+t0*8], m8
mova [rsp+224*16+t1*8], m9
mova [rsp+224*16+t2*8], m10
mova [rsp+224*16+t3*8], m11
%else
sub r3, 8*16
mova m0, [r3+ 8*16]
mova m2, [r3+10*16]
mova m4, [r3+12*16]
mova m6, [r3+14*16]
packssdw m0, [r3+ 9*16]
packssdw m2, [r3+11*16]
packssdw m4, [r3+13*16]
packssdw m6, [r3+15*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+ 96*16+t0*8], m0
mova [rsp+ 96*16+t1*8], m1
mova [rsp+ 96*16+t2*8], m2
mova [rsp+ 96*16+t3*8], m3
mova m0, [r3+16*16]
mova m2, [r3+18*16]
mova m4, [r3+20*16]
mova m6, [r3+22*16]
packssdw m0, [r3+17*16]
packssdw m2, [r3+19*16]
packssdw m4, [r3+21*16]
packssdw m6, [r3+23*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+160*16+t0*8], m0
mova [rsp+160*16+t1*8], m1
mova [rsp+160*16+t2*8], m2
mova [rsp+160*16+t3*8], m3
mova m0, [r3+31*16]
mova m2, [r3+29*16]
mova m4, [r3+27*16]
mova m6, [r3+25*16]
packssdw m0, [r3+30*16]
packssdw m2, [r3+28*16]
packssdw m4, [r3+26*16]
packssdw m6, [r3+24*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [rsp+224*16+t0*8], m0
mova [rsp+224*16+t1*8], m1
mova [rsp+224*16+t2*8], m2
mova [rsp+224*16+t3*8], m3
mova m0, [r3+ 0*16]
mova m2, [r3+ 2*16]
mova m4, [r3+ 4*16]
mova m6, [r3+ 6*16]
packssdw m0, [r3+ 1*16]
packssdw m2, [r3+ 3*16]
packssdw m4, [r3+ 5*16]
packssdw m6, [r3+ 7*16]
call m(idct_8x4_internal_16bpc).transpose4x8packed
%endif
mova [rsp+ 32*16+t0*8], m0
mova [rsp+ 32*16+t1*8], m1
mova [rsp+ 32*16+t2*8], m2
mova [rsp+ 32*16+t3*8], m3
pxor m0, m0
REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14, 15, \
16, 17, 18, 19, 20, 21, 22, 23, \
24, 25, 26, 27, 28, 29, 30, 31
%if ARCH_X86_32
mov r6, [rsp+gprsize*3+(64*4+32)*16]
%endif
sub r5d, 2
jge .loop_pass1
; pass=2
mov eobd, [rsp+gprsize*0+(64*4+32)*16]
cmp eobd, 136
jl .fast
; fall-through
%if ARCH_X86_64
DECLARE_REG_TMP 8, 9
%else
DECLARE_REG_TMP 1, 5
%endif
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
jmp .run
.fast:
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
.run:
add rsp, 29*16
%if ARCH_X86_64
lea r2, [dstq+64]
mov r7, -8
%else
lea r2, [rsp+(64*4+3)*16]
mov [r2+4*gprsize], t0
mov [r2+5*gprsize], t1
mov r1, [r2+2*gprsize]
mov dword [r2+0*gprsize], 4
%endif
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
; remove entirely-zero iterations
%undef cmp
mov r5d, 8
.zero_loop:
sub r5d, 2
cmp eobw, word [o2(tbl_32x16_2d)+r5]
jl .zero_loop
; actual first pass after skipping all-zero data
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mov r3, rsp
lea r4, [o(idct64_mul_16bpc)]
mova m0, [cq+64* 1+r5*8]
mova m1, [cq+64*31+r5*8]
mova m2, [cq+64*17+r5*8]
mova m3, [cq+64*15+r5*8]
call .main_part1
mova m0, [cq+64* 7+r5*8]
mova m1, [cq+64*25+r5*8]
mova m2, [cq+64*23+r5*8]
mova m3, [cq+64* 9+r5*8]
call .main_part1
mova m0, [cq+64* 5+r5*8]
mova m1, [cq+64*27+r5*8]
mova m2, [cq+64*21+r5*8]
mova m3, [cq+64*11+r5*8]
call .main_part1
mova m0, [cq+64* 3+r5*8]
mova m1, [cq+64*29+r5*8]
mova m2, [cq+64*19+r5*8]
mova m3, [cq+64*13+r5*8]
call .main_part1
call .main_part2
mova m0, [cq+64* 2+r5*8]
mova m1, [cq+64*14+r5*8]
mova m2, [cq+64*18+r5*8]
mova m3, [cq+64*30+r5*8]
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
mova m0, [cq+64* 6+r5*8]
mova m1, [cq+64*10+r5*8]
mova m2, [cq+64*22+r5*8]
mova m3, [cq+64*26+r5*8]
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
add r3, 16*(24+4*ARCH_X86_32)
mova m0, [cq+64* 4+r5*8]
mova m1, [cq+64*12+r5*8]
mova m2, [cq+64*20+r5*8]
mova m3, [cq+64*28+r5*8]
call m(idct_16x4_internal_16bpc).main_oddhalf_fast
mova m0, [cq+64* 0+r5*8]
mova m1, [cq+64* 8+r5*8]
mova m2, [cq+64*16+r5*8]
mova m3, [cq+64*24+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1_fast
call m(idct_8x4_internal_16bpc).round
mova [r3-(7+4*ARCH_X86_32)*16], m1
mova [r3-(6+4*ARCH_X86_32)*16], m2
mova [r3-(5+4*ARCH_X86_32)*16], m3
mova [r3-(4+4*ARCH_X86_32)*16], m4
mova [r3-(3+4*ARCH_X86_32)*16], m5
mova [r3-(2+4*ARCH_X86_32)*16], m6
mova [r3-(1+4*ARCH_X86_32)*16], m7
sub r3, 16*(40+4*ARCH_X86_32-4)
%if ARCH_X86_64
psrld m15, m11, 10 ; pd_2
%else
mova m7, [o(pd_2)]
%endif
call .main_end_loop_start
lea r3, [rsp+56*16]
lea r4, [cq+r5*8+64*28]
call .shift_transpose
sub r5d, 2
jge .loop_pass1
; pass=2, we need to call this otherwise the stack pointer has
; the wrong offset in the 8-bit code
call .pass2
RET
.pass2:
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
mova m10, [o(pixel_10bpc_max)]
%if WIN64
mov [rsp+16*16+gprsize], r7
%endif
mov r7, dstq
%else
mov [rsp+2*gprsize+16*16], dstq
%endif
lea r3, [strideq*3]
mov r4d, 8
jmp m(idct_16x16_internal_16bpc).loop_pass2
.main_part1: ; idct64 steps 1-5
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
%if ARCH_X86_64
movd m7, [r4+4*0]
movd m8, [r4+4*1]
movd m6, [r4+4*2]
movd m9, [r4+4*3]
movd m5, [r4+4*4]
movd m10, [r4+4*5]
movd m4, [r4+4*6]
movd m15, [r4+4*7]
REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
pmulld m7, m0 ; t63a
pmulld m0, m8 ; t32a
pmulld m6, m1 ; t62a
pmulld m1, m9 ; t33a
pmulld m5, m2 ; t61a
pmulld m2, m10 ; t34a
pmulld m4, m3 ; t60a
pmulld m3, m15 ; t35a
movd m10, [r4+4*8]
movd m15, [r4+4*9]
REPX {pshufd x, x, q0000}, m10, m15
REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
psubd m8, m0, m1 ; t33
paddd m0, m1 ; t32
psubd m1, m7, m6 ; t62
paddd m7, m6 ; t63
psubd m6, m3, m2 ; t34
paddd m3, m2 ; t35
psubd m2, m4, m5 ; t61
paddd m4, m5 ; t60
REPX {pmaxsd x, m12}, m8, m1, m6, m2
REPX {pminsd x, m13}, m8, m1, m6, m2
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
REPX {pmaxsd x, m12}, m0, m3, m7, m4
REPX {pminsd x, m13}, m0, m3, m7, m4
movd m10, [r4+4*10]
movd m15, [r4+4*11]
REPX {pshufd x, x, q0000}, m10, m15
psubd m5, m0, m3 ; t35a
paddd m0, m3 ; t32a
psubd m3, m7, m4 ; t60a
paddd m7, m4 ; t63a
psubd m4, m1, m6 ; t34
paddd m1, m6 ; t33
psubd m6, m8, m2 ; t61
paddd m8, m2 ; t62
REPX {pmaxsd x, m12}, m5, m3, m4, m6
REPX {pminsd x, m13}, m5, m3, m4, m6
ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
REPX {pmaxsd x, m12}, m0, m7, m1, m8
REPX {pminsd x, m13}, m0, m7, m1, m8
add r4, 4*12
mova [r3+16*0], m0
mova [r3+16*7], m7
mova [r3+16*1], m1
mova [r3+16*6], m8
mova [r3+16*2], m6
mova [r3+16*5], m4
mova [r3+16*3], m3
mova [r3+16*4], m5
%else
movd m7, [r4+4*0]
movd m6, [r4+4*2]
movd m5, [r4+4*4]
movd m4, [r4+4*6]
REPX {pshufd x, x, q0000}, m7, m6, m5, m4
pmulld m7, m0 ; t63a
pmulld m6, m1 ; t62a
pmulld m5, m2 ; t61a
pmulld m4, m3 ; t60a
mova [r3+0*16], m6
mova [r3+1*16], m7
movd m6, [r4+4*1]
movd m7, [r4+4*3]
REPX {pshufd x, x, q0000}, m7, m6
pmulld m0, m6 ; t32a
pmulld m1, m7 ; t33a
movd m6, [r4+4*5]
movd m7, [r4+4*7]
REPX {pshufd x, x, q0000}, m7, m6
pmulld m2, m6 ; t34a
pmulld m3, m7 ; t35a
mova m6, [r3+0*16]
mova m7, [o(pd_2048)]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
paddd m7, [r3+1*16]
REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
mova [r3+0*16], m5
psubd m5, m0, m1 ; t33
paddd m0, m1 ; t32
mova [r3+1*16], m0
mova m0, [r3+0*16]
psubd m1, m7, m6 ; t62
paddd m7, m6 ; t63
psubd m6, m3, m2 ; t34
paddd m3, m2 ; t35
psubd m2, m4, m0 ; t61
paddd m4, m0 ; t60
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
pmaxsd m0, [r3+1*16]
mova [r3+0*16], m0
mova m0, [o(clip_18b_max)]
REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
pminsd m0, [r3+0*16]
mova [r3+0*16], m0
mova [r3+1*16], m3
mova [r3+2*16], m4
mova [r3+3*16], m7
mova m0, [o(pd_2048)]
movd m3, [r4+4*8]
movd m4, [r4+4*9]
REPX {pshufd x, x, q0000}, m3, m4
mova [r3+4*16], m2
ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a
mova m2, [r3+4*16]
mova [r3+4*16], m5
ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
mova m0, [r3+0*16]
mova m3, [r3+1*16]
mova m4, [r3+2*16]
mova m7, [r3+3*16]
psubd m5, m0, m3 ; t35a
paddd m0, m3 ; t32a
mova [r3+0*16], m5
mova m5, [r3+4*16]
psubd m3, m7, m4 ; t60a
paddd m7, m4 ; t63a
psubd m4, m1, m6 ; t34
paddd m1, m6 ; t33
psubd m6, m5, m2 ; t61
paddd m2, m5 ; t62
mova m5, [o(clip_18b_min)]
REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
pmaxsd m5, [r3+0*16]
mova [r3+0*16], m5
mova m5, [o(clip_18b_max)]
REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
pminsd m5, [r3+0*16]
mova [r3+16*0], m0
mova [r3+16*7], m7
mova [r3+16*1], m1
mova [r3+16*6], m2
mova [r3+16*2], m4
mova m7, [o(pd_2048)]
movd m0, [r4+4*10]
movd m1, [r4+4*11]
REPX {pshufd x, x, q0000}, m0, m1
ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60
mova [r3+16*3], m3
mova [r3+16*4], m5
mova m4, [r3+2*16]
ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
add r4, 4*12
mova [r3+16*2], m6
mova [r3+16*5], m4
%endif
add r3, 16*8
ret
.main_part2: ; idct64 steps 6-9
lea r4, [r3+16*7]
%if ARCH_X86_64
mova m10, [o(pd_1567)]
mova m15, [o(pd_3784)]
.main_part2_loop:
mova m0, [r3-16*32] ; t32a
mova m1, [r4-16*24] ; t39a
mova m2, [r4-16*32] ; t63a
mova m3, [r3-16*24] ; t56a
mova m4, [r3-16*16] ; t40a
mova m5, [r4-16* 8] ; t47a
mova m6, [r4-16*16] ; t55a
mova m7, [r3-16* 8] ; t48a
psubd m8, m0, m1 ; t39
paddd m0, m1 ; t32
psubd m1, m2, m3 ; t56
paddd m2, m3 ; t63
psubd m3, m5, m4 ; t40
paddd m5, m4 ; t47
psubd m4, m7, m6 ; t55
paddd m7, m6 ; t48
REPX {pmaxsd x, m12}, m8, m1, m3, m4
REPX {pminsd x, m13}, m8, m1, m3, m4
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
REPX {pmaxsd x, m12}, m0, m2, m5, m7
REPX {pminsd x, m13}, m0, m5, m2, m7
psubd m6, m2, m7 ; t48a
paddd m2, m7 ; t63a
psubd m7, m0, m5 ; t47a
paddd m0, m5 ; t32a
psubd m5, m8, m4 ; t55
paddd m8, m4 ; t56
psubd m4, m1, m3 ; t40
paddd m1, m3 ; t39
REPX {pmaxsd x, m12}, m6, m7, m5, m4
REPX {pminsd x, m13}, m6, m7, m5, m4
REPX {pmulld x, m14}, m6, m7, m5, m4
REPX {pmaxsd x, m12}, m2, m0, m8, m1
REPX {pminsd x, m13}, m2, m0, m8, m1
paddd m6, m11
paddd m5, m11
psubd m3, m6, m7 ; t47
paddd m6, m7 ; t48
psubd m7, m5, m4 ; t40a
paddd m5, m4 ; t55a
REPX {psrad x, 12}, m3, m6, m7, m5
mova [r4-16* 8], m2
mova [r3-16*32], m0
mova [r3-16* 8], m8
mova [r4-16*32], m1
mova [r4-16*24], m3
mova [r3-16*16], m6
mova [r3-16*24], m7
mova [r4-16*16], m5
%else
.main_part2_loop:
mova m0, [r3-16*32] ; t32a
mova m1, [r4-16*24] ; t39a
mova m2, [r4-16*32] ; t63a
mova m3, [r3-16*24] ; t56a
mova m4, [r3-16*16] ; t40a
mova m5, [r4-16* 8] ; t47a
mova m6, [r4-16*16] ; t55a
psubd m7, m0, m1 ; t39
paddd m0, m1 ; t32
mova [r3+0*16], m7
mova m7, [r3-16* 8] ; t48a
psubd m1, m2, m3 ; t56
paddd m2, m3 ; t63
psubd m3, m5, m4 ; t40
paddd m5, m4 ; t47
psubd m4, m7, m6 ; t55
paddd m7, m6 ; t48
mova m6, [o(clip_18b_min)]
REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
pmaxsd m6, [r3+0*16]
mova [r3+0*16], m6
mova m6, [o(clip_18b_max)]
REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
pminsd m6, [r3+0*16]
mova [r3+0*16], m0
mova [r3+1*16], m2
mova [r3+2*16], m5
mova [r3+3*16], m7
mova m0, [o(pd_2048)]
ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a
ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a
mova m2, [r3+1*16]
mova m7, [r3+3*16]
psubd m5, m2, m7 ; t48a
paddd m2, m7 ; t63a
mova [r3+1*16], m5
mova m0, [r3+0*16]
mova m5, [r3+2*16]
psubd m7, m0, m5 ; t47a
paddd m0, m5 ; t32a
psubd m5, m6, m4 ; t55
paddd m6, m4 ; t56
psubd m4, m1, m3 ; t40
paddd m1, m3 ; t39
mova m3, [o(clip_18b_min)]
REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
pmaxsd m3, [r3+1*16]
mova [r3+0*16], m3
mova m3, [o(clip_18b_max)]
REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
pminsd m3, [r3+0*16]
mova [r4-16* 8], m2
mova [r3-16*32], m0
mova [r3-16* 8], m6
mova [r4-16*32], m1
mova m0, [o(pd_2896)]
mova m1, [o(pd_2048)]
REPX {pmulld x, m0}, m3, m7, m5, m4
REPX {paddd x, m1}, m3, m5
psubd m6, m3, m7 ; t47
paddd m3, m7 ; t48
psubd m7, m5, m4 ; t40a
paddd m5, m4 ; t55a
REPX {psrad x, 12}, m6, m3, m7, m5
mova [r4-16*24], m6
mova [r3-16*16], m3
mova [r3-16*24], m7
mova [r4-16*16], m5
%endif
add r3, 16
sub r4, 16
cmp r3, r4
jl .main_part2_loop
sub r3, 4*16
ret
.main_end_loop:
mova m0, [r3+16*28] ; idct8 0 + n
.main_end_loop_start:
mova m2, [r3+16*12] ; idct32 16 + n
mova m3, [r4+16*12] ; idct32 31 - n
%if ARCH_X86_64
mova m1, [r4+16*28] ; idct16 15 - n
mova m4, [r4-16* 4] ; idct64 63 - n
mova m5, [r3-16* 4] ; idct64 48 + n
mova m6, [r4-16*20] ; idct64 47 - n
mova m7, [r3-16*20] ; idct64 32 + n
pmaxsd m0, m12
pminsd m0, m13
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
REPX {pminsd x, m13}, m8, m0
paddd m1, m8, m3 ; idct32 out0 + n
psubd m8, m3 ; idct32 out31 - n
paddd m3, m0, m2 ; idct32 out15 - n
psubd m0, m2 ; idct32 out16 + n
REPX {pmaxsd x, m12}, m1, m8, m3, m0
REPX {pminsd x, m13}, m1, m3, m8, m0
REPX {paddd x, m15}, m1, m3, m0, m8
paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
psubd m1, m4 ; idct64 out63 - n (unshifted)
paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
psubd m3, m5 ; idct64 out48 + n (unshifted)
paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
psubd m0, m6 ; idct64 out47 - n (unshifted)
paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
psubd m8, m7 ; idct64 out32 + n (unshifted)
mova [r3-16*20], m2
mova [r4+16*28], m1
mova [r4-16*20], m4
mova [r3+16*28], m3
mova [r3-16* 4], m5
mova [r4+16*12], m0
mova [r4-16* 4], m6
mova [r3+16*12], m8
%else
mova m5, [o(clip_18b_min)]
mova m6, [o(clip_18b_max)]
mova m1, [r3+16*44] ; idct16 15 - n
pmaxsd m0, m5
pminsd m0, m6
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m5}, m4, m0
REPX {pminsd x, m6}, m4, m0
paddd m1, m4, m3 ; idct32 out0 + n
psubd m4, m3 ; idct32 out31 - n
paddd m3, m0, m2 ; idct32 out15 - n
psubd m0, m2 ; idct32 out16 + n
REPX {pmaxsd x, m5}, m1, m4, m3, m0
REPX {pminsd x, m6}, m1, m3, m4, m0
REPX {paddd x, m7}, m1, m3, m0, m4
mova m5, [r4-16* 4] ; idct64 63 - n
mova m6, [r3-16* 4] ; idct64 48 + n
paddd m2, m1, m5 ; idct64 out0 + n (unshifted)
psubd m1, m5 ; idct64 out63 - n (unshifted)
paddd m5, m3, m6 ; idct64 out15 - n (unshifted)
psubd m3, m6 ; idct64 out48 + n (unshifted)
mova [r4+16*28], m1
mova [r3+16*28], m3
mova m6, [r4-16*20] ; idct64 47 - n
mova m1, [r3-16*20] ; idct64 32 + n
mova [r3-16*20], m2
mova [r4-16*20], m5
paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
psubd m0, m6 ; idct64 out47 - n (unshifted)
paddd m6, m4, m1 ; idct64 out31 - n (unshifted)
psubd m4, m1 ; idct64 out32 + n (unshifted)
mova [r3-16* 4], m5
mova [r4+16*12], m0
mova [r4-16* 4], m6
mova [r3+16*12], m4
%endif
sub r4, 16
add r3, 16
cmp r3, r4
jl .main_end_loop
ret
.shift_transpose:
mova m0, [r3+0*16]
mova m1, [r3+1*16]
mova m2, [r3+2*16]
mova m3, [r3+3*16]
mova m4, [r3+4*16]
mova m5, [r3+5*16]
mova m6, [r3+6*16]
mova m7, [r3+7*16]
REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [r4+0*64], m0
mova [r4+1*64], m1
mova [r4+2*64], m2
mova [r4+3*64], m3
sub r4, 4*64
sub r3, 8*16
cmp r3, rsp
jg .shift_transpose
ret
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
add r5d, 640
sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
mova m6, [o(pixel_10bpc_max)]
pxor m5, m5
.dconly_loop:
paddw m1, m0, [dstq+16*0]
paddw m2, m0, [dstq+16*1]
paddw m3, m0, [dstq+16*2]
paddw m4, m0, [dstq+16*3]
REPX {pmaxsw x, m5}, m1, m2, m3, m4
REPX {pminsw x, m6}, m1, m2, m3, m4
mova [dstq+16*0], m1
mova [dstq+16*1], m2
mova [dstq+16*2], m3
mova [dstq+16*3], m4
add dstq, 64
btc r3d, 16
jnc .dconly_loop
lea dstq, [dstq+strideq-128]
dec r3d
jg .dconly_loop
RET
cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
%if ARCH_X86_32
DECLARE_REG_TMP 0, 4, 1
mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
%else
DECLARE_REG_TMP 4, 7, 8
%if WIN64
mov [rsp+(8*32+64+1)*16+1*gprsize], r7
mov [rsp+64*16+0*gprsize], r8
%endif
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 14
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
lea t2, [rsp+7*32*16]
.zero_loop_inner:
mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
sub t2, 32*16
cmp t2, rsp
jge .zero_loop_inner
sub r5d, 2
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
; actual first pass after skipping all-zero data
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mov r3, rsp
lea r4, [o(idct64_mul_16bpc)]
mova m0, [cq+128* 1+r5*8]
mova m1, [cq+128*31+r5*8]
mova m2, [cq+128*17+r5*8]
mova m3, [cq+128*15+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 7+r5*8]
mova m1, [cq+128*25+r5*8]
mova m2, [cq+128*23+r5*8]
mova m3, [cq+128* 9+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 5+r5*8]
mova m1, [cq+128*27+r5*8]
mova m2, [cq+128*21+r5*8]
mova m3, [cq+128*11+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 3+r5*8]
mova m1, [cq+128*29+r5*8]
mova m2, [cq+128*19+r5*8]
mova m3, [cq+128*13+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
mova m0, [cq+128* 2+r5*8]
mova m1, [cq+128*14+r5*8]
mova m2, [cq+128*18+r5*8]
mova m3, [cq+128*30+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
mova m0, [cq+128* 6+r5*8]
mova m1, [cq+128*10+r5*8]
mova m2, [cq+128*22+r5*8]
mova m3, [cq+128*26+r5*8]
call .rect2_mul_fast
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
add r3, 16*(24+4*ARCH_X86_32)
mova m0, [cq+128* 4+r5*8]
mova m1, [cq+128*12+r5*8]
mova m2, [cq+128*20+r5*8]
mova m3, [cq+128*28+r5*8]
call .rect2_mul_fast
call m(idct_16x4_internal_16bpc).main_oddhalf_fast
mova m0, [cq+128* 0+r5*8]
mova m1, [cq+128* 8+r5*8]
mova m2, [cq+128*16+r5*8]
mova m3, [cq+128*24+r5*8]
call .rect2_mul_fast
call m(idct_8x4_internal_16bpc).main_pass1_fast
call m(idct_8x4_internal_16bpc).round
mova [r3-(7+4*ARCH_X86_32)*16], m1
mova [r3-(6+4*ARCH_X86_32)*16], m2
mova [r3-(5+4*ARCH_X86_32)*16], m3
mova [r3-(4+4*ARCH_X86_32)*16], m4
mova [r3-(3+4*ARCH_X86_32)*16], m5
mova [r3-(2+4*ARCH_X86_32)*16], m6
mova [r3-(1+4*ARCH_X86_32)*16], m7
sub r3, 16*(40+4*ARCH_X86_32-4)
%if ARCH_X86_64
psrld m15, m11, 11 ; pd_1
%else
mova m7, [o(pd_1)]
%endif
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
lea r3, [rsp+56*16]
lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
movzx t1d, t0b
shr t0d, 8
call .shift_transpose
; zero cq
pxor m7, m7
lea r4, [cq+30*128+r5*8]
.zero_cq_loop:
REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
sub r4, 4*128
cmp r4, cq
jg .zero_cq_loop
sub r5d, 2
jge .loop_pass1
; pass=2 code starts here
mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
%if ARCH_X86_32
mov strideq, [rsp+gprsize*2+(8*32+64+8)*16]
%elif WIN64
mov r8, [rsp+gprsize*0+64*16]
%endif
add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
cmp eobd, 36
jl .load_veryfast
cmp eobd, 136
jl .load_fast
; load normal
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
jmp .run
.load_fast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
jmp .run
.load_veryfast:
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
; fall-through
.run:
%if ARCH_X86_64
lea r2, [dstq+128]
mov r7, -16
%else
lea r2, [rsp+(8*32+3)*16]
mov dword [r2+0*gprsize], 8
%endif
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.rect2_mul_fast:
%if ARCH_X86_64
REPX {pmulld x, m14}, m0, m1, m2, m3
REPX {paddd x, m11}, m0, m1, m2, m3
%else
mova m4, [o(pd_2896)]
mova m5, [o(pd_2048)]
REPX {pmulld x, m4 }, m0, m1, m2, m3
REPX {paddd x, m5 }, m0, m1, m2, m3
%endif
REPX {psrad x, 12 }, m0, m1, m2, m3
ret
.shift_transpose:
mova m0, [r3+0*16]
mova m1, [r3+1*16]
mova m2, [r3+2*16]
mova m3, [r3+3*16]
mova m4, [r3+4*16]
mova m5, [r3+5*16]
mova m6, [r3+6*16]
mova m7, [r3+7*16]
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [t2+0*16+r5*8], m0
mova [t2+8*16+r5*8], m2
mova [t2+0*16+t0*8], m3
mova [t2+0*16+t1*8], m1
sub t2, 16*32
sub r3, 8*16
cmp r3, rsp
jg .shift_transpose
ret
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
dst, stride, c, eob
LEA r6, base
test eobd, eobd
jz .dconly
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 2, 0, 6
mov [rsp+gprsize*1+(64*9+8)*16], r0
mov [rsp+gprsize*2+(64*9+8)*16], r1
mov [rsp+gprsize*3+(64*9+8)*16], r2
mov [rsp+gprsize*4+(64*9+8)*16], r6
%else
DECLARE_REG_TMP 8, 9, 4, 7, 0
mov [rsp+gprsize*1+(64*9+1)*16], r9
mov [rsp+gprsize*0+64*16], r0
%if WIN64
mov [rsp+gprsize*2+(64*9+1)*16], r7
mov [rsp+gprsize*3+(64*9+1)*16], r8
%endif
%endif
%undef cmp
; remove entirely-zero iterations
mov r5d, 14
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jge .end_zero_loop
pxor m0, m0
.zero_loop:
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
lea t4, [rsp+7*64*16]
.zero_loop_inner:
mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
sub t4, 64*16
cmp t4, rsp
jge .zero_loop_inner
%if ARCH_X86_32
mov r6, [rsp+gprsize*4+(64*9+8)*16]
%endif
sub r5d, 2
cmp eobw, word [o2(tbl_32x32_2d)+r5]
jl .zero_loop
.end_zero_loop:
mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
%if ARCH_X86_32
mov cq, [rsp+gprsize*3+(64*9+8)*16]
%endif
; actual first pass after skipping all-zero data
.loop_pass1:
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
mova m13, [o(clip_18b_max)]
mova m14, [o(pd_2896)]
%endif
mov r3, rsp
lea r4, [o(idct64_mul_16bpc)]
mova m0, [cq+128* 1+r5*8]
mova m1, [cq+128*31+r5*8]
mova m2, [cq+128*17+r5*8]
mova m3, [cq+128*15+r5*8]
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 7+r5*8]
mova m1, [cq+128*25+r5*8]
mova m2, [cq+128*23+r5*8]
mova m3, [cq+128* 9+r5*8]
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 5+r5*8]
mova m1, [cq+128*27+r5*8]
mova m2, [cq+128*21+r5*8]
mova m3, [cq+128*11+r5*8]
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
mova m0, [cq+128* 3+r5*8]
mova m1, [cq+128*29+r5*8]
mova m2, [cq+128*19+r5*8]
mova m3, [cq+128*13+r5*8]
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
mova m0, [cq+128* 2+r5*8]
mova m1, [cq+128*14+r5*8]
mova m2, [cq+128*18+r5*8]
mova m3, [cq+128*30+r5*8]
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
mova m0, [cq+128* 6+r5*8]
mova m1, [cq+128*10+r5*8]
mova m2, [cq+128*22+r5*8]
mova m3, [cq+128*26+r5*8]
call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
add r3, 16*(24+4*ARCH_X86_32)
mova m0, [cq+128* 4+r5*8]
mova m1, [cq+128*12+r5*8]
mova m2, [cq+128*20+r5*8]
mova m3, [cq+128*28+r5*8]
call m(idct_16x4_internal_16bpc).main_oddhalf_fast
mova m0, [cq+128* 0+r5*8]
mova m1, [cq+128* 8+r5*8]
mova m2, [cq+128*16+r5*8]
mova m3, [cq+128*24+r5*8]
call m(idct_8x4_internal_16bpc).main_pass1_fast
call m(idct_8x4_internal_16bpc).round
mova [r3-(7+4*ARCH_X86_32)*16], m1
mova [r3-(6+4*ARCH_X86_32)*16], m2
mova [r3-(5+4*ARCH_X86_32)*16], m3
mova [r3-(4+4*ARCH_X86_32)*16], m4
mova [r3-(3+4*ARCH_X86_32)*16], m5
mova [r3-(2+4*ARCH_X86_32)*16], m6
mova [r3-(1+4*ARCH_X86_32)*16], m7
sub r3, 16*(40+4*ARCH_X86_32-4)
%if ARCH_X86_64
psrld m15, m11, 10 ; pd_2
%else
mova m7, [o(pd_2)]
%endif
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
lea r3, [rsp+56*16]
movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
movzx t0d, t1b
movzx t2d, t3b
shr t1d, 8
shr t3d, 8
lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
call .shift_transpose
; zero cq
pxor m7, m7
%if ARCH_X86_32
mov cq, [rsp+gprsize*3+(64*9+8)*16]
%endif
lea r4, [cq+30*128+r5*8]
.zero_cq_loop:
REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
sub r4, 4*128
cmp r4, cq
jg .zero_cq_loop
%if ARCH_X86_32
mov r6, [rsp+gprsize*4+(64*9+8)*16]
%endif
sub r5d, 2
jge .loop_pass1
; pass=2 code starts here
mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
%if ARCH_X86_32
mov strideq, [rsp+gprsize*2+(9*64+8)*16]
%else
mov r0, [rsp+gprsize*0+64*16]
%endif
add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
cmp eobd, 151
jl .fast
; fall-through
%if ARCH_X86_64
DECLARE_REG_TMP 8, 9
%else
DECLARE_REG_TMP 1, 5
%endif
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
jmp .run
.fast:
lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
.run:
%if ARCH_X86_64
lea r2, [dstq+128]
mov r7, -16
%else
lea r2, [rsp+(64*8+3)*16]
mov [r2+4*gprsize], t0
mov [r2+5*gprsize], t1
mov r1, [r2+2*gprsize]
mov dword [r2+0*gprsize], 8
%endif
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
; copy of pass=1 tmp-regs
%if ARCH_X86_32
DECLARE_REG_TMP 4, 1, 2, 0, 6
%else
DECLARE_REG_TMP 8, 9, 4, 7, 0
%endif
.shift_transpose:
mova m0, [r3+0*16]
mova m1, [r3+1*16]
mova m2, [r3+2*16]
mova m3, [r3+3*16]
mova m4, [r3+4*16]
mova m5, [r3+5*16]
mova m6, [r3+6*16]
mova m7, [r3+7*16]
REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
call m(idct_8x4_internal_16bpc).transpose4x8packed
mova [t4+t0*8], m0
mova [t4+t1*8], m1
mova [t4+t2*8], m2
mova [t4+t3*8], m3
sub t4, 16*64
sub r3, 8*16
cmp r3, rsp
jg .shift_transpose
ret
.dconly:
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
(4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1