| ; Copyright © 2020-2023, VideoLAN and dav1d authors |
| ; Copyright © 2020-2023, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| %if ARCH_X86_64 |
| |
| SECTION_RODATA 64 |
| const \ |
| dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 |
| db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 |
| db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 |
| db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 |
| const \ |
| int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 |
| db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 |
| db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 |
| db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 |
| int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 |
| db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 |
| db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 |
| db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 |
| int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 |
| db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 |
| db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 |
| db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 |
| idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 |
| db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 |
| db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 |
| db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 |
| idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 |
| db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 |
| db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 |
| db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 |
| idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 |
| db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 |
| db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 |
| db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 |
| end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 |
| db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 |
| db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 |
| db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 |
| |
| ; packed 4-bit qword shuffle indices |
| permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 |
| dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 |
| dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb |
| dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea |
| permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 |
| dq 0xc824352d56128751, 0xd906171e74301e15 |
| dq 0x6271604b03472d62, 0x735342782165b426 |
| dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 |
| permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 |
| dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 |
| dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e |
| dq 0x5115049dd9045b79, 0x733726bffb263d1f |
| permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 |
| dq 0x0f11fa9c01150415, 0x0988f326039d2637 |
| dq 0x05640f1108269d8c, 0x05290edb0aaebfae |
| dq 0x0005000509378c9d, 0xffffffff0bbfaebf |
| |
| pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 |
| gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 |
| gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 |
| gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 |
| |
| int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 |
| int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 |
| int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 |
| int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 |
| deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 |
| int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 |
| |
| pb_32: times 4 db 32 |
| pw_2048: times 2 dw 2048 |
| pw_4096: times 2 dw 4096 |
| pw_8192: times 2 dw 8192 |
| pw_16384: times 2 dw 16384 |
| pw_1697x16: times 2 dw 1697*16 |
| pw_1697x8: times 2 dw 1697*8 |
| pw_2896x8: times 2 dw 2896*8 |
| pd_2048: dd 2048 |
| |
| %define pw_5 (permD+52) |
| %define pd_m1 (permD+60) |
| %define pw_3803_1321 (permD+44) |
| %define pw_2482_3803 (permD+12) |
| %define pw_2440_3290 (permD+ 4) |
| %define pw_m3290_2440 (permD+28) |
| %define pw_3857_1380 (permD+36) |
| %define pw_m1380_3857 (permD+20) |
| |
| pw_8192_m8192: dw 8192, -8192 |
| pw_m8192_8192: dw -8192, 8192 |
| pw_16384_m16384: dw 16384, -16384 |
| pw_m16384_16384: dw -16384, 16384 |
| |
| pw_m1321_2482: dw -1321, 2482 |
| pw_m3344_3344: dw -3344, 3344 |
| pw_2482_3344: dw 2482, 3344 |
| pw_m3803_3344: dw -3803, 3344 |
| pd_3344: dd 3344 |
| pw_m1321_m3344: dw -1321, -3344 |
| pw_2896_m2896: dw 2896, -2896 |
| |
| pw_1567_m3784: dw 1567, -3784 |
| pw_3784_m1567: dw 3784, -1567 |
| pw_4017_m799: dw 4017, -799 |
| pw_2276_m3406: dw 2276, -3406 |
| pw_m799_m4017: dw -799, -4017 |
| pw_m3406_m2276: dw -3406, -2276 |
| |
| %macro COEF_PAIR 2-3 0 |
| pw_%1_%2: dw %1, %2 |
| pw_m%2_%1: dw -%2, %1 |
| %if %3 |
| pw_m%1_m%2: dw -%1, -%2 |
| %endif |
| %endmacro |
| |
| COEF_PAIR 2896, 2896 |
| COEF_PAIR 1567, 3784, 1 |
| COEF_PAIR 3784, 1567 |
| COEF_PAIR 201, 4091 |
| COEF_PAIR 995, 3973 |
| COEF_PAIR 1751, 3703 |
| COEF_PAIR 3035, 2751 |
| COEF_PAIR 3513, 2106 |
| COEF_PAIR 4052, 601 |
| COEF_PAIR 3166, 2598, 1 |
| COEF_PAIR 3920, 1189, 1 |
| COEF_PAIR 2276, 3406 |
| COEF_PAIR 4017, 799 |
| |
| %macro COEF_X8 1-* |
| %rep %0 |
| dw %1*8, %1*8 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| pw_m2276x8: COEF_X8 -2276 |
| pw_3406x8: COEF_X8 3406 |
| pw_4017x8: COEF_X8 4017 |
| pw_799x8: COEF_X8 799 |
| pw_3784x8: COEF_X8 3784 |
| pw_1567x8: COEF_X8 1567 |
| |
| pw_4076x8: COEF_X8 4076 |
| pw_401x8: COEF_X8 401 |
| pw_m2598x8: COEF_X8 -2598 |
| pw_3166x8: COEF_X8 3166 |
| pw_3612x8: COEF_X8 3612 |
| pw_1931x8: COEF_X8 1931 |
| pw_m1189x8: COEF_X8 -1189 |
| pw_3920x8: COEF_X8 3920 |
| |
| pw_4091x8: COEF_X8 4091 |
| pw_201x8: COEF_X8 201 |
| pw_m2751x8: COEF_X8 -2751 |
| pw_3035x8: COEF_X8 3035 |
| pw_3703x8: COEF_X8 3703 |
| pw_1751x8: COEF_X8 1751 |
| pw_m1380x8: COEF_X8 -1380 |
| pw_3857x8: COEF_X8 3857 |
| pw_3973x8: COEF_X8 3973 |
| pw_995x8: COEF_X8 995 |
| pw_m2106x8: COEF_X8 -2106 |
| pw_3513x8: COEF_X8 3513 |
| pw_3290x8: COEF_X8 3290 |
| pw_2440x8: COEF_X8 2440 |
| pw_m601x8: COEF_X8 -601 |
| pw_4052x8: COEF_X8 4052 |
| |
| pw_401_4076x8: dw 401*8, 4076*8 |
| pw_m2598_3166x8: dw -2598*8, 3166*8 |
| pw_1931_3612x8: dw 1931*8, 3612*8 |
| pw_m1189_3920x8: dw -1189*8, 3920*8 |
| pw_799_4017x8: dw 799*8, 4017*8 |
| pw_m2276_3406x8: dw -2276*8, 3406*8 |
| |
| pw_201_4091x8: dw 201*8, 4091*8 |
| pw_m601_4052x8: dw -601*8, 4052*8 |
| pw_995_3973x8: dw 995*8, 3973*8 |
| pw_m1380_3857x8: dw -1380*8, 3857*8 |
| pw_1751_3703x8: dw 1751*8, 3703*8 |
| pw_m2106_3513x8: dw -2106*8, 3513*8 |
| pw_2440_3290x8: dw 2440*8, 3290*8 |
| pw_m2751_3035x8: dw -2751*8, 3035*8 |
| |
| pw_101_4095x8: dw 101*8, 4095*8 |
| pw_m2824_2967x8: dw -2824*8, 2967*8 |
| pw_1660_3745x8: dw 1660*8, 3745*8 |
| pw_m1474_3822x8: dw -1474*8, 3822*8 |
| pw_897_3996x8: dw 897*8, 3996*8 |
| pw_m2191_3461x8: dw -2191*8, 3461*8 |
| pw_2359_3349x8: dw 2359*8, 3349*8 |
| pw_m700_4036x8: dw -700*8, 4036*8 |
| pw_501_4065x8: dw 501*8, 4065*8 |
| pw_m2520_3229x8: dw -2520*8, 3229*8 |
| pw_2019_3564x8: dw 2019*8, 3564*8 |
| pw_m1092_3948x8: dw -1092*8, 3948*8 |
| pw_1285_3889x8: dw 1285*8, 3889*8 |
| pw_m1842_3659x8: dw -1842*8, 3659*8 |
| pw_2675_3102x8: dw 2675*8, 3102*8 |
| pw_m301_4085x8: dw -301*8, 4085*8 |
| |
| idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 |
| COEF_PAIR 401, 4076, 1 |
| COEF_PAIR 799, 4017 |
| COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 |
| dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 |
| COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 |
| COEF_PAIR 1931, 3612, 1 |
| COEF_PAIR 3406, 2276 |
| COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 |
| dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 |
| |
| SECTION .text |
| |
| %define o_base int8_permA+64*18 |
| %define o(x) (r5 - (o_base) + (x)) |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, |
| ; 16 = special_mul1, 32 = special_mul2 |
| %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags |
| mova m%2, m%4 |
| %if %7 & 16 |
| vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} |
| mova m%3, m%4 |
| %if %7 & 32 |
| vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} |
| %else |
| vpdpwssd m%3, m%1, m%6 |
| %endif |
| %elif %7 & 32 |
| vpdpwssd m%2, m%1, m%5 |
| mova m%3, m%4 |
| vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} |
| %elif %6 < 32 |
| vpdpwssd m%2, m%1, m%5 |
| mova m%3, m%4 |
| vpdpwssd m%3, m%1, m%6 |
| %elif %7 & 1 |
| vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} |
| mova m%3, m%4 |
| vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} |
| %else |
| vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} |
| mova m%3, m%4 |
| vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} |
| %endif |
| %if %7 & 2 |
| psrld m%2, 12 |
| pslld m%3, 4 |
| vpshrdd m%1, m%3, m%2, 16 |
| %elif %7 & 4 |
| ; compared to using shifts (as above) this has better throughput, |
| ; but worse latency and requires setting up the opmask/index |
| ; registers, so only use this method for the larger transforms |
| pslld m%1, m%2, 4 |
| vpmultishiftqb m%1{k7}, m13, m%3 |
| %else |
| psrad m%2, 12 |
| psrad m%3, 12 |
| %if %7 & 8 == 0 |
| packssdw m%1, m%3, m%2 |
| %endif |
| %endif |
| %endmacro |
| |
| ; flags: same as ITX_MUL2X_PACK |
| %macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags |
| %if %11 & 1 |
| vpbroadcastd m%4, [o(pw_%9_%10)] |
| vpbroadcastd m%4{k1}, [o(pw_%7_%8)] |
| vpbroadcastd m%5, [o(pw_m%10_%9)] |
| vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] |
| %else |
| vpbroadcastd m%4, [o(pw_m%10_%9)] |
| vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] |
| vpbroadcastd m%5, [o(pw_%9_%10)] |
| vpbroadcastd m%5{k1}, [o(pw_%7_%8)] |
| %endif |
| ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 |
| %endmacro |
| |
| ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 |
| ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 |
| %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 |
| punpcklwd m%3, m%2, m%1 |
| punpckhwd m%2, m%1 |
| %if %7 < 32 |
| mova m%1, m%5 |
| vpdpwssd m%1, m%3, m%7 |
| mova m%4, m%5 |
| vpdpwssd m%4, m%2, m%7 |
| %else |
| mova m%1, m%5 |
| vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} |
| mova m%4, m%5 |
| vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} |
| %endif |
| psrad m%1, 12 |
| psrad m%4, 12 |
| packssdw m%1, m%4 |
| mova m%4, m%5 |
| %if %7 < 32 |
| vpdpwssd m%4, m%2, m%6 |
| mova m%2, m%5 |
| vpdpwssd m%2, m%3, m%6 |
| %else |
| vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} |
| mova m%2, m%5 |
| vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} |
| %endif |
| psrad m%4, 12 |
| psrad m%2, 12 |
| %if %0 == 8 |
| packssdw m%8, m%2, m%4 |
| %else |
| packssdw m%2, m%4 |
| %endif |
| %endmacro |
| |
| %macro WRAP_XMM 1+ |
| %xdefine %%reset RESET_MM_PERMUTATION |
| INIT_XMM cpuname |
| DEFINE_MMREGS xmm |
| AVX512_MM_PERMUTATION |
| %1 |
| %%reset |
| %endmacro |
| |
| %macro WRAP_YMM 1+ |
| INIT_YMM cpuname |
| %1 |
| INIT_ZMM cpuname |
| %endmacro |
| |
| %macro ITX4_END 4-5 2048 ; row[1-4], rnd |
| %if %5 |
| vpbroadcastd m2, [o(pw_%5)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| %endif |
| lea r2, [dstq+strideq*2] |
| %assign %%i 1 |
| %rep 4 |
| %if %1 & 2 |
| CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) |
| %else |
| CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) |
| %endif |
| %assign %%i %%i + 1 |
| %rotate 1 |
| %endrep |
| movd m2, [%%row_adr1] |
| pinsrd m2, [%%row_adr2], 1 |
| movd m3, [%%row_adr3] |
| pinsrd m3, [%%row_adr4], 1 |
| pmovzxbw m2, m2 |
| pmovzxbw m3, m3 |
| paddw m0, m2 |
| paddw m1, m3 |
| packuswb m0, m1 |
| movd [%%row_adr1], m0 |
| pextrd [%%row_adr2], m0, 1 |
| pextrd [%%row_adr3], m0, 2 |
| pextrd [%%row_adr4], m0, 3 |
| ret |
| %endmacro |
| |
| %macro INV_TXFM_FN 3 ; type1, type2, size |
| cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base |
| %define %%p1 m(i%1_%3_internal_8bpc) |
| lea baseq, [o_base] |
| ; Jump to the 1st txfm function if we're not taking the fast path, which |
| ; in turn performs an indirect jump to the 2nd txfm function. |
| lea tx2q, [m(i%2_%3_internal_8bpc).pass2] |
| %ifidn %1_%2, dct_dct |
| test eobd, eobd |
| jnz %%p1 |
| %else |
| ; jump to the 1st txfm function unless it's located directly after this |
| times ((%%end - %%p1) >> 31) & 1 jmp %%p1 |
| ALIGN function_align |
| %%end: |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_4X4_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 4x4 |
| %ifidn %1_%2, dct_dct |
| vpbroadcastw m0, [cq] |
| vpbroadcastd m1, [o(pw_2896x8)] |
| pmulhrsw m0, m1 |
| mov [cq], eobd |
| pmulhrsw m0, m1 |
| mova m1, m0 |
| jmp m(iadst_4x4_internal_8bpc).end2 |
| %endif |
| %endmacro |
| |
| %macro IDCT4_1D_PACKED 0 |
| vpbroadcastd m4, [o(pd_2048)] |
| punpckhwd m2, m1, m0 |
| punpcklwd m1, m0 |
| ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 |
| ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 |
| paddsw m0, m1, m2 ; out0 out1 |
| psubsw m1, m2 ; out3 out2 |
| %endmacro |
| |
| %macro IADST4_1D_PACKED 0 |
| punpcklwd m4, m1, m0 ; in2 in0 |
| punpckhwd m5, m1, m0 ; in3 in1 |
| .main2: |
| vpbroadcastd m3, [o(pd_2048)] |
| mova m0, m3 |
| vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} |
| mova m2, m3 |
| vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} |
| mova m1, m3 |
| vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} |
| vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} |
| vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} |
| vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} |
| vpdpwssd m1, m5, [o(pd_3344)] {bcstd} |
| vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} |
| REPX {psrad x, 12}, m0, m2, m1, m3 |
| packssdw m0, m2 ; out0 out1 |
| packssdw m1, m3 ; out2 out3 |
| %endmacro |
| |
| INIT_XMM avx512icl |
| INV_TXFM_4X4_FN dct, dct |
| INV_TXFM_4X4_FN dct, adst |
| INV_TXFM_4X4_FN dct, flipadst |
| INV_TXFM_4X4_FN dct, identity |
| |
| cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| IDCT4_1D_PACKED |
| mova m2, [o(deint_shuf)] |
| shufps m3, m0, m1, q1331 |
| shufps m0, m0, m1, q0220 |
| pshufb m0, m2 |
| pshufb m1, m3, m2 |
| jmp tx2q |
| .pass2: |
| IDCT4_1D_PACKED |
| pxor ymm16, ymm16 |
| mova [cq], ymm16 |
| ITX4_END 0, 1, 3, 2 |
| |
| INV_TXFM_4X4_FN adst, dct |
| INV_TXFM_4X4_FN adst, adst |
| INV_TXFM_4X4_FN adst, flipadst |
| INV_TXFM_4X4_FN adst, identity |
| |
| cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| call .main |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| pxor ymm16, ymm16 |
| mova [cq], ymm16 |
| .end2: |
| ITX4_END 0, 1, 2, 3 |
| ALIGN function_align |
| .main: |
| IADST4_1D_PACKED |
| ret |
| |
| INV_TXFM_4X4_FN flipadst, dct |
| INV_TXFM_4X4_FN flipadst, adst |
| INV_TXFM_4X4_FN flipadst, flipadst |
| INV_TXFM_4X4_FN flipadst, identity |
| |
| cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| call m(iadst_4x4_internal_8bpc).main |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| jmp tx2q |
| .pass2: |
| call m(iadst_4x4_internal_8bpc).main |
| .end: |
| pxor ymm16, ymm16 |
| mova [cq], ymm16 |
| .end2: |
| ITX4_END 3, 2, 1, 0 |
| |
| INV_TXFM_4X4_FN identity, dct |
| INV_TXFM_4X4_FN identity, adst |
| INV_TXFM_4X4_FN identity, flipadst |
| INV_TXFM_4X4_FN identity, identity |
| |
| cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+16*0] |
| mova m1, [cq+16*1] |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| jmp m(iadst_4x4_internal_8bpc).end |
| |
| %macro INV_TXFM_4X8_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 4x8 |
| %ifidn %1_%2, dct_dct |
| movd xmm1, [o(pw_2896x8)] |
| pmulhrsw xmm0, xmm1, [cq] |
| movd xmm2, [o(pw_2048)] |
| pmulhrsw xmm0, xmm1 |
| pmulhrsw xmm0, xmm1 |
| pmulhrsw xmm0, xmm2 |
| vpbroadcastw ym0, xmm0 |
| mova ym1, ym0 |
| jmp m(iadst_4x8_internal_8bpc).end3 |
| %endif |
| %endmacro |
| |
| %macro IDCT8_1D_PACKED 0 |
| punpckhwd m5, m3, m0 ; in7 in1 |
| punpckhwd m4, m1, m2 ; in3 in5 |
| punpcklwd m3, m1 ; in6 in2 |
| punpcklwd m2, m0 ; in4 in0 |
| .main2: |
| vpbroadcastd m6, [o(pd_2048)] |
| ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a |
| ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a |
| ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 |
| psubsw m0, m5, m4 ; t5a t6a (interleaved) |
| paddsw m4, m5 ; t4 t7 (interleaved) |
| ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 |
| ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 |
| %if mmsize > 16 |
| vbroadcasti32x4 m1, [o(deint_shuf)] |
| pshufb m4, m1 |
| %else |
| pshufb m4, [o(deint_shuf)] |
| %endif |
| psubsw m1, m2, m3 ; tmp3 tmp2 |
| paddsw m3, m2 ; tmp0 tmp1 |
| punpckhqdq m2, m4, m0 ; t7 t6 |
| punpcklqdq m4, m0 ; t4 t5 |
| paddsw m0, m3, m2 ; out0 out1 |
| psubsw m3, m2 ; out7 out6 |
| psubsw m2, m1, m4 ; out4 out5 |
| paddsw m1, m4 ; out3 out2 |
| %endmacro |
| |
| %macro IADST8_1D_PACKED 1 ; pass |
| vpbroadcastd m6, [o(pd_2048)] |
| %if %1 == 1 |
| ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a |
| ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a |
| ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a |
| ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a |
| psubsw m4, m0, m2 ; t5 t4 |
| paddsw m0, m2 ; t1 t0 |
| psubsw m5, m1, m3 ; t6 t7 |
| paddsw m1, m3 ; t2 t3 |
| ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a |
| ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a |
| %if mmsize > 16 |
| vbroadcasti32x4 m2, [o(deint_shuf)] |
| %else |
| mova m2, [o(deint_shuf)] |
| %endif |
| vprord m1, 16 |
| psubsw m3, m0, m1 ; t3 t2 |
| paddsw m0, m1 ; -out7 out0 |
| psubsw m1, m4, m5 ; t7 t6 |
| paddsw m4, m5 ; out6 -out1 |
| pshufb m0, m2 |
| pshufb m4, m2 |
| mova m2, m6 |
| vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} |
| mova m5, m6 |
| vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} |
| psrad m2, 12 |
| psrad m5, 12 |
| packssdw m2, m5 ; out4 -out5 |
| mova m5, m6 |
| vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} |
| mova m3, m6 |
| vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} |
| psrad m5, 12 |
| psrad m3, 12 |
| packssdw m1, m3, m5 ; out2 -out3 |
| %else |
| punpckhwd m0, m4, m3 ; 0 7 |
| punpckhwd m1, m5, m2 ; 2 5 |
| punpcklwd m2, m5 ; 4 3 |
| punpcklwd m3, m4 ; 6 1 |
| ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a |
| ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a |
| ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a |
| ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a |
| psubsw m4, m0, m2 ; t4 t5 |
| paddsw m0, m2 ; t0 t1 |
| psubsw m5, m1, m3 ; t6 t7 |
| paddsw m1, m3 ; t2 t3 |
| shufps m2, m5, m4, q1032 |
| punpckhwd m4, m2 |
| punpcklwd m5, m2 |
| ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a |
| ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a |
| psubsw m2, m0, m1 ; t2 t3 |
| paddsw m0, m1 ; out0 -out7 |
| psubsw m1, m4, m5 ; t6 t7 |
| paddsw m4, m5 ; -out1 out6 |
| vpbroadcastd m5, [o(pw_2896x8)] |
| punpckhqdq m3, m2, m1 ; t3 t7 |
| punpcklqdq m2, m1 ; t2 t6 |
| paddsw m1, m2, m3 ; t2+t3 t6+t7 |
| psubsw m2, m3 ; t2-t3 t6-t7 |
| punpckhqdq m3, m4, m0 ; out6 -out7 |
| punpcklqdq m0, m4 ; out0 -out1 |
| pmulhrsw m2, m5 ; out4 -out5 |
| pshufd m1, m1, q1032 |
| pmulhrsw m1, m5 ; out2 -out3 |
| %endif |
| %endmacro |
| |
| INIT_YMM avx512icl |
| INV_TXFM_4X8_FN dct, dct |
| INV_TXFM_4X8_FN dct, identity |
| INV_TXFM_4X8_FN dct, adst |
| INV_TXFM_4X8_FN dct, flipadst |
| |
| cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| IDCT4_1D_PACKED |
| vbroadcasti32x4 m2, [o(deint_shuf)] |
| shufps m3, m0, m1, q1331 |
| shufps m0, m0, m1, q0220 |
| pshufb m0, m2 |
| pshufb m1, m3, m2 |
| jmp tx2q |
| .pass2: |
| vextracti32x4 xm2, m0, 1 |
| vextracti32x4 xm3, m1, 1 |
| call .main |
| vpbroadcastd m4, [o(pw_2048)] |
| vinserti32x4 m0, m0, xm2, 1 |
| vinserti32x4 m1, m1, xm3, 1 |
| pshufd m1, m1, q1032 |
| jmp m(iadst_4x8_internal_8bpc).end2 |
| ALIGN function_align |
| .main: |
| WRAP_XMM IDCT8_1D_PACKED |
| ret |
| |
| INV_TXFM_4X8_FN adst, dct |
| INV_TXFM_4X8_FN adst, adst |
| INV_TXFM_4X8_FN adst, flipadst |
| INV_TXFM_4X8_FN adst, identity |
| |
| cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| call m(iadst_8x4_internal_8bpc).main |
| punpckhwd m3, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| vextracti32x4 xm2, m0, 1 |
| vextracti32x4 xm3, m1, 1 |
| pshufd xm4, xm0, q1032 |
| pshufd xm5, xm1, q1032 |
| call .main_pass2 |
| vpbroadcastd m4, [o(pw_2048)] |
| vinserti32x4 m0, xm2, 1 |
| vinserti32x4 m1, xm3, 1 |
| pxor m5, m5 |
| psubw m5, m4 |
| .end: |
| punpcklqdq m4, m5 |
| .end2: |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| .end3: |
| vpbroadcastd m3, strided |
| pmulld m5, m3, [o(pd_0to15)] |
| kxnorb k1, k1, k1 |
| kmovb k2, k1 |
| vpgatherdd m3{k1}, [dstq+m5] |
| pxor m4, m4 |
| mova [cq], zmm20 |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| packuswb m0, m1 |
| vpscatterdd [dstq+m5]{k2}, m0 |
| RET |
| ALIGN function_align |
| .main_pass1: |
| punpckhwd xm0, xm4, xm3 ; 0 7 |
| punpckhwd xm1, xm5, xm2 ; 2 5 |
| punpcklwd xm2, xm5 ; 4 3 |
| punpcklwd xm3, xm4 ; 6 1 |
| WRAP_XMM IADST8_1D_PACKED 1 |
| punpcklqdq xm3, xm4, xm0 ; out6 -out7 |
| punpckhqdq xm0, xm4 ; out0 -out1 |
| ret |
| ALIGN function_align |
| .main_pass2: |
| WRAP_XMM IADST8_1D_PACKED 2 |
| ret |
| |
| INV_TXFM_4X8_FN flipadst, dct |
| INV_TXFM_4X8_FN flipadst, adst |
| INV_TXFM_4X8_FN flipadst, flipadst |
| INV_TXFM_4X8_FN flipadst, identity |
| |
| cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 |
| vpermq m1, [cq+32*1], q3120 |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| call m(iadst_8x4_internal_8bpc).main |
| punpcklwd m3, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m1, m3 |
| punpckhwd m1, m3 |
| jmp tx2q |
| .pass2: |
| vextracti32x4 xm2, m0, 1 |
| vextracti32x4 xm3, m1, 1 |
| pshufd xm4, xm0, q1032 |
| pshufd xm5, xm1, q1032 |
| call m(iadst_4x8_internal_8bpc).main_pass2 |
| vpbroadcastd m5, [o(pw_2048)] |
| vinserti32x4 m3, xm1, 1 |
| vinserti32x4 m2, xm0, 1 |
| pxor m4, m4 |
| psubw m4, m5 |
| pshufd m0, m3, q1032 |
| pshufd m1, m2, q1032 |
| jmp m(iadst_4x8_internal_8bpc).end |
| |
| INIT_ZMM avx512icl |
| INV_TXFM_4X8_FN identity, dct |
| INV_TXFM_4X8_FN identity, adst |
| INV_TXFM_4X8_FN identity, flipadst |
| INV_TXFM_4X8_FN identity, identity |
| |
| cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd m0, [o(pw_2896x8)] |
| pmulhrsw m0, [cq] |
| mova m1, [o(int8_permB)] |
| vpbroadcastd m2, [o(pw_1697x8)] |
| vpermb m0, m1, m0 |
| pmulhrsw m2, m0 |
| paddsw m0, m2 |
| vextracti32x8 ym1, m0, 1 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd ym4, [o(pw_4096)] |
| jmp m(iadst_4x8_internal_8bpc).end2 |
| |
| %macro INV_TXFM_4X16_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 4x16 |
| %ifidn %1_%2, dct_dct |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| imul r6d, 181 |
| add r6d, 128+2048 |
| sar r6d, 8+4 |
| vpbroadcastw m0, r6d |
| mova m1, m0 |
| jmp m(iadst_4x16_internal_8bpc).end3 |
| %endif |
| %endmacro |
| |
| %macro IDCT16_1D_PACKED 0 |
| punpckhwd m8, m7, m0 ; dct16 in15 in1 |
| punpcklwd m9, m4, m0 ; dct4 in2 in0 |
| punpckhwd m0, m3, m4 ; dct16 in7 in9 |
| punpcklwd m7, m1 ; dct8 in7 in1 |
| punpckhwd m1, m6 ; dct16 in3 in13 |
| punpcklwd m3, m5 ; dct8 in3 in5 |
| punpckhwd m5, m2 ; dct16 in11 in5 |
| punpcklwd m6, m2 ; dct4 in3 in1 |
| cglobal_label .main2 |
| vpbroadcastd m10, [o(pd_2048)] |
| .main3: |
| vpbroadcastq m13, [o(int_mshift)] |
| vpcmpub k7, m13, m10, 6 ; 0x33... |
| ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a |
| ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a |
| ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a |
| ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a |
| ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a |
| ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a |
| .main4: |
| psubsw m2, m8, m0 ; t9 t14 |
| paddsw m8, m0 ; t8 t15 |
| psubsw m4, m1, m5 ; t10 t13 |
| paddsw m1, m5 ; t11 t12 |
| ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 |
| psubsw m0, m8, m1 ; t11a t12a |
| paddsw m8, m1 ; t8a t15a |
| psubsw m1, m7, m3 ; t5a t6a |
| paddsw m7, m3 ; t4 t7 |
| .main5: |
| ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a |
| ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a |
| %if mmsize > 16 |
| vbroadcasti32x4 m5, [o(deint_shuf)] |
| %else |
| mova m5, [o(deint_shuf)] |
| %endif |
| vpbroadcastd m11, [o(pw_m2896_2896)] |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| paddsw m3, m2, m4 ; t9 t14 |
| psubsw m2, m4 ; t10 t13 |
| pshufb m8, m5 |
| pshufb m7, m5 |
| pshufb m3, m5 |
| ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 |
| ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 |
| ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 |
| ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a |
| punpckhqdq m2, m7, m1 ; t7 t6 |
| punpcklqdq m7, m1 ; t4 t5 |
| psubsw m1, m9, m6 ; dct4 out3 out2 |
| paddsw m9, m6 ; dct4 out0 out1 |
| packssdw m5, m11 ; t12 t13a |
| packssdw m4, m0 ; t11 t10a |
| punpckhqdq m0, m8, m3 ; t15a t14 |
| punpcklqdq m8, m3 ; t8a t9 |
| psubsw m3, m9, m2 ; dct8 out7 out6 |
| paddsw m9, m2 ; dct8 out0 out1 |
| psubsw m2, m1, m7 ; dct8 out4 out5 |
| paddsw m1, m7 ; dct8 out3 out2 |
| psubsw m7, m9, m0 ; out15 out14 |
| paddsw m0, m9 ; out0 out1 |
| psubsw m6, m1, m5 ; out12 out13 |
| paddsw m1, m5 ; out3 out2 |
| psubsw m5, m2, m4 ; out11 out10 |
| paddsw m2, m4 ; out4 out5 |
| psubsw m4, m3, m8 ; out8 out9 |
| paddsw m3, m8 ; out7 out6 |
| %endmacro |
| |
| INV_TXFM_4X16_FN dct, dct |
| INV_TXFM_4X16_FN dct, identity |
| INV_TXFM_4X16_FN dct, adst |
| INV_TXFM_4X16_FN dct, flipadst |
| |
| cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova ym1, [cq+32*2] |
| vinserti32x8 m1, [cq+32*0], 1 |
| mova m0, [o(int16_perm)] |
| mova ym2, [cq+32*3] |
| vinserti32x8 m2, [cq+32*1], 1 |
| vpbroadcastd m4, [o(pd_2048)] |
| vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 |
| vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 |
| ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 |
| ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 |
| vpbroadcastd m4, [o(pw_16384)] |
| psubsw m3, m1, m2 |
| paddsw m1, m2 ; out0 out1 |
| vprord m3, 16 ; out2 out3 |
| punpckldq m0, m1, m3 |
| punpckhdq m1, m3 |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| jmp tx2q |
| .pass2: |
| vextracti32x4 xm2, ym0, 1 |
| vextracti32x4 xm3, ym1, 1 |
| vextracti32x4 xm4, m0, 2 |
| vextracti32x4 xm5, m1, 2 |
| vextracti32x4 xm6, m0, 3 |
| vextracti32x4 xm7, m1, 3 |
| call .main |
| vinserti32x4 ym0, xm2, 1 |
| vinserti32x4 ym1, xm3, 1 |
| vinserti32x4 ym4, xm6, 1 |
| vinserti32x4 ym5, xm7, 1 |
| vinserti32x8 m0, ym4, 1 |
| vinserti32x8 m1, ym5, 1 |
| vpbroadcastd m5, [o(pw_2048)] |
| pshufd m1, m1, q1032 |
| jmp m(iadst_4x16_internal_8bpc).end2 |
| ALIGN function_align |
| .main: |
| WRAP_XMM IDCT16_1D_PACKED |
| ret |
| |
| INV_TXFM_4X16_FN adst, dct |
| INV_TXFM_4X16_FN adst, adst |
| INV_TXFM_4X16_FN adst, flipadst |
| INV_TXFM_4X16_FN adst, identity |
| |
| cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m1, [o(permB)] |
| vpermq m0, m1, [cq+64*0] |
| vpermq m1, m1, [cq+64*1] |
| call m(iadst_16x4_internal_8bpc).main |
| vpbroadcastd m3, [o(pw_16384)] |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmulhrsw m2, m3 |
| pmulhrsw m0, m3 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| jmp tx2q |
| .pass2: |
| call .main |
| vpbroadcastd m5, [o(pw_2048)] |
| psrlq m10, 4 |
| psubw m6, m8, m5 |
| .end: |
| vpbroadcastd m7, [o(pw_2896x8)] |
| paddsw ym1, ym2, ym4 |
| psubsw ym2, ym4 |
| vinserti32x8 m1, ym2, 1 |
| pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 |
| psrlq m0, m10, 4 |
| vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d |
| vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f |
| punpcklqdq m5, m6 |
| .end2: |
| pmulhrsw m0, m5 |
| pmulhrsw m1, m5 |
| .end3: |
| vpbroadcastd m3, strided |
| pmulld m5, m3, [o(pd_0to15)] |
| kxnorw k1, k1, k1 |
| kmovw k2, k1 |
| vpgatherdd m3{k1}, [dstq+m5] |
| pxor m4, m4 |
| mova [cq+64*0], m4 |
| mova [cq+64*1], m4 |
| punpcklbw m2, m3, m4 |
| punpckhbw m3, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| packuswb m0, m1 |
| vpscatterdd [dstq+m5]{k2}, m0 |
| RET |
| ALIGN function_align |
| .main: |
| movu m3, [o(permB+1)] |
| psrlq m10, m3, 4 |
| .main2: |
| vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 |
| vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 |
| vpbroadcastd m9, [o(pd_2048)] |
| vpbroadcastq ym13, [o(int_mshift)] |
| kxnorb k1, k1, k1 |
| punpckhwd m4, m3, m0 ; in12 in3 in14 in1 |
| punpcklwd m0, m3 ; in0 in15 in2 in13 |
| kshiftrb k1, k1, 4 |
| vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 |
| vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 |
| INIT_YMM avx512icl |
| vpcmpub k7, m13, m9, 6 ; 0x33... |
| pxor m8, m8 |
| ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 |
| ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 |
| ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 |
| ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 |
| psubsw m2, m0, m3 ; t9a t8a t11a t10a |
| paddsw m0, m3 ; t1a t0a t3a t2a |
| psubsw m3, m1, m4 ; t13a t12a t15a t14a |
| paddsw m4, m1 ; t5a t4a t7a t6a |
| ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 |
| psubw m7, m8, m7 |
| ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 |
| vpbroadcastd m6, [o(pw_3784_m1567)] |
| vpbroadcastd m6{k1}, [o(pw_m3784_1567)] |
| psubsw m1, m0, m4 ; t5 t4 t7 t6 |
| paddsw m0, m4 ; t1 t0 t3 t2 |
| psubsw m4, m2, m3 ; t13a t12a t15a t14a |
| paddsw m2, m3 ; t9a t8a t11a t10a |
| ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a |
| ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 |
| vbroadcasti32x4 m5, [o(deint_shuf)] |
| pshufb m0, m5 |
| pshufb m2, m5 |
| vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a |
| vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a |
| vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 |
| vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 |
| pshufd m2, m2, q1032 ; t6a t7a t14 t15 |
| psubsw m4, m0, m3 ; t3a t2a t11 t10 |
| paddsw m0, m3 ; -out15 out0 out14 -out1 |
| paddsw m3, m1, m2 ; out12 -out3 -out13 out2 |
| psubsw m1, m2 ; t7 t6 t15a t14a |
| punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a |
| punpcklqdq m4, m1 ; t3a t7 t11 t15a |
| INIT_ZMM avx512icl |
| vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 |
| ret |
| |
| INV_TXFM_4X16_FN flipadst, dct |
| INV_TXFM_4X16_FN flipadst, adst |
| INV_TXFM_4X16_FN flipadst, flipadst |
| INV_TXFM_4X16_FN flipadst, identity |
| |
| cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m1, [o(permB)] |
| vpermq m0, m1, [cq+64*0] |
| vpermq m1, m1, [cq+64*1] |
| call m(iadst_16x4_internal_8bpc).main |
| vpbroadcastd m3, [o(pw_16384)] |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| pmulhrsw m2, m3 |
| pmulhrsw m1, m3 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| jmp tx2q |
| .pass2: |
| call m(iadst_4x16_internal_8bpc).main |
| vpbroadcastd m6, [o(pw_2048)] |
| psrlq m10, 12 |
| psubw m5, m8, m6 |
| jmp m(iadst_4x16_internal_8bpc).end |
| |
| INV_TXFM_4X16_FN identity, dct |
| INV_TXFM_4X16_FN identity, adst |
| INV_TXFM_4X16_FN identity, flipadst |
| INV_TXFM_4X16_FN identity, identity |
| |
| cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m2, [o(int16_perm)] |
| vpermb m1, m2, [cq+64*0] |
| vpermb m2, m2, [cq+64*1] |
| vpbroadcastd m4, [o(pw_1697x8)] |
| vpbroadcastd m0, [o(pd_m1)] |
| pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is |
| vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal |
| pmulhrsw m4, m2 ; it still works, but if the input is -1 the |
| vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes |
| vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless |
| vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x16)] |
| vpbroadcastd m5, [o(pw_2048)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddsw m0, m0 |
| paddsw m1, m1 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| jmp m(iadst_4x16_internal_8bpc).end2 |
| |
| %macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] |
| movq xm%3, [dstq ] |
| movhps xm%3, [dstq+%5] |
| movq xm%4, [dstq+%6] |
| movhps xm%4, [dstq+%7] |
| pmovzxbw m%3, xm%3 |
| pmovzxbw m%4, xm%4 |
| %ifnum %1 |
| paddw m%3, m%1 |
| %else |
| paddw m%3, %1 |
| %endif |
| %ifnum %2 |
| paddw m%4, m%2 |
| %else |
| paddw m%4, %2 |
| %endif |
| packuswb m%3, m%4 |
| vextracti32x4 xm%4, m%3, 1 |
| movq [dstq ], xm%3 |
| movhps [dstq+%6], xm%3 |
| movq [dstq+%5], xm%4 |
| movhps [dstq+%7], xm%4 |
| %endmacro |
| |
| %macro INV_TXFM_8X4_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 8x4 |
| %ifidn %1_%2, dct_dct |
| movd xm1, [o(pw_2896x8)] |
| pmulhrsw xm0, xm1, [cq] |
| movd xm2, [o(pw_2048)] |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm1 |
| pmulhrsw xm0, xm2 |
| vpbroadcastw m0, xm0 |
| mova m1, m0 |
| jmp m(iadst_8x4_internal_8bpc).end3 |
| %endif |
| %endmacro |
| |
| INIT_YMM avx512icl |
| INV_TXFM_8X4_FN dct, dct |
| INV_TXFM_8X4_FN dct, adst |
| INV_TXFM_8X4_FN dct, flipadst |
| INV_TXFM_8X4_FN dct, identity |
| |
| cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd xm3, [o(pw_2896x8)] |
| pmulhrsw xm0, xm3, [cq+16*0] |
| pmulhrsw xm1, xm3, [cq+16*1] |
| pmulhrsw xm2, xm3, [cq+16*2] |
| pmulhrsw xm3, [cq+16*3] |
| call m(idct_4x8_internal_8bpc).main |
| vbroadcasti32x4 m4, [o(deint_shuf)] |
| vinserti32x4 m3, m1, xm3, 1 |
| vinserti32x4 m1, m0, xm2, 1 |
| shufps m0, m1, m3, q0220 |
| shufps m1, m3, q1331 |
| pshufb m0, m4 |
| pshufb m1, m4 |
| jmp tx2q |
| .pass2: |
| IDCT4_1D_PACKED |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| jmp m(iadst_8x4_internal_8bpc).end2 |
| |
| INV_TXFM_8X4_FN adst, dct |
| INV_TXFM_8X4_FN adst, adst |
| INV_TXFM_8X4_FN adst, flipadst |
| INV_TXFM_8X4_FN adst, identity |
| |
| cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pshufd xm4, [cq+16*0], q1032 |
| pmulhrsw xm3, xm0, [cq+16*3] |
| pshufd xm5, [cq+16*1], q1032 |
| pmulhrsw xm2, xm0, [cq+16*2] |
| pmulhrsw xm4, xm0 |
| pmulhrsw xm5, xm0 |
| call m(iadst_4x8_internal_8bpc).main_pass1 |
| vinserti32x4 m0, xm2, 1 |
| vinserti32x4 m1, xm3, 1 |
| pxor m3, m3 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| psubsw m3, m2 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call .main |
| .end: |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q3120 |
| .end2: |
| vpbroadcastd m2, [o(pw_2048)] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| .end3: |
| pxor m2, m2 |
| mova [cq], zmm18 |
| lea r6, [strideq*3] |
| WRITE_8X4 0, 1, 4, 5 |
| RET |
| ALIGN function_align |
| .main: |
| IADST4_1D_PACKED |
| ret |
| |
| INV_TXFM_8X4_FN flipadst, dct |
| INV_TXFM_8X4_FN flipadst, adst |
| INV_TXFM_8X4_FN flipadst, flipadst |
| INV_TXFM_8X4_FN flipadst, identity |
| |
| cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd xm0, [o(pw_2896x8)] |
| pshufd xm4, [cq+16*0], q1032 |
| pmulhrsw xm3, xm0, [cq+16*3] |
| pshufd xm5, [cq+16*1], q1032 |
| pmulhrsw xm2, xm0, [cq+16*2] |
| pmulhrsw xm4, xm0 |
| pmulhrsw xm5, xm0 |
| call m(iadst_4x8_internal_8bpc).main_pass1 |
| vinserti32x4 m3, m3, xm1, 1 |
| vinserti32x4 m2, m2, xm0, 1 |
| punpckhwd m1, m3, m2 |
| punpcklwd m3, m2 |
| pxor m0, m0 |
| psubsw m0, m1 |
| punpckhwd m1, m0, m3 |
| punpcklwd m0, m3 |
| jmp tx2q |
| .pass2: |
| call m(iadst_8x4_internal_8bpc).main |
| mova m2, m1 |
| vpermq m1, m0, q2031 |
| vpermq m0, m2, q2031 |
| jmp m(iadst_8x4_internal_8bpc).end2 |
| |
| INV_TXFM_8X4_FN identity, dct |
| INV_TXFM_8X4_FN identity, adst |
| INV_TXFM_8X4_FN identity, flipadst |
| INV_TXFM_8X4_FN identity, identity |
| |
| cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova xm2, [cq+16*0] |
| mova xm0, [cq+16*1] |
| vinserti32x4 m2, [cq+16*2], 1 |
| vinserti32x4 m0, [cq+16*3], 1 |
| vpbroadcastd m3, [o(pw_2896x8)] |
| punpcklwd m1, m2, m0 |
| punpckhwd m2, m0 |
| pmulhrsw m1, m3 |
| pmulhrsw m2, m3 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| paddsw m0, m0 |
| paddsw m1, m1 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| jmp m(iadst_8x4_internal_8bpc).end |
| |
| %macro INV_TXFM_8X8_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 8x8 |
| %ifidn %1_%2, dct_dct |
| INIT_ZMM avx512icl |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| .dconly: |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| .dconly2: |
| vpbroadcastd ym2, strided |
| imul r6d, 181 |
| pmulld ym5, ym2, [o(pd_0to15)] |
| kxnorb k1, k1, k1 |
| add r6d, 128+2048 |
| sar r6d, 8+4 |
| pxor m3, m3 |
| vpbroadcastw m4, r6d |
| .dconly_loop: |
| kmovb k2, k1 |
| vpgatherdq m2{k1}, [dstq+ym5] |
| punpcklbw m0, m2, m3 |
| punpckhbw m1, m2, m3 |
| paddw m0, m4 |
| paddw m1, m4 |
| packuswb m0, m1 |
| kmovb k1, k2 |
| vpscatterdq [dstq+ym5]{k2}, m0 |
| lea dstq, [dstq+strideq*8] |
| sub r3d, 8 |
| jg .dconly_loop |
| RET |
| INIT_YMM avx512icl |
| %endif |
| %endmacro |
| |
| INV_TXFM_8X8_FN dct, dct |
| INV_TXFM_8X8_FN dct, identity |
| INV_TXFM_8X8_FN dct, adst |
| INV_TXFM_8X8_FN dct, flipadst |
| |
| cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m0, [cq+32*0], q3120 ; 0 1 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| vpermq m1, [cq+32*1], q3120 ; 2 3 |
| call .main |
| shufps m4, m0, m1, q0220 |
| shufps m5, m0, m1, q1331 |
| shufps m1, m2, m3, q0220 |
| shufps m3, m2, m3, q1331 |
| vbroadcasti32x4 m0, [o(deint_shuf)] |
| vpbroadcastd m2, [o(pw_16384)] |
| REPX {pshufb x, m0}, m4, m5, m1, m3 |
| REPX {pmulhrsw x, m2}, m4, m5, m1, m3 |
| vinserti32x4 m0, m4, xm1, 1 |
| vshufi32x4 m2, m4, m1, 0x03 |
| vinserti32x4 m1, m5, xm3, 1 |
| vshufi32x4 m3, m5, m3, 0x03 |
| jmp tx2q |
| .pass2: |
| call .main |
| vpbroadcastd m4, [o(pw_2048)] |
| vpermq m0, m0, q3120 |
| vpermq m1, m1, q2031 |
| vpermq m2, m2, q3120 |
| vpermq m3, m3, q2031 |
| jmp m(iadst_8x8_internal_8bpc).end2 |
| ALIGN function_align |
| cglobal_label .main |
| IDCT8_1D_PACKED |
| ret |
| |
| INV_TXFM_8X8_FN adst, dct |
| INV_TXFM_8X8_FN adst, adst |
| INV_TXFM_8X8_FN adst, flipadst |
| INV_TXFM_8X8_FN adst, identity |
| |
| cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m4, [cq+32*0], q1302 ; 1 0 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m5, [cq+32*1], q1302 ; 3 2 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| call .main_pass1 |
| vpbroadcastd m5, [o(pw_16384_m16384)] |
| punpcklwd m4, m0, m1 |
| punpckhwd m0, m1 |
| punpcklwd m1, m2, m3 |
| punpckhwd m2, m3 |
| punpcklwd m3, m4, m0 |
| punpckhwd m4, m0 |
| punpcklwd m0, m1, m2 |
| punpckhwd m1, m2 |
| REPX {pmulhrsw x, m5}, m3, m4, m0, m1 |
| vshufi32x4 m2, m3, m0, 0x03 |
| vinserti32x4 m0, m3, xm0, 1 |
| vshufi32x4 m3, m4, m1, 0x03 |
| vinserti32x4 m1, m4, xm1, 1 |
| jmp tx2q |
| .pass2: |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call .main_pass2 |
| vpbroadcastd m5, [o(pw_2048)] |
| vpbroadcastd xm4, [o(pw_4096)] |
| psubw m4, m5 ; lower half = 2048, upper half = -2048 |
| .end: |
| REPX {vpermq x, x, q3120}, m0, m1, m2, m3 |
| .end2: |
| pmulhrsw m0, m4 |
| pmulhrsw m1, m4 |
| .end3: |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m4 |
| .end4: |
| pxor m4, m4 |
| mova [cq+32*0], m4 |
| mova [cq+32*1], m4 |
| mova [cq+32*2], m4 |
| mova [cq+32*3], m4 |
| lea r6, [strideq*3] |
| WRITE_8X4 0, 1, 4, 5 |
| lea dstq, [dstq+strideq*4] |
| WRITE_8X4 2, 3, 4, 5 |
| RET |
| ALIGN function_align |
| .main_pass1: |
| punpckhwd m0, m4, m3 ; 0 7 |
| punpckhwd m1, m5, m2 ; 2 5 |
| punpcklwd m2, m5 ; 4 3 |
| punpcklwd m3, m4 ; 6 1 |
| IADST8_1D_PACKED 1 |
| punpcklqdq m3, m4, m0 ; out6 -out7 |
| punpckhqdq m0, m4 ; out0 -out1 |
| ret |
| ALIGN function_align |
| cglobal_label .main_pass2 |
| IADST8_1D_PACKED 2 |
| ret |
| |
| INV_TXFM_8X8_FN flipadst, dct |
| INV_TXFM_8X8_FN flipadst, adst |
| INV_TXFM_8X8_FN flipadst, flipadst |
| INV_TXFM_8X8_FN flipadst, identity |
| |
| cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpermq m4, [cq+32*0], q1302 ; 1 0 |
| vpermq m3, [cq+32*3], q3120 ; 6 7 |
| vpermq m5, [cq+32*1], q1302 ; 3 2 |
| vpermq m2, [cq+32*2], q3120 ; 4 5 |
| call m(iadst_8x8_internal_8bpc).main_pass1 |
| vpbroadcastd m5, [o(pw_m16384_16384)] |
| punpckhwd m4, m3, m2 |
| punpcklwd m3, m2 |
| punpckhwd m2, m1, m0 |
| punpcklwd m1, m0 |
| punpckhwd m0, m4, m3 |
| punpcklwd m4, m3 |
| punpckhwd m3, m2, m1 |
| punpcklwd m2, m1 |
| REPX {pmulhrsw x, m5}, m0, m4, m3, m2 |
| vinserti32x4 m1, m0, xm3, 1 |
| vshufi32x4 m3, m0, m3, 0x03 |
| vinserti32x4 m0, m4, xm2, 1 |
| vshufi32x4 m2, m4, m2, 0x03 |
| jmp tx2q |
| .pass2: |
| pshufd m4, m0, q1032 |
| pshufd m5, m1, q1032 |
| call m(iadst_8x8_internal_8bpc).main_pass2 |
| vpbroadcastd m4, [o(pw_2048)] |
| vpbroadcastd xm5, [o(pw_4096)] |
| psubw m4, m5 ; lower half = -2048, upper half = 2048 |
| vpermq m5, m3, q2031 |
| vpermq m3, m0, q2031 |
| vpermq m0, m2, q2031 |
| vpermq m2, m1, q2031 |
| pmulhrsw m1, m0, m4 |
| pmulhrsw m0, m5, m4 |
| jmp m(iadst_8x8_internal_8bpc).end3 |
| |
| INV_TXFM_8X8_FN identity, dct |
| INV_TXFM_8X8_FN identity, adst |
| INV_TXFM_8X8_FN identity, flipadst |
| INV_TXFM_8X8_FN identity, identity |
| |
| cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova xm3, [cq+16*0] |
| mova xm2, [cq+16*1] |
| vinserti32x4 m3, [cq+16*4], 1 |
| vinserti32x4 m2, [cq+16*5], 1 |
| mova xm4, [cq+16*2] |
| mova xm0, [cq+16*3] |
| vinserti32x4 m4, [cq+16*6], 1 |
| vinserti32x4 m0, [cq+16*7], 1 |
| punpcklwd m1, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m4, m0 |
| punpckhwd m4, m0 |
| punpckldq m0, m1, m2 |
| punpckhdq m1, m2 |
| punpckldq m2, m3, m4 |
| punpckhdq m3, m4 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m4, [o(pw_4096)] |
| jmp m(iadst_8x8_internal_8bpc).end |
| |
| %macro INV_TXFM_8X16_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 8x16 |
| %ifidn %1_%2, dct_dct |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 16 |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly |
| %endif |
| %endmacro |
| |
| %macro ITX_8X16_LOAD_COEFS 0 |
| vpbroadcastd m4, [o(pw_2896x8)] |
| pmulhrsw m0, m4, [cq+32*0] |
| add cq, 32*4 |
| pmulhrsw m7, m4, [cq+32*3] |
| pmulhrsw m1, m4, [cq-32*3] |
| pmulhrsw m6, m4, [cq+32*2] |
| pmulhrsw m2, m4, [cq-32*2] |
| pmulhrsw m5, m4, [cq+32*1] |
| pmulhrsw m3, m4, [cq-32*1] |
| pmulhrsw m4, [cq+32*0] |
| %endmacro |
| |
| INIT_ZMM avx512icl |
| INV_TXFM_8X16_FN dct, dct |
| INV_TXFM_8X16_FN dct, identity |
| INV_TXFM_8X16_FN dct, adst |
| INV_TXFM_8X16_FN dct, flipadst |
| |
| cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m3, [o(permB)] |
| vpermq m0, m3, [cq+64*0] |
| vpbroadcastd m4, [o(pw_2896x8)] |
| vpermq m1, m3, [cq+64*1] |
| vpermq m2, m3, [cq+64*2] |
| vpermq m3, m3, [cq+64*3] |
| REPX {pmulhrsw x, m4}, m0, m1, m2, m3 |
| call m(idct_16x8_internal_8bpc).main |
| vpbroadcastd m5, [o(pw_16384)] |
| punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 |
| punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 |
| punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 |
| punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 |
| REPX {pmulhrsw x, m5}, m4, m0, m2, m1 |
| punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 |
| punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 |
| punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 |
| punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 |
| punpckhdq m1, m0, m2 ; 1 5 9 13 |
| punpckldq m0, m2 ; 0 4 8 12 |
| punpckldq m2, m3, m4 ; 2 6 10 14 |
| punpckhdq m3, m4 ; 3 7 11 15 |
| jmp tx2q |
| .pass2: |
| vprord m5, [o(int16_perm)], 16 |
| vshufi32x4 m2, m2, q1320 ; 2 10 14 6 |
| vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 |
| vshufi32x4 m1, m3, q0132 ; 9 13 7 3 |
| vpermb m9, m5, m0 |
| vpermb m7, m5, m2 |
| vpermb m8, m5, m4 |
| vpermb m0, m5, m1 |
| vextracti32x8 ym6, m9, 1 |
| vextracti32x8 ym3, m7, 1 |
| vextracti32x8 ym5, m8, 1 |
| vextracti32x8 ym1, m0, 1 |
| call .main2 |
| mova ym8, [o(gather8a)] |
| lea r3, [dstq+strideq*4] |
| pmovzxdq m9, ym8 |
| pshufd ym8, ym8, q1230 |
| vpermt2q m0, m9, m4 |
| vpermt2q m1, m9, m5 |
| vpermt2q m2, m9, m6 |
| vpermt2q m3, m9, m7 |
| .end: |
| vpbroadcastd m7, [o(pw_2048)] |
| .end2: |
| pmulhrsw m0, m7 |
| pmulhrsw m1, m7 |
| .end3: |
| pmulhrsw m2, m7 |
| pmulhrsw m3, m7 |
| .end4: |
| vpbroadcastd ym6, strided |
| kxnorb k1, k1, k1 |
| pxor m4, m4 |
| pmulld ym8, ym6 |
| kmovb k2, k1 |
| vpgatherdq m6{k1}, [dstq+ym8] |
| kmovb k1, k2 |
| vpgatherdq m7{k2}, [r3+ym8] |
| mova [cq+64*0], m4 |
| mova [cq+64*1], m4 |
| kmovb k2, k1 |
| mova [cq+64*2], m4 |
| mova [cq+64*3], m4 |
| punpcklbw m5, m6, m4 |
| punpckhbw m6, m4 |
| paddw m0, m5 |
| paddw m1, m6 |
| packuswb m0, m1 |
| vpscatterdq [dstq+ym8]{k1}, m0 |
| punpcklbw m6, m7, m4 |
| punpckhbw m7, m4 |
| paddw m2, m6 |
| paddw m3, m7 |
| packuswb m2, m3 |
| vpscatterdq [r3+ym8]{k2}, m2 |
| RET |
| ALIGN function_align |
| cglobal_label .main_fast2 ; bottom three-quarters are zero |
| vpbroadcastd ym10, [o(pd_2048)] |
| vpbroadcastq ym13, [o(int_mshift)] |
| vpbroadcastd ym3, [o(pw_401_4076x8)] |
| vpbroadcastd ym5, [o(pw_799_4017x8)] |
| vpbroadcastd ym4, [o(pw_m1189_3920x8)] |
| pxor ym6, ym6 |
| punpckhwd ym2, ym0, ym0 |
| pmulhrsw ym2, ym3 ; t8a t15a |
| punpcklwd ym7, ym1, ym1 |
| pmulhrsw ym7, ym5 ; t4a t7a |
| punpckhwd ym1, ym1 |
| pmulhrsw ym4, ym1 ; t11a t12a |
| vpcmpub k7, ym13, ym10, 6 |
| punpcklwd ym9, ym6, ym0 |
| psubsw ym0, ym2, ym4 ; t11a t12a |
| paddsw ym8, ym2, ym4 ; t8a t15a |
| mova ym1, ym7 |
| jmp .main5 |
| ALIGN function_align |
| cglobal_label .main_fast ; bottom half is zero |
| vpbroadcastd ym10, [o(pd_2048)] |
| vpbroadcastq ym13, [o(int_mshift)] |
| pxor ym6, ym6 |
| punpckhwd ym8, ym0, ym0 |
| punpckhwd ym4, ym3, ym3 |
| punpckhwd ym5, ym2, ym2 |
| punpcklwd ym7, ym1, ym1 |
| punpckhwd ym1, ym1 |
| punpcklwd ym3, ym3 |
| punpcklwd ym9, ym6, ym0 |
| punpcklwd ym6, ym2 |
| vpbroadcastd ym2, [o(pw_401_4076x8)] |
| vpbroadcastd ym0, [o(pw_m2598_3166x8)] |
| vpbroadcastd ym11, [o(pw_1931_3612x8)] |
| vpbroadcastd ym12, [o(pw_m1189_3920x8)] |
| pmulhrsw ym8, ym2 ; t8a t15a |
| vpbroadcastd ym2, [o(pw_799_4017x8)] |
| pmulhrsw ym0, ym4 ; t9a t14a |
| vpbroadcastd ym4, [o(pw_m2276_3406x8)] |
| pmulhrsw ym5, ym11 ; t10a t13a |
| pmulhrsw ym1, ym12 ; t11a t12a |
| pmulhrsw ym7, ym2 ; t4a t7a |
| pmulhrsw ym3, ym4 ; t5a t6a |
| vpcmpub k7, ym13, ym10, 6 |
| jmp .main4 |
| ALIGN function_align |
| cglobal_label .main |
| WRAP_YMM IDCT16_1D_PACKED |
| ret |
| |
| INV_TXFM_8X16_FN adst, dct |
| INV_TXFM_8X16_FN adst, adst |
| INV_TXFM_8X16_FN adst, flipadst |
| INV_TXFM_8X16_FN adst, identity |
| |
| cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call m(iadst_16x8_internal_8bpc).main_pass1 |
| vbroadcasti32x4 m6, [o(int_shuf1)] |
| vpbroadcastd m7, [o(pw_16384_m16384)] |
| punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 |
| pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 |
| .pass1_end: |
| REPX {pmulhrsw x, m7}, m3, m5, m4, m2 |
| punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m5 |
| punpckhqdq m3, m5 |
| jmp tx2q |
| .pass2: |
| call .main_pass2 |
| vpbroadcastd m6, [o(pw_2048)] |
| psrlq m10, 4 |
| psubw m7, m8, m6 |
| .pass2_end: |
| vpbroadcastd m5, [o(pw_2896x8)] |
| paddsw m1, m2, m4 |
| psubsw m2, m4 |
| pmulhrsw m1, m5 ; -out7 out4 out6 -out5 |
| pmulhrsw m5, m2 ; out8 -out11 -out9 out10 |
| mova ym8, [o(gather8c)] |
| lea r3, [dstq+strideq] |
| psrlq m2, m10, 4 |
| vpermi2q m2, m0, m3 ; 1 3 13 15 |
| vpermt2q m0, m10, m3 ; 0 2 12 14 |
| psrlq m3, m10, 8 |
| vpermi2q m3, m1, m5 ; 5 7 9 11 |
| psrlq m10, 12 |
| vpermt2q m1, m10, m5 ; 4 6 8 10 |
| pmulhrsw m0, m6 |
| pmulhrsw m1, m6 |
| jmp m(idct_8x16_internal_8bpc).end3 |
| ALIGN function_align |
| .main_pass1: |
| vpbroadcastd m2, [o(pw_2896x8)] |
| pmulhrsw m5, m2, [cq+64*0] |
| pmulhrsw m3, m2, [cq+64*3] |
| pmulhrsw m1, m2, [cq+64*1] |
| pmulhrsw m2, [cq+64*2] |
| movu m4, [o(permA+3)] |
| psrlq m10, m4, 4 |
| mova m6, m4 |
| vpermi2q m4, m5, m3 ; in0 in12 in2 in14 |
| vpermt2q m5, m10, m3 ; in15 in3 in13 in1 |
| vpermi2q m6, m1, m2 ; in4 in8 in6 in10 |
| vpermt2q m1, m10, m2 ; in11 in7 in9 in5 |
| jmp .main |
| ALIGN function_align |
| .main_pass2: |
| mova m4, [o(permC)] |
| psrlq m5, m4, 4 |
| vpermi2q m4, m0, m2 ; in0 in12 in2 in14 |
| psrlq m6, m5, 4 |
| vpermi2q m5, m1, m3 ; in15 in3 in13 in1 |
| psrlq m10, m6, 4 |
| vpermi2q m6, m0, m2 ; in4 in8 in6 in10 |
| vpermt2q m1, m10, m3 ; in11 in7 in9 in5 |
| .main: |
| punpcklwd m0, m4, m5 ; in0 in15 in2 in13 |
| punpckhwd m4, m5 ; in12 in3 in14 in1 |
| punpcklwd m5, m6, m1 ; in4 in11 in6 in9 |
| punpckhwd m6, m1 ; in8 in7 in10 in5 |
| cglobal_label .main2 |
| vpbroadcastd m9, [o(pd_2048)] |
| vpbroadcastq m13, [o(int_mshift)] |
| kxnorb k1, k1, k1 |
| vpcmpub k7, m13, m9, 6 ; 0x33... |
| pxor m8, m8 |
| ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 |
| ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 |
| ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 |
| ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 |
| psubsw m2, m0, m6 ; t9a t8a t11a t10a |
| paddsw m0, m6 ; t1a t0a t3a t2a |
| psubsw m3, m5, m4 ; t13a t12a t15a t14a |
| paddsw m5, m4 ; t5a t4a t7a t6a |
| ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 |
| psubw m7, m8, m7 |
| ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 |
| vpbroadcastd m6, [o(pw_3784_m1567)] |
| vpbroadcastd m6{k1}, [o(pw_m3784_1567)] |
| psubsw m1, m0, m5 ; t5 t4 t7 t6 |
| paddsw m0, m5 ; t1 t0 t3 t2 |
| psubsw m4, m2, m3 ; t13a t12a t15a t14a |
| paddsw m2, m3 ; t9a t8a t11a t10a |
| ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a |
| ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 |
| vbroadcasti32x4 m5, [o(deint_shuf)] |
| pshufb m0, m5 |
| pshufb m2, m5 |
| vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a |
| vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a |
| vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 |
| vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 |
| pshufd m2, m2, q1032 ; t7a t6a t15 t14 |
| psubsw m4, m0, m3 ; t3a t2a t11 t10 |
| paddsw m0, m3 ; -out15 out0 out14 -out1 |
| paddsw m3, m1, m2 ; out12 -out3 -out13 out2 |
| psubsw m1, m2 ; t7 t6 t15a t14a |
| punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a |
| punpcklqdq m4, m1 ; t3a t7 t11 t15a |
| ret |
| |
| INV_TXFM_8X16_FN flipadst, dct |
| INV_TXFM_8X16_FN flipadst, adst |
| INV_TXFM_8X16_FN flipadst, flipadst |
| INV_TXFM_8X16_FN flipadst, identity |
| |
| cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call m(iadst_16x8_internal_8bpc).main_pass1 |
| vbroadcasti32x4 m6, [o(int_shuf2)] |
| vpbroadcastd m7, [o(pw_m16384_16384)] |
| punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 |
| pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 |
| jmp m(iadst_8x16_internal_8bpc).pass1_end |
| .pass2: |
| call m(iadst_8x16_internal_8bpc).main_pass2 |
| vpbroadcastd m7, [o(pw_2048)] |
| psrlq m10, 36 |
| psubw m6, m8, m7 |
| jmp m(iadst_8x16_internal_8bpc).pass2_end |
| |
| INV_TXFM_8X16_FN identity, dct |
| INV_TXFM_8X16_FN identity, adst |
| INV_TXFM_8X16_FN identity, flipadst |
| INV_TXFM_8X16_FN identity, identity |
| |
| cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [o(int16_perm)] |
| vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 |
| vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 |
| vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 |
| vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 |
| vpbroadcastd m5, [o(pw_2896x8)] |
| punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 |
| punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| REPX {pmulhrsw x, m5}, m1, m2, m3, m4 |
| punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 |
| punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 |
| punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 |
| punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m7, [o(pw_1697x16)] |
| mova ym8, [o(gather8b)] |
| lea r3, [dstq+strideq*2] |
| pmulhrsw m4, m7, m0 |
| pmulhrsw m5, m7, m1 |
| pmulhrsw m6, m7, m2 |
| pmulhrsw m7, m3 |
| REPX {paddsw x, x}, m0, m1, m2, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| paddsw m2, m6 |
| paddsw m3, m7 |
| jmp m(idct_8x16_internal_8bpc).end |
| |
| %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] |
| pmovzxbw m%3, [dstq+%5] |
| %ifnum %1 |
| paddw m%3, m%1 |
| %else |
| paddw m%3, %1 |
| %endif |
| pmovzxbw m%4, [dstq+%6] |
| %ifnum %2 |
| paddw m%4, m%2 |
| %else |
| paddw m%4, %2 |
| %endif |
| packuswb m%3, m%4 |
| vpermq m%3, m%3, q3120 |
| mova [dstq+%5], xm%3 |
| vextracti32x4 [dstq+%6], m%3, 1 |
| %endmacro |
| |
| %macro INV_TXFM_16X4_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 16x4 |
| %ifidn %1_%2, dct_dct |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 |
| %endif |
| %endmacro |
| |
| INIT_ZMM avx512icl |
| INV_TXFM_16X4_FN dct, dct |
| INV_TXFM_16X4_FN dct, adst |
| INV_TXFM_16X4_FN dct, flipadst |
| INV_TXFM_16X4_FN dct, identity |
| |
| cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova xm0, [cq+16*0] |
| mova xm1, [cq+16*1] |
| mova xm2, [cq+16*2] |
| mova xm3, [cq+16*3] |
| mova xm4, [cq+16*4] |
| mova xm5, [cq+16*5] |
| mova xm6, [cq+16*6] |
| mova xm7, [cq+16*7] |
| call m(idct_4x16_internal_8bpc).main |
| vpbroadcastd m8, [o(pw_16384)] |
| vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 |
| vinserti32x4 ym5, xm7, 1 ; b a f e |
| vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 |
| vinserti32x4 ym4, xm6, 1 ; 8 9 c d |
| vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e |
| vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d |
| pmulhrsw m1, m8 |
| pmulhrsw m0, m8 |
| pshufd m1, m1, q1032 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| jmp tx2q |
| .pass2: |
| IDCT4_1D_PACKED |
| mova m2, [o(permA)] |
| jmp m(iadst_16x4_internal_8bpc).end |
| |
| INV_TXFM_16X4_FN adst, dct |
| INV_TXFM_16X4_FN adst, adst |
| INV_TXFM_16X4_FN adst, flipadst |
| INV_TXFM_16X4_FN adst, identity |
| |
| cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+64*0] |
| mova m1, [cq+64*1] |
| movshdup m3, [o(permB)] |
| psrlq m10, m3, 4 |
| call m(iadst_4x16_internal_8bpc).main2 |
| vpbroadcastd m6, [o(pw_16384_m16384)] |
| psrlq m0, m10, 4 |
| psrlq m10, 8 |
| .pass1_end: |
| punpcklwd ym5, ym4, ym2 |
| punpckhwd ym4, ym2 |
| vinserti32x8 m5, ym4, 1 |
| mova m1, m9 |
| vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} |
| mova m4, m9 |
| vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} |
| psrad m1, 12 |
| psrad m4, 12 |
| packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 |
| vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d |
| vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m0, m2 |
| punpcklwd m0, m2 |
| pmulhrsw m0, m6 |
| pmulhrsw m1, m6 |
| jmp tx2q |
| .pass2: |
| call .main |
| movu m2, [o(permA+1)] |
| .end: |
| vpbroadcastd m3, [o(pw_2048)] |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| .end2: |
| psrlq m3, m2, 4 |
| vpermi2q m2, m0, m1 |
| vpermi2q m3, m0, m1 |
| .end3: |
| lea r3, [dstq+strideq*2] |
| mova xm1, [dstq+strideq*0] |
| vinserti32x4 ym1, [dstq+strideq*1], 1 |
| vinserti32x4 m1, [r3 +strideq*0], 2 |
| vinserti32x4 m1, [r3 +strideq*1], 3 |
| pxor m4, m4 |
| mova [cq+64*0], m4 |
| mova [cq+64*1], m4 |
| punpcklbw m0, m1, m4 |
| punpckhbw m1, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| packuswb m0, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], ym0, 1 |
| vextracti32x4 [r3 +strideq*0], m0, 2 |
| vextracti32x4 [r3 +strideq*1], m0, 3 |
| RET |
| ALIGN function_align |
| .main: |
| IADST4_1D_PACKED |
| ret |
| |
| INV_TXFM_16X4_FN flipadst, dct |
| INV_TXFM_16X4_FN flipadst, adst |
| INV_TXFM_16X4_FN flipadst, flipadst |
| INV_TXFM_16X4_FN flipadst, identity |
| |
| cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m0, [cq+64*0] |
| mova m1, [cq+64*1] |
| movshdup m3, [o(permB)] |
| psrlq m10, m3, 4 |
| call m(iadst_4x16_internal_8bpc).main2 |
| vpbroadcastd m6, [o(pw_m16384_16384)] |
| psrlq m0, m10, 12 |
| psrlq m10, 16 |
| jmp m(iadst_16x4_internal_8bpc).pass1_end |
| .pass2: |
| call m(iadst_16x4_internal_8bpc).main |
| movu m2, [o(permA+2)] |
| jmp m(iadst_16x4_internal_8bpc).end |
| |
| INV_TXFM_16X4_FN identity, dct |
| INV_TXFM_16X4_FN identity, adst |
| INV_TXFM_16X4_FN identity, flipadst |
| INV_TXFM_16X4_FN identity, identity |
| |
| cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m1, [cq+64*0] |
| mova m2, [cq+64*1] |
| vpbroadcastd m3, [o(pw_1697x16)] |
| vpbroadcastd m4, [o(pw_16384)] |
| mova m5, [o(idtx_16x4p)] |
| shufps m0, m1, m2, q2020 |
| shufps m1, m2, q3131 |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m4 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| vpermb m0, m5, m0 |
| vpermb m1, m5, m1 |
| jmp tx2q |
| .pass2: |
| vpbroadcastd m3, [o(pw_1697x8)] |
| pmulhrsw m2, m3, m0 |
| pmulhrsw m3, m1 |
| paddsw m0, m2 |
| paddsw m1, m3 |
| movu m2, [o(permA+1)] |
| jmp m(iadst_16x4_internal_8bpc).end |
| |
| %macro INV_TXFM_16X8_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 16x8 |
| %ifidn %1_%2, dct_dct |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 8 |
| .dconly: |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| .dconly2: |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| .dconly3: |
| imul r6d, 181 |
| lea r2, [strideq*3] |
| add r6d, 128+2048 |
| sar r6d, 8+4 |
| pxor m2, m2 |
| vpbroadcastw m3, r6d |
| .dconly_loop: |
| mova xm1, [dstq+strideq*0] |
| vinserti32x4 ym1, [dstq+strideq*1], 1 |
| vinserti32x4 m1, [dstq+strideq*2], 2 |
| vinserti32x4 m1, [dstq+r2 ], 3 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| paddw m0, m3 |
| paddw m1, m3 |
| packuswb m0, m1 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], ym0, 1 |
| vextracti32x4 [dstq+strideq*2], m0, 2 |
| vextracti32x4 [dstq+r2 ], m0, 3 |
| lea dstq, [dstq+strideq*4] |
| sub r3d, 4 |
| jg .dconly_loop |
| RET |
| %endif |
| %endmacro |
| |
| %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd |
| vpbroadcastd m8, [o(pw_2896x8)] |
| vpermq m0, [cq+32*0], q3120 |
| add cq, 32*4 |
| vpermq m7, [cq+32*3], q%1 |
| vpermq m1, [cq-32*3], q%1 |
| vpermq m6, [cq+32*2], q3120 |
| vpermq m2, [cq-32*2], q3120 |
| vpermq m5, [cq+32*1], q%1 |
| vpermq m3, [cq-32*1], q%1 |
| vpermq m4, [cq+32*0], q3120 |
| REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 |
| %endmacro |
| |
| INV_TXFM_16X8_FN dct, dct |
| INV_TXFM_16X8_FN dct, identity |
| INV_TXFM_16X8_FN dct, adst |
| INV_TXFM_16X8_FN dct, flipadst |
| |
| cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd m1, [o(pw_2896x8)] |
| vpermq m0, [cq+64*0], q3120 |
| vpermq m2, [cq+64*1], q3120 |
| vpermq m4, [cq+64*2], q3120 |
| vpermq m6, [cq+64*3], q3120 |
| REPX {pmulhrsw x, m1}, m0, m2, m4, m6 |
| vextracti32x8 ym1, m0, 1 |
| vextracti32x8 ym3, m2, 1 |
| vextracti32x8 ym5, m4, 1 |
| vextracti32x8 ym7, m6, 1 |
| call m(idct_8x16_internal_8bpc).main |
| vbroadcasti32x4 m8, [o(int_shuf1)] |
| vbroadcasti32x4 m9, [o(int_shuf2)] |
| vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 |
| vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 |
| vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 |
| vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 |
| vpbroadcastd m2, [o(pw_16384)] |
| pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 |
| pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 |
| REPX {pmulhrsw x, m2}, m0, m1, m6, m7 |
| punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| jmp tx2q |
| .pass2: |
| vshufi32x4 m0, m2, m4, q2020 ; 0 1 |
| vshufi32x4 m2, m4, q3131 ; 4 5 |
| vshufi32x4 m1, m3, m5, q2020 ; 2 3 |
| vshufi32x4 m3, m5, q3131 ; 6 7 |
| call .main |
| movshdup m4, [o(permC)] |
| psrlq m6, m4, 4 |
| vpermq m5, m4, q1032 |
| vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 |
| vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 |
| psrlq m6, m5, 4 |
| vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 |
| vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 |
| vpbroadcastd m6, [o(pw_2048)] |
| .end: |
| REPX {pmulhrsw x, m6}, m0, m4, m1, m5 |
| .end2: |
| lea r3, [dstq+strideq*4] |
| lea r4, [strideq*3] |
| mova xm3, [dstq+strideq*0] |
| mova xm6, [dstq+strideq*2] |
| vinserti32x4 ym3, [dstq+strideq*1], 1 |
| vinserti32x4 ym6, [dstq+r4 ], 1 |
| vinserti32x4 m3, [r3 +strideq*0], 2 |
| vinserti32x4 m6, [r3 +strideq*2], 2 |
| vinserti32x4 m3, [r3 +strideq*1], 3 |
| vinserti32x4 m6, [r3 +r4 ], 3 |
| pxor m7, m7 |
| mova [cq+64*0], m7 |
| mova [cq+64*1], m7 |
| mova [cq+64*2], m7 |
| mova [cq+64*3], m7 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| paddw m0, m2 |
| paddw m4, m3 |
| packuswb m0, m4 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], ym0, 1 |
| vextracti32x4 [r3 +strideq*0], m0, 2 |
| vextracti32x4 [r3 +strideq*1], m0, 3 |
| punpcklbw m3, m6, m7 |
| punpckhbw m6, m7 |
| paddw m1, m3 |
| paddw m5, m6 |
| packuswb m1, m5 |
| mova [dstq+strideq*2], xm1 |
| vextracti32x4 [dstq+r4 ], ym1, 1 |
| vextracti32x4 [r3 +strideq*2], m1, 2 |
| vextracti32x4 [r3 +r4 ], m1, 3 |
| RET |
| ALIGN function_align |
| cglobal_label .main |
| IDCT8_1D_PACKED |
| ret |
| |
| INV_TXFM_16X8_FN adst, dct |
| INV_TXFM_16X8_FN adst, adst |
| INV_TXFM_16X8_FN adst, flipadst |
| INV_TXFM_16X8_FN adst, identity |
| |
| cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call m(iadst_8x16_internal_8bpc).main_pass1 |
| vpbroadcastd m7, [o(pw_16384_m16384)] |
| psrlq m10, 4 |
| .pass1_end: |
| punpcklwd m5, m4, m2 |
| punpckhwd m4, m2 |
| mova m1, m9 |
| vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} |
| mova m6, m9 |
| vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} |
| mova m2, m9 |
| vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} |
| vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} |
| psrad m1, 12 |
| psrad m6, 12 |
| packssdw m1, m6 ; out8 -out7 -out9 out6 |
| psrad m2, 12 |
| psrad m9, 12 |
| packssdw m2, m9 ; -out11 out4 out10 -out5 |
| psrlq m4, m10, 4 |
| vpermi2q m4, m0, m2 |
| vpermt2q m0, m10, m2 |
| psrlq m5, m10, 8 |
| vpermi2q m5, m1, m3 |
| psrlq m10, 12 |
| vpermt2q m1, m10, m3 |
| punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 |
| punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 |
| punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 |
| punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 |
| punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| REPX {pmulhrsw x, m7}, m2, m3, m4, m5 |
| jmp tx2q |
| .pass2: |
| vshufi32x4 m0, m2, m4, q2020 |
| vshufi32x4 m2, m4, q3131 ; 4 5 |
| vshufi32x4 m1, m3, m5, q2020 |
| vshufi32x4 m3, m5, q3131 ; 6 7 |
| pshufd m4, m0, q1032 ; 1 0 |
| pshufd m5, m1, q1032 ; 3 2 |
| call .main_pass2 |
| movshdup m4, [o(permC)] |
| pmulhrsw m0, m6 |
| pmulhrsw m1, m6 |
| psrlq m6, m4, 4 |
| mova m5, m4 |
| vpermi2q m4, m0, m2 |
| vpermt2q m0, m6, m2 |
| vpermi2q m5, m1, m3 |
| vpermt2q m1, m6, m3 |
| jmp m(idct_16x8_internal_8bpc).end2 |
| ALIGN function_align |
| .main_pass1: |
| vpbroadcastd m4, [o(pw_2896x8)] |
| pmulhrsw m3, m4, [cq+64*0] |
| pmulhrsw m1, m4, [cq+64*3] |
| pmulhrsw m2, m4, [cq+64*1] |
| pmulhrsw m4, [cq+64*2] |
| mova m5, [o(int16_perm)] |
| kxnorb k1, k1, k1 |
| vpblendmd m0{k1}, m1, m3 ; 0 7 |
| vmovdqa32 m3{k1}, m1 ; 6 1 |
| vpblendmd m1{k1}, m4, m2 ; 2 5 |
| vmovdqa32 m2{k1}, m4 ; 4 3 |
| REPX {vpermb x, m5, x}, m0, m1, m2, m3 |
| IADST8_1D_PACKED 1 |
| ret |
| ALIGN function_align |
| cglobal_label .main_pass2 |
| IADST8_1D_PACKED 2 |
| pxor m5, m5 |
| psubd m5, m6 |
| packssdw m6, m5 |
| pmulhrsw m2, m6 |
| pmulhrsw m3, m6 |
| ret |
| |
| INV_TXFM_16X8_FN flipadst, dct |
| INV_TXFM_16X8_FN flipadst, adst |
| INV_TXFM_16X8_FN flipadst, flipadst |
| INV_TXFM_16X8_FN flipadst, identity |
| |
| cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call m(iadst_8x16_internal_8bpc).main_pass1 |
| vpbroadcastd m7, [o(pw_m16384_16384)] |
| psrlq m10, 20 |
| jmp m(iadst_16x8_internal_8bpc).pass1_end |
| .pass2: |
| vshufi32x4 m0, m2, m4, q2020 |
| vshufi32x4 m2, m4, q3131 ; 4 5 |
| vshufi32x4 m1, m3, m5, q2020 |
| vshufi32x4 m3, m5, q3131 ; 6 7 |
| pshufd m4, m0, q1032 ; 1 0 |
| pshufd m5, m1, q1032 ; 3 2 |
| call m(iadst_16x8_internal_8bpc).main_pass2 |
| movshdup m4, [o(permC)] |
| pmulhrsw m5, m6, m0 |
| pmulhrsw m0, m6, m1 |
| psrlq m1, m4, 12 |
| psrlq m4, 8 |
| mova m7, m4 |
| vpermi2q m4, m0, m3 |
| vpermt2q m0, m1, m3 |
| vpermi2q m1, m5, m2 |
| vpermt2q m5, m7, m2 |
| jmp m(idct_16x8_internal_8bpc).end2 |
| |
| INV_TXFM_16X8_FN identity, dct |
| INV_TXFM_16X8_FN identity, adst |
| INV_TXFM_16X8_FN identity, flipadst |
| INV_TXFM_16X8_FN identity, identity |
| |
| cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| vpbroadcastd m0, [o(pw_2896x8)] |
| pmulhrsw m3, m0, [cq+64*0] |
| pmulhrsw m4, m0, [cq+64*1] |
| pmulhrsw m5, m0, [cq+64*2] |
| pmulhrsw m0, [cq+64*3] |
| vpbroadcastd m7, [o(pw_1697x16)] |
| vpbroadcastd m8, [o(pw_16384)] |
| shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 |
| shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 |
| shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 |
| shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 |
| mova m9, [o(int8_permA)] |
| pmulhrsw m0, m7, m2 |
| pmulhrsw m1, m7, m3 |
| pmulhrsw m6, m7, m4 |
| pmulhrsw m7, m5 |
| REPX {pmulhrsw x, m8}, m0, m1, m6, m7 |
| paddsw m2, m0 |
| paddsw m3, m1 |
| paddsw m4, m6 |
| paddsw m5, m7 |
| REPX {vpermb x, m9, x}, m2, m3, m4, m5 |
| jmp tx2q |
| .pass2: |
| mova m7, [o(permB)] |
| vpbroadcastd m6, [o(pw_4096)] |
| vpermq m0, m7, m2 |
| vpermq m4, m7, m4 |
| vpermq m1, m7, m3 |
| vpermq m5, m7, m5 |
| jmp m(idct_16x8_internal_8bpc).end |
| |
| %macro INV_TXFM_16X16_FN 2 ; type1, type2 |
| INV_TXFM_FN %1, %2, 16x16 |
| %ifidn %1_%2, dct_dct |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 16 |
| imul r6d, 181 |
| add r6d, 128+512 |
| sar r6d, 8+2 |
| jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 |
| %endif |
| %endmacro |
| |
| INV_TXFM_16X16_FN dct, dct |
| INV_TXFM_16X16_FN dct, identity |
| INV_TXFM_16X16_FN dct, adst |
| INV_TXFM_16X16_FN dct, flipadst |
| |
| cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m7, [o(permB)] |
| vpermq m0, m7, [cq+64*0] |
| vpermq m1, m7, [cq+64*1] |
| vpermq m2, m7, [cq+64*2] |
| vpermq m3, m7, [cq+64*3] |
| vpermq m4, m7, [cq+64*4] |
| vpermq m5, m7, [cq+64*5] |
| vpermq m6, m7, [cq+64*6] |
| vpermq m7, m7, [cq+64*7] |
| call .main |
| vbroadcasti32x4 m12, [o(int_shuf1)] |
| vbroadcasti32x4 m11, [o(int_shuf2)] |
| vpbroadcastd m13, [o(pw_8192)] |
| pshufb m0, m12 |
| pshufb m8, m1, m11 |
| pshufb m2, m12 |
| pshufb m9, m3, m11 |
| pshufb m4, m12 |
| pshufb m10, m5, m11 |
| pshufb m6, m12 |
| pshufb m11, m7, m11 |
| REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 |
| punpckhdq m1, m0, m8 |
| punpckldq m0, m8 |
| punpckhdq m3, m2, m9 |
| punpckldq m2, m9 |
| punpckhdq m5, m4, m10 |
| punpckldq m4, m10 |
| punpckhdq m7, m6, m11 |
| punpckldq m6, m11 |
| jmp tx2q |
| .pass2: |
| vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc |
| vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 |
| vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec |
| vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 |
| vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me |
| vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 |
| vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee |
| vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 |
| vshufi32x4 m2, m0, m4, q3131 ; 4 5 |
| vshufi32x4 m0, m4, q2020 ; 0 1 |
| vshufi32x4 m4, m6, m8, q2020 ; 8 9 |
| vshufi32x4 m6, m8, q3131 ; 12 13 |
| vshufi32x4 m3, m1, m5, q3131 ; 6 7 |
| vshufi32x4 m1, m5, q2020 ; 2 3 |
| vshufi32x4 m5, m7, m9, q2020 ; 10 11 |
| vshufi32x4 m7, m9, q3131 ; 14 15 |
| call .main |
| mova m8, [o(permD)] |
| psrlq m12, m8, 4 |
| psrlq m9, m8, 8 |
| psrlq m13, m8, 12 |
| mova m10, m8 |
| vpermi2q m8, m0, m2 ; 0 1 4 5 |
| vpermt2q m0, m12, m2 |
| mova m11, m9 |
| vpermi2q m9, m1, m3 ; 2 3 6 7 |
| vpermt2q m1, m13, m3 |
| vpermi2q m10, m4, m6 ; 8 9 12 13 |
| vpermt2q m4, m12, m6 |
| vpermi2q m11, m5, m7 ; 10 11 14 15 |
| vpermt2q m5, m13, m7 |
| .end: |
| vpbroadcastd m12, [o(pw_2048)] |
| .end2: |
| REPX {pmulhrsw x, m12}, m0, m1, m4, m5 |
| .end3: |
| REPX {pmulhrsw x, m12}, m8, m9, m10, m11 |
| lea r3, [strideq*3] |
| lea r4, [dstq+strideq*4] |
| lea r5, [dstq+strideq*8] |
| lea r6, [r4 +strideq*8] |
| mova xm3, [dstq+strideq*0] |
| mova xm6, [dstq+strideq*2] |
| vinserti32x4 ym3, [dstq+strideq*1], 1 |
| vinserti32x4 ym6, [dstq+r3 ], 1 |
| vinserti32x4 m3, [r4+strideq*0], 2 |
| vinserti32x4 m6, [r4+strideq*2], 2 |
| vinserti32x4 m3, [r4+strideq*1], 3 |
| vinserti32x4 m6, [r4+r3 ], 3 |
| mova xm12, [r5+strideq*0] |
| mova xm13, [r5+strideq*2] |
| vinserti32x4 ym12, [r5+strideq*1], 1 |
| vinserti32x4 ym13, [r5+r3 ], 1 |
| vinserti32x4 m12, [r6+strideq*0], 2 |
| vinserti32x4 m13, [r6+strideq*2], 2 |
| vinserti32x4 m12, [r6+strideq*1], 3 |
| vinserti32x4 m13, [r6+r3 ], 3 |
| pxor m7, m7 |
| REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 |
| punpcklbw m2, m3, m7 |
| punpckhbw m3, m7 |
| paddw m0, m2 |
| paddw m8, m3 |
| packuswb m0, m8 |
| punpcklbw m2, m6, m7 |
| punpckhbw m6, m7 |
| paddw m1, m2 |
| paddw m9, m6 |
| packuswb m1, m9 |
| punpcklbw m2, m12, m7 |
| punpckhbw m12, m7 |
| paddw m2, m4 |
| paddw m10, m12 |
| packuswb m2, m10 |
| punpcklbw m3, m13, m7 |
| punpckhbw m13, m7 |
| paddw m3, m5 |
| paddw m11, m13 |
| packuswb m3, m11 |
| mova [dstq+strideq*0], xm0 |
| vextracti32x4 [dstq+strideq*1], ym0, 1 |
| mova [dstq+strideq*2], xm1 |
| vextracti32x4 [dstq+r3 ], ym1, 1 |
| vextracti32x4 [r4+strideq*0], m0, 2 |
| vextracti32x4 [r4+strideq*1], m0, 3 |
| vextracti32x4 [r4+strideq*2], m1, 2 |
| vextracti32x4 [r4+r3 ], m1, 3 |
| mova [r5+strideq*0], xm2 |
| vextracti32x4 [r5+strideq*1], ym2, 1 |
| mova [r5+strideq*2], xm3 |
| vextracti32x4 [r5+r3 ], ym3, 1 |
| vextracti32x4 [r6+strideq*0], m2, 2 |
| vextracti32x4 [r6+strideq*1], m2, 3 |
| vextracti32x4 [r6+strideq*2], m3, 2 |
| vextracti32x4 [r6+r3 ], m3, 3 |
| RET |
| ALIGN function_align |
| cglobal_label .main_fast2 ; bottom three-quarters are zero |
| vpbroadcastd m10, [o(pd_2048)] |
| vpbroadcastq m13, [o(int_mshift)] |
| vpcmpub k7, m13, m10, 6 |
| .main_fast4: |
| vpbroadcastd m2, [o(pw_401_4076x8)] |
| vpbroadcastd m4, [o(pw_m1189_3920x8)] |
| vpbroadcastd m3, [o(pw_799_4017x8)] |
| pmulhrsw m2, m8 ; t8a t15a |
| pmulhrsw m4, m1 ; t11a t12a |
| pmulhrsw m7, m3 ; t4a t7a |
| pxor m6, m6 |
| psubsw m0, m2, m4 ; t11a t12a |
| paddsw m8, m2, m4 ; t8a t15a |
| mova m1, m7 |
| jmp .main5 |
| ALIGN function_align |
| cglobal_label .main_fast ; bottom half is zero |
| vpbroadcastd m10, [o(pd_2048)] |
| .main_fast3: |
| vpbroadcastq m13, [o(int_mshift)] |
| vpcmpub k7, m13, m10, 6 |
| .main_fast5: |
| vpbroadcastd m2, [o(pw_401_4076x8)] |
| vpbroadcastd m4, [o(pw_m2598_3166x8)] |
| vpbroadcastd m11, [o(pw_1931_3612x8)] |
| vpbroadcastd m12, [o(pw_m1189_3920x8)] |
| pmulhrsw m8, m2 ; t8a t15a |
| vpbroadcastd m2, [o(pw_799_4017x8)] |
| pmulhrsw m0, m4 ; t9a t14a |
| vpbroadcastd m4, [o(pw_m2276_3406x8)] |
| pmulhrsw m5, m11 ; t10a t13a |
| pmulhrsw m1, m12 ; t11a t12a |
| pmulhrsw m7, m2 ; t4a t7a |
| pmulhrsw m3, m4 ; t5a t6a |
| jmp .main4 |
| ALIGN function_align |
| cglobal_label .main |
| IDCT16_1D_PACKED |
| ret |
| |
| INV_TXFM_16X16_FN adst, dct |
| INV_TXFM_16X16_FN adst, adst |
| INV_TXFM_16X16_FN adst, flipadst |
| |
| cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call .main_pass1 |
| vpbroadcastd m10, [o(pw_8192_m8192)] |
| punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 |
| punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 |
| punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 |
| punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 |
| punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 |
| punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 |
| punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 |
| punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 |
| punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 |
| punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 |
| .pass1_end: |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| jmp tx2q |
| .pass2: |
| call .main_pass2 |
| mova m10, [o(permD)] |
| psrlq m8, m10, 8 |
| psrlq m12, m10, 12 |
| psrlq m13, m10, 4 |
| mova m9, m8 |
| vpermi2q m8, m0, m2 ; 0 1 4 5 |
| vpermt2q m0, m12, m2 |
| vpermi2q m9, m1, m3 ; 2 3 6 7 |
| vpermt2q m1, m12, m3 |
| vpbroadcastd m12, [o(pw_2048)] |
| mov r3d, 0xff00ff00 |
| mova m11, m10 |
| vpermi2q m10, m4, m6 ; 8 9 12 13 |
| vpermt2q m4, m13, m6 |
| kmovd k1, r3d |
| vpermi2q m11, m5, m7 ; 10 11 14 15 |
| vpermt2q m5, m13, m7 |
| pxor m7, m7 |
| vpsubw m12{k1}, m7, m12 |
| jmp m(idct_16x16_internal_8bpc).end2 |
| ALIGN function_align |
| .main_pass1: |
| mova m4, [o(permB)] |
| psrlq m3, m4, 4 |
| vpermq m0, m4, [cq+64*0] |
| vpermq m7, m3, [cq+64*7] |
| vpermq m6, m4, [cq+64*6] |
| vpermq m1, m3, [cq+64*1] |
| vpermq m2, m4, [cq+64*2] |
| vpermq m5, m3, [cq+64*5] |
| vpermq m4, m4, [cq+64*4] |
| vpermq m3, m3, [cq+64*3] |
| call .main |
| vpbroadcastd m13, [o(pw_2896_2896)] |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| mova m2, m10 |
| vpdpwssd m2, m5, m13 ; -out5 |
| mova m8, m10 |
| vpdpwssd m8, m11, m13 ; out4 |
| mova m9, m10 |
| vpdpwssd m9, m5, m12 ; out10 |
| mova m5, m10 |
| vpdpwssd m5, m11, m12 ; -out11 |
| mova m11, m10 |
| vpdpwssd m11, m3, m13 ; -out7 |
| mova m14, m10 |
| vpdpwssd m14, m4, m13 ; out6 |
| mova m13, m10 |
| vpdpwssd m13, m3, m12 ; out8 |
| vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 |
| REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 |
| packssdw m2, m8 ; -out5 out4 |
| packssdw m5, m9, m5 ; out10 -out11 |
| packssdw m3, m11, m14 ; -out7 out6 |
| packssdw m4, m13, m10 ; out8 -out9 |
| ret |
| ALIGN function_align |
| .main_pass2: |
| vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc |
| vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 |
| vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec |
| vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 |
| vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me |
| vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 |
| vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee |
| vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 |
| vshufi32x4 m2, m0, m4, q3131 ; 4 5 |
| vshufi32x4 m0, m4, q2020 ; 0 1 |
| vshufi32x4 m4, m6, m8, q2020 ; 8 9 |
| vshufi32x4 m6, m8, q3131 ; 12 13 |
| vshufi32x4 m3, m1, m5, q3131 ; 6 7 |
| vshufi32x4 m1, m5, q2020 ; 2 3 |
| vshufi32x4 m5, m7, m9, q2020 ; 10 11 |
| vshufi32x4 m7, m9, q3131 ; 14 15 |
| cglobal_label .main_pass2b |
| REPX {pshufd x, x, q1032}, m1, m3, m5, m7 |
| call .main |
| vpbroadcastd m8, [o(pw_2896x8)] |
| pshufb m2, m11, m12 |
| pshufb m5, m12 |
| pshufb m3, m12 |
| pshufb m4, m12 |
| punpcklqdq m9, m5, m2 ; t15a t7 |
| punpckhqdq m5, m2 ; t14a t6 |
| shufps m2, m3, m4, q1032 ; t2a t10 |
| shufps m3, m4, q3210 ; t3a t11 |
| psubsw m4, m2, m3 ; out8 -out9 |
| paddsw m3, m2 ; -out7 out6 |
| paddsw m2, m5, m9 ; -out5 out4 |
| psubsw m5, m9 ; out10 -out11 |
| REPX {pmulhrsw x, m8}, m2, m3, m4, m5 |
| ret |
| ALIGN function_align |
| .main: |
| vpbroadcastd m10, [o(pd_2048)] |
| vpbroadcastq m13, [o(int_mshift)] |
| punpckhwd m8, m7, m0 ; in14 in1 |
| punpcklwd m0, m7 ; in0 in15 |
| punpcklwd m7, m6, m1 ; in12 in3 |
| punpckhwd m1, m6 ; in2 in13 |
| punpckhwd m6, m5, m2 ; in10 in5 |
| punpcklwd m2, m5 ; in4 in11 |
| punpcklwd m5, m4, m3 ; in8 in7 |
| punpckhwd m3, m4 ; in6 in9 |
| vpcmpub k7, m13, m10, 6 ; 0x33... |
| ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 |
| ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 |
| ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 |
| ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 |
| ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 |
| ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 |
| ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 |
| ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 |
| psubsw m4, m0, m5 ; t9a t8a |
| paddsw m0, m5 ; t1a t0a |
| psubsw m5, m1, m6 ; t11a t10a |
| paddsw m1, m6 ; t3a t2a |
| psubsw m6, m2, m7 ; t13a t12a |
| paddsw m2, m7 ; t5a t4a |
| psubsw m7, m3, m8 ; t15a t14a |
| paddsw m3, m8 ; t7a t6a |
| ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 |
| ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 |
| ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 |
| ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 |
| psubsw m8, m1, m3 ; t7 t6 |
| paddsw m1, m3 ; t3 t2 |
| psubsw m3, m0, m2 ; t5 t4 |
| paddsw m0, m2 ; t1 t0 |
| psubsw m2, m5, m7 ; t14a t15a |
| paddsw m7, m5 ; t10a t11a |
| psubsw m5, m4, m6 ; t12a t13a |
| paddsw m4, m6 ; t8a t9a |
| ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a |
| ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a |
| ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 |
| ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 |
| vbroadcasti32x4 m12, [o(deint_shuf)] |
| paddsw m6, m4, m7 ; -out1 out14 |
| psubsw m4, m7 ; t10 t11 |
| psubsw m11, m3, m8 ; t7 t6 |
| paddsw m8, m3 ; out12 -out3 |
| psubsw m3, m0, m1 ; t3a t2a |
| paddsw m0, m1 ; -out15 out0 |
| paddsw m1, m2, m5 ; -out13 out2 |
| psubsw m5, m2 ; t15a t14a |
| pshufb m0, m12 |
| pshufb m6, m12 |
| pshufb m8, m12 |
| pshufb m1, m12 |
| shufps m7, m6, m0, q1032 ; out14 -out15 |
| shufps m0, m6, m0, q3210 ; -out1 out0 |
| punpcklqdq m6, m8, m1 ; out12 -out13 |
| punpckhqdq m1, m8, m1 ; -out3 out2 |
| ret |
| |
| INV_TXFM_16X16_FN flipadst, dct |
| INV_TXFM_16X16_FN flipadst, adst |
| INV_TXFM_16X16_FN flipadst, flipadst |
| |
| cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| call m(iadst_16x16_internal_8bpc).main_pass1 |
| vpbroadcastd m10, [o(pw_m8192_8192)] |
| punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 |
| punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 |
| punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 |
| punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 |
| punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 |
| punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 |
| punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 |
| punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 |
| punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 |
| punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 |
| punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| jmp m(iadst_16x16_internal_8bpc).pass1_end |
| .pass2: |
| call m(iadst_16x16_internal_8bpc).main_pass2 |
| mova m10, [o(permD)] |
| psrlq m8, m10, 8 |
| psrlq m12, m10, 12 |
| psrlq m13, m10, 4 |
| mova m9, m8 |
| vpermi2q m8, m7, m5 ; 0 1 4 5 |
| vpermt2q m7, m12, m5 |
| vpermi2q m9, m6, m4 ; 2 3 6 7 |
| vpermt2q m6, m12, m4 |
| vpbroadcastd m12, [o(pw_2048)] |
| mov r3d, 0x00ff00ff |
| mova m11, m10 |
| vpermi2q m10, m3, m1 ; 8 9 12 13 |
| vpermt2q m3, m13, m1 |
| kmovd k1, r3d |
| vpermi2q m11, m2, m0 ; 10 11 14 15 |
| vpermt2q m2, m13, m0 |
| pxor m0, m0 |
| vpsubw m12{k1}, m0, m12 |
| pmulhrsw m0, m7, m12 |
| pmulhrsw m1, m6, m12 |
| pmulhrsw m4, m3, m12 |
| pmulhrsw m5, m2, m12 |
| jmp m(idct_16x16_internal_8bpc).end3 |
| |
| INV_TXFM_16X16_FN identity, dct |
| INV_TXFM_16X16_FN identity, identity |
| |
| cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 |
| mova m8, [o(int16_perm)] |
| vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 |
| vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 |
| vpbroadcastd m0, [o(pw_1697x16)] |
| vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 |
| vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 |
| vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 |
| vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 |
| vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 |
| vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 |
| pmulhrsw m9, m0, m1 |
| pmulhrsw m10, m0, m2 |
| pmulhrsw m11, m0, m3 |
| pmulhrsw m12, m0, m4 |
| pmulhrsw m13, m0, m5 |
| pmulhrsw m14, m0, m6 |
| pmulhrsw m15, m0, m7 |
| pmulhrsw m0, m8 |
| REPX {psraw x, 1}, m9, m10, m11, m12 |
| pavgw m1, m9 |
| pavgw m2, m10 |
| pavgw m3, m11 |
| pavgw m4, m12 |
| REPX {psraw x, 1}, m13, m14, m15, m0 |
| pavgw m5, m13 |
| pavgw m6, m14 |
| pavgw m7, m15 |
| pavgw m8, m0 |
| punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 |
| punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 |
| jmp tx2q |
| ALIGN function_align |
| .pass2: |
| vpbroadcastd m11, [o(pw_1697x16)] |
| pmulhrsw m12, m11, m0 |
| pmulhrsw m13, m11, m1 |
| pmulhrsw m14, m11, m2 |
| pmulhrsw m15, m11, m3 |
| pmulhrsw m8, m11, m4 |
| pmulhrsw m9, m11, m5 |
| pmulhrsw m10, m11, m6 |
| pmulhrsw m11, m7 |
| REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 |
| paddsw m0, m12 |
| paddsw m1, m13 |
| paddsw m2, m14 |
| paddsw m3, m15 |
| paddsw m8, m4 |
| movu m4, [o(permD+2)] |
| paddsw m9, m5 |
| paddsw m6, m10 |
| paddsw m7, m11 |
| psrlq m12, m4, 4 |
| mova m5, m4 |
| mova m10, m4 |
| mova m11, m4 |
| vpermi2q m4, m0, m2 ; 8 9 12 13 |
| vpermt2q m0, m12, m2 ; 0 1 4 5 |
| vpermi2q m5, m1, m3 ; 10 11 14 15 |
| vpermt2q m1, m12, m3 ; 2 3 6 7 |
| vpermi2q m10, m8, m6 |
| vpermt2q m8, m12, m6 |
| vpermi2q m11, m9, m7 |
| vpermt2q m9, m12, m7 |
| jmp m(idct_16x16_internal_8bpc).end |
| |
| %macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] |
| vpbroadcastd m%4, [o(pw_%5_%6x8)] |
| punpcklwd m%1, m%3, m%3 |
| pmulhrsw m%1, m%4 |
| vpbroadcastd m%4, [o(pw_%7_%8x8)] |
| punpckhwd m%2, m%3, m%3 |
| pmulhrsw m%2, m%4 |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| cmp eobd, 107 |
| jb .fast |
| mova m5, [cq+64*5] |
| mova m3, [cq+64*3] |
| mova m1, [cq+64*1] |
| mova m7, [cq+64*7] |
| mova m2, [cq+64*2] |
| mova m6, [cq+64*6] |
| mova m0, [cq+64*0] |
| mova m4, [cq+64*4] |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| mova m8, [o(idct_8x32p)] |
| vpbroadcastd m9, [o(pw_8192)] |
| REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 |
| punpckldq m8, m0, m1 ; ab |
| punpckhdq m0, m1 |
| punpckldq m1, m2, m3 ; cd |
| punpckhdq m2, m3 |
| punpckldq m3, m4, m5 ; ef |
| punpckhdq m4, m5 |
| punpckldq m5, m6, m7 ; gh |
| punpckhdq m6, m7 |
| REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 |
| punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 |
| punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 |
| punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 |
| punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 |
| punpcklqdq m20, m3, m5 |
| punpckhqdq m16, m3, m5 |
| punpcklqdq m19, m4, m6 |
| punpckhqdq m17, m4, m6 |
| vinserti32x4 ym8, ym18, xm20, 1 |
| vshufi32x4 ym1, ym18, ym20, 0x03 |
| vinserti32x4 ym9, ym14, xm16, 1 |
| vshufi32x4 ym3, ym14, ym16, 0x03 |
| vinserti32x4 ym0, ym21, xm19, 1 |
| vshufi32x4 ym5, ym21, ym19, 0x03 |
| vinserti32x4 ym7, ym15, xm17, 1 |
| vshufi32x4 ym6, ym15, ym17, 0x03 |
| call m(idct_8x16_internal_8bpc).main2 |
| psrlq m12, [o(permB)], 60 |
| vpermt2q m14, m12, m16 |
| vpermt2q m21, m12, m19 |
| vpermt2q m15, m12, m17 |
| vpermi2q m12, m18, m20 |
| vextracti32x8 ym16, m14, 1 |
| vextracti32x8 ym19, m21, 1 |
| vextracti32x8 ym17, m15, 1 |
| vextracti32x8 ym20, m12, 1 |
| call .main2 |
| jmp .end |
| .fast: ; right half is zero |
| mova m0, [o(int16_perm)] |
| mova ym2, [cq+64*4] |
| vinserti32x8 m2, [cq+64*0], 1 |
| mova ym3, [cq+64*6] |
| vinserti32x8 m3, [cq+64*2], 1 |
| mova ym4, [cq+64*3] |
| vinserti32x8 m4, [cq+64*5], 1 |
| mova ym5, [cq+64*7] |
| vinserti32x8 m5, [cq+64*1], 1 |
| REPX {vpermb x, m0, x}, m2, m3, m4, m5 |
| call m(idct_16x8_internal_8bpc).main2 |
| vbroadcasti32x4 m4, [o(int_shuf3)] |
| vbroadcasti32x4 m5, [o(int_shuf4)] |
| pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 |
| pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 |
| pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 |
| pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 |
| vpbroadcastd m4, [o(pw_8192)] |
| psrlq m5, [o(permB)], 60 |
| punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 |
| punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 |
| punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 |
| punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 |
| REPX {pmulhrsw x, m4}, m6, m17, m2, m16 |
| vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 |
| vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 |
| vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 |
| vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 |
| vpermt2q m2, m5, m6 ; 8 10 |
| vpermt2q m16, m5, m17 ; 9 11 |
| vextracti32x8 ym3, m2, 1 ; 12 14 |
| vextracti32x8 ym17, m16, 1 ; 13 15 |
| call m(idct_8x16_internal_8bpc).main_fast |
| call .main_fast |
| .end: |
| vpbroadcastd ym8, strided |
| pmulld ym8, [o(gather8d)] |
| call .main_end |
| lea r3, [dstq+strideq*4] |
| kxnorb k1, k1, k1 |
| lea r4, [dstq+strideq*8] |
| pxor m9, m9 |
| lea r1, [r3+strideq*8] |
| kmovb k2, k1 |
| vpgatherdq m12{k1}, [r0+ym8] |
| kmovb k1, k2 |
| vpgatherdq m13{k2}, [r3+ym8] |
| kmovb k2, k1 |
| vpgatherdq m14{k1}, [r4+ym8] |
| kmovb k1, k2 |
| vpgatherdq m15{k2}, [r1+ym8] |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 |
| punpcklbw m11, m12, m9 |
| punpckhbw m12, m9 |
| paddw m0, m11 |
| paddw m1, m12 |
| packuswb m0, m1 |
| kmovb k2, k1 |
| vpscatterdq [r0+ym8]{k1}, m0 |
| punpcklbw m12, m13, m9 |
| punpckhbw m13, m9 |
| paddw m2, m12 |
| paddw m3, m13 |
| packuswb m2, m3 |
| kmovb k1, k2 |
| vpscatterdq [r3+ym8]{k2}, m2 |
| punpcklbw m13, m14, m9 |
| punpckhbw m14, m9 |
| paddw m4, m13 |
| paddw m5, m14 |
| packuswb m4, m5 |
| kmovb k2, k1 |
| vpscatterdq [r4+ym8]{k1}, m4 |
| punpcklbw m14, m15, m9 |
| punpckhbw m15, m9 |
| paddw m6, m14 |
| paddw m7, m15 |
| packuswb m6, m7 |
| vpscatterdq [r1+ym8]{k2}, m6 |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 32 |
| imul r6d, 181 |
| add r6d, 128+512 |
| sar r6d, 8+2 |
| jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 |
| INIT_YMM avx512icl |
| ALIGN function_align |
| cglobal_label .main_fast2 ; bottom three-quarters are zero |
| ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a |
| ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a |
| mova m11, m12 |
| mova m17, m20 |
| mova m15, m21 |
| mova m16, m14 |
| jmp .main4 |
| ALIGN function_align |
| cglobal_label .main_fast ; bottom half is zero |
| ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a |
| ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a |
| ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a |
| ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a |
| jmp .main3 |
| ALIGN function_align |
| cglobal_label .main |
| punpcklwd m12, m21, m14 ; in31 in1 |
| punpckhwd m14, m21 ; in3 in29 |
| punpcklwd m21, m20, m15 ; in27 in5 |
| punpckhwd m15, m20 ; in7 in25 |
| punpcklwd m20, m19, m16 ; in23 in9 |
| punpckhwd m16, m19 ; in11 in21 |
| punpcklwd m19, m18, m17 ; in19 in13 |
| punpckhwd m17, m18 ; in15 in17 |
| .main2: |
| ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a |
| ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a |
| ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a |
| ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a |
| ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a |
| ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a |
| ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a |
| ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a |
| .main3: |
| psubsw m11, m12, m17 ; t17 t30 |
| paddsw m12, m17 ; t16 t31 |
| psubsw m17, m15, m20 ; t18 t29 |
| paddsw m20, m15 ; t19 t28 |
| psubsw m15, m21, m16 ; t21 t26 |
| paddsw m21, m16 ; t20 t27 |
| psubsw m16, m14, m19 ; t22 t25 |
| paddsw m14, m19 ; t23 t24 |
| .main4: |
| ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a |
| ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a |
| ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a |
| ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a |
| vpbroadcastd m8, [o(pw_m3784_1567)] |
| psubsw m19, m12, m20 ; t19a t28a |
| paddsw m20, m12 ; t16a t31a |
| psubsw m12, m14, m21 ; t20a t27a |
| paddsw m14, m21 ; t23a t24a |
| psubsw m21, m11, m17 ; t18 t29 |
| paddsw m11, m17 ; t17 t30 |
| psubsw m17, m16, m15 ; t21 t26 |
| paddsw m16, m15 ; t22 t25 |
| ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a |
| ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 |
| ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 |
| ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a |
| vbroadcasti32x4 m18, [o(deint_shuf)] |
| vpbroadcastd m8, [o(pw_m2896_2896)] |
| vpbroadcastd m9, [o(pw_2896_2896)] |
| psubsw m15, m20, m14 ; t23 t24 |
| paddsw m20, m14 ; t16 t31 |
| psubsw m14, m11, m16 ; t22a t25a |
| paddsw m11, m16 ; t17a t30a |
| psubsw m16, m21, m17 ; t21 t26 |
| paddsw m21, m17 ; t18 t29 |
| psubsw m17, m19, m12 ; t20a t27a |
| paddsw m19, m12 ; t19a t28a |
| REPX {pshufb x, m18}, m20, m11, m21, m19 |
| ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a |
| ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 |
| packssdw m18, m13 ; t23a t22 |
| packssdw m12, m15 ; t24a t25 |
| ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a |
| ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 |
| packssdw m16, m13 ; t20 t21a |
| packssdw m14, m15 ; t27 t26a |
| punpcklqdq m13, m19, m21 ; t19a t18 |
| punpckhqdq m19, m21 ; t28a t29 |
| punpcklqdq m21, m20, m11 ; t16 t17a |
| punpckhqdq m20, m11 ; t31 t30a |
| INIT_ZMM avx512icl |
| mova m15, [o(permA)] |
| ret |
| cglobal_label .main_end |
| vpbroadcastd m10, [o(pw_2048)] |
| vpermt2q m0, m15, m1 ; t0 t1 t2 t3 |
| vpermt2q m20, m15, m19 ; t31 t30a t29 t28a |
| vpermt2q m2, m15, m3 ; t4 t5 t6 t7 |
| vpermt2q m14, m15, m12 ; t27 t26a t25 t24a |
| vpermt2q m4, m15, m5 ; t8 t9 t10 t11 |
| vpermt2q m18, m15, m16 ; t23a t22 t21a t20 |
| vpermt2q m6, m15, m7 ; t12 t13 t14 t15 |
| vpermt2q m13, m15, m21 ; t19a t18 t17a t16 |
| psubsw m7, m0, m20 ; out31 out30 out29 out28 |
| paddsw m0, m20 ; out0 out1 out2 out3 |
| psubsw m5, m2, m14 ; out27 out26 out25 out24 |
| paddsw m2, m14 ; out4 out5 out6 out7 |
| psubsw m3, m4, m18 ; out23 out22 out21 out20 |
| paddsw m4, m18 ; out8 out9 out10 out11 |
| psubsw m1, m6, m13 ; out19 out18 out17 out16 |
| paddsw m6, m13 ; out12 out13 out14 out15 |
| vzeroupper |
| ret |
| |
| %macro LOAD_PACKED_16X2 3 ; dst, row[1-2] |
| vbroadcasti32x4 ym%1, [cq+16*%2] |
| vbroadcasti32x4 ym8, [cq+16*%3] |
| shufpd ym%1, ym8, 0x0c |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob |
| %undef cmp |
| test eobd, eobd |
| jz .dconly |
| lea r5, [o_base] |
| LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 |
| LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 |
| LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 |
| LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 |
| LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 |
| LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 |
| LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 |
| LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 |
| pxor m4, m4 |
| REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 |
| cmp eobd, 107 |
| jb .fast |
| LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 |
| LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 |
| LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 |
| LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 |
| call m(idct_8x16_internal_8bpc).main |
| LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 |
| LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 |
| LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 |
| LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 |
| pxor m8, m8 |
| REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 |
| call m(inv_txfm_add_dct_dct_8x32_8bpc).main |
| jmp .pass2 |
| .fast: ; bottom half is zero |
| mova ym5, ym4 |
| mova ym6, ym4 |
| mova ym7, ym4 |
| call m(idct_8x16_internal_8bpc).main |
| call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast |
| .pass2: |
| vpbroadcastd m10, [o(pw_8192)] |
| vpermt2q m0, m15, m4 ; t0 t1 t9 t8 |
| vpermt2q m20, m15, m18 ; t31 t30a t23a t22 |
| vpermt2q m3, m15, m7 ; t7 t6 t14 t15 |
| vpermt2q m12, m15, m21 ; t25 t24a t17a t16 |
| vpermt2q m2, m15, m6 ; t4 t5 t13 t12 |
| vpermt2q m14, m15, m13 ; t23a t22 t21a t20 |
| vpermt2q m1, m15, m5 ; t3 t2 t10 t11 |
| vpermt2q m19, m15, m16 ; t27 t26a t19a t18 |
| psubsw m8, m0, m20 ; out31 out30 out22 out23 |
| paddsw m0, m20 ; out0 out1 out9 out8 |
| paddsw m6, m3, m12 ; out7 out6 out14 out15 |
| psubsw m3, m12 ; out24 out25 out17 out16 |
| psubsw m5, m2, m14 ; out27 out26 out18 out19 |
| paddsw m4, m2, m14 ; out4 out5 out13 out12 |
| psubsw m7, m1, m19 ; out28 out29 out21 out20 |
| paddsw m2, m1, m19 ; out3 out2 out10 out11 |
| vzeroupper |
| vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 |
| vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 |
| vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 |
| vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 |
| vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 |
| vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 |
| vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 |
| vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 |
| REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 |
| call .main |
| vpbroadcastd m8, [o(pw_2048)] |
| REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 |
| lea r2, [strideq*3] |
| lea r3, [dstq+strideq*4] |
| movshdup m12, [o(permD)] |
| pmovzxbw m8, [dstq+strideq*0] |
| pmovzxbw m9, [dstq+strideq*1] |
| pmovzxbw m10, [dstq+strideq*2] |
| pmovzxbw m11, [dstq+r2 ] |
| paddw m0, m8 |
| paddw m1, m9 |
| paddw m2, m10 |
| paddw m3, m11 |
| pmovzxbw m8, [r3+strideq*0] |
| pmovzxbw m9, [r3+strideq*1] |
| pmovzxbw m10, [r3+strideq*2] |
| pmovzxbw m11, [r3+r2 ] |
| paddw m4, m8 |
| paddw m5, m9 |
| paddw m6, m10 |
| paddw m7, m11 |
| packuswb m0, m1 |
| packuswb m2, m3 |
| vpermq m0, m12, m0 |
| vpermq m2, m12, m2 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], ym2 |
| vextracti32x8 [dstq+r2 ], m2, 1 |
| packuswb m4, m5 |
| packuswb m6, m7 |
| vpermq m4, m12, m4 |
| vpermq m6, m12, m6 |
| mova [r3+strideq*0], ym4 |
| vextracti32x8 [r3+strideq*1], m4, 1 |
| mova [r3+strideq*2], ym6 |
| vextracti32x8 [r3+r2 ], m6, 1 |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 8 |
| .dconly2: |
| imul r6d, 181 |
| add r6d, 128+512 |
| sar r6d, 8+2 |
| .dconly3: |
| imul r6d, 181 |
| add r6d, 128+2048 |
| sar r6d, 8+4 |
| pxor m2, m2 |
| vpbroadcastw m3, r6d |
| .dconly_loop: |
| mova ym1, [dstq+strideq*0] |
| vinserti32x8 m1, [dstq+strideq*1], 1 |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| paddw m0, m3 |
| paddw m1, m3 |
| packuswb m0, m1 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| lea dstq, [dstq+strideq*2] |
| sub r3d, 2 |
| jg .dconly_loop |
| RET |
| ALIGN function_align |
| cglobal_label .main |
| vpbroadcastd m10, [o(pd_2048)] |
| .main2: |
| ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a |
| ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a |
| ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 |
| vpbroadcastd m11, [o(pw_2896_2896)] |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 |
| .main3: |
| paddsw m8, m1, m5 ; t4 |
| psubsw m1, m5 ; t5a |
| paddsw m9, m7, m3 ; t7 |
| psubsw m7, m3 ; t6a |
| ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 |
| psubsw m5, m0, m2 ; dct4 out2 |
| paddsw m2, m0 ; dct4 out1 |
| paddsw m0, m4, m6 ; dct4 out0 |
| psubsw m4, m6 ; dct4 out3 |
| psubsw m6, m2, m1 ; out6 |
| paddsw m1, m2 ; out1 |
| paddsw m2, m5, m7 ; out2 |
| psubsw m5, m7 ; out5 |
| psubsw m7, m0, m9 ; out7 |
| paddsw m0, m9 ; out0 |
| paddsw m3, m4, m8 ; out3 |
| psubsw m4, m8 ; out4 |
| ret |
| |
| cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c |
| vpbroadcastd m7, [pw_5] |
| paddsw m0, m7, [cq+64*0] |
| paddsw m1, m7, [cq+64*1] |
| vpbroadcastd ym9, strided |
| paddsw m2, m7, [cq+64*2] |
| paddsw m3, m7, [cq+64*3] |
| paddsw m4, m7, [cq+64*4] |
| paddsw m5, m7, [cq+64*5] |
| paddsw m6, m7, [cq+64*6] |
| paddsw m7, [cq+64*7] |
| pmulld ym14, ym9, [pd_0to15] |
| lea r3, [dstq+strideq*1] |
| lea r4, [dstq+strideq*2] |
| kxnorb k1, k1, k1 |
| pxor m13, m13 |
| add r1, r4 ; dstq+strideq*3 |
| kmovb k2, k1 |
| vpgatherdq m9{k1}, [r0+ym14*4] |
| kmovb k1, k2 |
| vpgatherdq m10{k2}, [r3+ym14*4] |
| kmovb k2, k1 |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 |
| REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 |
| vpgatherdq m11{k1}, [r4+ym14*4] |
| kmovb k1, k2 |
| vpgatherdq m12{k2}, [r1+ym14*4] |
| REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 |
| punpcklbw m8, m9, m13 ; 0 8 16 24 |
| punpckhbw m9, m13 ; 4 12 20 28 |
| paddw m0, m8 |
| paddw m4, m9 |
| packuswb m0, m4 |
| kmovb k2, k1 |
| vpscatterdq [r0+ym14*4]{k1}, m0 |
| punpcklbw m8, m10, m13 ; 1 9 17 25 |
| punpckhbw m10, m13 ; 5 13 21 29 |
| paddw m1, m8 |
| paddw m5, m10 |
| packuswb m1, m5 |
| kmovb k1, k2 |
| vpscatterdq [r3+ym14*4]{k2}, m1 |
| punpcklbw m8, m11, m13 ; 2 10 18 26 |
| punpckhbw m11, m13 ; 6 14 22 30 |
| paddw m2, m8 |
| paddw m6, m11 |
| packuswb m2, m6 |
| kmovb k2, k1 |
| vpscatterdq [r4+ym14*4]{k1}, m2 |
| punpcklbw m8, m12, m13 ; 3 11 19 27 |
| punpckhbw m12, m13 ; 7 15 23 31 |
| paddw m3, m8 |
| paddw m7, m12 |
| packuswb m3, m7 |
| vpscatterdq [r1+ym14*4]{k2}, m3 |
| RET |
| |
| cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c |
| vpbroadcastd m0, [pw_4096] |
| pmulhrsw m3, m0, [cq+64*0] |
| pmulhrsw m4, m0, [cq+64*4] |
| pmulhrsw m6, m0, [cq+64*1] |
| pmulhrsw m5, m0, [cq+64*5] |
| pmulhrsw m7, m0, [cq+64*2] |
| pmulhrsw m2, m0, [cq+64*6] |
| pmulhrsw m8, m0, [cq+64*3] |
| pmulhrsw m0, [cq+64*7] |
| mova m13, [int8_permA] |
| lea r3, [strideq*3] |
| lea r4, [dstq+strideq*4] |
| punpckldq m1, m3, m4 |
| punpckhdq m3, m4 |
| punpckldq m4, m6, m5 |
| punpckhdq m6, m5 |
| punpckldq m5, m7, m2 |
| punpckhdq m7, m2 |
| punpckldq m2, m8, m0 |
| punpckhdq m8, m0 |
| mova ym9, [dstq+strideq*0] |
| vinserti32x8 m9, [dstq+strideq*2], 1 |
| mova ym10, [dstq+strideq*1] |
| vinserti32x8 m10, [dstq+r3 ], 1 |
| mova ym11, [r4+strideq*0] |
| vinserti32x8 m11, [r4+strideq*2], 1 |
| mova ym12, [r4+strideq*1] |
| vinserti32x8 m12, [r4+r3 ], 1 |
| REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 |
| pxor m13, m13 |
| REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 |
| punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 |
| punpckhqdq m1, m4 ; b0 b2 d0 d2 |
| punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 |
| punpckhqdq m5, m2 ; b1 b3 d1 d3 |
| punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 |
| punpckhqdq m3, m6 ; f0 f2 h0 h2 |
| punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 |
| punpckhqdq m7, m8 ; f1 f3 h1 h3 |
| punpcklbw m8, m9, m13 |
| punpckhbw m9, m13 |
| paddw m0, m8 |
| paddw m4, m9 |
| packuswb m0, m4 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*2], m0, 1 |
| punpcklbw m8, m10, m13 |
| punpckhbw m10, m13 |
| paddw m1, m8 |
| paddw m5, m10 |
| packuswb m1, m5 |
| mova [dstq+strideq*1], ym1 |
| vextracti32x8 [dstq+r3 ], m1, 1 |
| punpcklbw m8, m11, m13 |
| punpckhbw m11, m13 |
| paddw m2, m8 |
| paddw m6, m11 |
| packuswb m2, m6 |
| mova [r4+strideq*0], ym2 |
| vextracti32x8 [r4+strideq*2], m2, 1 |
| punpcklbw m8, m12, m13 |
| punpckhbw m12, m13 |
| paddw m3, m8 |
| paddw m7, m12 |
| packuswb m3, m7 |
| mova [r4+strideq*1], ym3 |
| vextracti32x8 [r4+r3 ], m3, 1 |
| RET |
| |
| %macro IDCT_16x32_END 3 ; src[1-2], row |
| mova xm8, [dstq+strideq*0] |
| vinserti32x4 ym8, [dstq+strideq*1], 1 |
| mova xm9, [dstq+r3 ] |
| vinserti32x4 ym9, [dstq+strideq*2], 1 |
| pmulhrsw m%1, m10 |
| pmulhrsw m%2, m10 |
| vpermb m8, m11, m8 |
| vpermb m9, m11, m9 |
| mova [cq+64*(%3*2+0)], m13 |
| mova [cq+64*(%3*2+1)], m13 |
| paddw m8, m%1 |
| paddw m9, m%2 |
| packuswb m8, m9 |
| vpermd m8, m12, m8 |
| mova [dstq+strideq*0], xm8 |
| vextracti32x4 [dstq+strideq*1], ym8, 1 |
| vextracti32x4 [dstq+strideq*2], m8, 2 |
| vextracti32x4 [dstq+r3 ], m8, 3 |
| %if %1 != 20 |
| lea dstq, [dstq+strideq*4] |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| vpbroadcastd m15, [o(pw_2896x8)] |
| cmp eobd, 151 |
| jb .fast |
| pmulhrsw m5, m15, [cq+64*10] |
| pmulhrsw m3, m15, [cq+64* 6] |
| pmulhrsw m1, m15, [cq+64* 2] |
| pmulhrsw m7, m15, [cq+64*14] |
| pmulhrsw m2, m15, [cq+64* 4] |
| pmulhrsw m6, m15, [cq+64*12] |
| pmulhrsw m0, m15, [cq+64* 0] |
| pmulhrsw m4, m15, [cq+64* 8] |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| pmulhrsw m14, m15, [cq+64* 1] |
| pmulhrsw m21, m15, [cq+64*15] |
| pmulhrsw m18, m15, [cq+64* 9] |
| pmulhrsw m17, m15, [cq+64* 7] |
| pmulhrsw m16, m15, [cq+64* 5] |
| pmulhrsw m19, m15, [cq+64*11] |
| pmulhrsw m20, m15, [cq+64*13] |
| pmulhrsw m15, [cq+64* 3] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| mova m8, [o(idct_16x32p)] |
| vpbroadcastd m9, [o(pw_16384)] |
| REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ |
| m14, m15, m16, m17, m18, m19, m20, m21 |
| punpckldq m8, m0, m1 |
| punpckhdq m0, m1 |
| punpckldq m1, m2, m3 |
| punpckhdq m2, m3 |
| REPX {pmulhrsw x, m9}, m8, m0, m1, m2 |
| punpckldq m3, m4, m5 |
| punpckhdq m4, m5 |
| punpckldq m5, m6, m7 |
| punpckhdq m6, m7 |
| REPX {pmulhrsw x, m9}, m3, m4, m5, m6 |
| punpckldq m7, m14, m15 |
| punpckhdq m14, m15 |
| punpckldq m15, m16, m17 |
| punpckhdq m16, m17 |
| REPX {pmulhrsw x, m9}, m7, m14, m15, m16 |
| punpckldq m17, m18, m19 |
| punpckhdq m18, m19 |
| punpckldq m19, m20, m21 |
| punpckhdq m20, m21 |
| REPX {pmulhrsw x, m9}, m17, m18, m19, m20 |
| punpcklqdq m21, m8, m1 |
| punpckhqdq m8, m1 |
| punpcklqdq m1, m0, m2 |
| punpckhqdq m0, m2 |
| punpcklqdq m2, m3, m5 |
| punpckhqdq m3, m5 |
| punpcklqdq m5, m4, m6 |
| punpckhqdq m4, m6 |
| punpcklqdq m6, m7, m15 |
| punpckhqdq m7, m15 |
| punpcklqdq m15, m14, m16 |
| punpckhqdq m14, m16 |
| punpcklqdq m16, m17, m19 |
| punpckhqdq m17, m19 |
| punpcklqdq m19, m18, m20 |
| punpckhqdq m18, m20 |
| vinserti32x8 m20, m21, ym2, 1 |
| vshufi32x4 m21, m2, q3232 |
| vinserti32x8 m2, m8, ym3, 1 |
| vshufi32x4 m8, m3, q3232 |
| vinserti32x8 m3, m1, ym5, 1 |
| vshufi32x4 m1, m5, q3232 |
| vinserti32x8 m5, m0, ym4, 1 |
| vshufi32x4 m0, m4, q3232 |
| vinserti32x8 m4, m6, ym16, 1 |
| vshufi32x4 m6, m16, q3232 |
| vinserti32x8 m16, m7, ym17, 1 |
| vshufi32x4 m7, m17, q3232 |
| vinserti32x8 m17, m15, ym19, 1 |
| vshufi32x4 m15, m19, q3232 |
| vinserti32x8 m19, m14, ym18, 1 |
| vshufi32x4 m14, m18, q3232 |
| vshufi32x4 m18, m21, m6, q3131 ; 27 5 |
| vshufi32x4 m21, m6, q2020 ; 31 1 |
| vshufi32x4 m6, m8, m7, q2020 ; 24 8 |
| vshufi32x4 m8, m7, q3131 ; 30 2 |
| vshufi32x4 m7, m1, m15, q2020 ; 28 4 |
| vshufi32x4 m1, m15, q3131 ; 6 26 |
| vshufi32x4 m15, m0, m14, q2020 ; 7 25 |
| vshufi32x4 m0, m14, q3131 ; 14 18 |
| vshufi32x4 m14, m20, m4, q2020 ; 3 29 |
| vshufi32x4 m20, m4, q3131 ; 23 9 |
| vshufi32x4 m9, m3, m17, q2020 ; 16 0 |
| vshufi32x4 m3, m17, q3131 ; 12 20 |
| vshufi32x4 m17, m5, m19, q2020 ; 15 17 |
| vshufi32x4 m5, m19, q3131 ; 22 10 |
| vshufi32x4 m19, m2, m16, q2020 ; 19 13 |
| vshufi32x4 m16, m2, m16, q3131 ; 11 21 |
| call m(idct_16x16_internal_8bpc).main3 |
| call .main_oddhalf |
| jmp .pass2 |
| .fast: ; right half is zero |
| mova ym8, [cq+64*15] |
| vinserti32x8 m8, [cq+64* 1], 1 |
| mova m2, [o(int16_perm)] |
| mova ym9, [cq+64* 8] |
| vinserti32x8 m9, [cq+64* 0], 1 |
| mova ym0, [cq+64* 7] |
| vinserti32x8 m0, [cq+64* 9], 1 |
| mova ym7, [cq+64*14] |
| vinserti32x8 m7, [cq+64* 2], 1 |
| mova ym1, [cq+64* 3] |
| vinserti32x8 m1, [cq+64*13], 1 |
| mova ym3, [cq+64* 6] |
| vinserti32x8 m3, [cq+64*10], 1 |
| mova ym5, [cq+64*11] |
| vinserti32x8 m5, [cq+64* 5], 1 |
| mova ym6, [cq+64*12] |
| vinserti32x8 m6, [cq+64* 4], 1 |
| REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 |
| REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 |
| call m(idct_16x16_internal_8bpc).main2 |
| vbroadcasti32x4 m8, [o(int_shuf3)] |
| vbroadcasti32x4 m9, [o(int_shuf4)] |
| vpbroadcastd m11, [o(pw_16384)] |
| pshufb m0, m8 |
| pshufb m1, m9 |
| pshufb m2, m8 |
| pshufb m3, m9 |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3 |
| pshufb m4, m8 |
| pshufb m5, m9 |
| pshufb m6, m8 |
| pshufb m7, m9 |
| REPX {pmulhrsw x, m11}, m4, m5, m6, m7 |
| punpckhdq m17, m0, m1 |
| punpckldq m0, m1 |
| punpckhdq m16, m2, m3 |
| punpckldq m2, m3 |
| punpckhdq m18, m4, m5 |
| punpckldq m4, m5 |
| punpckhdq m5, m6, m7 |
| punpckldq m6, m7 |
| vinserti32x8 m1, m0, ym2, 1 |
| vshufi32x4 m3, m0, m2, q3232 |
| vinserti32x8 m2, m4, ym6, 1 |
| vshufi32x4 m4, m6, q3232 |
| vinserti32x8 m15, m17, ym16, 1 |
| vshufi32x4 m17, m16, q3232 |
| vinserti32x8 m16, m18, ym5, 1 |
| vshufi32x4 m18, m5, q3232 |
| vshufi32x4 m0, m1, m2, q2020 ; 0 2 |
| vshufi32x4 m1, m2, q3131 ; 4 6 |
| vshufi32x4 m2, m3, m4, q2020 ; 8 10 |
| vshufi32x4 m3, m4, q3131 ; 12 14 |
| vshufi32x4 m14, m15, m16, q2020 ; 1 3 |
| vshufi32x4 m15, m16, q3131 ; 5 7 |
| vshufi32x4 m16, m17, m18, q2020 ; 9 11 |
| vshufi32x4 m17, m18, q3131 ; 13 15 |
| pxor m6, m6 |
| punpckhwd m8, m0, m0 |
| punpcklwd m9, m6, m0 |
| punpckhwd m0, m3, m3 |
| punpckhwd m5, m2, m2 |
| punpcklwd m7, m1, m1 |
| punpckhwd m1, m1 |
| punpcklwd m3, m3 |
| punpcklwd m6, m2 |
| call m(idct_16x16_internal_8bpc).main_fast5 |
| punpcklwd m21, m14, m14 |
| punpckhwd m14, m14 |
| punpcklwd m18, m15, m15 |
| punpckhwd m15, m15 |
| punpcklwd m20, m16, m16 |
| punpckhwd m16, m16 |
| punpcklwd m19, m17, m17 |
| punpckhwd m17, m17 |
| call .main_oddhalf_fast |
| .pass2: |
| vpbroadcastd m10, [o(pw_2048)] |
| mova m11, [o(end_16x32p)] |
| lea r3, [strideq*3] |
| pxor m13, m13 |
| psrld m12, m11, 8 |
| IDCT_16x32_END 0, 1, 0 |
| IDCT_16x32_END 2, 3, 1 |
| IDCT_16x32_END 4, 5, 2 |
| IDCT_16x32_END 6, 7, 3 |
| IDCT_16x32_END 14, 15, 4 |
| IDCT_16x32_END 16, 17, 5 |
| IDCT_16x32_END 18, 19, 6 |
| IDCT_16x32_END 20, 21, 7 |
| RET |
| ALIGN function_align |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 32 |
| jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero |
| vpbroadcastd m8, [o(pw_201_4091x8)] |
| vpbroadcastd m20, [o(pw_m1380_3857x8)] |
| vpbroadcastd m9, [o(pw_995_3973x8)] |
| vpbroadcastd m16, [o(pw_m601_4052x8)] |
| pmulhrsw m21, m8 ; t16a, t31a |
| pmulhrsw m20, m15 ; t19a, t28a |
| pmulhrsw m18, m9 ; t20a, t27a |
| pmulhrsw m14, m16 ; t23a, t24a |
| mova m8, m21 |
| mova m17, m20 |
| mova m15, m18 |
| mova m16, m14 |
| jmp .main3 |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast ; bottom half is zero |
| vpbroadcastd m8, [o(pw_201_4091x8)] |
| vpbroadcastd m9, [o(pw_m2751_3035x8)] |
| vpbroadcastd m11, [o(pw_1751_3703x8)] |
| vpbroadcastd m12, [o(pw_m1380_3857x8)] |
| pmulhrsw m21, m8 ; t16a, t31a |
| vpbroadcastd m8, [o(pw_995_3973x8)] |
| pmulhrsw m17, m9 ; t17a, t30a |
| vpbroadcastd m9, [o(pw_m2106_3513x8)] |
| pmulhrsw m20, m11 ; t18a, t29a |
| vpbroadcastd m11, [o(pw_2440_3290x8)] |
| pmulhrsw m15, m12 ; t19a, t28a |
| vpbroadcastd m12, [o(pw_m601_4052x8)] |
| pmulhrsw m18, m8 ; t20a, t27a |
| pmulhrsw m16, m9 ; t21a, t26a |
| pmulhrsw m19, m11 ; t22a, t25a |
| pmulhrsw m14, m12 ; t23a, t24a |
| jmp .main2 |
| ALIGN function_align |
| cglobal_label .main_oddhalf |
| ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a |
| ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a |
| ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a |
| ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a |
| ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a |
| ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a |
| ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a |
| ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a |
| .main2: |
| psubsw m8, m21, m17 ; t17 t30 |
| paddsw m21, m17 ; t16 t31 |
| psubsw m17, m15, m20 ; t18 t29 |
| paddsw m20, m15 ; t19 t28 |
| psubsw m15, m18, m16 ; t21 t26 |
| paddsw m18, m16 ; t20 t27 |
| psubsw m16, m14, m19 ; t22 t25 |
| paddsw m14, m19 ; t23 t24 |
| .main3: |
| ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a |
| ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a |
| ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a |
| ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a |
| vpbroadcastd m11, [o(pw_m3784_1567)] |
| psubsw m19, m21, m20 ; t19a t28a |
| paddsw m21, m20 ; t16a t31a |
| psubsw m20, m14, m18 ; t20a t27a |
| paddsw m14, m18 ; t23a t24a |
| psubsw m18, m8, m17 ; t18 t29 |
| paddsw m8, m17 ; t17 t30 |
| psubsw m17, m16, m15 ; t21 t26 |
| paddsw m15, m16 ; t22 t25 |
| ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a |
| ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 |
| ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 |
| ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a |
| vbroadcasti32x4 m9, [o(deint_shuf)] |
| psubsw m16, m21, m14 ; t23 t24 |
| paddsw m14, m21 ; t16 t31 |
| psubsw m21, m8, m15 ; t22a t25a |
| paddsw m15, m8 ; t17a t30a |
| psubsw m8, m18, m17 ; t21 t26 |
| paddsw m18, m17 ; t18 t29 |
| paddsw m17, m19, m20 ; t19a t28a |
| psubsw m19, m20 ; t20a t27a |
| vpbroadcastd m11, [o(pw_m2896_2896)] |
| vpbroadcastd m12, [o(pw_2896_2896)] |
| REPX {pshufb x, m9}, m14, m15, m18, m17 |
| mova m9, m10 |
| vpdpwssd m9, m16, m11 |
| mova m20, m10 |
| vpdpwssd m20, m21, m11 |
| psrad m9, 12 |
| psrad m20, 12 |
| packssdw m9, m20 ; t23a t22 |
| mova m20, m10 |
| vpdpwssd m20, m16, m12 |
| mova m16, m10 |
| vpdpwssd m16, m21, m12 |
| psrad m20, 12 |
| psrad m16, 12 |
| packssdw m16, m20, m16 ; t24a t25 |
| ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a |
| ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 |
| packssdw m11, m20 ; t27 t26a |
| packssdw m8, m21 ; t20 t21a |
| punpcklqdq m20, m14, m15 ; t16 t17a |
| punpckhqdq m14, m15 ; t31 t30a |
| punpckhqdq m15, m17, m18 ; t28a t29 |
| punpcklqdq m17, m18 ; t19a t18 |
| psubsw m21, m0, m14 ; out31 out30 |
| paddsw m0, m14 ; out0 out1 |
| psubsw m14, m7, m20 ; out16 out17 |
| paddsw m7, m20 ; out15 out14 |
| psubsw m20, m1, m15 ; out28 out29 |
| paddsw m1, m15 ; out3 out2 |
| psubsw m15, m6, m17 ; out19 out18 |
| paddsw m6, m17 ; out12 out13 |
| psubsw m17, m4, m9 ; out23 out22 |
| paddsw m4, m9 ; out8 out9 |
| psubsw m18, m3, m16 ; out24 out25 |
| paddsw m3, m16 ; out7 out6 |
| psubsw m16, m5, m8 ; out20 out21 |
| paddsw m5, m8 ; out11 out10 |
| psubsw m19, m2, m11 ; out27 out26 |
| paddsw m2, m11 ; out4 out5 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| mova m21, [o(permB)] |
| vpermq m1, m21, [cq+64* 0] ; 0 1 |
| vpermq m14, m21, [cq+64* 1] ; 2 3 |
| vpermq m20, m21, [cq+64* 2] ; 4 5 |
| vpermq m15, m21, [cq+64* 3] ; 6 7 |
| vpbroadcastd m8, [o(pw_2896x8)] |
| vpermq m2, m21, [cq+64* 4] ; 8 9 |
| vpermq m16, m21, [cq+64* 5] ; 10 11 |
| vpermq m3, m21, [cq+64* 6] ; 12 13 |
| vpermq m17, m21, [cq+64* 7] ; 14 15 |
| REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 |
| pxor m12, m12 |
| REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 |
| cmp eobd, 151 |
| jb .fast |
| vpermq m9, m21, [cq+64* 8] ; 16 17 |
| vpermq m19, m21, [cq+64* 9] ; 18 19 |
| vpermq m4, m21, [cq+64*10] ; 20 21 |
| vpermq m5, m21, [cq+64*11] ; 22 23 |
| vpermq m6, m21, [cq+64*12] ; 24 25 |
| vpermq m18, m21, [cq+64*13] ; 26 27 |
| vpermq m7, m21, [cq+64*14] ; 28 29 |
| vpermq m21, m21, [cq+64*15] ; 30 31 |
| REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 |
| REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 |
| punpcklwd m8, m21, m14 ; 30 2 |
| punpckhwd m21, m1 ; 31 1 |
| punpcklwd m0, m17, m19 ; 14 18 |
| punpckhwd m17, m9 ; 15 17 |
| punpcklwd m9, m1 ; 16 0 |
| punpckhwd m14, m7 ; 3 29 |
| punpcklwd m1, m15, m18 ; 6 26 |
| punpckhwd m15, m6 ; 7 25 |
| punpcklwd m6, m2 ; 24 8 |
| punpckhwd m19, m3 ; 19 13 |
| punpcklwd m3, m4 ; 12 20 |
| punpckhwd m18, m20 ; 27 5 |
| punpcklwd m7, m20 ; 28 4 |
| punpckhwd m20, m5, m2 ; 23 9 |
| punpcklwd m5, m16 ; 22 10 |
| punpckhwd m16, m4 ; 11 21 |
| call m(idct_16x16_internal_8bpc).main2 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf |
| jmp .pass2 |
| .fast: ; bottom half zero |
| punpcklwd m8, m14, m14 ; 2 |
| punpcklwd m0, m17, m17 ; 14 |
| punpcklwd m5, m16, m16 ; 10 |
| punpcklwd m9, m12, m1 ; __ 0 |
| punpckhwd m21, m1, m1 ; 1 |
| punpcklwd m1, m15, m15 ; 6 |
| punpcklwd m7, m20, m20 ; 4 |
| punpckhwd m19, m3, m3 ; 13 |
| punpcklwd m3, m3 ; 12 |
| punpcklwd m6, m12, m2 ; __ 8 |
| punpckhwd m18, m20, m20 ; 5 |
| punpckhwd m20, m2, m2 ; 9 |
| call m(idct_16x16_internal_8bpc).main_fast |
| punpckhwd m15, m15 ; 7 |
| punpckhwd m14, m14 ; 3 |
| punpckhwd m16, m16 ; 11 |
| punpckhwd m17, m17 ; 15 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| .pass2: |
| vpbroadcastd m9, [o(pw_16384)] |
| call .transpose_round |
| vshufi32x4 m16, m14, m2, q3131 ; 5 |
| vshufi32x4 m14, m2, q2020 ; 1 |
| vshufi32x4 m2, m0, m3, q3131 ; 4 |
| vshufi32x4 m0, m3, q2020 ; 0 |
| vshufi32x4 m3, m1, m18, q3131 ; 6 |
| vshufi32x4 m1, m18, q2020 ; 2 |
| vshufi32x4 m18, m20, m6, q2020 ; 9 |
| vshufi32x4 m20, m6, q3131 ; 13 |
| vshufi32x4 m6, m21, m4, q3131 ; 12 |
| vshufi32x4 m4, m21, m4, q2020 ; 8 |
| vshufi32x4 m21, m19, m7, q3131 ; 15 |
| vshufi32x4 m19, m7, q2020 ; 11 |
| vshufi32x4 m7, m5, m15, q3131 ; 14 |
| vshufi32x4 m5, m15, q2020 ; 10 |
| vshufi32x4 m15, m17, m9, q2020 ; 3 |
| vshufi32x4 m17, m9, q3131 ; 7 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 |
| call .main_oddhalf |
| vpbroadcastd m12, [o(pw_2048)] |
| movshdup m13, [o(permD)] |
| lea r2, [strideq*3] |
| pmovzxbw m8, [dstq+strideq*0] |
| pmovzxbw m9, [dstq+strideq*1] |
| pmovzxbw m10, [dstq+strideq*2] |
| pmovzxbw m11, [dstq+r2 ] |
| REPX {pmulhrsw x, m12}, m0, m1, m2, m3 |
| lea r3, [dstq+strideq*4] |
| paddw m0, m8 |
| paddw m1, m9 |
| paddw m2, m10 |
| paddw m3, m11 |
| pmovzxbw m8, [r3+strideq*0] |
| pmovzxbw m9, [r3+strideq*1] |
| pmovzxbw m10, [r3+strideq*2] |
| pmovzxbw m11, [r3+r2 ] |
| REPX {pmulhrsw x, m12}, m4, m5, m6, m7 |
| lea r4, [dstq+strideq*8] |
| packuswb m0, m1 |
| paddw m4, m8 |
| paddw m5, m9 |
| packuswb m2, m3 |
| paddw m6, m10 |
| paddw m7, m11 |
| pmovzxbw m8, [r4+strideq*0] |
| pmovzxbw m9, [r4+strideq*1] |
| pmovzxbw m10, [r4+strideq*2] |
| pmovzxbw m11, [r4+r2 ] |
| REPX {pmulhrsw x, m12}, m14, m15, m16, m17 |
| lea r5, [r3+strideq*8] |
| packuswb m4, m5 |
| paddw m14, m8 |
| paddw m15, m9 |
| packuswb m6, m7 |
| paddw m16, m10 |
| paddw m17, m11 |
| pmovzxbw m8, [r5+strideq*0] |
| pmovzxbw m9, [r5+strideq*1] |
| pmovzxbw m10, [r5+strideq*2] |
| pmovzxbw m11, [r5+r2 ] |
| REPX {pmulhrsw x, m12}, m18, m19, m20, m21 |
| packuswb m14, m15 |
| paddw m18, m8 |
| paddw m19, m9 |
| packuswb m16, m17 |
| paddw m20, m10 |
| paddw m21, m11 |
| packuswb m18, m19 |
| packuswb m20, m21 |
| REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 |
| mova [dstq+strideq*0], ym0 |
| vextracti32x8 [dstq+strideq*1], m0, 1 |
| mova [dstq+strideq*2], ym2 |
| vextracti32x8 [dstq+r2 ], m2, 1 |
| mova [r3+strideq*0], ym4 |
| vextracti32x8 [r3+strideq*1], m4, 1 |
| mova [r3+strideq*2], ym6 |
| vextracti32x8 [r3+r2 ], m6, 1 |
| mova [r4+strideq*0], ym14 |
| vextracti32x8 [r4+strideq*1], m14, 1 |
| mova [r4+strideq*2], ym16 |
| vextracti32x8 [r4+r2 ], m16, 1 |
| mova [r5+strideq*0], ym18 |
| vextracti32x8 [r5+strideq*1], m18, 1 |
| mova [r5+strideq*2], ym20 |
| vextracti32x8 [r5+r2 ], m20, 1 |
| RET |
| ALIGN function_align |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 16 |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero |
| vpbroadcastd m9, [o(pw_2896x8)] |
| vpbroadcastd m2, [o(pw_4017x8)] |
| vpbroadcastd m3, [o(pw_799x8)] |
| vpbroadcastd m18, [o(pw_4076x8)] |
| vpbroadcastd m19, [o(pw_401x8)] |
| vpbroadcastd m20, [o(pw_m1189x8)] |
| vpbroadcastd m16, [o(pw_3920x8)] |
| pmulhrsw m9, m0 ; t0 |
| pmulhrsw m2, m1 ; t7a |
| pmulhrsw m1, m3 ; t4a |
| pmulhrsw m18, m14 ; t15a |
| pmulhrsw m14, m19 ; t8a |
| pmulhrsw m20, m15 ; t11a |
| pmulhrsw m15, m16 ; t12a |
| psubsw m7, m9, m2 ; idct8 out7 |
| paddsw m0, m9, m2 ; idct8 out0 |
| psubsw m4, m9, m1 ; idct8 out4 |
| paddsw m3, m9, m1 ; idct8 out3 |
| ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 |
| mova m21, m18 |
| mova m19, m14 |
| mova m16, m15 |
| mova m8, m20 |
| psubsw m6, m9, m1 ; idct8 out6 |
| paddsw m1, m9 ; idct8 out1 |
| psubsw m5, m9, m2 ; idct8 out5 |
| paddsw m2, m9 ; idct8 out2 |
| jmp .main3 |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast ; bottom half is zero |
| vpbroadcastd m5, [o(pw_m2276x8)] |
| vpbroadcastd m11, [o(pw_3406x8)] |
| vpbroadcastd m7, [o(pw_4017x8)] |
| vpbroadcastd m12, [o(pw_799x8)] |
| vpbroadcastd m6, [o(pw_3784x8)] |
| vpbroadcastd m10, [o(pw_1567x8)] |
| vpbroadcastd m4, [o(pw_2896x8)] |
| pmulhrsw m5, m3 ; t5a |
| pmulhrsw m3, m11 ; t6a |
| pmulhrsw m7, m1 ; t7a |
| pmulhrsw m1, m12 ; t4a |
| pmulhrsw m6, m2 ; t3 |
| pmulhrsw m2, m10 ; t2 |
| pmulhrsw m4, m0 ; t0 |
| vpbroadcastd m11, [o(pw_2896_2896)] |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| vpbroadcastd m10, [o(pd_2048)] |
| mova m0, m4 ; t1 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 |
| vpbroadcastd m21, [o(pw_4076x8)] |
| vpbroadcastd m8, [o(pw_401x8)] |
| vpbroadcastd m18, [o(pw_m2598x8)] |
| vpbroadcastd m9, [o(pw_3166x8)] |
| vpbroadcastd m19, [o(pw_3612x8)] |
| vpbroadcastd m11, [o(pw_1931x8)] |
| vpbroadcastd m20, [o(pw_m1189x8)] |
| vpbroadcastd m12, [o(pw_3920x8)] |
| pmulhrsw m21, m14 ; t15a |
| pmulhrsw m14, m8 ; t8a |
| pmulhrsw m18, m17 ; t9a |
| pmulhrsw m17, m9 ; t14a |
| pmulhrsw m19, m16 ; t13a |
| pmulhrsw m16, m11 ; t10a |
| pmulhrsw m20, m15 ; t11a |
| pmulhrsw m15, m12 ; t12a |
| jmp .main2 |
| ALIGN function_align |
| cglobal_label .main_oddhalf |
| ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a |
| ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a |
| ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a |
| ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a |
| .main2: |
| paddsw m8, m20, m16 ; t11 |
| psubsw m20, m16 ; t10 |
| paddsw m16, m15, m19 ; t12 |
| psubsw m15, m19 ; t13 |
| psubsw m19, m14, m18 ; t9 |
| paddsw m14, m18 ; t8 |
| psubsw m18, m21, m17 ; t14 |
| paddsw m21, m17 ; t15 |
| .main3: |
| vpbroadcastd m11, [o(pw_1567_3784)] |
| vpbroadcastd m12, [o(pw_m3784_1567)] |
| ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a |
| vpbroadcastd m11, [o(pw_m1567_m3784)] |
| ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a |
| vpbroadcastd m11, [o(pw_2896_2896)] |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| psubsw m17, m14, m8 ; t11a |
| paddsw m8, m14 ; t8a |
| paddsw m14, m18, m15 ; t9 |
| psubsw m18, m15 ; t10 |
| psubsw m15, m19, m20 ; t13 |
| paddsw m19, m20 ; t14 |
| paddsw m20, m21, m16 ; t15a |
| psubsw m16, m21, m16 ; t12a |
| ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a |
| ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 |
| psubsw m21, m0, m20 ; out15 |
| paddsw m0, m20 ; out0 |
| psubsw m20, m1, m19 ; out14 |
| paddsw m1, m19 ; out1 |
| psubsw m19, m2, m18 ; out13 |
| paddsw m2, m18 ; out2 |
| psubsw m18, m3, m17 ; out12 |
| paddsw m3, m17 ; out3 |
| psubsw m17, m4, m16 ; out11 |
| paddsw m4, m16 ; out4 |
| psubsw m16, m5, m15 ; out10 |
| paddsw m5, m15 ; out5 |
| psubsw m15, m6, m14 ; out9 |
| paddsw m6, m14 ; out6 |
| psubsw m14, m7, m8 ; out8 |
| paddsw m7, m8 ; out7 |
| ret |
| .transpose_round: |
| punpcklwd m8, m0, m2 |
| punpckhwd m0, m2 |
| punpcklwd m2, m1, m3 |
| punpckhwd m1, m3 |
| punpcklwd m3, m4, m6 |
| punpckhwd m4, m6 |
| punpcklwd m6, m5, m7 |
| punpckhwd m5, m7 |
| punpcklwd m7, m14, m16 |
| punpckhwd m14, m16 |
| punpcklwd m16, m15, m17 |
| punpckhwd m15, m17 |
| punpcklwd m17, m19, m21 |
| punpckhwd m19, m21 |
| punpckhwd m21, m18, m20 |
| punpcklwd m18, m20 |
| punpcklwd m20, m8, m1 |
| punpckhwd m8, m1 |
| punpcklwd m1, m0, m2 |
| punpckhwd m0, m2 |
| punpcklwd m2, m3, m5 |
| punpckhwd m3, m5 |
| punpcklwd m5, m4, m6 |
| punpckhwd m4, m6 |
| REPX {pmulhrsw x, m9}, m20, m8, m1, m0 |
| punpcklwd m6, m7, m15 |
| punpckhwd m7, m15 |
| punpcklwd m15, m14, m16 |
| punpckhwd m14, m16 |
| REPX {pmulhrsw x, m9}, m2, m3, m5, m4 |
| punpckhwd m16, m18, m19 |
| punpcklwd m18, m19 |
| punpcklwd m19, m21, m17 |
| punpckhwd m21, m17 |
| REPX {pmulhrsw x, m9}, m6, m7, m15, m14 |
| punpcklwd m17, m8, m0 ; a2 a6 aa ae |
| punpckhwd m8, m0 ; a3 a7 ab af |
| punpcklwd m0, m20, m1 ; a0 a4 a8 ac |
| punpckhwd m20, m1 ; a1 a5 a9 ad |
| REPX {pmulhrsw x, m9}, m16, m18, m19, m21 |
| punpcklwd m1, m2, m5 ; b0 b4 b8 bc |
| punpckhwd m2, m5 ; b1 b5 b9 bd |
| punpcklwd m5, m3, m4 ; b2 b6 ba be |
| punpckhwd m3, m4 ; b3 b7 bb bf |
| punpcklwd m4, m6, m15 ; c0 c4 c8 cc |
| punpckhwd m6, m15 ; c1 c5 c9 cd |
| punpcklwd m15, m7, m14 ; c2 c6 ca ce |
| punpckhwd m7, m14 ; c3 c7 cb cf |
| punpcklwd m14, m18, m19 ; d0 d4 d8 dc |
| punpckhwd m18, m19 ; d1 d5 d9 dd |
| punpcklwd m9, m16, m21 ; d2 d6 da de |
| punpckhwd m16, m21 ; d3 d7 db df |
| vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc |
| vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 |
| vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 |
| vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be |
| vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 |
| vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf |
| vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 |
| vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc |
| vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 |
| vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd |
| vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 |
| vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd |
| vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 |
| vshufi32x4 m15, m9, q3232 ; ca ce da de |
| vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 |
| vshufi32x4 m7, m16, q3232 ; cb cf db df |
| ret |
| |
| %macro IDTX_16x32 4 ; src/dst[1-4] |
| pmulhrsw m%1, m15, [cq+64*%1] |
| pmulhrsw m%2, m15, [cq+64*%2] |
| pmulhrsw m%3, m15, [cq+64*%3] |
| pmulhrsw m%4, m15, [cq+64*%4] |
| pmulhrsw m18, m16, m%1 |
| pmulhrsw m19, m16, m%2 |
| pmulhrsw m20, m16, m%3 |
| pmulhrsw m21, m16, m%4 |
| REPX {pmulhrsw x, m17}, m18, m19, m20, m21 |
| paddsw m%1, m18 |
| paddsw m%2, m19 |
| paddsw m%3, m20 |
| paddsw m%4, m21 |
| %endmacro |
| |
| %macro IDTX_16x32_STORE 2 ; src[1-2] |
| mova xm17, [dstq+r3*0] |
| vinserti128 ym17, [dstq+r3*4], 1 |
| vinserti32x4 m17, [dstq+r3*8], 2 |
| vinserti32x4 m17, [dstq+r4*8], 3 |
| mova [cq+64*(%1*2+0)], m18 |
| mova [cq+64*(%1*2+1)], m18 |
| punpcklbw m16, m17, m18 |
| punpckhbw m17, m18 |
| paddw m16, m%1 |
| paddw m17, m%2 |
| packuswb m16, m17 |
| mova [dstq+r3*0], xm16 |
| vextracti128 [dstq+r3*4], ym16, 1 |
| vextracti32x4 [dstq+r3*8], m16, 2 |
| vextracti32x4 [dstq+r4*8], m16, 3 |
| %if %1 != 7 |
| add dstq, strideq |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c |
| vpbroadcastd m15, [pw_2896x8] |
| vpbroadcastd m16, [pw_1697x16] |
| vpbroadcastd m17, [pw_16384] |
| IDTX_16x32 0, 1, 2, 3 |
| IDTX_16x32 4, 5, 6, 7 |
| IDTX_16x32 8, 9, 10, 11 |
| IDTX_16x32 12, 13, 14, 15 |
| vpbroadcastd m16, [pw_8192] |
| call .transpose_2x8x8_round |
| lea r3, [strideq*2] |
| lea r4, [strideq*3] |
| pxor m18, m18 |
| IDTX_16x32_STORE 0, 8 |
| IDTX_16x32_STORE 1, 9 |
| IDTX_16x32_STORE 2, 10 |
| IDTX_16x32_STORE 3, 11 |
| IDTX_16x32_STORE 4, 12 |
| IDTX_16x32_STORE 5, 13 |
| IDTX_16x32_STORE 6, 14 |
| IDTX_16x32_STORE 7, 15 |
| RET |
| ALIGN function_align |
| .transpose_2x8x8_round: |
| punpckhwd m17, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m7 |
| punpcklwd m6, m7 |
| punpckhwd m7, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckhdq m6, m5, m7 |
| punpckldq m5, m7 |
| punpckldq m7, m17, m1 |
| punpckhdq m17, m1 |
| REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m7 |
| punpckhqdq m5, m7 |
| punpckhqdq m7, m6, m17 |
| punpcklqdq m6, m17 |
| punpckhwd m17, m12, m13 |
| punpcklwd m12, m13 |
| punpckhwd m13, m8, m9 |
| punpcklwd m8, m9 |
| punpckhwd m9, m14, m15 |
| punpcklwd m14, m15 |
| punpckhwd m15, m10, m11 |
| punpcklwd m10, m11 |
| punpckhdq m11, m8, m10 |
| punpckldq m8, m10 |
| punpckldq m10, m12, m14 |
| punpckhdq m12, m14 |
| punpckhdq m14, m13, m15 |
| punpckldq m13, m15 |
| punpckldq m15, m17, m9 |
| punpckhdq m17, m9 |
| REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 |
| punpckhqdq m9, m8, m10 |
| punpcklqdq m8, m10 |
| punpcklqdq m10, m11, m12 |
| punpckhqdq m11, m12 |
| punpcklqdq m12, m13, m15 |
| punpckhqdq m13, m15 |
| punpckhqdq m15, m14, m17 |
| punpcklqdq m14, m17 |
| ret |
| |
| %macro IDTX_32x16 4 ; dst[1-4] |
| pmulhrsw m%2, m12, [cq+32*(%1+ 0)] |
| pmulhrsw m18, m12, [cq+32*(%1+16)] |
| pmulhrsw m%4, m12, [cq+32*(%3+ 0)] |
| pmulhrsw m19, m12, [cq+32*(%3+16)] |
| REPX {paddsw x, x}, m%2, m18, m%4, m19 |
| mova m%1, m14 |
| vpermi2q m%1, m%2, m18 |
| vpermt2q m%2, m16, m18 |
| %if %3 != 14 |
| mova m%3, m14 |
| %endif |
| vpermi2q m%3, m%4, m19 |
| vpermt2q m%4, m16, m19 |
| pmulhrsw m18, m17, m%1 |
| pmulhrsw m19, m17, m%2 |
| pmulhrsw m20, m17, m%3 |
| pmulhrsw m21, m17, m%4 |
| REPX {paddsw x, x}, m%1, m%2, m%3, m%4 |
| paddsw m%1, m18 |
| paddsw m%2, m19 |
| paddsw m%3, m20 |
| paddsw m%4, m21 |
| %endmacro |
| |
| %macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 |
| mova ym19, [dstq+strideq*0] |
| vinserti32x8 m19, [dstq+strideq*8], 1 |
| %if %3 == 0 |
| mova [cq+64*(%1*2+0)], m20 |
| mova [cq+64*(%1*2+1)], m20 |
| %endif |
| punpcklbw m18, m19, m20 |
| punpckhbw m19, m20 |
| paddw m18, m%1 |
| paddw m19, m%2 |
| packuswb m18, m19 |
| mova [dstq+strideq*0], ym18 |
| vextracti32x8 [dstq+strideq*8], m18, 1 |
| %if %3 || %1 != 7 |
| add dstq, strideq |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c |
| vpbroadcastd m12, [pw_2896x8] |
| movu m14, [permB+7] |
| vpbroadcastd m17, [pw_1697x16] |
| psrlq m16, m14, 4 |
| IDTX_32x16 0, 1, 2, 3 |
| IDTX_32x16 4, 5, 6, 7 |
| IDTX_32x16 8, 9, 10, 11 |
| IDTX_32x16 12, 13, 14, 15 |
| vpbroadcastd m16, [pw_2048] |
| call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round |
| pxor m20, m20 |
| IDTX_32x16_STORE 0, 8 |
| IDTX_32x16_STORE 1, 9 |
| IDTX_32x16_STORE 2, 10 |
| IDTX_32x16_STORE 3, 11 |
| IDTX_32x16_STORE 4, 12 |
| IDTX_32x16_STORE 5, 13 |
| IDTX_32x16_STORE 6, 14 |
| IDTX_32x16_STORE 7, 15 |
| RET |
| |
| %macro IDCT_32x32_END 4 ; src, mem, stride[1-2] |
| pmovzxbw m10, [dstq+%3] |
| pmovzxbw m11, [r3 +%4] |
| %if %2 < 8 |
| paddsw m8, m%2, m%1 |
| psubsw m9, m%2, m%1 |
| %else |
| mova m9, [cq+64*(%2*2-16)] |
| paddsw m8, m9, m%1 |
| psubsw m9, m%1 |
| %endif |
| pmulhrsw m8, m12 |
| pmulhrsw m9, m12 |
| %if %2 >= 8 |
| %if %2 == 8 |
| pxor m0, m0 |
| %endif |
| mova [cq+64*(%2*2-16)], m0 |
| mova [cq+64*(%2*2-15)], m0 |
| %endif |
| paddw m8, m10 |
| paddw m9, m11 |
| packuswb m8, m9 |
| vpermq m8, m13, m8 |
| mova [dstq+%3], ym8 |
| vextracti32x8 [r3 +%4], m8, 1 |
| %if %2 == 3 || %2 == 7 || %2 == 11 |
| add dstq, r5 |
| sub r3, r5 |
| %endif |
| %endmacro |
| |
| cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| WIN64_SPILL_XMM 30 |
| cmp eobd, 136 |
| jb .fast |
| mova m5, [cq+64*20] |
| mova m3, [cq+64*12] |
| mova m1, [cq+64* 4] |
| mova m7, [cq+64*28] |
| mova m2, [cq+64* 8] |
| mova m6, [cq+64*24] |
| mova m0, [cq+64* 0] |
| mova m4, [cq+64*16] |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| mova m14, [cq+64* 2] |
| mova m21, [cq+64*30] |
| mova m18, [cq+64*18] |
| mova m17, [cq+64*14] |
| mova m16, [cq+64*10] |
| mova m19, [cq+64*22] |
| mova m20, [cq+64*26] |
| mova m15, [cq+64* 6] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| mova [cq+64* 0], m14 |
| mova [cq+64* 2], m15 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64*12], m20 |
| mova [cq+64*14], m21 |
| mova m22, [cq+64* 1] |
| mova m21, [cq+64*31] |
| mova m14, [cq+64*17] |
| mova m29, [cq+64*15] |
| mova m26, [cq+64* 9] |
| mova m17, [cq+64*23] |
| mova m18, [cq+64*25] |
| mova m25, [cq+64* 7] |
| mova m24, [cq+64* 5] |
| mova m19, [cq+64*27] |
| mova m16, [cq+64*21] |
| mova m27, [cq+64*11] |
| mova m28, [cq+64*13] |
| mova m15, [cq+64*19] |
| mova m20, [cq+64*29] |
| mova m23, [cq+64* 3] |
| call .main_oddhalf |
| vpbroadcastd m10, [o(pw_8192)] |
| psubsw m13, m0, m29 ; 31 |
| paddsw m0, m29 ; 0 |
| psubsw m29, m1, m28 ; 30 |
| paddsw m1, m28 ; 1 |
| psubsw m28, m2, m27 ; 29 |
| paddsw m2, m27 ; 2 |
| psubsw m27, m3, m26 ; 28 |
| paddsw m3, m26 ; 3 |
| psubsw m26, m4, m25 ; 27 |
| paddsw m4, m25 ; 4 |
| psubsw m25, m5, m24 ; 26 |
| paddsw m5, m24 ; 5 |
| psubsw m24, m6, m23 ; 25 |
| paddsw m6, m23 ; 6 |
| psubsw m23, m7, m22 ; 24 |
| paddsw m7, m22 ; 7 |
| pxor m9, m9 |
| punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 |
| punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 |
| punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 |
| punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 |
| punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 |
| punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 |
| punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 |
| REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 |
| punpckhwd m3, m23, m24 |
| punpcklwd m23, m24 |
| punpckhwd m24, m25, m26 |
| punpcklwd m25, m26 |
| REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 |
| punpckhwd m26, m27, m28 |
| punpcklwd m27, m28 |
| punpckhwd m28, m29, m13 |
| punpcklwd m29, m13 |
| REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 |
| punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 |
| punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 |
| punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 |
| punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 |
| REPX {pmulhrsw x, m10}, m0, m4, m8, m22 |
| punpckhdq m13, m23, m25 |
| punpckldq m23, m25 |
| punpckhdq m25, m27, m29 |
| punpckldq m27, m29 |
| REPX {pmulhrsw x, m10}, m13, m23, m25, m27 |
| punpckhdq m9, m3, m24 |
| punpckldq m3, m24 |
| punpckhdq m24, m26, m28 |
| punpckldq m26, m28 |
| punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 |
| punpckhqdq m23, m27 ; d01 d09 d17 d25 |
| punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 |
| punpcklqdq m13, m25 ; d02 d10 d18 d26 |
| punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 |
| punpcklqdq m3, m26 ; d04 d12 d20 d28 |
| punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 |
| punpcklqdq m9, m24 ; d06 d14 d22 d30 |
| REPX {pmulhrsw x, m10}, m25, m3, m26 |
| mova [cq+64* 9], m23 |
| mova [cq+64*11], m27 |
| mova [cq+64*13], m25 |
| mova [cq+64*15], m26 |
| punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 |
| punpcklqdq m8, m22 ; a04 a12 a20 a28 |
| punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 |
| punpcklqdq m0, m4 ; a00 a08 a16 a24 |
| punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 |
| punpcklqdq m7, m2 ; a02 a10 a18 a26 |
| punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 |
| punpcklqdq m6, m1 ; a06 a14 a22 a30 |
| mova m2, [cq+64* 0] |
| mova m11, [cq+64* 2] |
| mova m12, [cq+64* 4] |
| mova m29, [cq+64* 6] |
| mova m27, [cq+64* 8] |
| mova m26, [cq+64*10] |
| mova m4, [cq+64*12] |
| mova m28, [cq+64*14] |
| psubsw m1, m2, m21 ; 23 |
| paddsw m2, m21 ; 8 |
| psubsw m21, m11, m20 ; 22 |
| paddsw m11, m20 ; 9 |
| psubsw m20, m12, m19 ; 21 |
| paddsw m12, m19 ; 10 |
| psubsw m19, m29, m18 ; 20 |
| paddsw m29, m18 ; 11 |
| psubsw m18, m27, m17 ; 19 |
| paddsw m27, m17 ; 12 |
| psubsw m17, m26, m16 ; 18 |
| paddsw m26, m16 ; 13 |
| paddsw m16, m4, m15 ; 14 |
| psubsw m4, m15 ; 17 |
| pmulhrsw m15, m6, m10 |
| psubsw m6, m28, m14 ; 16 |
| paddsw m28, m14 ; 15 |
| pmulhrsw m14, m7, m10 |
| punpcklwd m7, m6, m4 |
| punpckhwd m6, m4 |
| punpckhwd m4, m17, m18 |
| punpcklwd m17, m18 |
| punpckhwd m18, m19, m20 |
| punpcklwd m19, m20 |
| punpckhwd m20, m21, m1 |
| punpcklwd m21, m1 |
| punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 |
| punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 |
| punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 |
| punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 |
| punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 |
| punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 |
| punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 |
| punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 |
| pmulhrsw m23, m10 |
| pmulhrsw m25, m10 |
| punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 |
| punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 |
| REPX {pmulhrsw x, m10}, m28, m2, m12, m27 |
| punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 |
| punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 |
| punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 |
| punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 |
| REPX {pmulhrsw x, m10}, m16, m1, m11, m29 |
| punpckhdq m26, m19, m21 |
| punpckldq m19, m21 |
| punpckhdq m21, m6, m4 |
| punpckldq m6, m4 |
| REPX {pmulhrsw x, m10}, m26, m19, m21, m6 |
| punpckhdq m4, m18, m20 |
| punpckldq m18, m20 |
| punpckhdq m20, m7, m17 |
| punpckldq m7, m17 |
| REPX {pmulhrsw x, m10}, m4, m18, m20, m7 |
| punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 |
| punpckhqdq m28, m12 ; b03 b11 b19 b27 |
| punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 |
| punpcklqdq m2, m27 ; b00 b08 b16 b24 |
| punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 |
| punpcklqdq m1, m29 ; b04 b12 b20 b28 |
| punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 |
| punpcklqdq m16, m11 ; b06 b14 b22 b30 |
| mova [cq+64* 1], m12 |
| mova [cq+64* 3], m28 |
| mova [cq+64* 5], m27 |
| mova [cq+64* 7], m29 |
| punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 |
| punpcklqdq m20, m26 ; c02 c10 c18 c26 |
| punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 |
| punpcklqdq m7, m19 ; c00 c08 c16 c24 |
| punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 |
| punpcklqdq m6, m18 ; c04 c12 c20 c28 |
| punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 |
| punpcklqdq m21, m4 ; c06 c14 c22 c30 |
| pmulhrsw m19, m9, m10 |
| vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 |
| vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 |
| vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 |
| vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 |
| vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 |
| vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 |
| vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 |
| vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 |
| vshufi32x4 m3, m1, m6, q3131 ; 12 |
| vshufi32x4 m1, m6, q2020 ; 4 |
| vshufi32x4 m6, m4, m2, q3131 ; 24 |
| vshufi32x4 m4, m2, q2020 ; 16 |
| vshufi32x4 m2, m0, m7, q3131 ; 8 |
| vshufi32x4 m0, m7, q2020 ; 0 |
| vshufi32x4 m7, m5, m8, q3131 ; 28 |
| vshufi32x4 m5, m8, q2020 ; 20 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 |
| vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 |
| vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 |
| vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 |
| vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 |
| vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 |
| vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 |
| vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 |
| vshufi32x4 m16, m14, m20, q3131 ; 10 |
| vshufi32x4 m14, m20, q2020 ; 2 |
| vshufi32x4 m20, m18, m17, q3131 ; 26 |
| vshufi32x4 m18, m17, q2020 ; 18 |
| vshufi32x4 m17, m15, m21, q3131 ; 14 |
| vshufi32x4 m15, m21, q2020 ; 6 |
| vshufi32x4 m21, m19, m13, q3131 ; 30 |
| vshufi32x4 m19, m13, q2020 ; 22 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| mova [cq+64* 0], m14 |
| mova [cq+64* 2], m15 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64*12], m20 |
| mova [cq+64*14], m21 |
| mova m15, [cq+64* 1] |
| mova m16, [cq+64* 3] |
| mova m17, [cq+64* 5] |
| mova m19, [cq+64* 7] |
| mova m20, [cq+64* 9] |
| mova m21, [cq+64*11] |
| mova m13, [cq+64*13] |
| mova m18, [cq+64*15] |
| vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 |
| vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 |
| vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 |
| vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 |
| vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 |
| vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 |
| vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 |
| vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 |
| vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 |
| vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 |
| vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 |
| vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 |
| vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 |
| vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 |
| vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 |
| vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 |
| vshufi32x4 m18, m14, m26, q3131 ; 25 |
| vshufi32x4 m14, m26, q2020 ; 17 |
| vshufi32x4 m19, m15, m27, q3131 ; 27 |
| vshufi32x4 m15, m27, q2020 ; 19 |
| vshufi32x4 m20, m16, m28, q3131 ; 29 |
| vshufi32x4 m16, m28, q2020 ; 21 |
| vshufi32x4 m21, m17, m29, q3131 ; 31 |
| vshufi32x4 m17, m29, q2020 ; 23 |
| vshufi32x4 m26, m22, m8, q3131 ; 9 |
| vshufi32x4 m22, m8, q2020 ; 1 |
| vshufi32x4 m27, m23, m9, q3131 ; 11 |
| vshufi32x4 m23, m9, q2020 ; 3 |
| vshufi32x4 m28, m24, m11, q3131 ; 13 |
| vshufi32x4 m24, m11, q2020 ; 5 |
| vshufi32x4 m29, m25, m12, q3131 ; 15 |
| vshufi32x4 m25, m12, q2020 ; 7 |
| call .main_oddhalf |
| jmp .end |
| .fast: ; bottom/right halves are zero |
| mova m14, [o(dup16_perm)] |
| pmovzxwd m9, [cq+64* 0] |
| pmovzxwd m6, [cq+64* 8] |
| vpermb m8, m14, [cq+64* 2] |
| vpermb ym0, ym14, [cq+64*14] |
| vpermb ym5, ym14, [cq+64*10] |
| vpermb m1, m14, [cq+64* 6] |
| vpermb m7, m14, [cq+64* 4] |
| vpermb ym3, ym14, [cq+64*12] |
| pslld m9, 16 |
| pslld m6, 16 |
| call m(idct_16x16_internal_8bpc).main_fast |
| vpermb m21, m14, [cq+64* 1] |
| vpermb ym17, ym14, [cq+64*15] |
| vpermb ym20, ym14, [cq+64* 9] |
| vpermb m15, m14, [cq+64* 7] |
| vpermb m18, m14, [cq+64* 5] |
| vpermb ym16, ym14, [cq+64*11] |
| vpermb ym19, ym14, [cq+64*13] |
| vpermb m14, m14, [cq+64* 3] |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| vpbroadcastd m9, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round |
| vshufi32x4 m22, m14, m2, q2020 ; 1 |
| vshufi32x4 m24, m14, m2, q3131 ; 5 |
| vshufi32x4 m23, m17, m9, q2020 ; 3 |
| vshufi32x4 m25, m17, m9, q3131 ; 7 |
| vshufi32x4 m16, m5, m15, q2020 ; 10 |
| vshufi32x4 m17, m5, m15, q3131 ; 14 |
| vshufi32x4 m14, m1, m18, q2020 ; 2 |
| vshufi32x4 m15, m1, m18, q3131 ; 6 |
| vshufi32x4 m1, m0, m3, q3131 ; 4 |
| vshufi32x4 m0, m3, q2020 ; 0 |
| vshufi32x4 m3, m21, m4, q3131 ; 12 |
| vshufi32x4 m2, m21, m4, q2020 ; 8 |
| vshufi32x4 m26, m20, m6, q2020 ; 9 |
| vshufi32x4 m28, m20, m6, q3131 ; 13 |
| vshufi32x4 m27, m19, m7, q2020 ; 11 |
| vshufi32x4 m29, m19, m7, q3131 ; 15 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| mova [cq+64* 0], m14 |
| mova [cq+64* 2], m15 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64*12], m20 |
| mova [cq+64*14], m21 |
| call .main_oddhalf_fast |
| .end: |
| lea r4, [strideq*3] |
| vpbroadcastd m12, [o(pw_2048)] |
| movshdup m13, [o(permD)] |
| lea r3, [dstq+r4*8] |
| lea r5, [strideq+r4] ; stride*4 |
| add r3, r5 ; dst+stride*28 |
| IDCT_32x32_END 29, 0, strideq*0, r4 |
| IDCT_32x32_END 28, 1, strideq*1, strideq*2 |
| IDCT_32x32_END 27, 2, strideq*2, strideq*1 |
| IDCT_32x32_END 26, 3, r4 , strideq*0 |
| IDCT_32x32_END 25, 4, strideq*0, r4 |
| IDCT_32x32_END 24, 5, strideq*1, strideq*2 |
| IDCT_32x32_END 23, 6, strideq*2, strideq*1 |
| IDCT_32x32_END 22, 7, r4 , strideq*0 |
| IDCT_32x32_END 21, 8, strideq*0, r4 |
| IDCT_32x32_END 20, 9, strideq*1, strideq*2 |
| IDCT_32x32_END 19, 10, strideq*2, strideq*1 |
| IDCT_32x32_END 18, 11, r4 , strideq*0 |
| IDCT_32x32_END 17, 12, strideq*0, r4 |
| IDCT_32x32_END 16, 13, strideq*1, strideq*2 |
| IDCT_32x32_END 15, 14, strideq*2, strideq*1 |
| IDCT_32x32_END 14, 15, r4 , strideq*0 |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 32 |
| jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero |
| vpbroadcastd m21, [o(pw_4091x8)] |
| vpbroadcastd m8, [o(pw_201x8)] |
| vpbroadcastd m18, [o(pw_m1380x8)] |
| vpbroadcastd m9, [o(pw_3857x8)] |
| vpbroadcastd m19, [o(pw_3973x8)] |
| vpbroadcastd m11, [o(pw_995x8)] |
| vpbroadcastd m28, [o(pw_m601x8)] |
| vpbroadcastd m12, [o(pw_4052x8)] |
| pmulhrsw m21, m22 ; t31a |
| pmulhrsw m22, m8 ; t16a |
| pmulhrsw m18, m25 ; t19a |
| pmulhrsw m25, m9 ; t28a |
| pmulhrsw m19, m24 ; t27a |
| pmulhrsw m24, m11 ; t20a |
| pmulhrsw m28, m23 ; t23a |
| pmulhrsw m23, m12 ; t24a |
| mova m15, m21 |
| mova m8, m22 |
| mova m14, m18 |
| mova m27, m25 |
| mova m29, m19 |
| mova m26, m24 |
| mova m16, m28 |
| mova m20, m23 |
| jmp .main3 |
| ALIGN function_align |
| cglobal_label .main_oddhalf_fast ; bottom half is zero |
| vpbroadcastd m21, [o(pw_4091x8)] |
| vpbroadcastd m8, [o(pw_201x8)] |
| vpbroadcastd m14, [o(pw_m2751x8)] |
| vpbroadcastd m9, [o(pw_3035x8)] |
| vpbroadcastd m17, [o(pw_3703x8)] |
| vpbroadcastd m11, [o(pw_1751x8)] |
| vpbroadcastd m18, [o(pw_m1380x8)] |
| vpbroadcastd m12, [o(pw_3857x8)] |
| pmulhrsw m21, m22 ; t31a |
| vpbroadcastd m19, [o(pw_3973x8)] |
| pmulhrsw m22, m8 ; t16a |
| vpbroadcastd m8, [o(pw_995x8)] |
| pmulhrsw m14, m29 ; t30a |
| vpbroadcastd m16, [o(pw_m2106x8)] |
| pmulhrsw m29, m9 ; t17a |
| vpbroadcastd m9, [o(pw_3513x8)] |
| pmulhrsw m17, m26 ; t29a |
| vpbroadcastd m15, [o(pw_3290x8)] |
| pmulhrsw m26, m11 ; t18a |
| vpbroadcastd m11, [o(pw_2440x8)] |
| pmulhrsw m18, m25 ; t19a |
| vpbroadcastd m20, [o(pw_m601x8)] |
| pmulhrsw m25, m12 ; t28a |
| vpbroadcastd m12, [o(pw_4052x8)] |
| pmulhrsw m19, m24 ; t27a |
| pmulhrsw m24, m8 ; t20a |
| pmulhrsw m16, m27 ; t21a |
| pmulhrsw m27, m9 ; t26a |
| pmulhrsw m15, m28 ; t25a |
| pmulhrsw m28, m11 ; t22a |
| pmulhrsw m20, m23 ; t23a |
| pmulhrsw m23, m12 ; t24a |
| jmp .main2 |
| ALIGN function_align |
| cglobal_label .main_oddhalf |
| ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a |
| ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a |
| ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a |
| ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a |
| ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a |
| ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a |
| ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a |
| ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a |
| .main2: |
| psubsw m8, m22, m14 ; t17 |
| paddsw m22, m14 ; t16 |
| paddsw m14, m18, m26 ; t19 |
| psubsw m18, m26 ; t18 |
| psubsw m26, m24, m16 ; t21 |
| paddsw m24, m16 ; t20 |
| psubsw m16, m20, m28 ; t22 |
| paddsw m28, m20 ; t23 |
| psubsw m20, m23, m15 ; t25 |
| paddsw m23, m15 ; t24 |
| psubsw m15, m21, m29 ; t30 |
| paddsw m21, m29 ; t31 |
| psubsw m29, m19, m27 ; t26 |
| paddsw m19, m27 ; t27 |
| paddsw m27, m25, m17 ; t28 |
| psubsw m25, m17 ; t29 |
| .main3: |
| ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a |
| ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a |
| ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a |
| ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a |
| vpbroadcastd m12, [o(pw_m3784_1567)] |
| vpbroadcastd m11, [o(pw_1567_3784)] |
| psubsw m17, m21, m27 ; t28a |
| paddsw m21, m27 ; t31a |
| psubsw m27, m15, m25 ; t18 |
| paddsw m15, m25 ; t17 |
| psubsw m25, m20, m29 ; t21 |
| paddsw m20, m29 ; t22 |
| psubsw m29, m8, m18 ; t29 |
| paddsw m8, m18 ; t30 |
| psubsw m18, m22, m14 ; t19a |
| paddsw m22, m14 ; t16a |
| psubsw m14, m28, m24 ; t20a |
| paddsw m24, m28 ; t23a |
| paddsw m28, m16, m26 ; t25 |
| psubsw m16, m26 ; t26 |
| psubsw m26, m23, m19 ; t27a |
| paddsw m23, m19 ; t24a |
| ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a |
| ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 |
| vpbroadcastd m11, [o(pw_m1567_m3784)] |
| ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a |
| ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| vpbroadcastd m11, [o(pw_2896_2896)] |
| psubsw m19, m27, m25 ; t26 |
| paddsw m27, m25 ; t29 |
| psubsw m25, m17, m26 ; t20a |
| paddsw m17, m26 ; t19a |
| paddsw m26, m18, m14 ; t28a |
| psubsw m18, m14 ; t27a |
| paddsw m14, m22, m24 ; t16 |
| psubsw m22, m24 ; t23 |
| psubsw m24, m29, m16 ; t21 |
| paddsw m16, m29 ; t18 |
| paddsw m29, m21, m23 ; t31 |
| psubsw m21, m23 ; t24 |
| psubsw m23, m15, m20 ; t22a |
| paddsw m15, m20 ; t17a |
| psubsw m20, m8, m28 ; t25a |
| paddsw m28, m8 ; t30a |
| ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 |
| ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a |
| ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a |
| ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 |
| ret |
| |
| %macro IDTX_32x32 2 ; dst[1-2] |
| vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which |
| vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to |
| vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements |
| vmovdqa32 ym18, [cq+64*(%2+16)] |
| vpermt2q m%1, m21, m17 |
| vpermt2q m%2, m21, m18 |
| %endmacro |
| |
| cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c |
| movu m21, [permB+7] |
| vpbroadcastd m16, [pw_8192] |
| pxor m20, m20 |
| .loop: |
| IDTX_32x32 0, 1 |
| IDTX_32x32 2, 3 |
| IDTX_32x32 4, 5 |
| IDTX_32x32 6, 7 |
| IDTX_32x32 8, 9 |
| IDTX_32x32 10, 11 |
| IDTX_32x32 12, 13 |
| IDTX_32x32 14, 15 |
| call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round |
| IDTX_32x16_STORE 0, 8, 1 |
| IDTX_32x16_STORE 1, 9, 1 |
| IDTX_32x16_STORE 2, 10, 1 |
| IDTX_32x16_STORE 3, 11, 1 |
| IDTX_32x16_STORE 4, 12, 1 |
| IDTX_32x16_STORE 5, 13, 1 |
| IDTX_32x16_STORE 6, 14, 1 |
| IDTX_32x16_STORE 7, 15, 1 |
| lea dstq, [dstq+strideq*8] |
| btc cq, 5 |
| jnc .loop |
| mov r0d, 8 |
| .zero_loop: |
| mova [cq+64*0], m20 |
| mova [cq+64*1], m20 |
| mova [cq+64*2], m20 |
| mova [cq+64*3], m20 |
| add cq, 64*4 |
| dec r0d |
| jg .zero_loop |
| RET |
| |
| cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| WIN64_SPILL_XMM 30 |
| cmp eobd, 151 |
| jb .fast |
| mova m5, [cq+64*10] |
| mova m3, [cq+64* 6] |
| mova m1, [cq+64* 2] |
| mova m7, [cq+64*14] |
| mova m2, [cq+64* 4] |
| mova m6, [cq+64*12] |
| mova m0, [cq+64* 0] |
| mova m4, [cq+64* 8] |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| mova m14, [cq+64* 1] |
| mova m21, [cq+64*15] |
| mova m18, [cq+64* 9] |
| mova m17, [cq+64* 7] |
| mova m16, [cq+64* 5] |
| mova m19, [cq+64*11] |
| mova m20, [cq+64*13] |
| mova m15, [cq+64* 3] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| vpbroadcastd m9, [o(pw_8192)] |
| %macro TRANSPOSE_8x4_ROUND 4 |
| punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 |
| punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 |
| punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 |
| punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 |
| REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 |
| %endmacro |
| TRANSPOSE_8x4_ROUND 0, 1, 2, 3 |
| TRANSPOSE_8x4_ROUND 4, 5, 6, 7 |
| TRANSPOSE_8x4_ROUND 14, 15, 16, 17 |
| TRANSPOSE_8x4_ROUND 18, 19, 20, 21 |
| vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 |
| vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 |
| vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 |
| vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 |
| vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 |
| vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 |
| vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 |
| vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 |
| vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 |
| vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 |
| vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 |
| vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 |
| vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 |
| vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 |
| vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 |
| vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 |
| vshufi32x4 m22, m26, m4, q2020 ; 0 1 |
| vshufi32x4 m26, m4, q3131 ; 8 9 |
| vshufi32x4 m23, m27, m5, q2020 ; 2 3 |
| vshufi32x4 m27, m5, q3131 ; 10 11 |
| vshufi32x4 m24, m28, m6, q2020 ; 4 5 |
| vshufi32x4 m28, m6, q3131 ; 12 13 |
| vshufi32x4 m25, m29, m7, q2020 ; 6 7 |
| vshufi32x4 m29, m7, q3131 ; 14 15 |
| vshufi32x4 m4, m0, m14, q2020 ; 16 17 |
| vshufi32x4 m3, m0, m14, q3131 ; 24 25 |
| vshufi32x4 m20, m1, m15, q2020 ; 18 19 |
| vshufi32x4 m19, m1, m15, q3131 ; 26 27 |
| vshufi32x4 m5, m2, m16, q2020 ; 20 21 |
| vshufi32x4 m0, m2, m16, q3131 ; 28 29 |
| vshufi32x4 m16, m8, m17, q2020 ; 22 23 |
| vshufi32x4 m17, m8, m17, q3131 ; 30 31 |
| pxor m6, m6 |
| mova [cq+64* 0], m4 |
| mova [cq+64* 2], m5 |
| mova [cq+64* 4], m3 |
| mova [cq+64* 6], m0 |
| punpcklwd m8, m24, m24 ; 4 |
| punpcklwd m0, m0 ; 28 |
| punpcklwd m5, m5 ; 20 |
| punpcklwd m1, m28, m28 ; 12 |
| punpcklwd m7, m26, m26 ; 8 |
| punpcklwd m3, m3 ; 24 |
| punpcklwd m9, m6, m22 ; __ 0 |
| punpcklwd m6, m4 ; __ 16 |
| call m(idct_16x16_internal_8bpc).main_fast3 |
| mova [cq+64* 1], m20 |
| mova [cq+64* 3], m16 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 7], m17 |
| punpcklwd m21, m23, m23 ; 2 |
| punpcklwd m17, m17 ; 30 |
| punpcklwd m20, m20 ; 18 |
| punpcklwd m15, m29, m29 ; 14 |
| punpcklwd m18, m27, m27 ; 10 |
| punpcklwd m16, m16 ; 22 |
| punpcklwd m19, m19 ; 26 |
| punpcklwd m14, m25, m25 ; 6 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| mova [cq+64* 8], m14 |
| mova [cq+64* 9], m15 |
| mova [cq+64*10], m16 |
| mova [cq+64*11], m17 |
| mova [cq+64*12], m18 |
| mova [cq+64*13], m19 |
| mova [cq+64*14], m20 |
| mova [cq+64*15], m21 |
| mova m21, [cq+64* 7] |
| mova m14, [cq+64* 0] |
| mova m17, [cq+64* 3] |
| mova m18, [cq+64* 4] |
| mova m19, [cq+64* 5] |
| mova m16, [cq+64* 2] |
| mova m15, [cq+64* 1] |
| mova m20, [cq+64* 6] |
| REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ |
| m24, m19, m16, m27, m28, m15, m20, m23 |
| call .main_oddhalf |
| jmp .end |
| .fast: ; right half is zero |
| mova ym8, [cq+64*15] |
| vinserti32x8 m8, [cq+64* 1], 1 |
| mova m2, [o(int16_perm)] |
| mova ym9, [cq+64* 8] |
| vinserti32x8 m9, [cq+64* 0], 1 |
| mova ym0, [cq+64* 7] |
| vinserti32x8 m0, [cq+64* 9], 1 |
| mova ym7, [cq+64*14] |
| vinserti32x8 m7, [cq+64* 2], 1 |
| mova ym1, [cq+64* 3] |
| vinserti32x8 m1, [cq+64*13], 1 |
| mova ym3, [cq+64* 6] |
| vinserti32x8 m3, [cq+64*10], 1 |
| mova ym5, [cq+64*11] |
| vinserti32x8 m5, [cq+64* 5], 1 |
| mova ym6, [cq+64*12] |
| vinserti32x8 m6, [cq+64* 4], 1 |
| REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 |
| call m(idct_16x16_internal_8bpc).main2 |
| vbroadcasti32x4 m8, [o(int_shuf3)] |
| vbroadcasti32x4 m9, [o(int_shuf4)] |
| vpbroadcastd m11, [o(pw_8192)] |
| pshufb m0, m8 |
| pshufb m1, m9 |
| pshufb m2, m8 |
| pshufb m3, m9 |
| REPX {pmulhrsw x, m11}, m0, m1, m2, m3 |
| pshufb m4, m8 |
| pshufb m5, m9 |
| pshufb m6, m8 |
| pshufb m7, m9 |
| REPX {pmulhrsw x, m11}, m4, m5, m6, m7 |
| punpckhdq m28, m0, m1 |
| punpckldq m0, m1 |
| punpckhdq m27, m2, m3 |
| punpckldq m2, m3 |
| punpckhdq m22, m4, m5 |
| punpckldq m4, m5 |
| punpckhdq m23, m6, m7 |
| punpckldq m6, m7 |
| vinserti32x8 m14, m0, ym2, 1 |
| vshufi32x4 m15, m0, m2, q3232 |
| vinserti32x8 m2, m4, ym6, 1 |
| vshufi32x4 m4, m6, q3232 |
| vshufi32x4 m21, m14, m2, q2020 ; 0 2 |
| vshufi32x4 m14, m2, q3131 ; 4 6 |
| vshufi32x4 m18, m15, m4, q2020 ; 8 10 |
| vshufi32x4 m15, m4, q3131 ; 12 14 |
| pxor m9, m9 |
| punpcklwd m8, m14, m14 ; 4 |
| punpcklwd m1, m15, m15 ; 12 |
| punpcklwd m7, m18, m18 ; 8 |
| punpcklwd m9, m21 ; __ 0 |
| call m(idct_16x16_internal_8bpc).main_fast4 |
| punpckhwd m21, m21 ; 2 |
| punpckhwd m15, m15 ; 14 |
| punpckhwd m18, m18 ; 10 |
| punpckhwd m14, m14 ; 6 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 |
| vinserti32x8 m24, m28, ym27, 1 |
| vshufi32x4 m28, m27, q3232 |
| vinserti32x8 m27, m22, ym23, 1 |
| vshufi32x4 m22, m23, q3232 |
| vshufi32x4 m23, m24, m27, q2020 ; 1 3 |
| vshufi32x4 m24, m27, q3131 ; 5 7 |
| vshufi32x4 m27, m28, m22, q2020 ; 9 11 |
| vshufi32x4 m28, m22, q3131 ; 13 15 |
| punpcklwd m22, m23, m23 ; 1 |
| punpckhwd m29, m28, m28 ; 15 |
| punpcklwd m26, m27, m27 ; 9 |
| punpckhwd m25, m24, m24 ; 7 |
| mova [cq+64* 8], m14 |
| mova [cq+64* 9], m15 |
| mova [cq+64*10], m16 |
| mova [cq+64*11], m17 |
| punpcklwd m24, m24 ; 5 |
| punpckhwd m27, m27 ; 11 |
| punpcklwd m28, m28 ; 13 |
| punpckhwd m23, m23 ; 3 |
| mova [cq+64*12], m18 |
| mova [cq+64*13], m19 |
| mova [cq+64*14], m20 |
| mova [cq+64*15], m21 |
| call .main_oddhalf_fast |
| .end: |
| imul r6, strideq, 60 |
| mova m10, [o(end_16x32p)] |
| vpbroadcastd m11, [o(pw_2048)] |
| lea r3, [strideq*3] |
| pxor m12, m12 |
| add r6, dstq ; dst+stride*60 |
| psrldq m13, m10, 1 |
| lea r4, [strideq+r3] ; stride*4 |
| %macro IDCT_16x64_END 3 ; idct32, idct64, tmp |
| %if %1 & 1 |
| %define %%s0 r3 |
| %define %%s1 strideq*2 |
| %define %%s2 strideq*1 |
| %define %%s3 strideq*0 |
| %else |
| %define %%s0 strideq*0 |
| %define %%s1 strideq*1 |
| %define %%s2 strideq*2 |
| %define %%s3 r3 |
| %if %1 |
| add dstq, r4 |
| sub r6, r4 |
| %endif |
| %endif |
| %if %1 < 8 |
| pmulhrsw m8, m11, m%1 |
| pmulhrsw m9, m11, m%2 |
| %else |
| mova m9, [cq+64*%1] |
| paddsw m8, m9, m%2 ; out 0+n, 1+n |
| psubsw m9, m%2 ; out 63-n, 62-n |
| pmulhrsw m8, m11 |
| pmulhrsw m9, m11 |
| %endif |
| mova xm29, [dstq+%%s0] |
| vinserti128 ym29, [dstq+%%s1], 1 |
| mova xm%3, [r6 +%%s3] |
| vinserti128 ym%3, [r6 +%%s2], 1 |
| vpermb m29, m10, m29 |
| vpermb m%3, m10, m%3 |
| mova [cq+64*%1], m12 |
| paddw m29, m8 |
| paddw m%3, m9 |
| packuswb m29, m%3 |
| vpermd m29, m13, m29 |
| mova [dstq+%%s0], xm29 |
| vextracti128 [dstq+%%s1], ym29, 1 |
| vextracti32x4 [r6 +%%s2], m29, 2 |
| vextracti32x4 [r6 +%%s3], m29, 3 |
| %endmacro |
| IDCT_16x64_END 0, 29, 0 |
| IDCT_16x64_END 1, 28, 28 |
| IDCT_16x64_END 2, 27, 28 |
| IDCT_16x64_END 3, 26, 28 |
| IDCT_16x64_END 4, 25, 28 |
| IDCT_16x64_END 5, 24, 28 |
| IDCT_16x64_END 6, 23, 28 |
| IDCT_16x64_END 7, 22, 28 |
| IDCT_16x64_END 8, 21, 28 |
| IDCT_16x64_END 9, 20, 28 |
| IDCT_16x64_END 10, 19, 28 |
| IDCT_16x64_END 11, 18, 28 |
| IDCT_16x64_END 12, 17, 28 |
| IDCT_16x64_END 13, 16, 28 |
| IDCT_16x64_END 14, 15, 28 |
| IDCT_16x64_END 15, 14, 28 |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 64 |
| imul r6d, 181 |
| add r6d, 128+512 |
| sar r6d, 8+2 |
| jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 |
| ALIGN function_align |
| .main_oddhalf_fast: ; bottom three-quarters are zero |
| vpbroadcastd m8, [o(pw_101_4095x8)] |
| vpbroadcastd m21, [o(pw_m1474_3822x8)] |
| vpbroadcastd m14, [o(pw_897_3996x8)] |
| vpbroadcastd m17, [o(pw_m700_4036x8)] |
| vpbroadcastd m18, [o(pw_501_4065x8)] |
| vpbroadcastd m19, [o(pw_m1092_3948x8)] |
| vpbroadcastd m16, [o(pw_1285_3889x8)] |
| vpbroadcastd m15, [o(pw_m301_4085x8)] |
| pmulhrsw m8, m22 ; t32a t63a |
| pmulhrsw m21, m29 ; t35a t60a |
| pmulhrsw m14, m26 ; t36a t59a |
| pmulhrsw m17, m25 ; t39a t56 |
| pmulhrsw m18, m24 ; t40a t55a |
| pmulhrsw m19, m27 ; t43a t52a |
| pmulhrsw m16, m28 ; t44a t51a |
| pmulhrsw m15, m23 ; t47a t48a |
| mova m22, m8 |
| mova m29, m21 |
| mova m26, m14 |
| mova m25, m17 |
| mova m24, m18 |
| mova m27, m19 |
| mova m28, m16 |
| mova m20, m15 |
| jmp .main_oddhalf2 |
| ALIGN function_align |
| .main_oddhalf: |
| vpbroadcastd m8, [o(pw_101_4095x8)] |
| vpbroadcastd m9, [o(pw_m2824_2967x8)] |
| vpbroadcastd m11, [o(pw_1660_3745x8)] |
| vpbroadcastd m12, [o(pw_m1474_3822x8)] |
| pmulhrsw m22, m8 ; t32a t63a |
| vpbroadcastd m8, [o(pw_897_3996x8)] |
| pmulhrsw m21, m9 ; t33a t62a |
| vpbroadcastd m9, [o(pw_m2191_3461x8)] |
| pmulhrsw m14, m11 ; t34a t61a |
| vpbroadcastd m11, [o(pw_2359_3349x8)] |
| pmulhrsw m29, m12 ; t35a t60a |
| vpbroadcastd m12, [o(pw_m700_4036x8)] |
| pmulhrsw m26, m8 ; t36a t59a |
| vpbroadcastd m8, [o(pw_501_4065x8)] |
| pmulhrsw m17, m9 ; t37a t58a |
| vpbroadcastd m9, [o(pw_m2520_3229x8)] |
| pmulhrsw m18, m11 ; t38a t57a |
| vpbroadcastd m11, [o(pw_2019_3564x8)] |
| pmulhrsw m25, m12 ; t39a t56a |
| vpbroadcastd m12, [o(pw_m1092_3948x8)] |
| pmulhrsw m24, m8 ; t40a t55a |
| vpbroadcastd m8, [o(pw_1285_3889x8)] |
| pmulhrsw m19, m9 ; t41a t54a |
| vpbroadcastd m9, [o(pw_m1842_3659x8)] |
| pmulhrsw m16, m11 ; t42a t53a |
| vpbroadcastd m11, [o(pw_2675_3102x8)] |
| pmulhrsw m27, m12 ; t43a t52a |
| vpbroadcastd m12, [o(pw_m301_4085x8)] |
| pmulhrsw m28, m8 ; t44a t51a |
| pmulhrsw m15, m9 ; t45a t50a |
| pmulhrsw m20, m11 ; t46a t49a |
| pmulhrsw m23, m12 ; t47a t48a |
| psubsw m8, m22, m21 ; t33 t62 |
| paddsw m22, m21 ; t32 t63 |
| psubsw m21, m29, m14 ; t34 t61 |
| paddsw m29, m14 ; t35 t60 |
| psubsw m14, m26, m17 ; t37 t58 |
| paddsw m26, m17 ; t36 t59 |
| psubsw m17, m25, m18 ; t38 t57 |
| paddsw m25, m18 ; t39 t56 |
| psubsw m18, m24, m19 ; t41 t54 |
| paddsw m24, m19 ; t40 t55 |
| psubsw m19, m27, m16 ; t42 t53 |
| paddsw m27, m16 ; t43 t52 |
| psubsw m16, m28, m15 ; t45 t50 |
| paddsw m28, m15 ; t44 t51 |
| psubsw m15, m23, m20 ; t46 t49 |
| paddsw m20, m23 ; t47 t48 |
| .main_oddhalf2: |
| ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a |
| ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a |
| ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a |
| ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a |
| ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a |
| ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a |
| ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a |
| ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a |
| vpbroadcastd m11, [o(pw_m4017_799)] |
| psubsw m23, m25, m26 ; t36a t59a |
| paddsw m25, m26 ; t39a t56a |
| psubsw m26, m24, m27 ; t43a t52a |
| paddsw m27, m24 ; t40a t55a |
| psubsw m24, m20, m28 ; t44a t51a |
| paddsw m20, m28 ; t47a t48a |
| psubsw m28, m8, m21 ; t34 t61 |
| paddsw m8, m21 ; t33 t62 |
| psubsw m21, m17, m14 ; t37 t58 |
| paddsw m17, m14 ; t38 t57 |
| psubsw m14, m18, m19 ; t42 t53 |
| paddsw m18, m19 ; t41 t54 |
| psubsw m19, m15, m16 ; t45 t50 |
| paddsw m15, m16 ; t46 t49 |
| psubsw m16, m22, m29 ; t35a t60a |
| paddsw m22, m29 ; t32a t63a |
| ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 |
| ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a |
| ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 |
| ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a |
| vpbroadcastd m11, [o(pw_m2276_3406)] |
| ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 |
| ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a |
| ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 |
| ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a |
| vpbroadcastd m11, [o(pw_1567_3784)] |
| vpbroadcastd m12, [o(pw_m3784_1567)] |
| psubsw m29, m22, m25 ; t39 t56 |
| paddsw m22, m25 ; t32 t63 |
| psubsw m25, m20, m27 ; t40 t55 |
| paddsw m20, m27 ; t47 t48 |
| psubsw m27, m8, m17 ; t38a t57a |
| paddsw m8, m17 ; t33a t62a |
| psubsw m17, m15, m18 ; t41a t54a |
| paddsw m15, m18 ; t46a t49a |
| paddsw m18, m16, m23 ; t35a t60a |
| psubsw m16, m23 ; t36a t59a |
| psubsw m23, m24, m26 ; t43a t52a |
| paddsw m24, m26 ; t44a t51a |
| paddsw m26, m28, m21 ; t34 t61 |
| psubsw m28, m21 ; t37 t58 |
| psubsw m21, m19, m14 ; t42 t53 |
| paddsw m19, m14 ; t45 t50 |
| ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a |
| ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 |
| ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 |
| ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a |
| vpbroadcastd m11, [o(pw_m1567_m3784)] |
| ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a |
| ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 |
| ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 |
| ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a |
| vbroadcasti32x4 m13, [o(deint_shuf)] |
| vpbroadcastd m11, [o(pw_2896_2896)] |
| vpbroadcastd m12, [o(pw_m2896_2896)] |
| paddsw m14, m22, m20 ; t32a t63a |
| psubsw m22, m20 ; t47a t48a |
| psubsw m20, m8, m15 ; t46 t49 |
| paddsw m8, m15 ; t33 t62 |
| paddsw m15, m18, m24 ; t35 t60 |
| psubsw m18, m24 ; t44 t51 |
| psubsw m24, m26, m19 ; t45a t50a |
| paddsw m26, m19 ; t34a t61a |
| REPX {pshufb x, m13}, m14, m8, m15, m26 |
| psubsw m19, m29, m25 ; t40 t55 |
| paddsw m25, m29 ; t39 t56 |
| psubsw m29, m27, m17 ; t41a t54a |
| paddsw m27, m17 ; t38a t57a |
| psubsw m17, m16, m23 ; t43a t52a |
| paddsw m16, m23 ; t36a t59a |
| psubsw m9, m28, m21 ; t42 t53 |
| paddsw m28, m21 ; t37 t58 |
| REPX {pshufb x, m13}, m25, m27, m16, m28 |
| ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 |
| ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a |
| packssdw m21, m22 ; t47 t46a |
| packssdw m13, m23 ; t48 t49a |
| ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a |
| ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 |
| packssdw m20, m18 ; t44a t45 |
| packssdw m22, m23 ; t51a t50 |
| ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a |
| ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 |
| packssdw m18, m19 ; t40a t41 |
| packssdw m24, m23 ; t55a t54 |
| ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 |
| ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a |
| packssdw m19, m17 ; t43 t42a |
| packssdw m23, m29 ; t52 t53a |
| punpcklqdq m17, m25, m27 ; t39 t38a |
| punpckhqdq m25, m27 ; t56 t57a |
| punpckhqdq m27, m15, m26 ; t60 t61a |
| punpcklqdq m15, m26 ; t35 t34a |
| punpckhqdq m26, m16, m28 ; t59a t58 |
| punpcklqdq m16, m28 ; t36a t37 |
| punpckhqdq m28, m14, m8 ; t63a t62 |
| punpcklqdq m14, m8 ; t32a t33 |
| psubsw m29, m0, m28 ; out63 out62 |
| paddsw m0, m28 ; out0 out1 |
| psubsw m28, m1, m27 ; out60 out61 |
| paddsw m1, m27 ; out3 out2 |
| psubsw m27, m2, m26 ; out59 out58 |
| paddsw m2, m26 ; out4 out5 |
| psubsw m26, m3, m25 ; out56 out57 |
| paddsw m3, m25 ; out7 out6 |
| psubsw m25, m4, m24 ; out55 out54 |
| paddsw m4, m24 ; out8 out9 |
| psubsw m24, m5, m23 ; out52 out53 |
| paddsw m5, m23 ; out11 out10 |
| psubsw m23, m6, m22 ; out51 out50 |
| paddsw m6, m22 ; out12 out13 |
| psubsw m22, m7, m13 ; out48 out49 |
| paddsw m7, m13 ; out15 out14 |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| test eobd, eobd |
| jnz .normal |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 16 |
| .dconly: |
| imul r6d, 181 |
| add r6d, 128+512 |
| sar r6d, 8+2 |
| .dconly2: |
| imul r6d, 181 |
| add r6d, 128+2048 |
| sar r6d, 8+4 |
| pxor m2, m2 |
| vpbroadcastw m3, r6d |
| .dconly_loop: |
| mova m1, [dstq] |
| punpcklbw m0, m1, m2 |
| punpckhbw m1, m2 |
| paddw m0, m3 |
| paddw m1, m3 |
| packuswb m0, m1 |
| mova [dstq], m0 |
| add dstq, strideq |
| dec r3d |
| jg .dconly_loop |
| RET |
| .normal: |
| WIN64_SPILL_XMM 31 |
| mova m19, [o(dup16_perm)] |
| mova m24, [cq+64* 2] |
| mova m28, [cq+64* 6] |
| mova m26, [cq+64* 4] |
| mova m22, [cq+64* 0] |
| mova m23, [cq+64* 1] |
| mova m29, [cq+64* 7] |
| mova m27, [cq+64* 5] |
| mova m25, [cq+64* 3] |
| vpermb m8, m19, m24 ; 4 |
| vpermb m1, m19, m28 ; 12 |
| vpermb m7, m19, m26 ; 8 |
| vpermb m9, m19, m22 ; __ 0 |
| vpermb m21, m19, m23 ; 2 |
| vpermb m15, m19, m29 ; 14 |
| vpermb m18, m19, m27 ; 10 |
| vpermb m14, m19, m25 ; 6 |
| pslld m9, 16 |
| vpord m30, m19, [o(pb_32)] {1to16} |
| REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 |
| cmp eobd, 151 |
| jb .fast |
| vpermb m0, m19, [cq+64*14] ; 28 |
| vpermb m5, m19, [cq+64*10] ; 20 |
| vpermb m3, m19, [cq+64*12] ; 24 |
| vpermb m6, m19, [cq+64* 8] ; __ 16 |
| pslld m6, 16 |
| call m(idct_16x16_internal_8bpc).main_fast |
| vpermb m17, m19, [cq+64*15] ; 30 |
| vpermb m20, m19, [cq+64* 9] ; 18 |
| vpermb m16, m19, [cq+64*11] ; 22 |
| vpermb m19, m19, [cq+64*13] ; 26 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| vpermb m21, m30, [cq+64*15] |
| vpermb m14, m30, [cq+64* 8] |
| vpermb m17, m30, [cq+64*11] |
| vpermb m18, m30, [cq+64*12] |
| vpermb m19, m30, [cq+64*13] |
| vpermb m16, m30, [cq+64*10] |
| vpermb m15, m30, [cq+64* 9] |
| vpermb m20, m30, [cq+64*14] |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf |
| jmp .end |
| .fast: ; bottom half is zero |
| call m(idct_16x16_internal_8bpc).main_fast2 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast |
| .end: |
| mova [cq+64* 8], m4 |
| mova [cq+64* 9], m5 |
| mova [cq+64*10], m6 |
| mova [cq+64*11], m7 |
| mova [cq+64*12], m26 |
| mova [cq+64*13], m27 |
| mova [cq+64*14], m28 |
| mova [cq+64*15], m29 |
| vpbroadcastd m13, [o(pw_8192)] |
| call .pass1_end |
| call .pass2 |
| mova [cq+64* 0], m0 |
| mova [cq+64* 1], m1 |
| mova [cq+64* 2], m2 |
| mova [cq+64* 3], m3 |
| mova [cq+64* 4], m4 |
| mova [cq+64* 5], m5 |
| mova [cq+64* 6], m6 |
| mova [cq+64* 7], m7 |
| pmulhrsw m0, m13, [cq+64* 8] |
| pmulhrsw m1, m13, [cq+64* 9] |
| pmulhrsw m2, m13, [cq+64*10] |
| pmulhrsw m3, m13, [cq+64*11] |
| vpbroadcastd m30, [o(pw_2048)] |
| pmulhrsw m4, m13, m22 |
| pmulhrsw m5, m13, m23 |
| pmulhrsw m6, m13, m24 |
| pmulhrsw m7, m13, m25 |
| pmulhrsw m22, m30, m14 |
| pmulhrsw m14, m13, m26 |
| pmulhrsw m23, m30, m15 |
| pmulhrsw m15, m13, m27 |
| pmulhrsw m24, m30, m16 |
| pmulhrsw m16, m13, m28 |
| pmulhrsw m25, m30, m17 |
| pmulhrsw m17, m13, m29 |
| pmulhrsw m26, m30, m18 |
| pmulhrsw m18, m13, [cq+64*12] |
| pmulhrsw m27, m30, m19 |
| pmulhrsw m19, m13, [cq+64*13] |
| pmulhrsw m28, m30, m20 |
| pmulhrsw m20, m13, [cq+64*14] |
| pmulhrsw m29, m30, m21 |
| pmulhrsw m21, m13, [cq+64*15] |
| call .transpose_round |
| call .pass2 |
| pxor m10, m10 |
| lea r3, [strideq*3] |
| %macro IDCT_64x16_END 4 |
| mova m9, [dstq+%4] |
| %if %1 < 8 |
| pmulhrsw m%3, m30, [cq+64*%1] |
| %endif |
| pmulhrsw m%2, m30 |
| mova [cq+64*%1], m10 |
| punpcklbw m8, m9, m10 |
| punpckhbw m9, m10 |
| paddw m8, m%3 |
| paddw m9, m%2 |
| packuswb m8, m9 |
| mova [dstq+%4], m8 |
| %if %1 == 3 || %1 == 7 || %1 == 11 |
| lea dstq, [dstq+strideq*4] |
| %endif |
| %endmacro |
| IDCT_64x16_END 0, 0, 11, strideq*0 |
| IDCT_64x16_END 1, 1, 11, strideq*1 |
| IDCT_64x16_END 2, 2, 11, strideq*2 |
| IDCT_64x16_END 3, 3, 11, r3 |
| IDCT_64x16_END 4, 4, 11, strideq*0 |
| IDCT_64x16_END 5, 5, 11, strideq*1 |
| IDCT_64x16_END 6, 6, 11, strideq*2 |
| IDCT_64x16_END 7, 7, 11, r3 |
| IDCT_64x16_END 8, 14, 22, strideq*0 |
| IDCT_64x16_END 9, 15, 23, strideq*1 |
| IDCT_64x16_END 10, 16, 24, strideq*2 |
| IDCT_64x16_END 11, 17, 25, r3 |
| IDCT_64x16_END 12, 18, 26, strideq*0 |
| IDCT_64x16_END 13, 19, 27, strideq*1 |
| IDCT_64x16_END 14, 20, 28, strideq*2 |
| IDCT_64x16_END 15, 21, 29, r3 |
| RET |
| ALIGN function_align |
| .pass1_end: |
| mova m4, [cq+64* 0] |
| mova m5, [cq+64* 1] |
| mova m6, [cq+64* 2] |
| mova m7, [cq+64* 3] |
| mova m8, [cq+64* 4] |
| mova m9, [cq+64* 5] |
| mova m11, [cq+64* 6] |
| mova m12, [cq+64* 7] |
| psubsw m29, m4, m21 ; out47 out46 |
| paddsw m4, m21 ; out16 out17 |
| psubsw m28, m5, m20 ; out44 out45 |
| paddsw m5, m20 ; out19 out18 |
| REPX {pmulhrsw x, m13}, m0, m1, m2, m3 |
| psubsw m27, m6, m19 ; out43 out42 |
| paddsw m6, m19 ; out20 out21 |
| psubsw m26, m7, m18 ; out40 out41 |
| paddsw m7, m18 ; out23 out22 |
| pmulhrsw m18, m13, m22 |
| pmulhrsw m19, m13, m23 |
| pmulhrsw m20, m13, m24 |
| pmulhrsw m21, m13, m25 |
| paddsw m25, m12, m14 ; out31 out30 |
| psubsw m14, m12, m14 ; out32 out33 |
| paddsw m24, m11, m15 ; out28 out29 |
| psubsw m15, m11, m15 ; out35 out34 |
| REPX {pmulhrsw x, m13}, m4, m5, m6, m7 |
| paddsw m23, m9, m16 ; out27 out26 |
| psubsw m16, m9, m16 ; out36 out37 |
| paddsw m22, m8, m17 ; out24 out25 |
| psubsw m17, m8, m17 ; out39 out38 |
| REPX {pmulhrsw x, m13}, m14, m15, m16, m17 |
| .transpose_round: |
| %macro TRANSPOSE_8x4_PACKED 4 |
| punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 |
| punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 |
| punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 |
| punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 |
| punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 |
| punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 |
| punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 |
| punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 |
| punpcklwd m%3, m%4, m%2 ; 2 |
| punpckhwd m%4, m%2 ; 3 |
| punpckhwd m%2, m%1, m8 ; 1 |
| punpcklwd m%1, m8 ; 0 |
| %endmacro |
| TRANSPOSE_8x4_PACKED 0, 1, 2, 3 |
| TRANSPOSE_8x4_PACKED 18, 19, 20, 21 |
| TRANSPOSE_8x4_PACKED 4, 5, 6, 7 |
| TRANSPOSE_8x4_PACKED 14, 15, 16, 17 |
| vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 |
| vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 |
| vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 |
| vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 |
| vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 |
| vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 |
| vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 |
| vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 |
| vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 |
| vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 |
| vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 |
| vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 |
| vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 |
| vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 |
| vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 |
| vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 |
| ret |
| .pass2: |
| vshufi32x4 m7, m5, m19, q3131 ; 14 |
| vshufi32x4 m5, m19, q2020 ; 10 |
| vshufi32x4 m21, m6, m20, q3131 ; 15 |
| vshufi32x4 m19, m6, m20, q2020 ; 11 |
| vshufi32x4 m20, m4, m18, q3131 ; 13 |
| vshufi32x4 m18, m4, m18, q2020 ; 9 |
| vshufi32x4 m6, m8, m2, q3131 ; 12 |
| vshufi32x4 m4, m8, m2, q2020 ; 8 |
| vshufi32x4 m2, m0, m3, q3131 ; 4 |
| vshufi32x4 m0, m3, q2020 ; 0 |
| vshufi32x4 m3, m1, m16, q3131 ; 6 |
| vshufi32x4 m1, m16, q2020 ; 2 |
| vshufi32x4 m16, m9, m15, q3131 ; 5 |
| vshufi32x4 m14, m9, m15, q2020 ; 1 |
| vshufi32x4 m15, m11, m17, q2020 ; 3 |
| vshufi32x4 m17, m11, m17, q3131 ; 7 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 |
| jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| |
| cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob |
| vpbroadcastd m23, [o(pw_2896x8)] |
| %undef cmp |
| cmp eobd, 136 |
| jb .fast |
| pmulhrsw m5, m23, [cq+64*20] |
| pmulhrsw m3, m23, [cq+64*12] |
| pmulhrsw m1, m23, [cq+64* 4] |
| pmulhrsw m7, m23, [cq+64*28] |
| pmulhrsw m2, m23, [cq+64* 8] |
| pmulhrsw m6, m23, [cq+64*24] |
| pmulhrsw m0, m23, [cq+64* 0] |
| pmulhrsw m4, m23, [cq+64*16] |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| pmulhrsw m14, m23, [cq+64* 2] |
| pmulhrsw m21, m23, [cq+64*30] |
| pmulhrsw m18, m23, [cq+64*18] |
| pmulhrsw m17, m23, [cq+64*14] |
| pmulhrsw m16, m23, [cq+64*10] |
| pmulhrsw m19, m23, [cq+64*22] |
| pmulhrsw m20, m23, [cq+64*26] |
| pmulhrsw m15, m23, [cq+64* 6] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| mova [cq+64* 0], m14 |
| mova [cq+64* 2], m15 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64*12], m20 |
| mova [cq+64*14], m21 |
| pmulhrsw m22, m23, [cq+64* 1] |
| pmulhrsw m21, m23, [cq+64*31] |
| pmulhrsw m14, m23, [cq+64*17] |
| pmulhrsw m29, m23, [cq+64*15] |
| pmulhrsw m26, m23, [cq+64* 9] |
| pmulhrsw m17, m23, [cq+64*23] |
| pmulhrsw m18, m23, [cq+64*25] |
| pmulhrsw m25, m23, [cq+64* 7] |
| pmulhrsw m24, m23, [cq+64* 5] |
| pmulhrsw m19, m23, [cq+64*27] |
| pmulhrsw m16, m23, [cq+64*21] |
| pmulhrsw m27, m23, [cq+64*11] |
| pmulhrsw m28, m23, [cq+64*13] |
| pmulhrsw m15, m23, [cq+64*19] |
| pmulhrsw m20, m23, [cq+64*29] |
| pmulhrsw m23, [cq+64* 3] |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf |
| vpbroadcastd m12, [o(pw_16384)] |
| psubsw m13, m0, m29 ; 31 |
| paddsw m0, m29 ; 0 |
| psubsw m29, m1, m28 ; 30 |
| paddsw m1, m28 ; 1 |
| psubsw m28, m2, m27 ; 29 |
| paddsw m2, m27 ; 2 |
| psubsw m27, m3, m26 ; 28 |
| paddsw m3, m26 ; 3 |
| psubsw m26, m4, m25 ; 27 |
| paddsw m4, m25 ; 4 |
| psubsw m25, m5, m24 ; 26 |
| paddsw m5, m24 ; 5 |
| psubsw m24, m6, m23 ; 25 |
| paddsw m6, m23 ; 6 |
| psubsw m23, m7, m22 ; 24 |
| paddsw m7, m22 ; 7 |
| pxor m9, m9 |
| punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 |
| punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 |
| punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 |
| punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 |
| punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 |
| punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 |
| punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 |
| REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 |
| punpckhwd m3, m23, m24 |
| punpcklwd m23, m24 |
| punpckhwd m24, m25, m26 |
| punpcklwd m25, m26 |
| REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 |
| punpckhwd m26, m27, m28 |
| punpcklwd m27, m28 |
| punpckhwd m28, m29, m13 |
| punpcklwd m29, m13 |
| REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 |
| punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| REPX {pmulhrsw x, m12}, m7, m0, m2, m4 |
| punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 |
| punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 |
| punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 |
| punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 |
| REPX {pmulhrsw x, m12}, m6, m8, m1, m22 |
| punpckhdq m13, m23, m25 |
| punpckldq m23, m25 |
| punpckhdq m25, m27, m29 |
| punpckldq m27, m29 |
| REPX {pmulhrsw x, m12}, m13, m23, m25, m27 |
| punpckhdq m9, m3, m24 |
| punpckldq m3, m24 |
| punpckhdq m24, m26, m28 |
| punpckldq m26, m28 |
| REPX {pmulhrsw x, m12}, m9, m3, m24, m26 |
| punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 |
| punpcklqdq m23, m27 ; d00 d08 d16 d24 |
| punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 |
| punpckhqdq m13, m25 ; d03 d11 d19 d27 |
| punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 |
| punpckhqdq m3, m26 ; d05 d13 d21 d29 |
| punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 |
| punpckhqdq m9, m24 ; d07 d15 d23 d31 |
| mova [cq+64* 3], m23 |
| mova [cq+64*13], m27 |
| mova [cq+64* 7], m25 |
| mova [cq+64*15], m26 |
| punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 |
| punpcklqdq m8, m22 ; a04 a12 a20 a28 |
| punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 |
| punpcklqdq m0, m4 ; a00 a08 a16 a24 |
| punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 |
| punpcklqdq m7, m2 ; a02 a10 a18 a26 |
| punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 |
| punpcklqdq m6, m1 ; a06 a14 a22 a30 |
| mova [cq+64* 1], m0 |
| mova [cq+64* 9], m7 |
| mova [cq+64* 5], m8 |
| mova [cq+64*11], m6 |
| mova m2, [cq+64* 0] |
| mova m11, [cq+64* 2] |
| mova m8, [cq+64* 4] |
| mova m29, [cq+64* 6] |
| mova m27, [cq+64* 8] |
| mova m26, [cq+64*10] |
| mova m4, [cq+64*12] |
| mova m28, [cq+64*14] |
| psubsw m1, m2, m21 ; 23 |
| paddsw m2, m21 ; 8 |
| psubsw m21, m11, m20 ; 22 |
| paddsw m11, m20 ; 9 |
| psubsw m20, m8, m19 ; 21 |
| paddsw m8, m19 ; 10 |
| psubsw m19, m29, m18 ; 20 |
| paddsw m29, m18 ; 11 |
| psubsw m18, m27, m17 ; 19 |
| paddsw m27, m17 ; 12 |
| psubsw m17, m26, m16 ; 18 |
| paddsw m26, m16 ; 13 |
| psubsw m16, m4, m15 ; 17 |
| paddsw m4, m15 ; 14 |
| psubsw m15, m28, m14 ; 16 |
| paddsw m28, m14 ; 15 |
| punpcklwd m14, m15, m16 |
| punpckhwd m15, m16 |
| punpckhwd m16, m17, m18 |
| punpcklwd m17, m18 |
| punpckhwd m18, m19, m20 |
| punpcklwd m19, m20 |
| punpckhwd m20, m21, m1 |
| punpcklwd m21, m1 |
| punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 |
| punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 |
| punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 |
| punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 |
| punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 |
| punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 |
| punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 |
| punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 |
| punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 |
| punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 |
| punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 |
| punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 |
| REPX {pmulhrsw x, m12}, m28, m2, m8, m27 |
| punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 |
| punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 |
| punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 |
| punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 |
| REPX {pmulhrsw x, m12}, m4, m1, m11, m29 |
| punpckhdq m26, m19, m21 |
| punpckldq m19, m21 |
| punpckhdq m21, m15, m16 |
| punpckldq m15, m16 |
| REPX {pmulhrsw x, m12}, m26, m19, m21, m15 |
| punpckhdq m16, m18, m20 |
| punpckldq m18, m20 |
| punpckhdq m20, m14, m17 |
| punpckldq m14, m17 |
| REPX {pmulhrsw x, m12}, m16, m18, m20, m14 |
| punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 |
| punpcklqdq m28, m8 ; b02 b10 b18 b26 |
| punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 |
| punpcklqdq m2, m27 ; b00 b08 b16 b24 |
| punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 |
| punpckhqdq m1, m29 ; b05 b13 b21 b29 |
| punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 |
| punpckhqdq m4, m11 ; b07 b15 b23 b31 |
| mova [cq+64* 0], m2 |
| mova [cq+64* 8], m28 |
| mova [cq+64* 4], m27 |
| mova [cq+64*10], m29 |
| punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 |
| punpcklqdq m20, m26 ; c02 c10 c18 c26 |
| punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 |
| punpcklqdq m14, m19 ; c00 c08 c16 c24 |
| punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 |
| punpcklqdq m15, m18 ; c04 c12 c20 c28 |
| punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 |
| punpcklqdq m21, m16 ; c06 c14 c22 c30 |
| mova [cq+64* 2], m14 |
| mova [cq+64*12], m20 |
| mova [cq+64* 6], m15 |
| mova [cq+64*14], m21 |
| vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 |
| vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 |
| vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 |
| vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 |
| vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 |
| vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 |
| vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 |
| vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 |
| vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 |
| vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 |
| vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 |
| vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 |
| vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 |
| vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 |
| vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 |
| vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 |
| mov r4, rsp |
| vshufi32x4 m0, m22, m19, q2020 ; 1 |
| vshufi32x4 m1, m17, m29, q3131 ; 31 |
| vshufi32x4 m2, m14, m26, q2020 ; 17 |
| vshufi32x4 m3, m25, m18, q3131 ; 15 |
| call .main_part1 |
| vshufi32x4 m0, m25, m18, q2020 ; 7 |
| vshufi32x4 m1, m14, m26, q3131 ; 25 |
| vshufi32x4 m2, m17, m29, q2020 ; 23 |
| vshufi32x4 m3, m22, m19, q3131 ; 9 |
| call .main_part1 |
| vshufi32x4 m0, m24, m21, q2020 ; 5 |
| vshufi32x4 m1, m15, m27, q3131 ; 27 |
| vshufi32x4 m2, m16, m28, q2020 ; 21 |
| vshufi32x4 m3, m23, m20, q3131 ; 11 |
| call .main_part1 |
| vshufi32x4 m0, m23, m20, q2020 ; 3 |
| vshufi32x4 m1, m16, m28, q3131 ; 29 |
| vshufi32x4 m2, m15, m27, q2020 ; 19 |
| vshufi32x4 m3, m24, m21, q3131 ; 13 |
| call .main_part1 |
| call .main_part2 |
| mova m0, [cq+64* 1] ; a0 |
| mova m15, [cq+64* 0] ; b0 |
| mova m3, [cq+64* 2] ; c0 |
| mova m16, [cq+64* 3] ; d0 |
| mova m14, [cq+64* 5] ; a4 |
| mova m8, [cq+64* 4] ; b4 |
| mova m17, [cq+64* 6] ; c4 |
| mova m1, [cq+64* 7] ; d4 |
| vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 |
| vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 |
| vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 |
| vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 |
| vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 |
| vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 |
| vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 |
| vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 |
| vshufi32x4 m1, m0, m3, q3131 ; 8 |
| vshufi32x4 m0, m3, q2020 ; 0 |
| vshufi32x4 m3, m2, m15, q3131 ; 24 |
| vshufi32x4 m2, m15, q2020 ; 16 |
| vshufi32x4 m15, m14, m17, q3131 ; 12 |
| vshufi32x4 m14, m17, q2020 ; 4 |
| vshufi32x4 m17, m16, m8, q3131 ; 28 |
| vshufi32x4 m16, m8, q2020 ; 20 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| mova m8, [cq+64* 8] |
| mova m9, [cq+64*12] |
| mova m11, [cq+64*10] |
| mova m12, [cq+64*14] |
| mova [cq+64* 0], m14 |
| mova [cq+64* 2], m15 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64*12], m20 |
| mova [cq+64*14], m21 |
| mova m22, [cq+64* 9] |
| mova m27, [cq+64*13] |
| mova m23, [cq+64*11] |
| mova m24, [cq+64*15] |
| vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 |
| vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 |
| vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 |
| vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 |
| vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 |
| vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 |
| vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 |
| vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 |
| vshufi32x4 m28, m26, m8, q3131 ; 26 |
| vshufi32x4 m26, m8, q2020 ; 18 |
| vshufi32x4 m24, m22, m9, q3131 ; 10 |
| vshufi32x4 m22, m9, q2020 ; 2 |
| vshufi32x4 m29, m27, m11, q3131 ; 30 |
| vshufi32x4 m27, m11, q2020 ; 22 |
| vshufi32x4 m25, m23, m12, q3131 ; 14 |
| vshufi32x4 m23, m12, q2020 ; 6 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| jmp .end |
| .fast: ; bottom/right halves are zero |
| pmulhrsw ym9, ym23, [cq+64* 0] |
| pmulhrsw ym6, ym23, [cq+64* 8] |
| mova m14, [o(dup16_perm)] |
| pmulhrsw ym8, ym23, [cq+64* 2] |
| pmulhrsw xm0, xm23, [cq+64*14] |
| pmulhrsw xm5, xm23, [cq+64*10] |
| pmulhrsw ym1, ym23, [cq+64* 6] |
| pmulhrsw ym7, ym23, [cq+64* 4] |
| pmulhrsw xm3, xm23, [cq+64*12] |
| pmovzxwd m9, ym9 |
| pmovzxwd m6, ym6 |
| vpermb m8, m14, m8 |
| punpcklwd xm0, xm0 |
| vpermb ym5, ym14, ym5 |
| vpermb m1, m14, m1 |
| vpermb m7, m14, m7 |
| punpcklwd xm3, xm3 |
| pslld m9, 16 |
| pslld m6, 16 |
| call m(idct_16x16_internal_8bpc).main_fast |
| vpmulhrsw ym21, ym23, [cq+64* 1] |
| {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which |
| {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to |
| {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements |
| {evex}vpmulhrsw ym18, ym23, [cq+64* 5] |
| {evex}vpmulhrsw xm16, xm23, [cq+64*11] |
| {evex}vpmulhrsw xm19, xm23, [cq+64*13] |
| {evex}vpmulhrsw ym23, [cq+64* 3] |
| vpermb m21, m14, m21 |
| punpcklwd xm17, xm17 |
| vpermb ym20, ym14, ym20 |
| vpermb m15, m14, m15 |
| vpermb m18, m14, m18 |
| vpermb ym16, ym14, ym16 |
| punpcklwd xm19, xm19 |
| vpermb m14, m14, m23 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast |
| vpbroadcastd m9, [o(pw_16384)] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round |
| vshufi32x4 m16, m0, m3, q2020 ; 0 |
| vshufi32x4 m26, m0, m3, q3131 ; 4 |
| vshufi32x4 m0, m14, m2, q2020 ; 1 |
| vshufi32x4 m14, m2, q3131 ; 5 |
| vshufi32x4 m3, m19, m7, q3131 ; 15 |
| vshufi32x4 m19, m7, q2020 ; 11 |
| vshufi32x4 m27, m17, m9, q2020 ; 3 |
| vshufi32x4 m17, m9, q3131 ; 7 |
| vshufi32x4 m28, m20, m6, q2020 ; 9 |
| vshufi32x4 m20, m6, q3131 ; 13 |
| vshufi32x4 m22, m1, m18, q2020 ; 2 |
| vshufi32x4 m23, m1, m18, q3131 ; 6 |
| vshufi32x4 m24, m5, m15, q2020 ; 10 |
| vshufi32x4 m25, m5, m15, q3131 ; 14 |
| vshufi32x4 m15, m21, m4, q3131 ; 12 |
| vshufi32x4 m21, m21, m4, q2020 ; 8 |
| mov r4, rsp |
| call .main_part1_fast |
| mova m0, m17 |
| mova m3, m28 |
| call .main_part1_fast |
| mova m0, m14 |
| mova m3, m19 |
| call .main_part1_fast |
| mova m0, m27 |
| mova m3, m20 |
| call .main_part1_fast |
| call .main_part2 |
| mova m0, m16 |
| mova m1, m21 |
| mova m14, m26 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 |
| mova [cq+64*14], m21 |
| mova [cq+64* 0], m14 |
| mova [cq+64* 6], m17 |
| mova [cq+64* 8], m18 |
| mova [cq+64*10], m19 |
| mova [cq+64* 4], m16 |
| mova [cq+64* 2], m15 |
| mova [cq+64*12], m20 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 |
| .end: |
| lea r4, [strideq*3] |
| vpbroadcastd m12, [o(pw_2048)] |
| movshdup m13, [o(permD)] |
| lea r5, [r4+strideq] ; stride*4 |
| lea r3, [dstq+r4*8] |
| lea r6, [strideq+r5*8] ; stride*33 |
| lea r8, [r4+r5*8] ; stride*35 |
| add r3, r5 ; dst+stride*28 |
| lea r7, [r6+strideq] ; stride*34 |
| %macro IDCT_32x64_END 6 ; src, mem, stride[1-4] |
| %if %2 < 8 |
| paddsw m10, m%2, m%1 |
| psubsw m11, m%2, m%1 |
| %else |
| mova m11, [cq+64*(%2*2-16)] |
| paddsw m10, m11, m%1 |
| psubsw m11, m%1 |
| %endif |
| mova m9, [rsp+64*(31-%2)] |
| mova m%1, [rsp+64*%2] |
| paddsw m8, m10, m9 |
| psubsw m10, m9 |
| paddsw m9, m11, m%1 |
| pmovzxbw m0, [dstq+%3] |
| psubsw m11, m%1 |
| pmovzxbw m%1, [r3 +%4] |
| REPX {pmulhrsw x, m12}, m8, m10, m9, m11 |
| paddw m8, m0 |
| pmovzxbw m0, [r3 +%5] |
| paddw m10, m%1 |
| pmovzxbw m%1, [dstq+%6] |
| paddw m9, m0 |
| paddw m11, m%1 |
| %if %2 >= 8 |
| %if %2 == 8 |
| pxor m1, m1 |
| %endif |
| mova [cq+64*(%2*2-16)], m1 |
| mova [cq+64*(%2*2-15)], m1 |
| %endif |
| packuswb m8, m10 |
| packuswb m9, m11 |
| vpermq m8, m13, m8 |
| vpermq m9, m13, m9 |
| mova [dstq+%3], ym8 |
| vextracti32x8 [r3 +%4], m8, 1 |
| mova [r3 +%5], ym9 |
| vextracti32x8 [dstq+%6], m9, 1 |
| %if %2 == 3 || %2 == 7 || %2 == 11 |
| add dstq, r5 |
| sub r3, r5 |
| %endif |
| %endmacro |
| IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 |
| IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 |
| IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 |
| IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 |
| IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 |
| IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 |
| IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 |
| IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 |
| IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 |
| IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 |
| IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 |
| IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 |
| IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 |
| IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 |
| IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 |
| IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 64 |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 |
| ALIGN function_align ; bottom three-quarters are zero |
| .main_part1_fast: |
| vpbroadcastd m1, [o(idct64_mul+4*0)] |
| vpbroadcastd m8, [o(idct64_mul+4*1)] |
| vpbroadcastd m2, [o(idct64_mul+4*6)] |
| vpbroadcastd m9, [o(idct64_mul+4*7)] |
| pmulhrsw m1, m0 ; t63a |
| pmulhrsw m0, m8 ; t32a |
| pmulhrsw m2, m3 ; t60a |
| pmulhrsw m3, m9 ; t35a |
| mova m8, m0 |
| mova m7, m1 |
| mova m6, m3 |
| mova m5, m2 |
| jmp .main_part1b |
| .main_part1: |
| ; idct64 steps 1-5: |
| ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a |
| ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a |
| ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a |
| ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a |
| vpbroadcastd m7, [o(idct64_mul+4*0)] |
| vpbroadcastd m8, [o(idct64_mul+4*1)] |
| vpbroadcastd m6, [o(idct64_mul+4*2)] |
| vpbroadcastd m9, [o(idct64_mul+4*3)] |
| pmulhrsw m7, m0 ; t63a |
| vpbroadcastd m5, [o(idct64_mul+4*4)] |
| pmulhrsw m0, m8 ; t32a |
| vpbroadcastd m8, [o(idct64_mul+4*5)] |
| pmulhrsw m6, m1 ; t62a |
| vpbroadcastd m4, [o(idct64_mul+4*6)] |
| pmulhrsw m1, m9 ; t33a |
| vpbroadcastd m9, [o(idct64_mul+4*7)] |
| pmulhrsw m5, m2 ; t61a |
| pmulhrsw m2, m8 ; t34a |
| pmulhrsw m4, m3 ; t60a |
| pmulhrsw m3, m9 ; t35a |
| psubsw m8, m0, m1 ; t33 |
| paddsw m0, m1 ; t32 |
| psubsw m1, m7, m6 ; t62 |
| paddsw m7, m6 ; t63 |
| psubsw m6, m3, m2 ; t34 |
| paddsw m3, m2 ; t35 |
| psubsw m2, m4, m5 ; t61 |
| paddsw m5, m4 ; t60 |
| .main_part1b: |
| vpbroadcastd m11, [o(idct64_mul+4*8)] |
| vpbroadcastd m12, [o(idct64_mul+4*9)] |
| ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a |
| vpbroadcastd m11, [o(idct64_mul+4*10)] |
| ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a |
| vpbroadcastd m11, [o(idct64_mul+4*11)] |
| vpbroadcastd m12, [o(idct64_mul+4*12)] |
| psubsw m4, m0, m3 ; t35a |
| paddsw m0, m3 ; t32a |
| psubsw m3, m7, m5 ; t60a |
| paddsw m7, m5 ; t63a |
| psubsw m5, m1, m2 ; t34 |
| paddsw m1, m2 ; t33 |
| psubsw m2, m8, m6 ; t61 |
| paddsw m6, m8 ; t62 |
| add r5, 4*13 |
| ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 |
| ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a |
| mova [r4+64*0], m0 |
| mova [r4+64*7], m7 |
| mova [r4+64*1], m1 |
| mova [r4+64*6], m6 |
| mova [r4+64*3], m3 |
| mova [r4+64*4], m4 |
| mova [r4+64*2], m2 |
| mova [r4+64*5], m5 |
| add r4, 64*8 |
| ret |
| .main_part2: |
| vpbroadcastd m11, [o(pw_1567_3784 -16*13)] |
| vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] |
| lea r6, [r4+64*7] |
| vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] |
| vpbroadcastd m18, [o(pw_2896_2896 -16*13)] |
| vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] |
| sub r5, 16*13 |
| .main_part2_loop: |
| mova m0, [r4-64*32] ; t32a |
| mova m1, [r6-64*24] ; t39a |
| mova m2, [r6-64*32] ; t63a |
| mova m3, [r4-64*24] ; t56a |
| mova m4, [r4-64*16] ; t40a |
| mova m5, [r6-64* 8] ; t47a |
| mova m6, [r6-64*16] ; t55a |
| mova m7, [r4-64* 8] ; t48a |
| psubsw m8, m0, m1 ; t39 |
| paddsw m0, m1 ; t32 |
| psubsw m1, m2, m3 ; t56 |
| paddsw m2, m3 ; t63 |
| psubsw m3, m5, m4 ; t40 |
| paddsw m5, m4 ; t47 |
| psubsw m4, m7, m6 ; t55 |
| paddsw m7, m6 ; t48 |
| ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a |
| ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a |
| psubsw m6, m2, m7 ; t48a |
| paddsw m2, m7 ; t63a |
| psubsw m7, m0, m5 ; t47a |
| paddsw m0, m5 ; t32a |
| psubsw m5, m8, m3 ; t55 |
| paddsw m8, m3 ; t56 |
| psubsw m3, m1, m4 ; t40 |
| paddsw m1, m4 ; t39 |
| ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 |
| ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a |
| mova [r6-64* 8], m2 |
| mova [r4-64*32], m0 |
| mova [r4-64* 8], m8 |
| mova [r6-64*32], m1 |
| mova [r6-64*24], m6 |
| mova [r4-64*16], m7 |
| mova [r4-64*24], m5 |
| mova [r6-64*16], m3 |
| add r4, 64 |
| sub r6, 64 |
| cmp r4, r6 |
| jb .main_part2_loop |
| ret |
| |
| cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob |
| vpbroadcastd m23, [o(pw_2896x8)] |
| %undef cmp |
| cmp eobd, 136 |
| jb .fast |
| pmulhrsw m0, m23, [cq+64* 1] |
| pmulhrsw m1, m23, [cq+64*31] |
| pmulhrsw m2, m23, [cq+64*17] |
| pmulhrsw m3, m23, [cq+64*15] |
| vpbroadcastd m10, [o(pd_2048)] |
| mov r4, rsp |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| pmulhrsw m0, m23, [cq+64* 7] |
| pmulhrsw m1, m23, [cq+64*25] |
| pmulhrsw m2, m23, [cq+64*23] |
| pmulhrsw m3, m23, [cq+64* 9] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| pmulhrsw m0, m23, [cq+64* 5] |
| pmulhrsw m1, m23, [cq+64*27] |
| pmulhrsw m2, m23, [cq+64*21] |
| pmulhrsw m3, m23, [cq+64*11] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| pmulhrsw m0, m23, [cq+64* 3] |
| pmulhrsw m1, m23, [cq+64*29] |
| pmulhrsw m2, m23, [cq+64*19] |
| pmulhrsw m3, m23, [cq+64*13] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 |
| pmulhrsw m3, m23, [cq+64*24] |
| pmulhrsw m1, m23, [cq+64* 8] |
| pmulhrsw m2, m23, [cq+64*16] |
| pmulhrsw m0, m23, [cq+64* 0] |
| pmulhrsw m14, m23, [cq+64* 4] |
| pmulhrsw m17, m23, [cq+64*28] |
| pmulhrsw m16, m23, [cq+64*20] |
| pmulhrsw m15, m23, [cq+64*12] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| pmulhrsw m22, m23, [cq+64* 2] |
| pmulhrsw m29, m23, [cq+64*30] |
| pmulhrsw m26, m23, [cq+64*18] |
| pmulhrsw m25, m23, [cq+64*14] |
| pmulhrsw m24, m23, [cq+64*10] |
| pmulhrsw m27, m23, [cq+64*22] |
| pmulhrsw m28, m23, [cq+64*26] |
| pmulhrsw m23, [cq+64* 6] |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| vpbroadcastd m13, [o(pw_16384)] |
| call .pass1_end_part1 |
| mova [cq+64*16], m1 |
| mova [cq+64*17], m3 |
| mova [cq+64*18], m5 |
| mova [cq+64*19], m7 |
| mova [cq+64*24], m23 |
| mova [cq+64*25], m25 |
| mova [cq+64*26], m27 |
| mova [cq+64*27], m29 |
| pmulhrsw m23, m13, m0 ; a0 |
| pmulhrsw m25, m13, m2 ; a2 |
| pmulhrsw m27, m13, m4 ; a4 |
| pmulhrsw m29, m13, m6 ; a6 |
| REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 |
| call .pass1_end_part2 |
| mova [cq+64*20], m15 |
| mova [cq+64*21], m17 |
| mova [cq+64*22], m19 |
| mova [cq+64*23], m21 |
| mova [cq+64*28], m1 |
| mova [cq+64*29], m3 |
| mova [cq+64*30], m5 |
| mova [cq+64*31], m7 |
| REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 |
| REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 |
| vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 |
| vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 |
| vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 |
| vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 |
| vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 |
| vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 |
| vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 |
| vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 |
| vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 |
| vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 |
| vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 |
| vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 |
| vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 |
| vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 |
| vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 |
| vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 |
| vshufi32x4 m2, m3, m15, q3131 ; 8 |
| vshufi32x4 m0, m3, m15, q2020 ; 0 |
| vshufi32x4 m6, m23, m22, q3131 ; 24 |
| vshufi32x4 m4, m23, m22, q2020 ; 16 |
| vshufi32x4 m3, m1, m18, q3131 ; 12 |
| vshufi32x4 m1, m18, q2020 ; 4 |
| vshufi32x4 m7, m27, m26, q3131 ; 28 |
| vshufi32x4 m5, m27, m26, q2020 ; 20 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| vshufi32x4 m16, m14, m17, q3131 ; 10 |
| vshufi32x4 m14, m17, q2020 ; 2 |
| vshufi32x4 m17, m19, m20, q3131 ; 14 |
| vshufi32x4 m15, m19, m20, q2020 ; 6 |
| vshufi32x4 m20, m25, m24, q3131 ; 26 |
| vshufi32x4 m18, m25, m24, q2020 ; 18 |
| vshufi32x4 m21, m29, m28, q3131 ; 30 |
| vshufi32x4 m19, m29, m28, q2020 ; 22 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| pmulhrsw m22, m13, [cq+64*16] ; a1 |
| pmulhrsw m23, m13, [cq+64*20] ; c1 |
| pmulhrsw m24, m13, [cq+64*24] ; e1 |
| pmulhrsw m25, m13, [cq+64*28] ; g1 |
| pmulhrsw m26, m13, [cq+64*17] ; a3 |
| pmulhrsw m27, m13, [cq+64*21] ; c3 |
| pmulhrsw m28, m13, [cq+64*25] ; e3 |
| pmulhrsw m29, m13, [cq+64*29] ; g3 |
| mova [cq+64* 8], m14 |
| mova [cq+64* 9], m15 |
| mova [cq+64*10], m16 |
| mova [cq+64*11], m17 |
| mova [cq+64*12], m18 |
| mova [cq+64*13], m19 |
| mova [cq+64*14], m20 |
| mova [cq+64*15], m21 |
| pmulhrsw m14, m13, [cq+64*18] ; a5 |
| pmulhrsw m15, m13, [cq+64*22] ; c5 |
| pmulhrsw m16, m13, [cq+64*26] ; e5 |
| pmulhrsw m17, m13, [cq+64*30] ; g5 |
| pmulhrsw m18, m13, [cq+64*19] ; a7 |
| pmulhrsw m19, m13, [cq+64*23] ; c7 |
| pmulhrsw m20, m13, [cq+64*27] ; e7 |
| pmulhrsw m21, m13, [cq+64*31] ; g7 |
| vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 |
| vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 |
| vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 |
| vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 |
| vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 |
| vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 |
| vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 |
| vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 |
| mova [cq+64* 0], m0 |
| mova [cq+64* 1], m1 |
| mova [cq+64* 2], m2 |
| mova [cq+64* 3], m3 |
| mova [cq+64* 4], m4 |
| mova [cq+64* 5], m5 |
| mova [cq+64* 6], m6 |
| mova [cq+64* 7], m7 |
| vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 |
| vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 |
| vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 |
| vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 |
| vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 |
| vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 |
| vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 |
| vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 |
| vshufi32x4 m27, m23, m11, q3131 ; 11 m27 |
| vshufi32x4 m23, m11, q2020 ; 3 m23 |
| vshufi32x4 m19, m26, m28, q3131 ; 27 m19 |
| vshufi32x4 m15, m26, m28, q2020 ; 19 m15 |
| vshufi32x4 m29, m25, m17, q3131 ; 15 m29 |
| vshufi32x4 m25, m17, q2020 ; 7 m25 |
| vshufi32x4 m21, m18, m20, q3131 ; 31 m21 |
| vshufi32x4 m17, m18, m20, q2020 ; 23 m17 |
| vshufi32x4 m20, m14, m16, q3131 ; 29 m20 |
| vshufi32x4 m16, m14, m16, q2020 ; 21 m16 |
| vshufi32x4 m18, m22, m24, q3131 ; 25 m18 |
| vshufi32x4 m14, m22, m24, q2020 ; 17 m14 |
| vshufi32x4 m26, m8, m9, q3131 ; 9 m26 |
| vshufi32x4 m22, m8, m9, q2020 ; 1 m22 |
| vshufi32x4 m28, m12, m13, q3131 ; 13 m28 |
| vshufi32x4 m24, m12, m13, q2020 ; 5 m24 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf |
| vpbroadcastd m13, [o(pw_16384)] |
| pmulhrsw m0, m13, [r4-64*21] |
| pmulhrsw m1, m13, [r4-64*22] |
| pmulhrsw m2, m13, [r4-64*23] |
| pmulhrsw m3, m13, [r4-64*24] |
| pmulhrsw m4, m13, [r4-64*25] |
| pmulhrsw m5, m13, [r4-64*26] |
| pmulhrsw m6, m13, [r4-64*27] |
| pmulhrsw m7, m13, [r4-64*28] |
| mova [cq+64*16], m14 |
| mova [cq+64*17], m15 |
| mova [cq+64*18], m16 |
| mova [cq+64*19], m17 |
| mova [cq+64*20], m18 |
| mova [cq+64*21], m19 |
| mova [cq+64*22], m20 |
| mova [cq+64*23], m21 |
| pmulhrsw m14, m13, [r4-64*12] |
| pmulhrsw m15, m13, [r4-64*11] |
| pmulhrsw m16, m13, [r4-64*10] |
| pmulhrsw m17, m13, [r4-64* 9] |
| pmulhrsw m18, m13, [r4-64* 8] |
| pmulhrsw m19, m13, [r4-64* 7] |
| pmulhrsw m20, m13, [r4-64* 6] |
| pmulhrsw m21, m13, [r4-64* 5] |
| mova [cq+64*24], m22 |
| mova [cq+64*25], m23 |
| mova [cq+64*26], m24 |
| mova [cq+64*27], m25 |
| mova [cq+64*28], m26 |
| mova [cq+64*29], m27 |
| mova [cq+64*30], m28 |
| mova [cq+64*31], m29 |
| call .transpose_2x8x8_lo |
| mova [r4-64*12], m1 |
| mova [r4-64*11], m3 |
| mova [r4-64*10], m5 |
| mova [r4-64* 9], m7 |
| mova [r4-64* 8], m15 |
| mova [r4-64* 7], m17 |
| mova [r4-64* 6], m19 |
| mova [r4-64* 5], m21 |
| vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 |
| vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 |
| vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 |
| vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 |
| vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 |
| vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 |
| vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 |
| vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 |
| pmulhrsw m0, m13, [r4-64*20] |
| pmulhrsw m1, m13, [r4-64*19] |
| pmulhrsw m2, m13, [r4-64*18] |
| pmulhrsw m3, m13, [r4-64*17] |
| pmulhrsw m4, m13, [r4-64*16] |
| pmulhrsw m5, m13, [r4-64*15] |
| pmulhrsw m6, m13, [r4-64*14] |
| pmulhrsw m7, m13, [r4-64*13] |
| pmulhrsw m14, m13, [r4-64*29] |
| pmulhrsw m15, m13, [r4-64*30] |
| pmulhrsw m16, m13, [r4-64*31] |
| pmulhrsw m17, m13, [r4-64*32] |
| pmulhrsw m18, m13, [r4-64*33] |
| pmulhrsw m19, m13, [r4-64*34] |
| pmulhrsw m20, m13, [r4-64*35] |
| pmulhrsw m21, m13, [r4-64*36] |
| call .transpose_2x8x8_lo |
| mova [r4-64*20], m1 |
| mova [r4-64*19], m3 |
| mova [r4-64*18], m5 |
| mova [r4-64*17], m7 |
| mova [r4-64*16], m15 |
| mova [r4-64*15], m17 |
| mova [r4-64*14], m19 |
| mova [r4-64*13], m21 |
| vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 |
| vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 |
| vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 |
| vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 |
| vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 |
| vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 |
| vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 |
| vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 |
| vshufi32x4 m2, m0, m22, q3131 ; 8 |
| vshufi32x4 m0, m22, q2020 ; 0 |
| vshufi32x4 m3, m1, m26, q3131 ; 12 |
| vshufi32x4 m1, m26, q2020 ; 4 |
| vshufi32x4 m6, m4, m23, q3131 ; 24 |
| vshufi32x4 m4, m23, q2020 ; 16 |
| vshufi32x4 m7, m5, m27, q3131 ; 28 |
| vshufi32x4 m5, m27, q2020 ; 20 |
| call m(inv_txfm_add_dct_dct_32x8_8bpc).main |
| vshufi32x4 m16, m14, m24, q3131 ; 10 |
| vshufi32x4 m14, m24, q2020 ; 2 |
| vshufi32x4 m17, m15, m28, q3131 ; 14 |
| vshufi32x4 m15, m28, q2020 ; 6 |
| vshufi32x4 m20, m18, m25, q3131 ; 26 |
| vshufi32x4 m18, m25, q2020 ; 18 |
| vshufi32x4 m21, m19, m29, q3131 ; 30 |
| vshufi32x4 m19, m29, q2020 ; 22 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf |
| mova m22, [r4-64*20] |
| mova m26, [r4-64*16] |
| mova m23, [r4-64*19] |
| mova m27, [r4-64*15] |
| mova m24, [r4-64*18] |
| mova m28, [r4-64*14] |
| mova m25, [r4-64*17] |
| mova m29, [r4-64*13] |
| mova [r4-64*20], m14 |
| mova [r4-64*19], m15 |
| mova [r4-64*18], m16 |
| mova [r4-64*17], m17 |
| mova [r4-64*16], m18 |
| mova [r4-64*15], m19 |
| mova [r4-64*14], m20 |
| mova [r4-64*13], m21 |
| mova m19, [r4-64*12] |
| mova m11, [r4-64* 8] |
| mova m20, [r4-64*11] |
| mova m12, [r4-64* 7] |
| mova m21, [r4-64*10] |
| mova m8, [r4-64* 6] |
| mova m9, [r4-64* 9] |
| mova m18, [r4-64* 5] |
| vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 |
| vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 |
| vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 |
| vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 |
| vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 |
| vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 |
| vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 |
| vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 |
| vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 |
| vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 |
| vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 |
| vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 |
| vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 |
| vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 |
| vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 |
| vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 |
| vshufi32x4 m26, m22, m27, q3131 ; 9 |
| vshufi32x4 m22, m27, q2020 ; 1 |
| vshufi32x4 m27, m23, m28, q3131 ; 11 |
| vshufi32x4 m23, m28, q2020 ; 3 |
| vshufi32x4 m28, m24, m29, q3131 ; 13 |
| vshufi32x4 m24, m29, q2020 ; 5 |
| vshufi32x4 m29, m25, m8, q3131 ; 15 |
| vshufi32x4 m25, m8, q2020 ; 7 |
| vshufi32x4 m18, m14, m19, q3131 ; 25 |
| vshufi32x4 m14, m19, q2020 ; 17 |
| vshufi32x4 m19, m15, m20, q3131 ; 27 |
| vshufi32x4 m15, m20, q2020 ; 19 |
| vshufi32x4 m20, m16, m21, q3131 ; 29 |
| vshufi32x4 m16, m21, q2020 ; 21 |
| vshufi32x4 m21, m17, m9, q3131 ; 31 |
| vshufi32x4 m17, m9, q2020 ; 23 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf |
| jmp .end |
| .fast: ; bottom/right halves are zero |
| {evex}vpmulhrsw ym8, ym23, [cq+64* 4] |
| {evex}vpmulhrsw xm1, xm23, [cq+64*12] |
| mova m28, [o(dup16_perm)] |
| {evex}vpmulhrsw ym7, ym23, [cq+64* 8] |
| vpmulhrsw ym22, ym23, [cq+64* 0] |
| vpermb m8, m28, m8 |
| vpermb ym1, ym28, ym1 |
| vpermb m7, m28, m7 |
| pmovzxwd m9, ym22 |
| pslld m9, 16 |
| call m(idct_16x16_internal_8bpc).main_fast2 |
| {evex}vpmulhrsw ym21, ym23, [cq+64* 2] |
| {evex}vpmulhrsw xm15, xm23, [cq+64*14] |
| {evex}vpmulhrsw xm18, xm23, [cq+64*10] |
| {evex}vpmulhrsw ym14, ym23, [cq+64* 6] |
| vpermb m21, m28, m21 |
| punpcklwd xm15, xm15 |
| vpermb ym18, ym28, ym18 |
| vpermb m14, m28, m14 |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 |
| vpmulhrsw ym22, ym23, [cq+64* 1] |
| {evex}vpmulhrsw xm29, xm23, [cq+64*15] |
| {evex}vpmulhrsw xm26, xm23, [cq+64* 9] |
| {evex}vpmulhrsw ym25, ym23, [cq+64* 7] |
| {evex}vpmulhrsw ym24, ym23, [cq+64* 5] |
| {evex}vpmulhrsw xm27, xm23, [cq+64*11] |
| {evex}vpmulhrsw xm8, xm23, [cq+64*13] |
| {evex}vpmulhrsw ym23, [cq+64* 3] |
| vpermb m22, m28, m22 |
| punpcklwd xm29, xm29 |
| vpermb ym26, ym28, ym26 |
| vpermb m25, m28, m25 |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| REPX {vpermb x, m28, x}, m24, m27, m23 |
| punpcklwd xm28, xm8, xm8 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast |
| mov r4, rsp |
| vpbroadcastd m13, [o(pw_16384)] |
| mova [r4+64*16], m4 |
| mova [r4+64*17], m5 |
| mova [r4+64*18], m6 |
| mova [r4+64*19], m7 |
| mova [r4+64*28], m26 |
| mova [r4+64*29], m27 |
| mova [r4+64*30], m28 |
| mova [r4+64*31], m29 |
| call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end |
| mova [r4+64*20], m22 |
| mova [r4+64*21], m23 |
| mova [r4+64*22], m24 |
| mova [r4+64*23], m25 |
| mova [r4+64*24], m26 |
| mova [r4+64*25], m27 |
| mova [r4+64*26], m28 |
| mova [r4+64*27], m29 |
| call .pass2_fast |
| mova [cq+64* 8], m14 |
| mova [cq+64* 9], m15 |
| mova [cq+64*10], m16 |
| mova [cq+64*11], m17 |
| mova [cq+64*12], m18 |
| mova [cq+64*13], m19 |
| mova [cq+64*14], m20 |
| mova [cq+64*15], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| mova [cq+64* 0], m0 |
| mova [cq+64* 1], m1 |
| mova [cq+64* 2], m2 |
| mova [cq+64* 3], m3 |
| mova [cq+64* 4], m4 |
| mova [cq+64* 5], m5 |
| mova [cq+64* 6], m6 |
| mova [cq+64* 7], m7 |
| pmulhrsw m0, m13, [r4+64*16] |
| pmulhrsw m1, m13, [r4+64*17] |
| pmulhrsw m2, m13, [r4+64*18] |
| pmulhrsw m3, m13, [r4+64*19] |
| pmulhrsw m4, m13, [r4+64*20] |
| pmulhrsw m5, m13, [r4+64*21] |
| pmulhrsw m6, m13, [r4+64*22] |
| pmulhrsw m7, m13, [r4+64*23] |
| mova [cq+64*16], m14 |
| mova [cq+64*17], m15 |
| mova [cq+64*18], m16 |
| mova [cq+64*19], m17 |
| mova [cq+64*20], m18 |
| mova [cq+64*21], m19 |
| mova [cq+64*22], m20 |
| mova [cq+64*23], m21 |
| pmulhrsw m14, m13, [r4+64*24] |
| pmulhrsw m15, m13, [r4+64*25] |
| pmulhrsw m16, m13, [r4+64*26] |
| pmulhrsw m17, m13, [r4+64*27] |
| pmulhrsw m18, m13, [r4+64*28] |
| pmulhrsw m19, m13, [r4+64*29] |
| pmulhrsw m20, m13, [r4+64*30] |
| pmulhrsw m21, m13, [r4+64*31] |
| mova [cq+64*24], m22 |
| mova [cq+64*25], m23 |
| mova [cq+64*26], m24 |
| mova [cq+64*27], m25 |
| mova [cq+64*28], m26 |
| mova [cq+64*29], m27 |
| mova [cq+64*30], m28 |
| mova [cq+64*31], m29 |
| call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round |
| call .pass2_fast |
| mova [r4+64*16], m14 |
| mova [r4+64*17], m15 |
| mova [r4+64*18], m16 |
| mova [r4+64*19], m17 |
| mova [r4+64*20], m18 |
| mova [r4+64*21], m19 |
| mova [r4+64*22], m20 |
| mova [r4+64*23], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| .end: |
| vpbroadcastd m13, [o(pw_2048)] |
| lea r5, [strideq*3] |
| pxor m12, m12 |
| lea r3, [dstq+r5*8] |
| lea r6, [strideq+r5] ; stride*4 |
| add r3, r6 ; dst+stride*28 |
| %macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi |
| mova m11, [cq+64*( %3)] ; 0 |
| mova m9, [cq+64*(31-%3)] ; 31 |
| %if %3 >= 8 |
| mova m%1, [rsp+64*(%1+16)] |
| %endif |
| mova m10, [dstq+%4] |
| paddsw m8, m11, m9 |
| psubsw m11, m9 |
| paddsw m9, m%1, m%2 |
| psubsw m%1, m%2 |
| punpcklbw m%2, m10, m12 |
| punpckhbw m10, m12 |
| pmulhrsw m8, m13 |
| pmulhrsw m9, m13 |
| paddw m8, m%2 |
| paddw m9, m10 |
| mova m10, [r3+%5] |
| pmulhrsw m11, m13 |
| pmulhrsw m%1, m13 |
| mova [cq+64*( %3)], m12 |
| mova [cq+64*(31-%3)], m12 |
| punpcklbw m%2, m10, m12 |
| punpckhbw m10, m12 |
| packuswb m8, m9 |
| paddw m11, m%2 |
| paddw m%1, m10 |
| packuswb m11, m%1 |
| mova [dstq+%4], m8 |
| mova [r3 +%5], m11 |
| %if %3 == 3 || %3 == 7 || %3 == 11 |
| add dstq, r6 |
| sub r3, r6 |
| %endif |
| %endmacro |
| IDCT_64x32_END 0, 29, 0, strideq*0, r5 |
| IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 |
| IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 |
| IDCT_64x32_END 3, 26, 3, r5 , strideq*0 |
| IDCT_64x32_END 4, 25, 4, strideq*0, r5 |
| IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 |
| IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 |
| IDCT_64x32_END 7, 22, 7, r5 , strideq*0 |
| IDCT_64x32_END 0, 21, 8, strideq*0, r5 |
| IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 |
| IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 |
| IDCT_64x32_END 3, 18, 11, r5 , strideq*0 |
| IDCT_64x32_END 4, 17, 12, strideq*0, r5 |
| IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 |
| IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 |
| IDCT_64x32_END 7, 14, 15, r5 , strideq*0 |
| RET |
| ALIGN function_align |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 32 |
| imul r6d, 181 |
| add r6d, 128 |
| sar r6d, 8 |
| imul r6d, 181 |
| add r6d, 128+256 |
| sar r6d, 8+1 |
| jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 |
| ALIGN function_align |
| .pass1_end_part1: |
| %macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 |
| %if %1 != %3 |
| mova m%1, [cq+64*%1] |
| %endif |
| mova m9, [r4+64*(%3-36)] ; idct64 32+n |
| mova m11, [r4+64*(-5-%3)] ; idct64 63-n |
| psubsw m8, m%1, m%2 ; idct32 31-n |
| paddsw m%1, m%2 ; idct32 0+n |
| %if %1 == %3 |
| psubsw m%2, m8, m9 ; out 32+n e |
| paddsw m8, m9 ; out 31-n d |
| psubsw m9, m%1, m11 ; out 63-n h |
| paddsw m%1, m11 ; out 0+n a |
| %else |
| paddsw m%2, m8, m9 ; out 23-n c |
| psubsw m8, m9 ; out 40+n f |
| paddsw m9, m%1, m11 ; out 8+n b |
| psubsw m%1, m11 ; out 55-n g |
| %endif |
| mova [r4+64*(%3-36)], m8 |
| mova [r4+64*(-5-%3)], m9 |
| %endmacro |
| IDCT_64x32_PASS1_END 0, 29, 0 |
| IDCT_64x32_PASS1_END 1, 28, 1 |
| IDCT_64x32_PASS1_END 2, 27, 2 |
| IDCT_64x32_PASS1_END 3, 26, 3 |
| IDCT_64x32_PASS1_END 4, 25, 4 |
| IDCT_64x32_PASS1_END 5, 24, 5 |
| IDCT_64x32_PASS1_END 6, 23, 6 |
| IDCT_64x32_PASS1_END 7, 22, 7 |
| .transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) |
| punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 |
| punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 |
| punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 |
| punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 |
| punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 |
| punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 |
| punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 |
| punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 |
| punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 |
| punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 |
| punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 |
| punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 |
| punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 |
| punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 |
| punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 |
| punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 |
| punpckhqdq m23, m22, m27 ; 1 23 |
| punpcklqdq m22, m27 ; 0 22 |
| punpckhqdq m27, m26, m28 ; 5 27 |
| punpcklqdq m26, m28 ; 4 26 |
| punpcklqdq m28, m29, m25 ; 6 28 |
| punpckhqdq m29, m25 ; 7 29 |
| punpckhqdq m25, m24, m8 ; 3 25 |
| punpcklqdq m24, m8 ; 2 24 |
| .transpose_8x8: |
| punpckhwd m8, m4, m5 |
| punpcklwd m4, m5 |
| punpckhwd m5, m0, m1 |
| punpcklwd m0, m1 |
| punpckhwd m1, m6, m7 |
| punpcklwd m6, m7 |
| punpckhwd m7, m2, m3 |
| punpcklwd m2, m3 |
| punpckhdq m3, m0, m2 |
| punpckldq m0, m2 |
| punpckldq m2, m4, m6 |
| punpckhdq m4, m6 |
| punpckhdq m6, m5, m7 |
| punpckldq m5, m7 |
| punpckldq m7, m8, m1 |
| punpckhdq m8, m1 |
| punpckhqdq m1, m0, m2 |
| punpcklqdq m0, m2 |
| punpcklqdq m2, m3, m4 |
| punpckhqdq m3, m4 |
| punpcklqdq m4, m5, m7 |
| punpckhqdq m5, m7 |
| punpckhqdq m7, m6, m8 |
| punpcklqdq m6, m8 |
| ret |
| .pass1_end_part2: |
| IDCT_64x32_PASS1_END 0, 21, 8 |
| IDCT_64x32_PASS1_END 1, 20, 9 |
| IDCT_64x32_PASS1_END 2, 19, 10 |
| IDCT_64x32_PASS1_END 3, 18, 11 |
| IDCT_64x32_PASS1_END 4, 17, 12 |
| IDCT_64x32_PASS1_END 5, 16, 13 |
| IDCT_64x32_PASS1_END 6, 15, 14 |
| IDCT_64x32_PASS1_END 7, 14, 15 |
| .transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 |
| punpcklwd m8, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m2, m1, m0 |
| punpckhwd m1, m0 |
| punpcklwd m0, m7, m6 |
| punpckhwd m7, m6 |
| punpcklwd m6, m5, m4 |
| punpckhwd m5, m4 |
| punpckldq m4, m7, m5 |
| punpckhdq m7, m5 |
| punpckldq m5, m8, m2 |
| punpckhdq m8, m2 |
| punpckhdq m2, m0, m6 |
| punpckldq m0, m6 |
| punpckldq m6, m3, m1 |
| punpckhdq m3, m1 |
| punpckhqdq m1, m0, m5 |
| punpcklqdq m0, m5 |
| punpckhqdq m5, m4, m6 |
| punpcklqdq m4, m6 |
| punpcklqdq m6, m7, m3 |
| punpckhqdq m7, m3 |
| punpckhqdq m3, m2, m8 |
| punpcklqdq m2, m8 |
| punpckhwd m8, m18, m19 |
| punpcklwd m18, m19 |
| punpckhwd m19, m14, m15 |
| punpcklwd m14, m15 |
| punpckhwd m15, m20, m21 |
| punpcklwd m20, m21 |
| punpckhwd m21, m16, m17 |
| punpcklwd m16, m17 |
| punpckhdq m17, m14, m16 |
| punpckldq m14, m16 |
| punpckldq m16, m18, m20 |
| punpckhdq m18, m20 |
| punpckhdq m20, m19, m21 |
| punpckldq m19, m21 |
| punpckldq m21, m8, m15 |
| punpckhdq m8, m15 |
| punpckhqdq m15, m14, m16 |
| punpcklqdq m14, m16 |
| punpcklqdq m16, m17, m18 |
| punpckhqdq m17, m18 |
| punpcklqdq m18, m19, m21 |
| punpckhqdq m19, m21 |
| punpckhqdq m21, m20, m8 |
| punpcklqdq m20, m8 |
| ret |
| .pass2_fast: |
| vshufi32x4 m24, m9, m15, q3131 ; 5 |
| vshufi32x4 m22, m9, m15, q2020 ; 1 |
| vshufi32x4 m15, m1, m16, q3131 ; 6 |
| vshufi32x4 m14, m1, m16, q2020 ; 2 |
| vshufi32x4 m1, m0, m3, q3131 ; 4 |
| vshufi32x4 m0, m3, q2020 ; 0 |
| vshufi32x4 m3, m8, m2, q3131 ; 12 |
| vshufi32x4 m2, m8, m2, q2020 ; 8 |
| vshufi32x4 m25, m11, m17, q3131 ; 7 |
| vshufi32x4 m23, m11, m17, q2020 ; 3 |
| vshufi32x4 m17, m5, m19, q3131 ; 14 |
| vshufi32x4 m16, m5, m19, q2020 ; 10 |
| vshufi32x4 m29, m6, m20, q3131 ; 15 |
| vshufi32x4 m27, m6, m20, q2020 ; 11 |
| vshufi32x4 m28, m4, m18, q3131 ; 13 |
| vshufi32x4 m26, m4, m18, q2020 ; 9 |
| jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| |
| cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob |
| lea r5, [o_base] |
| test eobd, eobd |
| jz .dconly |
| PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob |
| %undef cmp |
| cmp eobd, 136 |
| jb .fast |
| mova m0, [cq+64* 1] |
| mova m1, [cq+64*31] |
| mova m2, [cq+64*17] |
| mova m3, [cq+64*15] |
| vpbroadcastd m10, [o(pd_2048)] |
| mov r4, rsp |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| mova m0, [cq+64* 7] |
| mova m1, [cq+64*25] |
| mova m2, [cq+64*23] |
| mova m3, [cq+64* 9] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| mova m0, [cq+64* 5] |
| mova m1, [cq+64*27] |
| mova m2, [cq+64*21] |
| mova m3, [cq+64*11] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| mova m0, [cq+64* 3] |
| mova m1, [cq+64*29] |
| mova m2, [cq+64*19] |
| mova m3, [cq+64*13] |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 8] |
| mova m2, [cq+64*16] |
| mova m3, [cq+64*24] |
| mova m14, [cq+64* 4] |
| mova m15, [cq+64*12] |
| mova m16, [cq+64*20] |
| mova m17, [cq+64*28] |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| mova m22, [cq+64* 2] |
| mova m29, [cq+64*30] |
| mova m26, [cq+64*18] |
| mova m25, [cq+64*14] |
| mova m24, [cq+64*10] |
| mova m27, [cq+64*22] |
| mova m28, [cq+64*26] |
| mova m23, [cq+64* 6] |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| vpbroadcastd m13, [o(pw_8192)] |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 |
| mova [r4+64*36], m1 |
| mova [r4+64*37], m3 |
| mova [r4+64*38], m5 |
| mova [r4+64*39], m7 |
| mova [r4+64*44], m23 |
| mova [r4+64*45], m25 |
| mova [r4+64*46], m27 |
| mova [r4+64*47], m29 |
| pmulhrsw m23, m13, m0 ; a0 |
| pmulhrsw m25, m13, m2 ; a2 |
| pmulhrsw m27, m13, m4 ; a4 |
| pmulhrsw m29, m13, m6 ; a6 |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 |
| lea r6, [r4-64*4] |
| add r4, 64*28 |
| call .pass2_end |
| mov r4, rsp |
| mova m0, [r4+64*23] |
| mova m1, [r4+64*22] |
| mova m2, [r4+64*21] |
| mova m3, [r4+64*20] |
| mova m4, [r4+64*19] |
| mova m5, [r4+64*18] |
| mova m6, [r4+64*17] |
| mova m7, [r4+64*16] |
| mova m22, [r4+64*15] |
| mova m23, [r4+64*14] |
| mova m24, [r4+64*13] |
| mova m25, [r4+64*12] |
| mova m26, [r4+64*11] |
| mova m27, [r4+64*10] |
| mova m28, [r4+64* 9] |
| mova m29, [r4+64* 8] |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi |
| vpbroadcastd m13, [o(pw_8192)] |
| mova [r4+64* 8], m1 |
| mova [r4+64* 9], m3 |
| mova [r4+64*10], m5 |
| mova [r4+64*11], m7 |
| mova [r4+64*16], m23 |
| mova [r4+64*17], m25 |
| mova [r4+64*18], m27 |
| mova [r4+64*19], m29 |
| pmulhrsw m23, m13, m0 ; b0 |
| pmulhrsw m25, m13, m2 ; b2 |
| pmulhrsw m27, m13, m4 ; b4 |
| pmulhrsw m29, m13, m6 ; b6 |
| mova m0, [r4+64*31] |
| mova m1, [r4+64*30] |
| mova m2, [r4+64*29] |
| mova m3, [r4+64*28] |
| mova m4, [r4+64*27] |
| mova m5, [r4+64*26] |
| mova m6, [r4+64*25] |
| mova m7, [r4+64*24] |
| mova m14, [r4+64* 7] |
| mova m15, [r4+64* 6] |
| mova m16, [r4+64* 5] |
| mova m17, [r4+64* 4] |
| mova m18, [r4+64* 3] |
| mova m19, [r4+64* 2] |
| mova m20, [r4+64* 1] |
| mova m21, [r4+64* 0] |
| call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo |
| mov r6, cq |
| call .pass2_end |
| jmp .end |
| .fast: ; bottom/right halves are zero |
| mova m28, [o(dup16_perm)] |
| pmovzxwd m9, [cq+64* 0] |
| vpermb m8, m28, [cq+64* 4] |
| vpermb ym1, ym28, [cq+64*12] |
| vpermb m7, m28, [cq+64* 8] |
| pslld m9, 16 |
| call m(idct_16x16_internal_8bpc).main_fast2 |
| vpermb m21, m28, [cq+64* 2] |
| vpermb ym15, ym28, [cq+64*14] |
| vpermb ym18, ym28, [cq+64*10] |
| vpermb m14, m28, [cq+64* 6] |
| call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 |
| vpermb m22, m28, [cq+64* 1] |
| vpermb ym29, ym28, [cq+64*15] |
| vpermb ym26, ym28, [cq+64* 9] |
| vpermb m25, m28, [cq+64* 7] |
| vpermb m24, m28, [cq+64* 5] |
| vpermb ym27, ym28, [cq+64*11] |
| vpermb m23, m28, [cq+64* 3] |
| vpermb ym28, ym28, [cq+64*13] |
| mova [cq+64* 0], m14 |
| mova [cq+64* 1], m15 |
| mova [cq+64* 2], m16 |
| mova [cq+64* 3], m17 |
| mova [cq+64* 4], m18 |
| mova [cq+64* 5], m19 |
| mova [cq+64* 6], m20 |
| mova [cq+64* 7], m21 |
| call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast |
| vpbroadcastd m13, [o(pw_8192)] |
| mova [cq+64*16], m4 |
| mova [cq+64*17], m5 |
| mova [cq+64*18], m6 |
| mova [cq+64*19], m7 |
| mova [cq+64*28], m26 |
| mova [cq+64*29], m27 |
| mova [cq+64*30], m28 |
| mova [cq+64*31], m29 |
| call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end |
| mova [cq+64*20], m22 |
| mova [cq+64*21], m23 |
| mova [cq+64*22], m24 |
| mova [cq+64*23], m25 |
| mova [cq+64*24], m26 |
| mova [cq+64*25], m27 |
| mova [cq+64*26], m28 |
| mova [cq+64*27], m29 |
| lea r4, [rsp+64*64] |
| lea r3, [rsp+64*32] |
| call .pass2_fast |
| pmulhrsw m0, m13, [cq+64*16] |
| pmulhrsw m1, m13, [cq+64*17] |
| pmulhrsw m2, m13, [cq+64*18] |
| pmulhrsw m3, m13, [cq+64*19] |
| pmulhrsw m4, m13, [cq+64*20] |
| pmulhrsw m5, m13, [cq+64*21] |
| pmulhrsw m6, m13, [cq+64*22] |
| pmulhrsw m7, m13, [cq+64*23] |
| pmulhrsw m14, m13, [cq+64*24] |
| pmulhrsw m15, m13, [cq+64*25] |
| pmulhrsw m16, m13, [cq+64*26] |
| pmulhrsw m17, m13, [cq+64*27] |
| pmulhrsw m18, m13, [cq+64*28] |
| pmulhrsw m19, m13, [cq+64*29] |
| pmulhrsw m20, m13, [cq+64*30] |
| pmulhrsw m21, m13, [cq+64*31] |
| call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round |
| mov r4, rsp |
| mov r3, cq |
| call .pass2_fast |
| .end: |
| vpbroadcastd m17, [o(pw_2048)] |
| lea r5, [strideq*8] |
| mov r3, dstq |
| pxor m16, m16 |
| sub r4, 64*5 ; rsp+64*31 |
| mov r6, rsp |
| .end_loop: |
| mova m2, [r6+64*32] ; idct16 0+n lo |
| mova m7, [r6+64*48] ; idct32 31-n lo |
| mova m6, [cq+64* 0] ; idct16 0+n hi |
| mova m0, [cq+64*16] ; idct32 31-n hi |
| mova m4, [r4+64*64] ; idct64 63-n lo |
| mova m1, [r4+64* 0] ; idct64 63-n hi |
| mova m5, [r6+64*64] ; idct64 32+n lo |
| mova m8, [r6+64* 0] ; idct64 32+n hi |
| sub r3, strideq |
| paddsw m3, m2, m7 ; idct32 0+n lo |
| mova m12, [dstq+r5*0] |
| psubsw m2, m7 ; idct32 31-n lo |
| mova m15, [r3 +r5*8] |
| paddsw m7, m6, m0 ; idct32 0+n hi |
| mova m13, [r3 +r5*4] |
| psubsw m6, m0 ; idct32 31-n hi |
| mova m14, [dstq+r5*4] |
| paddsw m0, m3, m4 ; out 0+n lo |
| add r6, 64 |
| psubsw m3, m4 ; out 63-n lo |
| sub r4, 64 |
| paddsw m4, m7, m1 ; out 0+n hi |
| mova [cq+64* 0], m16 |
| psubsw m7, m1 ; out 63-n hi |
| mova [cq+64*16], m16 |
| paddsw m1, m2, m5 ; out 31-n lo |
| add cq, 64 |
| psubsw m2, m5 ; out 32+n lo |
| paddsw m5, m6, m8 ; out 31-n hi |
| psubsw m6, m8 ; out 32+n hi |
| pmulhrsw m0, m17 |
| punpcklbw m8, m12, m16 |
| pmulhrsw m4, m17 |
| punpckhbw m12, m16 |
| pmulhrsw m3, m17 |
| punpcklbw m11, m15, m16 |
| pmulhrsw m7, m17 |
| punpckhbw m15, m16 |
| pmulhrsw m1, m17 |
| punpcklbw m9, m13, m16 |
| pmulhrsw m5, m17 |
| punpckhbw m13, m16 |
| pmulhrsw m2, m17 |
| punpcklbw m10, m14, m16 |
| pmulhrsw m6, m17 |
| punpckhbw m14, m16 |
| paddw m0, m8 |
| paddw m4, m12 |
| packuswb m0, m4 |
| paddw m3, m11 |
| paddw m7, m15 |
| packuswb m3, m7 |
| paddw m1, m9 |
| paddw m5, m13 |
| packuswb m1, m5 |
| paddw m2, m10 |
| paddw m6, m14 |
| packuswb m2, m6 |
| mova [dstq+r5*0], m0 |
| mova [r3 +r5*8], m3 |
| mova [r3 +r5*4], m1 |
| mova [dstq+r5*4], m2 |
| add dstq, strideq |
| cmp r6, r4 |
| jb .end_loop |
| RET |
| .dconly: |
| movsx r6d, word [cq] |
| mov [cq], eobd |
| or r3d, 64 |
| jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly |
| ALIGN function_align |
| .pass2_end: |
| REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 |
| mova [r4+64*20], m1 |
| mova [r4+64*21], m3 |
| mova [r4+64*22], m5 |
| mova [r4+64*23], m7 |
| vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 |
| vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 |
| vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 |
| vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 |
| mova [r4+64*12], m15 |
| mova [r4+64*13], m17 |
| mova [r4+64*14], m19 |
| mova [r4+64*15], m21 |
| vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 |
| vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 |
| vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 |
| vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 |
| vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 |
| vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 |
| vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 |
| vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 |
| vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 |
| vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 |
| vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 |
| vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 |
| vshufi32x4 m0, m1, m5, q2020 ; 0 |
| vshufi32x4 m1, m5, q3131 ; 8 |
| vshufi32x4 m2, m3, m14, q2020 ; 16 |
| vshufi32x4 m3, m14, q3131 ; 24 |
| vshufi32x4 m14, m15, m18, q2020 ; 4 |
| vshufi32x4 m15, m18, q3131 ; 12 |
| vshufi32x4 m16, m17, m19, q2020 ; 20 |
| vshufi32x4 m17, m19, q3131 ; 28 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast |
| vshufi32x4 m24, m22, m25, q3131 ; 10 |
| vshufi32x4 m22, m25, q2020 ; 2 |
| vshufi32x4 m25, m23, m28, q3131 ; 14 |
| vshufi32x4 m23, m28, q2020 ; 6 |
| vshufi32x4 m28, m26, m27, q3131 ; 26 |
| vshufi32x4 m26, m27, q2020 ; 18 |
| vshufi32x4 m27, m29, m13, q2020 ; 22 |
| vshufi32x4 m29, m13, q3131 ; 30 |
| mova [r6+64* 0], m0 |
| mova [r6+64* 1], m1 |
| mova [r6+64* 2], m2 |
| mova [r6+64* 3], m3 |
| mova [r6+64* 4], m4 |
| mova [r6+64* 5], m5 |
| mova [r6+64* 6], m6 |
| mova [r6+64* 7], m7 |
| mova [r6+64* 8], m14 |
| mova [r6+64* 9], m15 |
| mova [r6+64*10], m16 |
| mova [r6+64*11], m17 |
| mova [r6+64*12], m18 |
| mova [r6+64*13], m19 |
| mova [r6+64*14], m20 |
| mova [r6+64*15], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast |
| vpbroadcastd m13, [o(pw_8192)] |
| mova [r6+64*16], m29 |
| mova [r6+64*17], m28 |
| mova [r6+64*18], m27 |
| mova [r6+64*19], m26 |
| mova [r6+64*20], m25 |
| mova [r6+64*21], m24 |
| mova [r6+64*22], m23 |
| mova [r6+64*23], m22 |
| mova [r6+64*24], m21 |
| mova [r6+64*25], m20 |
| mova [r6+64*26], m19 |
| mova [r6+64*27], m18 |
| mova [r6+64*28], m17 |
| mova [r6+64*29], m16 |
| mova [r6+64*30], m15 |
| mova [r6+64*31], m14 |
| pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 |
| pmulhrsw m16, m13, [r4+64*12] |
| pmulhrsw m17, m13, [r4+64*16] |
| pmulhrsw m18, m13, [r4+64*20] |
| pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 |
| pmulhrsw m20, m13, [r4+64*15] |
| pmulhrsw m21, m13, [r4+64*19] |
| pmulhrsw m22, m13, [r4+64*23] |
| vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 |
| vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 |
| vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 |
| vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 |
| pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 |
| pmulhrsw m24, m13, [r4+64*14] |
| pmulhrsw m25, m13, [r4+64*18] |
| pmulhrsw m26, m13, [r4+64*22] |
| vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 |
| vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 |
| vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 |
| vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 |
| pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 |
| pmulhrsw m28, m13, [r4+64*13] |
| pmulhrsw m29, m13, [r4+64*17] |
| pmulhrsw m13, [r4+64*21] |
| vshufi32x4 m0, m14, m16, q2020 ; 1 |
| vshufi32x4 m1, m19, m21, q3131 ; 31 |
| vshufi32x4 m2, m15, m17, q2020 ; 17 |
| vshufi32x4 m3, m18, m20, q3131 ; 15 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| vshufi32x4 m0, m18, m20, q2020 ; 7 |
| vshufi32x4 m1, m15, m17, q3131 ; 25 |
| vshufi32x4 m2, m19, m21, q2020 ; 23 |
| vshufi32x4 m3, m14, m16, q3131 ; 9 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 |
| vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 |
| vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 |
| vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 |
| vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 |
| vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 |
| vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 |
| vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 |
| vshufi32x4 m0, m22, m24, q2020 ; 5 |
| vshufi32x4 m1, m27, m29, q3131 ; 27 |
| vshufi32x4 m2, m23, m25, q2020 ; 21 |
| vshufi32x4 m3, m26, m28, q3131 ; 11 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| vshufi32x4 m0, m26, m28, q2020 ; 3 |
| vshufi32x4 m1, m23, m25, q3131 ; 29 |
| vshufi32x4 m2, m27, m29, q2020 ; 19 |
| vshufi32x4 m3, m22, m24, q3131 ; 13 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 |
| jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 |
| ALIGN function_align |
| .pass2_fast: |
| vshufi32x4 m23, m1, m16, q3131 ; 6 |
| vshufi32x4 m22, m1, m16, q2020 ; 2 |
| vshufi32x4 m14, m0, m3, q3131 ; 4 |
| vshufi32x4 m26, m0, m3, q2020 ; 0 |
| vshufi32x4 m28, m9, m15, q3131 ; 5 |
| vshufi32x4 m0, m9, m15, q2020 ; 1 |
| vshufi32x4 m16, m11, m17, q3131 ; 7 |
| vshufi32x4 m29, m11, m17, q2020 ; 3 |
| vshufi32x4 m15, m8, m2, q3131 ; 12 |
| vshufi32x4 m27, m8, m2, q2020 ; 8 |
| vshufi32x4 m25, m5, m19, q3131 ; 14 |
| vshufi32x4 m24, m5, m19, q2020 ; 10 |
| vshufi32x4 m3, m6, m20, q3131 ; 15 |
| vshufi32x4 m19, m6, m20, q2020 ; 11 |
| vshufi32x4 m17, m4, m18, q3131 ; 13 |
| vshufi32x4 m18, m4, m18, q2020 ; 9 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast |
| mova m0, m16 |
| mova m3, m18 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast |
| mova m0, m28 |
| mova m3, m19 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast |
| mova m0, m29 |
| mova m3, m17 |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast |
| call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 |
| mova m0, m26 |
| mova m1, m27 |
| call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 |
| mova [r3+64* 0], m0 |
| mova [r3+64* 1], m1 |
| mova [r3+64* 2], m2 |
| mova [r3+64* 3], m3 |
| mova [r3+64* 4], m4 |
| mova [r3+64* 5], m5 |
| mova [r3+64* 6], m6 |
| mova [r3+64* 7], m7 |
| mova [r3+64* 8], m14 |
| mova [r3+64* 9], m15 |
| mova [r3+64*10], m16 |
| mova [r3+64*11], m17 |
| mova [r3+64*12], m18 |
| mova [r3+64*13], m19 |
| mova [r3+64*14], m20 |
| mova [r3+64*15], m21 |
| call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 |
| mova [r3+64*16], m29 |
| mova [r3+64*17], m28 |
| mova [r3+64*18], m27 |
| mova [r3+64*19], m26 |
| mova [r3+64*20], m25 |
| mova [r3+64*21], m24 |
| mova [r3+64*22], m23 |
| mova [r3+64*23], m22 |
| mova [r3+64*24], m21 |
| mova [r3+64*25], m20 |
| mova [r3+64*26], m19 |
| mova [r3+64*27], m18 |
| mova [r3+64*28], m17 |
| mova [r3+64*29], m16 |
| mova [r3+64*30], m15 |
| mova [r3+64*31], m14 |
| ret |
| |
| %endif ; ARCH_X86_64 |