| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 |
| pb_7_1: times 8 db 7, 1 |
| pb_3_1: times 8 db 3, 1 |
| pb_2_1: times 8 db 2, 1 |
| pb_m1_0: times 8 db -1, 0 |
| pb_m1_1: times 8 db -1, 1 |
| pb_m1_2: times 8 db -1, 2 |
| pb_1: times 16 db 1 |
| pb_2: times 16 db 2 |
| pb_3: times 16 db 3 |
| pb_4: times 16 db 4 |
| pb_16: times 16 db 16 |
| pb_63: times 16 db 63 |
| pb_64: times 16 db 64 |
| pb_128: times 16 db 0x80 |
| pb_129: times 16 db 0x81 |
| pb_240: times 16 db 0xf0 |
| pb_248: times 16 db 0xf8 |
| pb_254: times 16 db 0xfe |
| |
| pw_2048: times 8 dw 2048 |
| pw_4096: times 8 dw 4096 |
| |
| pd_mask: dd 1, 2, 4, 8 |
| |
| SECTION .text |
| |
| %macro ABSSUB 4 ; dst, a, b, tmp |
| psubusb %1, %2, %3 |
| psubusb %4, %3, %2 |
| por %1, %4 |
| %endmacro |
| |
| %macro TRANSPOSE_16x4_AND_WRITE_4x16 5 |
| ; transpose 16x4 |
| punpcklbw m%5, m%1, m%2 |
| punpckhbw m%1, m%2 |
| punpcklbw m%2, m%3, m%4 |
| punpckhbw m%3, m%4 |
| punpcklwd m%4, m%5, m%2 |
| punpckhwd m%5, m%2 |
| punpcklwd m%2, m%1, m%3 |
| punpckhwd m%1, m%3 |
| |
| ; write out |
| %assign %%n 0 |
| %rep 4 |
| movd [dstq+strideq *0-2], xm%4 |
| movd [dstq+strideq *4-2], xm%5 |
| movd [dstq+strideq *8-2], xm%2 |
| movd [dstq+stride3q*4-2], xm%1 |
| add dstq, strideq |
| %if %%n < 3 |
| psrldq xm%4, 4 |
| psrldq xm%5, 4 |
| psrldq xm%2, 4 |
| psrldq xm%1, 4 |
| %endif |
| %assign %%n (%%n+1) |
| %endrep |
| lea dstq, [dstq+stride3q*4] |
| %endmacro |
| |
| %macro TRANSPOSE_16X16B 2 ; output_transpose, mem |
| %if %1 == 0 |
| mova %2, m15 ; m7 in 32-bit |
| %endif |
| |
| ; input in m0-7 |
| punpcklbw m15, m0, m1 |
| punpckhbw m0, m1 |
| punpcklbw m1, m2, m3 |
| punpckhbw m2, m3 |
| punpcklbw m3, m4, m5 |
| punpckhbw m4, m5 |
| %if ARCH_X86_64 |
| SWAP 4, 5, 7 |
| %else |
| %if %1 == 0 |
| mova m5, %2 |
| %else |
| mova m5, [esp+1*16] |
| %endif |
| mova %2, m4 |
| %endif |
| punpcklbw m4, m6, m5 |
| punpckhbw m6, m5 |
| |
| ; interleaved in m15,0,1,2,3,7,4,6 |
| punpcklwd m5, m15, m1 |
| punpckhwd m15, m1 |
| punpcklwd m1, m0, m2 |
| punpckhwd m0, m2 |
| punpcklwd m2, m3, m4 |
| punpckhwd m3, m4 |
| %if ARCH_X86_64 |
| SWAP 3, 4, 7 |
| %else |
| mova m4, %2 |
| mova %2, m3 |
| %endif |
| punpcklwd m3, m4, m6 |
| punpckhwd m4, m6 |
| |
| ; interleaved in m5,15,1,0,2,7,3,4 |
| punpckldq m6, m5, m2 |
| punpckhdq m5, m2 |
| %if ARCH_X86_64 |
| SWAP 2, 7, 5 |
| %else |
| mova m2, %2 |
| mova [esp+1*16], m5 |
| %endif |
| punpckldq m5, m15, m2 |
| punpckhdq m15, m2 |
| punpckldq m2, m1, m3 |
| punpckhdq m1, m3 |
| punpckldq m3, m0, m4 |
| punpckhdq m0, m4 |
| |
| %if ARCH_X86_32 |
| mova [esp+0*16], m6 |
| mova [esp+2*16], m5 |
| mova [esp+3*16], m15 |
| mova [esp+4*16], m2 |
| mova [esp+5*16], m1 |
| mova [esp+6*16], m3 |
| mova [esp+7*16], m0 |
| mova m8, [esp+ 8*16] |
| mova m9, [esp+ 9*16] |
| mova m10, [esp+10*16] |
| %if %1 == 0 |
| mova m11, [esp+11*16] |
| mova m12, [esp+12*16] |
| mova m13, [esp+13*16] |
| mova m14, [esp+14*16] |
| %else |
| mova m11, [esp+20*16] |
| mova m12, [esp+15*16] |
| mova m13, [esp+16*16] |
| mova m14, [esp+17*16] |
| %endif |
| %endif |
| |
| ; input in m8-m15 |
| %if ARCH_X86_64 |
| SWAP 7, 4 |
| %endif |
| punpcklbw m7, m8, m9 |
| punpckhbw m8, m9 |
| punpcklbw m9, m10, m11 |
| punpckhbw m10, m11 |
| punpcklbw m11, m12, m13 |
| punpckhbw m12, m13 |
| %if ARCH_X86_64 |
| mova m13, %2 |
| %else |
| %if %1 == 0 |
| mova m13, [esp+15*16] |
| %else |
| mova m13, [esp+18*16] |
| %endif |
| %endif |
| mova %2, m12 |
| punpcklbw m12, m14, m13 |
| punpckhbw m14, m14, m13 |
| |
| ; interleaved in m7,8,9,10,11,rsp%2,12,14 |
| punpcklwd m13, m7, m9 |
| punpckhwd m7, m9 |
| punpcklwd m9, m8, m10 |
| punpckhwd m8, m10 |
| punpcklwd m10, m11, m12 |
| punpckhwd m11, m12 |
| mova m12, %2 |
| mova %2, m11 |
| punpcklwd m11, m12, m14 |
| punpckhwd m12, m14 |
| |
| ; interleaved in m13,7,9,8,10,rsp%2,11,12 |
| punpckldq m14, m13, m10 |
| punpckhdq m13, m10 |
| punpckldq m10, m9, m11 |
| punpckhdq m9, m11 |
| punpckldq m11, m8, m12 |
| punpckhdq m8, m12 |
| mova m12, %2 |
| mova %2, m8 |
| punpckldq m8, m7, m12 |
| punpckhdq m7, m12 |
| |
| %if ARCH_X86_32 |
| mova [esp+ 8*16], m10 |
| mova [esp+ 9*16], m9 |
| mova [esp+10*16], m11 |
| SWAP 6, 1 |
| SWAP 4, 2 |
| SWAP 5, 3 |
| mova m6, [esp+0*16] |
| mova m4, [esp+1*16] |
| mova m5, [esp+2*16] |
| %endif |
| |
| ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 |
| punpcklqdq m12, m6, m14 |
| punpckhqdq m6, m14 |
| punpcklqdq m14, m4, m13 |
| punpckhqdq m4, m13 |
| punpcklqdq m13, m5, m8 |
| punpckhqdq m5, m8 |
| %if ARCH_X86_64 |
| SWAP 8, 5 |
| %else |
| mova m8, [esp+3*16] |
| mova [esp+27*16], m5 |
| %define m15 m8 |
| %endif |
| punpcklqdq m5, m15, m7 |
| punpckhqdq m15, m7 |
| |
| %if ARCH_X86_32 |
| mova [esp+11*16], m12 |
| mova [esp+12*16], m6 |
| mova [esp+13*16], m14 |
| mova [esp+14*16], m4 |
| mova [esp+26*16], m13 |
| mova [esp+ 0*16], m5 |
| mova [esp+ 1*16], m15 |
| mova m2, [esp+ 4*16] |
| mova m10, [esp+ 8*16] |
| mova m1, [esp+ 5*16] |
| mova m9, [esp+ 9*16] |
| mova m3, [esp+ 6*16] |
| mova m11, [esp+10*16] |
| mova m0, [esp+ 7*16] |
| %endif |
| |
| punpcklqdq m7, m2, m10 |
| punpckhqdq m2, m10 |
| punpcklqdq m10, m1, m9 |
| punpckhqdq m1, m9 |
| punpcklqdq m9, m3, m11 |
| punpckhqdq m3, m11 |
| mova m11, %2 |
| %if ARCH_X86_32 |
| %define m12 m3 |
| %endif |
| mova %2, m12 |
| punpcklqdq m12, m0, m11 |
| punpckhqdq m0, m11 |
| %if %1 == 1 |
| mova m11, %2 |
| %endif |
| |
| %if ARCH_X86_64 |
| ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 |
| SWAP 0, 11, 1, 6, 5, 8, 7, 15 |
| SWAP 2, 14, 12, 9 |
| SWAP 3, 4, 13 |
| %else |
| %if %1 == 0 |
| mova [esp+15*16], m9 |
| mova [esp+17*16], m12 |
| mova [esp+18*16], m0 |
| mova [esp+28*16], m10 |
| mova [esp+29*16], m1 |
| mova m3, [esp+0*16] |
| mova m4, [esp+1*16] |
| SWAP m5, m7 |
| SWAP m6, m2 |
| %else |
| SWAP 0, 7 |
| SWAP 3, 1, 2, 4, 6 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro FILTER 2 ; width [4/6/8/16], dir [h/v] |
| %if ARCH_X86_64 |
| %define %%flat8mem [rsp+0*16] |
| %define %%q2mem [rsp+1*16] |
| %define %%q3mem [rsp+2*16] |
| %else |
| %if %1 == 4 || %1 == 6 |
| %define %%p2mem [esp+ 8*16] |
| %define %%q2mem [esp+ 9*16] |
| %define %%flat8mem [esp+10*16] |
| %else |
| %ifidn %2, v |
| %define %%p2mem [esp+16*16] |
| %define %%q2mem [esp+ 1*16] |
| %define %%q3mem [esp+18*16] |
| %define %%flat8mem [esp+ 0*16] |
| %define %%flat16mem [esp+20*16] |
| %else |
| %define %%p2mem [esp+27*16] |
| %define %%q2mem [esp+28*16] |
| %define %%q3mem [esp+29*16] |
| %define %%flat8mem [esp+21*16] |
| %define %%flat16mem [esp+30*16] |
| %endif |
| %endif |
| %xdefine m12reg m12 |
| %endif |
| |
| %if ARCH_X86_32 |
| lea stride3q, [strideq*3] |
| %endif |
| ; load data |
| %ifidn %2, v |
| %if ARCH_X86_32 |
| mov mstrideq, strideq |
| neg mstrideq |
| %endif |
| %if %1 == 4 |
| lea tmpq, [dstq+mstrideq*2] |
| mova m3, [tmpq+strideq*0] ; p1 |
| mova m4, [tmpq+strideq*1] ; p0 |
| mova m5, [tmpq+strideq*2] ; q0 |
| mova m6, [tmpq+stride3q] ; q1 |
| %else |
| ; load 6-8 pixels, remainder (for wd=16) will be read inline |
| lea tmpq, [dstq+mstrideq*4] |
| ; we load p3 later |
| %define %%p3mem [dstq+mstrideq*4] |
| %if ARCH_X86_32 |
| %define m13 m0 |
| %define m14 m1 |
| %define m15 m2 |
| %endif |
| mova m13, [tmpq+strideq*1] |
| mova m3, [tmpq+strideq*2] |
| mova m4, [tmpq+stride3q] |
| mova m5, [dstq+strideq*0] |
| mova m6, [dstq+strideq*1] |
| mova m14, [dstq+strideq*2] |
| %if %1 != 6 |
| mova m15, [dstq+stride3q] |
| %endif |
| %if ARCH_X86_32 |
| mova %%p2mem, m13 |
| mova %%q2mem, m14 |
| %define m13 %%p2mem |
| %define m14 %%q2mem |
| %if %1 != 6 |
| mova %%q3mem, m15 |
| %define m15 %%q3mem |
| %endif |
| %endif |
| %endif |
| %else ; %2 == h |
| ; load lines |
| %if %1 == 4 |
| ; transpose 4x16 |
| movd m7, [dstq+strideq*0-2] |
| movd m3, [dstq+strideq*1-2] |
| movd m4, [dstq+strideq*2-2] |
| movd m5, [dstq+stride3q -2] |
| lea tmpq, [dstq+strideq*4] |
| punpcklbw m7, m3 |
| punpcklbw m4, m5 |
| movd m3, [tmpq+strideq*0-2] |
| movd m1, [tmpq+strideq*1-2] |
| movd m5, [tmpq+strideq*2-2] |
| movd m6, [tmpq+stride3q -2] |
| lea tmpq, [tmpq+strideq*4] |
| punpcklbw m3, m1 |
| punpcklbw m5, m6 |
| movd m0, [tmpq+strideq*0-2] |
| movd m1, [tmpq+strideq*1-2] |
| punpcklbw m0, m1 |
| movd m1, [tmpq+strideq*2-2] |
| movd m2, [tmpq+stride3q -2] |
| punpcklbw m1, m2 |
| punpcklqdq m7, m0 |
| punpcklqdq m4, m1 |
| lea tmpq, [tmpq+strideq*4] |
| movd m0, [tmpq+strideq*0-2] |
| movd m1, [tmpq+strideq*1-2] |
| punpcklbw m0, m1 |
| movd m1, [tmpq+strideq*2-2] |
| movd m2, [tmpq+stride3q -2] |
| punpcklbw m1, m2 |
| punpcklqdq m3, m0 |
| punpcklqdq m5, m1 |
| ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 |
| ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 |
| ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 |
| ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 |
| punpcklwd m6, m7, m4 |
| punpckhwd m7, m4 |
| punpcklwd m4, m3, m5 |
| punpckhwd m3, m5 |
| ; xm6: A0-3,B0-3,C0-3,D0-3 |
| ; xm7: A8-11,B8-11,C8-11,D8-11 |
| ; xm4: A4-7,B4-7,C4-7,D4-7 |
| ; xm3: A12-15,B12-15,C12-15,D12-15 |
| punpckldq m5, m6, m4 |
| punpckhdq m6, m4 |
| punpckldq m4, m7, m3 |
| punpckhdq m7, m3 |
| ; xm5: A0-7,B0-7 |
| ; xm6: C0-7,D0-7 |
| ; xm4: A8-15,B8-15 |
| ; xm7: C8-15,D8-15 |
| punpcklqdq m3, m5, m4 |
| punpckhqdq m5, m5, m4 |
| punpcklqdq m4, m6, m7 |
| punpckhqdq m6, m7 |
| ; xm3: A0-15 |
| ; xm5: B0-15 |
| ; xm4: C0-15 |
| ; xm6: D0-15 |
| SWAP 4, 5 |
| %elif %1 == 6 || %1 == 8 |
| ; transpose 8x16 |
| movq m7, [dstq+strideq*0-%1/2] |
| movq m3, [dstq+strideq*1-%1/2] |
| movq m4, [dstq+strideq*2-%1/2] |
| movq m5, [dstq+stride3q -%1/2] |
| lea tmpq, [dstq+strideq*8] |
| punpcklbw m7, m3 |
| punpcklbw m4, m5 |
| movq m3, [tmpq+strideq*0-%1/2] |
| movq m1, [tmpq+strideq*1-%1/2] |
| movq m5, [tmpq+strideq*2-%1/2] |
| movq m6, [tmpq+stride3q -%1/2] |
| lea tmpq, [dstq+strideq*4] |
| punpcklbw m3, m1 |
| punpcklbw m5, m6 |
| movq m6, [tmpq+strideq*0-%1/2] |
| movq m0, [tmpq+strideq*1-%1/2] |
| movq m1, [tmpq+strideq*2-%1/2] |
| movq m2, [tmpq+stride3q -%1/2] |
| lea tmpq, [tmpq+strideq*8] |
| punpcklbw m6, m0 |
| punpcklbw m1, m2 |
| movq m2, [tmpq+strideq*2-%1/2] |
| movq m0, [tmpq+stride3q -%1/2] |
| punpcklbw m2, m0 |
| %if ARCH_X86_64 |
| SWAP m15, m2 |
| %else |
| %define m15 [esp+3*16] |
| mova m15, m2 |
| %endif |
| movq m0, [tmpq+strideq*0-%1/2] |
| movq m2, [tmpq+strideq*1-%1/2] |
| punpcklbw m0, m2 |
| ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 |
| ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 |
| ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 |
| ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 |
| ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 |
| ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 |
| ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 |
| ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 |
| punpcklwd m2, m7, m4 |
| punpckhwd m7, m4 |
| punpcklwd m4, m3, m5 |
| punpckhwd m3, m5 |
| punpcklwd m5, m6, m1 |
| punpckhwd m6, m1 |
| punpcklwd m1, m0, m15 |
| punpckhwd m0, m15 |
| %if ARCH_X86_64 |
| SWAP m15, m0 |
| %else |
| mova m15, m0 |
| %endif |
| ; xm2: A0-3,B0-3,C0-3,D0-3 |
| ; xm7: E0-3,F0-3,G0-3,H0-3 |
| ; xm4: A8-11,B8-11,C8-11,D8-11 |
| ; xm3: E8-11,F8-11,G8-11,H8-11 |
| ; xm5: A4-7,B4-7,C4-7,D4-7 |
| ; xm6: E4-7,F4-7,G4-7,H4-7 |
| ; xm1: A12-15,B12-15,C12-15,D12-15 |
| ; xm0: E12-15,F12-15,G12-15,H12-15 |
| punpckldq m0, m2, m5 |
| punpckhdq m2, m5 |
| punpckldq m5, m7, m6 |
| %if %1 != 6 |
| punpckhdq m7, m6 |
| %endif |
| punpckldq m6, m4, m1 |
| punpckhdq m4, m1 |
| punpckldq m1, m3, m15 |
| %if %1 != 6 |
| punpckhdq m3, m15 |
| %if ARCH_X86_64 |
| SWAP m15, m3 |
| %else |
| mova m15, m3 |
| %endif |
| %endif |
| ; xm0: A0-7,B0-7 |
| ; xm2: C0-7,D0-7 |
| ; xm5: E0-7,F0-7 |
| ; xm7: G0-7,H0-7 |
| ; xm6: A8-15,B8-15 |
| ; xm4: C8-15,D8-15 |
| ; xm1: E8-15,F8-15 |
| ; xm3: G8-15,H8-15 |
| punpcklqdq m3, m0, m6 |
| punpckhqdq m0, m6 |
| punpckhqdq m6, m2, m4 |
| punpcklqdq m2, m4 |
| punpcklqdq m4, m5, m1 |
| punpckhqdq m5, m1 |
| %if %1 == 8 |
| punpcklqdq m1, m7, m15 |
| punpckhqdq m7, m15 |
| ; xm3: A0-15 |
| ; xm0: B0-15 |
| ; xm2: C0-15 |
| ; xm6: D0-15 |
| ; xm4: E0-15 |
| ; xm5: F0-15 |
| ; xm1: G0-15 |
| ; xm7: H0-15 |
| %if ARCH_X86_64 |
| SWAP 11, 3, 2 |
| SWAP 13, 0 |
| SWAP 6, 5, 4 |
| SWAP 14, 1 |
| SWAP 15, 7 |
| ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 |
| mova [rsp+21*16], m11 |
| %define %%p3mem [rsp+21*16] |
| %else |
| %define m11 [esp+26*16] |
| %define m13 [esp+27*16] |
| %define m14 [esp+28*16] |
| %define m15 [esp+29*16] |
| mova m11, m3 |
| mova m13, m0 |
| SWAP 3, 2 |
| SWAP 6, 5, 4 |
| mova m14, m1 |
| mova m15, m7 |
| %define %%p3mem [esp+26*16] |
| %endif |
| %else |
| %if ARCH_X86_64 |
| SWAP 13, 3, 0 |
| SWAP 14, 5, 6, 4, 2 |
| ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 |
| %else |
| %define m13 %%p2mem |
| %define m14 %%q2mem |
| mova m13, m3 |
| mova m14, m5 |
| SWAP 3, 0 |
| SWAP 5, 6, 4, 2 |
| ; 0,2,6,4 -> 3,4,5,6 |
| %endif |
| %endif |
| %else |
| %if ARCH_X86_64 |
| mova [rsp+20*16], m12 |
| %endif |
| ; load and 16x16 transpose. We only use 14 pixels but we'll need the |
| ; remainder at the end for the second transpose |
| %if ARCH_X86_32 |
| %xdefine m8 m0 |
| %xdefine m9 m1 |
| %xdefine m10 m2 |
| %xdefine m11 m3 |
| %xdefine m12 m4 |
| %xdefine m13 m5 |
| %xdefine m14 m6 |
| %xdefine m15 m7 |
| lea tmpq, [dstq+strideq*8] |
| movu m8, [tmpq+strideq*0-8] |
| movu m9, [tmpq+strideq*1-8] |
| movu m10, [tmpq+strideq*2-8] |
| movu m11, [tmpq+stride3q -8] |
| lea tmpq, [tmpq+strideq*4] |
| movu m12, [tmpq+strideq*0-8] |
| movu m13, [tmpq+strideq*1-8] |
| movu m14, [tmpq+strideq*2-8] |
| movu m15, [tmpq+stride3q -8] |
| mova [esp+ 8*16], m8 |
| mova [esp+ 9*16], m9 |
| mova [esp+10*16], m10 |
| mova [esp+11*16], m11 |
| mova [esp+12*16], m12 |
| mova [esp+13*16], m13 |
| mova [esp+14*16], m14 |
| mova [esp+15*16], m15 |
| %endif |
| movu m0, [dstq+strideq*0-8] |
| movu m1, [dstq+strideq*1-8] |
| movu m2, [dstq+strideq*2-8] |
| movu m3, [dstq+stride3q -8] |
| lea tmpq, [dstq+strideq*4] |
| movu m4, [tmpq+strideq*0-8] |
| movu m5, [tmpq+strideq*1-8] |
| movu m6, [tmpq+strideq*2-8] |
| movu m7, [tmpq+stride3q -8] |
| lea tmpq, [tmpq+strideq*4] |
| %if ARCH_X86_64 |
| movu m8, [tmpq+strideq*0-8] |
| movu m9, [tmpq+strideq*1-8] |
| movu m10, [tmpq+strideq*2-8] |
| movu m11, [tmpq+stride3q -8] |
| lea tmpq, [tmpq+strideq*4] |
| movu m12, [tmpq+strideq*0-8] |
| movu m13, [tmpq+strideq*1-8] |
| movu m14, [tmpq+strideq*2-8] |
| movu m15, [tmpq+stride3q -8] |
| %endif |
| |
| %if ARCH_X86_64 |
| TRANSPOSE_16X16B 0, [rsp+11*16] |
| mova [rsp+12*16], m1 |
| mova [rsp+13*16], m2 |
| mova [rsp+14*16], m3 |
| mova [rsp+15*16], m12 |
| mova [rsp+16*16], m13 |
| mova [rsp+17*16], m14 |
| mova [rsp+18*16], m15 |
| ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 |
| SWAP 12, 4, 7 |
| SWAP 13, 5, 8 |
| SWAP 3, 6, 9 |
| SWAP 10, 14 |
| SWAP 11, 15 |
| mova [rsp+21*16], m12 |
| %define %%p3mem [rsp+21*16] |
| mova m12, [rsp+20*16] |
| %else |
| TRANSPOSE_16X16B 0, [esp+16*16] |
| %define %%p3mem [esp+26*16] |
| %define m11 %%p3mem |
| %define m13 %%p2mem |
| %define m14 %%q2mem |
| %define m15 %%q3mem |
| %endif |
| %endif ; if 4 elif 6 or 8 else 16 |
| %endif ; if v else h |
| |
| ; load L/E/I/H |
| %if ARCH_X86_32 |
| mov l_strideq, l_stridem |
| %endif |
| %ifidn %2, v |
| movu m1, [lq] |
| movu m0, [lq+l_strideq] |
| %else |
| %if ARCH_X86_32 |
| lea l_stride3q, [l_strideq*3] |
| %endif |
| movq xm1, [lq] |
| movq xm2, [lq+l_strideq*2] |
| movhps xm1, [lq+l_strideq] |
| movhps xm2, [lq+l_stride3q] |
| shufps m0, m1, m2, q3131 |
| shufps m1, m2, q2020 |
| %if ARCH_X86_32 |
| lea stride3q, [strideq*3] |
| %endif |
| %endif |
| |
| %if ARCH_X86_32 |
| %ifidn %2, v |
| mov lutd, lutm |
| %endif |
| %endif |
| pxor m2, m2 |
| pcmpeqb m7, m2, m0 |
| pand m1, m7 |
| por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] |
| pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] |
| pcmpeqb m2, m0 ; !L |
| psrlq m7, m0, [lutq+128] |
| pand m7, [PIC_sym(pb_63)] |
| pminub m7, minlvl |
| pmaxub m7, [PIC_sym(pb_1)] ; I |
| pand m1, m0, [PIC_sym(pb_240)] |
| psrlq m1, 4 ; H |
| paddb m0, [PIC_sym(pb_2)] |
| paddb m0, m0 |
| paddb m0, m7 ; E |
| pxor m1, [PIC_sym(pb_128)] |
| pxor m7, [PIC_sym(pb_128)] |
| pxor m0, [PIC_sym(pb_128)] |
| SWAP 2, 7 |
| |
| %if ARCH_X86_64 |
| SWAP 0, 8 |
| SWAP 2, 10 |
| %else |
| %ifidn %2, v |
| mov mstrideq, strideq |
| neg mstrideq |
| %if %1 == 4 |
| lea tmpq, [dstq+mstrideq*2] |
| %elif %1 == 6 || %1 == 8 |
| lea tmpq, [dstq+mstrideq*4] |
| %endif |
| %endif |
| mova [esp+3*16], m0 |
| mova [esp+4*16], m2 |
| %endif |
| |
| ABSSUB m0, m3, m4, m2 ; abs(p1-p0) |
| pmaxub m0, m7 |
| ABSSUB m2, m5, m6, m7 ; abs(q1-q0) |
| pmaxub m0, m2 |
| %if %1 == 4 |
| pxor m0, [PIC_sym(pb_128)] |
| pcmpgtb m7, m0, m1 ; hev |
| %if ARCH_X86_64 |
| SWAP 7, 11 |
| %else |
| mova [esp+5*16], m7 |
| %endif |
| %else |
| pxor m7, m0, [PIC_sym(pb_128)] |
| pcmpgtb m7, m1 ; hev |
| %if ARCH_X86_64 |
| SWAP 7, 11 |
| %else |
| mova [esp+5*16], m7 |
| %endif |
| |
| %if %1 == 6 |
| ABSSUB m1, m13, m4, m7 ; abs(p2-p0) |
| pmaxub m1, m0 |
| %else |
| mova m2, %%p3mem |
| ABSSUB m1, m2, m4, m7 ; abs(p3-p0) |
| pmaxub m1, m0 |
| ABSSUB m7, m13, m4, m2 ; abs(p2-p0) |
| pmaxub m1, m7 |
| %endif |
| ABSSUB m7, m5, m14, m2 ; abs(p2-p0) |
| pmaxub m1, m7 |
| %if %1 != 6 |
| ABSSUB m7, m5, m15, m2 ; abs(q3-q0) |
| pmaxub m1, m7 |
| %endif |
| pxor m1, [PIC_sym(pb_128)] |
| pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in |
| %if ARCH_X86_64 |
| SWAP 1, 9 |
| %else |
| mova [esp+6*16], m1 |
| %endif |
| |
| %if %1 == 6 |
| ABSSUB m7, m13, m3, m1 ; abs(p2-p1) |
| %else |
| mova m2, %%p3mem |
| ABSSUB m7, m2, m13, m1 ; abs(p3-p2) |
| ABSSUB m2, m13, m3, m1 ; abs(p2-p1) |
| pmaxub m7, m2 |
| ABSSUB m2, m14, m15, m1 ; abs(q3-q2) |
| pmaxub m7, m2 |
| %endif |
| ABSSUB m2, m14, m6, m1 ; abs(q2-q1) |
| pmaxub m7, m2 |
| %if ARCH_X86_32 |
| %define m12 m1 |
| mova m12, maskmem |
| %endif |
| pand m2, m12, mask1 |
| pcmpeqd m2, m12 |
| pand m7, m2 ; only apply fm-wide to wd>4 blocks |
| pmaxub m0, m7 |
| |
| pxor m0, [PIC_sym(pb_128)] |
| %endif ; %if %1 == 4 else |
| %if ARCH_X86_64 |
| SWAP 2, 10 |
| pcmpgtb m0, m2 |
| %else |
| pcmpgtb m0, [esp+4*16] |
| %endif |
| |
| ABSSUB m1, m3, m6, m7 ; abs(p1-q1) |
| ABSSUB m7, m4, m5, m2 ; abs(p0-q0) |
| paddusb m7, m7 |
| pand m1, [PIC_sym(pb_254)] |
| psrlq m1, 1 |
| paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) |
| pxor m1, [PIC_sym(pb_128)] |
| %if ARCH_X86_64 |
| pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E |
| %else |
| pcmpgtb m1, [esp+3*16] |
| %endif |
| por m0, m1 |
| |
| %if %1 == 16 |
| %if ARCH_X86_64 |
| SWAP 0, 8 |
| %else |
| mova [esp+3*16], m0 |
| %endif |
| %ifidn %2, v |
| lea tmpq, [dstq+mstrideq*8] |
| mova m0, [tmpq+strideq*1] |
| %else |
| mova m0, [rsp+12*16] |
| %endif |
| ABSSUB m1, m0, m4, m2 |
| %ifidn %2, v |
| mova m0, [tmpq+strideq*2] |
| %else |
| mova m0, [rsp+13*16] |
| %endif |
| ABSSUB m2, m0, m4, m7 |
| pmaxub m1, m2 |
| %ifidn %2, v |
| mova m0, [tmpq+stride3q] |
| %else |
| mova m0, [rsp+14*16] |
| %endif |
| ABSSUB m2, m0, m4, m7 |
| pmaxub m1, m2 |
| %ifidn %2, v |
| lea tmpq, [dstq+strideq*4] |
| mova m0, [tmpq+strideq*0] |
| %else |
| mova m0, [rsp+15*16] |
| %endif |
| ABSSUB m2, m0, m5, m7 |
| pmaxub m1, m2 |
| %ifidn %2, v |
| mova m0, [tmpq+strideq*1] |
| %else |
| mova m0, [rsp+16*16] |
| %endif |
| ABSSUB m2, m0, m5, m7 |
| pmaxub m1, m2 |
| %ifidn %2, v |
| mova m0, [tmpq+strideq*2] |
| %else |
| mova m0, [rsp+17*16] |
| %endif |
| ABSSUB m2, m0, m5, m7 |
| pmaxub m1, m2 |
| pxor m1, [PIC_sym(pb_128)] |
| pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out |
| %if ARCH_X86_64 |
| por m1, m9 ; !flat8in | !flat8out |
| %else |
| por m1, [esp+6*16] |
| %define m12 m7 |
| mova m12, maskmem |
| %endif |
| pand m2, m12, mask2 |
| pcmpeqd m2, m12 |
| pandn m1, m2 ; flat16 |
| %if ARCH_X86_64 |
| pandn m2, m8, m1 ; flat16 & fm |
| %else |
| pandn m2, [esp+3*16], m1 ; flat16 & fm |
| mova %%flat16mem, m2 |
| %endif |
| SWAP 1, 2 |
| |
| pand m2, m12, mask1 |
| pcmpeqd m2, m12 |
| %if ARCH_X86_64 |
| pandn m9, m2 ; flat8in |
| pandn m2, m8, m9 |
| SWAP 2, 9 |
| %else |
| pandn m0, [esp+6*16], m2 |
| pandn m2, [esp+3*16], m0 |
| mova [esp+6*16], m2 |
| %endif |
| pand m2, m12, mask0 |
| pcmpeqd m2, m12 |
| %if ARCH_X86_64 |
| pandn m8, m2 |
| pandn m2, m9, m8 ; fm & !flat8 & !flat16 |
| SWAP 2, 8 |
| pandn m2, m1, m9 ; flat8 & !flat16 |
| SWAP 2, 9 |
| SWAP 0, 8 |
| SWAP 1, 10 |
| %else |
| pandn m0, [esp+3*16], m2 |
| pandn m2, [esp+6*16], m0 |
| SWAP 2, 0 |
| pandn m2, m1, [esp+6*16] |
| mova %%flat8mem, m2 |
| %endif |
| %elif %1 != 4 |
| %if ARCH_X86_64 |
| SWAP 1, 9 |
| %else |
| %define m12 m7 |
| mova m12, maskmem |
| mova m1, [esp+6*16] |
| %endif |
| pand m2, m12, mask1 |
| pcmpeqd m2, m12 |
| pandn m1, m2 |
| pandn m2, m0, m1 ; flat8 & fm |
| pand m1, m12, mask0 |
| pcmpeqd m1, m12 |
| pandn m0, m1 |
| pandn m1, m2, m0 ; fm & !flat8 |
| SWAP 1, 2, 0 |
| %if ARCH_X86_64 |
| SWAP 1, 9 |
| %else |
| mova %%flat8mem, m1 |
| %endif |
| %else |
| %if ARCH_X86_32 |
| %define m12 m1 |
| mova m12, maskmem |
| %endif |
| pand m2, m12, mask0 |
| pcmpeqd m2, m12 |
| pandn m0, m2 ; fm |
| %endif |
| |
| ; short filter |
| |
| mova m1, [PIC_sym(pb_128)] |
| %if ARCH_X86_64 |
| SWAP 7, 11 |
| %else |
| mova m7, [esp+5*16] |
| %endif |
| pxor m3, m1 |
| pxor m6, m1 |
| pxor m4, m1 |
| pxor m5, m1 |
| psubsb m1, m3, m6 ; iclip_diff(p1-q1) |
| pand m1, m7 ; f=iclip_diff(p1-q1)&hev |
| psubsb m2, m5, m4 |
| paddsb m1, m2 |
| paddsb m1, m2 |
| paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) |
| mova m2, [PIC_sym(pb_16)] |
| pand m0, m1 ; f&=fm |
| paddsb m1, m0, [PIC_sym(pb_3)] |
| paddsb m0, [PIC_sym(pb_4)] |
| pand m1, [PIC_sym(pb_248)] |
| pand m0, [PIC_sym(pb_248)] |
| psrlq m1, 3 |
| psrlq m0, 3 |
| pxor m1, m2 |
| pxor m0, m2 |
| psubb m1, m2 ; f2 |
| psubb m0, m2 ; f1 |
| mova m2, [PIC_sym(pb_128)] |
| paddsb m4, m1 |
| psubsb m5, m0 |
| pxor m4, m2 |
| pxor m5, m2 |
| |
| pxor m0, m2 |
| pxor m1, m1 |
| pavgb m0, m1 ; f=(f1+1)>>1 |
| psubb m0, [PIC_sym(pb_64)] |
| pandn m7, m0 ; f&=!hev |
| paddsb m3, m7 |
| psubsb m6, m7 |
| pxor m3, m2 |
| pxor m6, m2 |
| |
| %if %1 == 16 |
| ; flat16 filter |
| %ifidn %2, v |
| lea tmpq, [dstq+mstrideq*8] |
| mova m0, [tmpq+strideq*1] ; p6 |
| mova m2, [tmpq+strideq*2] ; p5 |
| mova m7, [tmpq+stride3q] ; p4 |
| %else |
| mova m0, [rsp+12*16] |
| mova m2, [rsp+13*16] |
| mova m7, [rsp+14*16] |
| %endif |
| |
| %if ARCH_X86_64 |
| SWAP 1, 10 |
| mova %%flat8mem, m9 |
| mova %%q2mem, m14 |
| mova %%q3mem, m15 |
| SWAP 0, 8 |
| SWAP 1, 9 |
| %else |
| %ifidn %2, v |
| mova [esp+17*16], m0 |
| mova [esp+19*16], m3 |
| mova [esp+21*16], m4 |
| mova [esp+22*16], m5 |
| mova [esp+23*16], m6 |
| %xdefine m11 m3 |
| %xdefine m14 m4 |
| %xdefine m15 m5 |
| %xdefine m10 m6 |
| %define m13 %%p2mem |
| %define m8 [esp+17*16] |
| %define m9 %%flat16mem |
| %define m3 [esp+19*16] |
| %define m4 [esp+21*16] |
| %define m5 [esp+22*16] |
| %define m6 [esp+23*16] |
| %else |
| mova [esp+31*16], m0 |
| mova [esp+32*16], m3 |
| mova [esp+33*16], m4 |
| mova [esp+34*16], m5 |
| mova [esp+35*16], m6 |
| %xdefine m11 m3 |
| %xdefine m14 m4 |
| %xdefine m15 m5 |
| %xdefine m10 m6 |
| %define m13 %%p2mem |
| %define m8 [esp+31*16] |
| %define m9 %%flat16mem |
| %define m3 [esp+32*16] |
| %define m4 [esp+33*16] |
| %define m5 [esp+34*16] |
| %define m6 [esp+35*16] |
| %endif |
| %endif |
| |
| ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A |
| ; write -6 |
| mova m11, %%p3mem |
| %if ARCH_X86_64 |
| punpcklbw m14, m8, m11 |
| punpckhbw m15, m8, m11 |
| %else |
| punpcklbw m14, m0, m11 |
| punpckhbw m15, m0, m11 |
| %endif |
| %ifidn %2, v |
| mova [rsp+5*16], m11 |
| %endif |
| pmaddubsw m10, m14, [PIC_sym(pb_7_1)] |
| pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 |
| punpcklbw m0, m2, m7 |
| punpckhbw m1, m2, m7 |
| pmaddubsw m0, [PIC_sym(pb_2)] |
| pmaddubsw m1, [PIC_sym(pb_2)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*7+p5*2+p4*2+p3 |
| punpcklbw m0, m13, m3 |
| punpckhbw m1, m13, m3 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 |
| punpcklbw m0, m4, m5 |
| punpckhbw m1, m4, m5 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m2 |
| por m0, m1 |
| %ifidn %2, v |
| mova [tmpq+strideq*2], m0 ; p5 |
| %else |
| mova [rsp+13*16], m0 |
| %endif |
| |
| ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B |
| ; write -5 |
| pmaddubsw m14, [PIC_sym(pb_m1_1)] |
| pmaddubsw m15, [PIC_sym(pb_m1_1)] |
| paddw m10, m14 |
| paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 |
| punpcklbw m0, m8, m6 |
| punpckhbw m1, m8, m6 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| mova [rsp+3*16], m0 |
| mova [rsp+4*16], m1 |
| paddw m10, m0 |
| paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m7 |
| por m0, m1 |
| %ifidn %2, v |
| mova [tmpq+stride3q], m0 ; p4 |
| %else |
| mova [rsp+14*16], m0 |
| %endif |
| |
| ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C |
| ; write -4 |
| mova m14, %%q2mem |
| punpcklbw m0, m8, m13 |
| punpckhbw m1, m8, m13 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 |
| punpcklbw m0, m2, m14 |
| punpckhbw m2, m14 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m2, [PIC_sym(pb_m1_1)] |
| mova [rsp+1*16], m0 |
| paddw m10, m0 |
| paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, %%p3mem |
| por m0, m1 |
| %ifidn %2, v |
| mova [tmpq+strideq*4], m0 ; p3 |
| %else |
| mova [rsp+19*16], m0 |
| %endif |
| |
| ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D |
| ; write -3 |
| mova m15, %%q3mem |
| punpcklbw m0, m8, m3 |
| punpckhbw m1, m8, m3 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 |
| punpcklbw m0, m7, m15 |
| punpckhbw m7, m15 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m7, [PIC_sym(pb_m1_1)] |
| mova [rsp+2*16], m0 |
| %if ARCH_X86_32 |
| %ifidn %2, v |
| mova [esp+24*16], m7 |
| %else |
| mova [esp+36*16], m7 |
| %endif |
| %endif |
| paddw m10, m0 |
| paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m13 |
| por m0, m1 |
| mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F |
| |
| ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E |
| ; write -2 |
| punpcklbw m0, m8, m4 |
| punpckhbw m1, m8, m4 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 |
| %if ARCH_X86_64 |
| SWAP 7, 8 |
| %endif |
| %ifidn %2, v |
| mova m1, [dstq+strideq*4] ; q4 |
| mova m7, [rsp+5*16] ; (pre-filter) p3 |
| %else |
| mova m1, [rsp+15*16] |
| mova m7, %%p3mem ; (pre-filter) p3 |
| %endif |
| punpcklbw m0, m1, m7 |
| punpckhbw m1, m1, m7 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| mova [rsp+7*16], m0 |
| mova [rsp+5*16], m1 |
| psubw m10, m0 |
| psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m3 |
| por m0, m1 |
| mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G |
| |
| ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F |
| ; write -1 |
| %ifidn %2, v |
| mova m7, [tmpq+strideq*1] ; p6 |
| lea tmpq, [dstq+strideq*4] |
| mova m1, [tmpq+strideq*1] ; q5 |
| %else |
| mova m7, [rsp+12*16] ; p6 |
| mova m1, [rsp+16*16] |
| %endif |
| punpcklbw m0, m7, m5 |
| punpckhbw m7, m5 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m7, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 |
| punpcklbw m7, m13, m1 |
| pmaddubsw m7, [PIC_sym(pb_m1_1)] |
| mova [rsp+9*16], m7 |
| paddw m10, m7 |
| %if ARCH_X86_64 |
| punpckhbw m13, m1 |
| mova m1, [rsp+6*16] |
| SWAP 1, 13 |
| %else |
| punpckhbw m7, m13, m1 |
| mova m1, [esp+6*16] |
| mova m13, m1 |
| SWAP 1, 7 |
| %endif |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| mova [rsp+10*16], m1 |
| paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 |
| pmulhrsw m7, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m0, m11, [PIC_sym(pw_2048)] |
| packuswb m7, m0 |
| pand m7, m9 |
| pandn m0, m9, m4 |
| por m7, m0 |
| mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H |
| |
| ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G |
| ; write +0 |
| %ifidn %2, v |
| mova m7, [tmpq+strideq*2] ; q6 |
| %else |
| mova m7, [rsp+17*16] |
| %endif |
| paddw m10, [rsp+3*16] |
| paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 |
| punpcklbw m0, m3, m7 |
| punpckhbw m1, m3, m7 |
| %if ARCH_X86_64 |
| mova m3, [rsp+8*16] |
| %endif |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| mova [rsp+3*16], m0 |
| mova [rsp+4*16], m1 |
| paddw m10, m0 |
| paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m5 |
| por m0, m1 |
| %if ARCH_X86_32 |
| mova m1, [esp+8*16] |
| mova m3, m1 |
| %endif |
| mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I |
| |
| ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H |
| ; write +1 |
| paddw m10, [rsp+1*16] |
| paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 |
| punpcklbw m0, m4, m7 |
| punpckhbw m2, m4, m7 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m2, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 |
| %if ARCH_X86_64 |
| mova m4, [rsp+6*16] |
| %else |
| %define m4 [esp+6*16] |
| %endif |
| pmulhrsw m2, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m2, m1 |
| pand m2, m9 |
| pandn m1, m9, m6 |
| por m2, m1 ; don't clobber q1/m6 since we need it in K |
| |
| ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I |
| ; write +2 |
| paddw m10, [rsp+2*16] |
| %if ARCH_X86_64 |
| SWAP 7, 8 |
| paddw m11, m7 |
| %else |
| mova m8, m7 |
| %ifidn %2, v |
| paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 |
| %else |
| paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 |
| %endif |
| %endif |
| punpcklbw m0, m5, m8 |
| punpckhbw m1, m5, m8 |
| %if ARCH_X86_64 |
| mova m5, [rsp+8*16] |
| %else |
| %define m5 [esp+8*16] |
| %endif |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 |
| pmulhrsw m7, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m7, m1 |
| pand m7, m9 |
| pandn m1, m9, m14 |
| por m7, m1 ; don't clobber q2/m14 since we need it in K |
| |
| ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J |
| ; write +3 |
| psubw m10, [rsp+7*16] |
| psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 |
| punpcklbw m0, m6, m8 |
| punpckhbw m1, m6, m8 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m15 |
| por m0, m1 |
| %ifidn %2, v |
| mova [tmpq+mstrideq], m0 ; q3 |
| %else |
| mova [rsp+20*16], m0 |
| %endif |
| |
| ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K |
| ; write +4 |
| paddw m10, [rsp+ 9*16] |
| paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 |
| punpcklbw m0, m14, m8 |
| punpckhbw m1, m14, m8 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 |
| pmulhrsw m0, m10, [PIC_sym(pw_2048)] |
| pmulhrsw m1, m11, [PIC_sym(pw_2048)] |
| packuswb m0, m1 |
| pand m0, m9 |
| %ifidn %2, v |
| pandn m1, m9, [tmpq+strideq*0] |
| %else |
| pandn m1, m9, [rsp+15*16] |
| %endif |
| por m0, m1 |
| %ifidn %2, v |
| mova [tmpq+strideq*0], m0 ; q4 |
| %else |
| mova [rsp+15*16], m0 |
| %endif |
| |
| ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L |
| ; write +5 |
| paddw m10, [rsp+3*16] |
| paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 |
| punpcklbw m0, m15, m8 |
| punpckhbw m1, m15, m8 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m10, m0 |
| paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 |
| pmulhrsw m10, [PIC_sym(pw_2048)] |
| pmulhrsw m11, [PIC_sym(pw_2048)] |
| packuswb m10, m11 |
| pand m10, m9 |
| %ifidn %2, v |
| pandn m11, m9, [tmpq+strideq*1] |
| %else |
| pandn m11, m9, [rsp+16*16] |
| %endif |
| por m10, m11 |
| %ifidn %2, v |
| mova [tmpq+strideq*1], m10 ; q5 |
| %else |
| mova [rsp+16*16], m10 |
| %endif |
| |
| %if ARCH_X86_64 |
| SWAP 0, 8 |
| SWAP 1, 9 |
| SWAP 14, 7 |
| %else |
| %xdefine m3 m11 |
| %xdefine m4 m14 |
| %xdefine m5 m15 |
| %xdefine m6 m10 |
| mova %%q2mem, m7 |
| %ifidn %2, v |
| mova m3, [esp+19*16] |
| %else |
| mova m3, [esp+32*16] |
| %endif |
| mova m4, [esp+ 6*16] |
| mova m5, [esp+ 8*16] |
| %endif |
| SWAP m6, m2 |
| |
| %if ARCH_X86_64 |
| mova m9, %%flat8mem |
| %endif |
| %ifidn %2, v |
| lea tmpq, [dstq+mstrideq*4] |
| %endif |
| %endif ; if %1 == 16 |
| %if %1 >= 8 |
| ; flat8 filter |
| %if ARCH_X86_32 |
| %define m9 %%flat8mem |
| %define m11 m1 |
| %define m13 %%p2mem |
| %define m14 %%q2mem |
| %define m15 %%q3mem |
| %endif |
| mova m11, %%p3mem |
| punpcklbw m0, m11, m3 |
| punpcklbw m7, m13, m4 |
| pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 |
| pmaddubsw m7, [PIC_sym(pb_2_1)] |
| paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 |
| punpcklbw m7, m5, [PIC_sym(pb_4)] |
| pmaddubsw m7, [PIC_sym(pb_1)] |
| paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 |
| punpckhbw m1, m11, m3 |
| pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 |
| punpckhbw m0, m13, m4 |
| pmaddubsw m0, [PIC_sym(pb_2_1)] |
| paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 |
| punpckhbw m0, m5, [PIC_sym(pb_4)] |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 |
| psrlw m0, m2, 3 |
| psrlw m1, m7, 3 |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m13 |
| por m0, m1 ; p2 |
| %ifidn %2, v |
| mova [tmpq+strideq*1], m0 |
| %else |
| %if ARCH_X86_64 |
| SWAP 0, 10 |
| %else |
| mova [esp+2*16], m0 |
| %endif |
| %endif |
| |
| %if ARCH_X86_32 |
| mova m11, %%p3mem |
| %endif |
| punpcklbw m0, m11, m3 |
| punpckhbw m1, m11, m3 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m2, m0 |
| paddw m7, m1 |
| punpcklbw m0, m13, m6 |
| punpckhbw m1, m13, m6 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m2, m0 |
| paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 |
| psrlw m0, m2, 3 |
| psrlw m1, m7, 3 |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m3 |
| por m0, m1 ; p1 |
| %ifidn %2, v |
| mova [tmpq+strideq*2], m0 |
| %else |
| mova [rsp+0*16], m0 |
| %endif |
| |
| %if ARCH_X86_32 |
| mova m11, %%p3mem |
| %endif |
| punpcklbw m0, m11, m3 |
| punpckhbw m1, m11, m3 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| psubw m2, m0 |
| psubw m7, m1 |
| punpcklbw m0, m4, m14 |
| punpckhbw m1, m4, m14 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| paddw m2, m0 |
| paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 |
| psrlw m0, m2, 3 |
| psrlw m1, m7, 3 |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m4 |
| por m0, m1 ; p0 |
| %ifidn %2, v |
| mova [tmpq+stride3q], m0 |
| %else |
| mova [rsp+1*16], m0 |
| %endif |
| |
| punpcklbw m0, m5, m15 |
| punpckhbw m1, m5, m15 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| paddw m2, m0 |
| paddw m7, m1 |
| %if ARCH_X86_32 |
| mova m11, %%p3mem |
| %endif |
| punpcklbw m0, m11, m4 |
| punpckhbw m11, m11, m4 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m11, [PIC_sym(pb_1)] |
| psubw m2, m0 |
| psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 |
| psrlw m0, m2, 3 |
| psrlw m11, m7, 3 |
| packuswb m0, m11 |
| pand m0, m9 |
| pandn m11, m9, m5 |
| por m11, m0 ; q0 |
| %ifidn %2, v |
| mova [dstq+strideq*0], m11 |
| %elif ARCH_X86_32 |
| mova [esp+8*16], m11 |
| %endif |
| |
| punpcklbw m0, m5, m15 |
| punpckhbw m1, m5, m15 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m2, m0 |
| paddw m7, m1 |
| punpcklbw m0, m13, m6 |
| punpckhbw m1, m13, m6 |
| pmaddubsw m0, [PIC_sym(pb_m1_1)] |
| pmaddubsw m1, [PIC_sym(pb_m1_1)] |
| paddw m2, m0 |
| paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 |
| psrlw m0, m2, 3 |
| psrlw m1, m7, 3 |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m6 |
| por m0, m1 ; q1 |
| %ifidn %2, v |
| mova [dstq+strideq*1], m0 |
| %else |
| %if ARCH_X86_64 |
| SWAP 0, 13 |
| %else |
| mova [esp+9*16], m0 |
| %endif |
| %endif |
| |
| punpcklbw m0, m3, m6 |
| punpckhbw m1, m3, m6 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| psubw m2, m0 |
| psubw m7, m1 |
| punpcklbw m0, m14, m15 |
| punpckhbw m1, m14, m15 |
| pmaddubsw m0, [PIC_sym(pb_1)] |
| pmaddubsw m1, [PIC_sym(pb_1)] |
| paddw m2, m0 |
| paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 |
| psrlw m2, 3 |
| psrlw m7, 3 |
| packuswb m2, m7 |
| pand m2, m9 |
| pandn m7, m9, m14 |
| por m2, m7 ; q2 |
| %ifidn %2, v |
| mova [dstq+strideq*2], m2 |
| %else |
| mova m0, [rsp+0*16] |
| %if %1 == 8 |
| mova m1, [rsp+1*16] |
| mova m4, %%p3mem |
| |
| %if ARCH_X86_32 |
| %define m10 [esp+2*16] |
| %define m11 [esp+8*16] |
| %define m13 [esp+9*16] |
| %endif |
| |
| ; 16x8 transpose |
| punpcklbw m3, m4, m10 |
| punpckhbw m4, m10 |
| punpcklbw m5, m0, m1 |
| punpckhbw m0, m1 |
| punpcklbw m1, m11, m13 |
| punpckhbw m6, m11, m13 |
| punpcklbw m7, m2, m15 |
| punpckhbw m2, m15 |
| %if ARCH_X86_64 |
| SWAP 2, 15 |
| %else |
| mova m15, m2 |
| %endif |
| |
| punpcklwd m2, m3, m5 |
| punpckhwd m3, m5 |
| punpcklwd m5, m4, m0 |
| punpckhwd m4, m0 |
| punpcklwd m0, m1, m7 |
| punpckhwd m1, m7 |
| punpcklwd m7, m6, m15 |
| punpckhwd m6, m15 |
| %if ARCH_X86_64 |
| SWAP 6, 15 |
| %else |
| mova m15, m6 |
| %endif |
| |
| punpckldq m6, m2, m0 |
| punpckhdq m2, m0 |
| punpckldq m0, m3, m1 |
| punpckhdq m3, m1 |
| punpckldq m1, m5, m7 |
| punpckhdq m5, m7 |
| punpckldq m7, m4, m15 |
| punpckhdq m4, m15 |
| |
| ; write 8x16 |
| movq [dstq+strideq*0-4], xm6 |
| movhps [dstq+strideq*1-4], xm6 |
| movq [dstq+strideq*2-4], xm2 |
| movhps [dstq+stride3q -4], xm2 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq+strideq*0-4], xm0 |
| movhps [dstq+strideq*1-4], xm0 |
| movq [dstq+strideq*2-4], xm3 |
| movhps [dstq+stride3q -4], xm3 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq+strideq*0-4], xm1 |
| movhps [dstq+strideq*1-4], xm1 |
| movq [dstq+strideq*2-4], xm5 |
| movhps [dstq+stride3q -4], xm5 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq+strideq*0-4], xm7 |
| movhps [dstq+strideq*1-4], xm7 |
| movq [dstq+strideq*2-4], xm4 |
| movhps [dstq+stride3q -4], xm4 |
| lea dstq, [dstq+strideq*4] |
| %else |
| ; 16x16 transpose and store |
| SWAP 6, 0 |
| SWAP 7, 1 |
| %if ARCH_X86_64 |
| SWAP 5, 10, 2 |
| SWAP 8, 11 |
| SWAP 9, 13 |
| mova [rsp+21*16], m12 |
| %else |
| mova [esp+10*16], m2 |
| %xdefine m8 m0 |
| %xdefine m9 m1 |
| %xdefine m10 m2 |
| %xdefine m11 m3 |
| %xdefine m12 m4 |
| %xdefine m13 m5 |
| %xdefine m14 m6 |
| %xdefine m15 m7 |
| %endif |
| mova m0, [rsp+11*16] |
| mova m1, [rsp+12*16] |
| mova m2, [rsp+13*16] |
| mova m3, [rsp+14*16] |
| mova m4, [rsp+19*16] |
| %if ARCH_X86_64 |
| mova m7, [rsp+ 1*16] |
| mova m11, [rsp+20*16] |
| mova m12, [rsp+15*16] |
| mova m13, [rsp+16*16] |
| mova m14, [rsp+17*16] |
| TRANSPOSE_16X16B 1, [rsp+18*16] |
| %else |
| mova m5, [esp+ 2*16] |
| TRANSPOSE_16X16B 1, [esp+32*16] |
| mov tmpq, dstq |
| lea dstq, [dstq+strideq*8] |
| %endif |
| movu [dstq+strideq*0-8], xm0 |
| movu [dstq+strideq*1-8], xm1 |
| movu [dstq+strideq*2-8], xm2 |
| movu [dstq+stride3q -8], xm3 |
| lea dstq, [dstq+strideq*4] |
| movu [dstq+strideq*0-8], xm4 |
| movu [dstq+strideq*1-8], xm5 |
| movu [dstq+strideq*2-8], xm6 |
| movu [dstq+stride3q -8], xm7 |
| %if ARCH_X86_64 |
| lea dstq, [dstq+strideq*4] |
| %else |
| %xdefine m8 m0 |
| %xdefine m9 m1 |
| %xdefine m10 m2 |
| %xdefine m11 m3 |
| %xdefine m12 m4 |
| %xdefine m13 m5 |
| %xdefine m14 m6 |
| %xdefine m15 m7 |
| mova m8, [esp+11*16] |
| mova m9, [esp+12*16] |
| mova m10, [esp+13*16] |
| mova m11, [esp+14*16] |
| mova m12, [esp+26*16] |
| mova m13, [esp+27*16] |
| mova m14, [esp+ 0*16] |
| mova m15, [esp+ 1*16] |
| mov dstq, tmpq |
| %endif |
| movu [dstq+strideq*0-8], xm8 |
| movu [dstq+strideq*1-8], xm9 |
| movu [dstq+strideq*2-8], xm10 |
| movu [dstq+stride3q -8], xm11 |
| lea dstq, [dstq+strideq*4] |
| movu [dstq+strideq*0-8], xm12 |
| movu [dstq+strideq*1-8], xm13 |
| movu [dstq+strideq*2-8], xm14 |
| movu [dstq+stride3q -8], xm15 |
| lea dstq, [dstq+strideq*4] |
| %if ARCH_X86_32 |
| lea dstq, [dstq+strideq*8] |
| %else |
| mova m12, [rsp+21*16] |
| %endif |
| |
| %endif ; if %1 == 8 |
| %endif ; ifidn %2, v |
| %elif %1 == 6 |
| ; flat6 filter |
| %if ARCH_X86_32 |
| mova [esp+3*16], m3 |
| mova [esp+4*16], m4 |
| mova [esp+5*16], m5 |
| mova [esp+6*16], m6 |
| %xdefine m8 m3 |
| %xdefine m10 m4 |
| %xdefine m11 m5 |
| %xdefine m15 m6 |
| %define m3 [esp+3*16] |
| %define m4 [esp+4*16] |
| %define m5 [esp+5*16] |
| %define m6 [esp+6*16] |
| %define m9 %%flat8mem |
| %define m13 %%p2mem |
| %define m14 %%q2mem |
| %endif |
| |
| punpcklbw m8, m13, m5 |
| punpckhbw m11, m13, m5 |
| pmaddubsw m0, m8, [PIC_sym(pb_3_1)] |
| pmaddubsw m1, m11, [PIC_sym(pb_3_1)] |
| punpcklbw m7, m4, m3 |
| punpckhbw m10, m4, m3 |
| pmaddubsw m2, m7, [PIC_sym(pb_2)] |
| pmaddubsw m15, m10, [PIC_sym(pb_2)] |
| paddw m0, m2 |
| paddw m1, m15 |
| pmulhrsw m2, m0, [PIC_sym(pw_4096)] |
| pmulhrsw m15, m1, [PIC_sym(pw_4096)] |
| packuswb m2, m15 |
| pand m2, m9 |
| pandn m15, m9, m3 |
| por m2, m15 |
| %ifidn %2, v |
| mova [tmpq+strideq*2], m2 ; p1 |
| %elif ARCH_X86_32 |
| mova [esp+11*16], m2 |
| %endif |
| |
| pmaddubsw m8, [PIC_sym(pb_m1_1)] |
| pmaddubsw m11, [PIC_sym(pb_m1_1)] |
| paddw m0, m8 |
| paddw m1, m11 |
| punpcklbw m8, m13, m6 |
| punpckhbw m11, m13, m6 |
| %if ARCH_X86_64 |
| SWAP 2, 13 |
| %endif |
| pmaddubsw m8, [PIC_sym(pb_m1_1)] |
| pmaddubsw m11, [PIC_sym(pb_m1_1)] |
| paddw m0, m8 |
| paddw m1, m11 |
| pmulhrsw m2, m0, [PIC_sym(pw_4096)] |
| pmulhrsw m15, m1, [PIC_sym(pw_4096)] |
| packuswb m2, m15 |
| pand m2, m9 |
| pandn m15, m9, m4 |
| por m2, m15 |
| %ifidn %2, v |
| mova [tmpq+stride3q], m2 ; p0 |
| %elif ARCH_X86_32 |
| mova [esp+8*16], m2 |
| %endif |
| |
| paddw m0, m8 |
| paddw m1, m11 |
| punpcklbw m8, m3, m14 |
| punpckhbw m11, m3, m14 |
| %if ARCH_X86_64 |
| SWAP 2, 14 |
| %endif |
| pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] |
| pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] |
| paddw m0, m2 |
| paddw m1, m15 |
| pmulhrsw m2, m0, [PIC_sym(pw_4096)] |
| pmulhrsw m15, m1, [PIC_sym(pw_4096)] |
| packuswb m2, m15 |
| pand m2, m9 |
| pandn m15, m9, m5 |
| por m2, m15 |
| %ifidn %2, v |
| mova [dstq+strideq*0], m2 ; q0 |
| %endif |
| |
| pmaddubsw m8, [PIC_sym(pb_m1_2)] |
| pmaddubsw m11, [PIC_sym(pb_m1_2)] |
| paddw m0, m8 |
| paddw m1, m11 |
| pmaddubsw m7, [PIC_sym(pb_m1_0)] |
| pmaddubsw m10, [PIC_sym(pb_m1_0)] |
| paddw m0, m7 |
| paddw m1, m10 |
| pmulhrsw m0, [PIC_sym(pw_4096)] |
| pmulhrsw m1, [PIC_sym(pw_4096)] |
| packuswb m0, m1 |
| pand m0, m9 |
| pandn m1, m9, m6 |
| por m0, m1 |
| %if ARCH_X86_32 |
| %xdefine m3 m8 |
| %xdefine m4 m10 |
| %xdefine m5 m11 |
| %xdefine m6 m15 |
| %endif |
| %ifidn %2, v |
| mova [dstq+strideq*1], m0 ; q1 |
| %else |
| %if ARCH_X86_64 |
| SWAP 3, 13 |
| SWAP 4, 14 |
| %else |
| mova m3, [esp+11*16] |
| mova m4, [esp+ 8*16] |
| %endif |
| SWAP 5, 2 |
| SWAP 6, 0 |
| TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 |
| %endif |
| %else ; if %1 == 4 |
| %ifidn %2, v |
| mova [tmpq+strideq*0], m3 ; p1 |
| mova [tmpq+strideq*1], m4 ; p0 |
| mova [tmpq+strideq*2], m5 ; q0 |
| mova [tmpq+stride3q ], m6 ; q1 |
| %else |
| TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 |
| %endif |
| %endif |
| %if ARCH_X86_32 |
| %define m12 m12reg |
| %endif |
| %endmacro |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; 32-bit PIC helpers ;; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| %if ARCH_X86_32 |
| %define PIC_base_offset $$ |
| |
| %macro SETUP_PIC 0 ; PIC_reg |
| %define PIC_reg r2 |
| %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) |
| LEA PIC_reg, $$ |
| %endmacro |
| |
| %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base |
| %if %1 == 0 |
| mov [esp+PIC_reg_stk_offset], PIC_reg |
| mov PIC_reg, maskm |
| %else |
| mov PIC_reg, [esp+PIC_reg_stk_offset] |
| %endif |
| %endmacro |
| |
| %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) |
| |
| %else |
| %macro XCHG_PIC_REG 1 |
| %endmacro |
| %define PIC_sym(sym) (sym) |
| %endif |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < required_stack_alignment |
| %assign copy_args 1 |
| %else |
| %assign copy_args 0 |
| %endif |
| %endif |
| |
| %macro RELOC_ARGS 1 |
| %if copy_args |
| %define maskm [esp+stack_size-gprsize*1] |
| %define l_stridem [esp+stack_size-gprsize*2] |
| %define lutm [esp+stack_size-gprsize*3] |
| %define %1m [esp+stack_size-gprsize*4] |
| mov r6d, r6m |
| mov maskm, maskd |
| mov lutm, lutd |
| mov %1m, r6d |
| %else |
| %define %1m r6m |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_32 |
| %define tmpq r4 |
| %define mstrideq r5 |
| %define stride3q r6 |
| %define l_stride3q r6 |
| %endif |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ |
| dst, stride, mask, l, l_stride, lut, \ |
| w, stride3, mstride, tmp, mask_bits |
| %else |
| cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ |
| dst, stride, mask, l, l_stride, lut, mask_bits |
| RELOC_ARGS w |
| SETUP_PIC |
| %define m12 m5 |
| %endif |
| shl l_strideq, 2 |
| sub lq, l_strideq |
| %if ARCH_X86_64 |
| mov mstrideq, strideq |
| neg mstrideq |
| lea stride3q, [strideq*3] |
| %else |
| mov l_stridem, l_strided |
| %endif |
| mov mask_bitsd, 0xf |
| mova m12, [PIC_sym(pd_mask)] |
| XCHG_PIC_REG 0 |
| movu m0, [maskq] |
| pxor m4, m4 |
| movd m3, [lutq+136] |
| pshufb m3, m4 |
| pshufd m2, m0, q2222 |
| pshufd m1, m0, q1111 |
| pshufd m0, m0, q0000 |
| por m1, m2 |
| por m0, m1 |
| mova [rsp+11*16], m0 |
| mova [rsp+12*16], m1 |
| mova [rsp+13*16], m2 |
| mova [rsp+14*16], m3 |
| |
| %define maskmem [esp+15*16] |
| %define mask0 [rsp+11*16] |
| %define mask1 [rsp+12*16] |
| %define mask2 [rsp+13*16] |
| %define minlvl [rsp+14*16] |
| |
| .loop: |
| test [maskq+8], mask_bitsd ; vmask[2] |
| je .no_flat16 |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+25*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 16, v |
| jmp .end |
| |
| .no_flat16: |
| test [maskq+4], mask_bitsd ; vmask[1] |
| je .no_flat |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+25*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 8, v |
| jmp .end |
| |
| .no_flat: |
| test [maskq+0], mask_bitsd ; vmask[0] |
| XCHG_PIC_REG 1 |
| je .no_filter |
| |
| %if ARCH_X86_32 |
| mov [esp+25*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 4, v |
| |
| .end: |
| %if ARCH_X86_32 |
| mova m12, maskmem |
| mov mask_bitsd, [esp+25*16] |
| %endif |
| .no_filter: |
| pslld m12, 4 |
| shl mask_bitsd, 4 |
| add lq, 16 |
| add dstq, 16 |
| %if ARCH_X86_64 |
| sub wd, 4 |
| %else |
| sub dword wm, 4 |
| %endif |
| XCHG_PIC_REG 0 |
| jg .loop |
| RET |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ |
| dst, stride, mask, l, l_stride, lut, \ |
| h, stride3, l_stride3, tmp, mask_bits |
| %else |
| cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ |
| dst, stride, mask, l, l_stride, lut, mask_bits |
| RELOC_ARGS h |
| SETUP_PIC |
| %define m12 m5 |
| %endif |
| sub lq, 4 |
| shl l_strideq, 2 |
| %if ARCH_X86_64 |
| lea stride3q, [strideq*3] |
| lea l_stride3q, [l_strideq*3] |
| %else |
| mov l_stridem, l_strided |
| %endif |
| mov mask_bitsd, 0xf |
| mova m12, [PIC_sym(pd_mask)] |
| XCHG_PIC_REG 0 |
| movu m0, [maskq] |
| pxor m4, m4 |
| movd m3, [lutq+136] |
| pshufb m3, m4 |
| pshufd m2, m0, q2222 |
| pshufd m1, m0, q1111 |
| pshufd m0, m0, q0000 |
| por m1, m2 |
| por m0, m1 |
| mova [rsp+22*16], m0 |
| mova [rsp+23*16], m1 |
| mova [rsp+24*16], m2 |
| mova [rsp+25*16], m3 |
| |
| %define maskmem [esp+37*16] |
| %define mask0 [rsp+22*16] |
| %define mask1 [rsp+23*16] |
| %define mask2 [rsp+24*16] |
| %define minlvl [rsp+25*16] |
| |
| .loop: |
| test [maskq+8], mask_bitsd ; vmask[2] |
| je .no_flat16 |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+38*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 16, h |
| jmp .end |
| |
| .no_flat16: |
| test [maskq+4], mask_bitsd ; vmask[1] |
| je .no_flat |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+38*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 8, h |
| jmp .end |
| |
| .no_flat: |
| test [maskq+0], mask_bitsd ; vmask[0] |
| XCHG_PIC_REG 1 |
| je .no_filter |
| |
| %if ARCH_X86_32 |
| mov [esp+38*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 4, h |
| jmp .end |
| |
| .no_filter: |
| lea dstq, [dstq+strideq*8] |
| lea dstq, [dstq+strideq*8] |
| %if ARCH_X86_32 |
| jmp .end_noload |
| .end: |
| mova m12, maskmem |
| mov l_strideq, l_stridem |
| mov mask_bitsd, [esp+38*16] |
| .end_noload: |
| %else |
| .end: |
| %endif |
| lea lq, [lq+l_strideq*4] |
| pslld m12, 4 |
| shl mask_bitsd, 4 |
| %if ARCH_X86_64 |
| sub hd, 4 |
| %else |
| sub dword hm, 4 |
| %endif |
| XCHG_PIC_REG 0 |
| jg .loop |
| RET |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ |
| dst, stride, mask, l, l_stride, lut, \ |
| w, stride3, mstride, tmp, mask_bits |
| %else |
| cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ |
| dst, stride, mask, l, l_stride, lut, mask_bits |
| RELOC_ARGS w |
| SETUP_PIC |
| %define m12 m4 |
| %endif |
| shl l_strideq, 2 |
| sub lq, l_strideq |
| %if ARCH_X86_64 |
| mov mstrideq, strideq |
| neg mstrideq |
| lea stride3q, [strideq*3] |
| %else |
| mov l_stridem, l_strided |
| %endif |
| mov mask_bitsd, 0xf |
| mova m12, [PIC_sym(pd_mask)] |
| XCHG_PIC_REG 0 |
| movq m0, [maskq] |
| pxor m3, m3 |
| movd m2, [lutq+136] |
| pshufb m2, m3 |
| pshufd m1, m0, q1111 |
| pshufd m0, m0, q0000 |
| por m0, m1 |
| mova [rsp+0*16], m0 |
| mova [rsp+1*16], m1 |
| mova [rsp+2*16], m2 |
| |
| %define maskmem [esp+7*16] |
| %define mask0 [rsp+0*16] |
| %define mask1 [rsp+1*16] |
| %define minlvl [rsp+2*16] |
| |
| .loop: |
| test [maskq+4], mask_bitsd ; vmask[1] |
| je .no_flat |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+11*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 6, v |
| jmp .end |
| |
| .no_flat: |
| test [maskq+0], mask_bitsd ; vmask[1] |
| XCHG_PIC_REG 1 |
| je .no_filter |
| |
| %if ARCH_X86_32 |
| mov [esp+11*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 4, v |
| |
| .end: |
| %if ARCH_X86_32 |
| mova m12, maskmem |
| mov mask_bitsd, [esp+11*16] |
| %endif |
| .no_filter: |
| pslld m12, 4 |
| shl mask_bitsd, 4 |
| add lq, 16 |
| add dstq, 16 |
| %if ARCH_X86_64 |
| sub wd, 4 |
| %else |
| sub dword wm, 4 |
| %endif |
| XCHG_PIC_REG 0 |
| jg .loop |
| RET |
| |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ |
| dst, stride, mask, l, l_stride, lut, \ |
| h, stride3, l_stride3, tmp, mask_bits |
| %else |
| cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ |
| dst, stride, mask, l, l_stride, lut, mask_bits |
| RELOC_ARGS h |
| SETUP_PIC |
| %define m12 m4 |
| %endif |
| sub lq, 4 |
| shl l_strideq, 2 |
| %if ARCH_X86_64 |
| lea stride3q, [strideq*3] |
| lea l_stride3q, [l_strideq*3] |
| %else |
| mov l_stridem, l_strided |
| %endif |
| mov mask_bitsd, 0xf |
| mova m12, [PIC_sym(pd_mask)] |
| XCHG_PIC_REG 0 |
| movq m0, [maskq] |
| pxor m3, m3 |
| movd m2, [lutq+136] |
| pshufb m2, m3 |
| pshufd m1, m0, q1111 |
| pshufd m0, m0, q0000 |
| por m0, m1 |
| mova [rsp+0*16], m0 |
| mova [rsp+1*16], m1 |
| mova [rsp+2*16], m2 |
| |
| %define maskmem [esp+7*16] |
| %define mask0 [rsp+0*16] |
| %define mask1 [rsp+1*16] |
| %define minlvl [rsp+2*16] |
| |
| .loop: |
| test [maskq+4], mask_bitsd ; vmask[1] |
| je .no_flat |
| |
| %if ARCH_X86_32 |
| XCHG_PIC_REG 1 |
| mov [esp+12*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 6, h |
| jmp .end |
| |
| .no_flat: |
| test [maskq+0], mask_bitsd ; vmask[1] |
| XCHG_PIC_REG 1 |
| je .no_filter |
| |
| %if ARCH_X86_32 |
| mov [esp+12*16], mask_bitsd |
| mova maskmem, m12 |
| %endif |
| FILTER 4, h |
| jmp .end |
| |
| .no_filter: |
| lea dstq, [dstq+strideq*8] |
| lea dstq, [dstq+strideq*8] |
| %if ARCH_X86_32 |
| jmp .end_noload |
| .end: |
| mova m12, maskmem |
| mov l_strided, l_stridem |
| mov mask_bitsd, [esp+12*16] |
| .end_noload: |
| %else |
| .end: |
| %endif |
| lea lq, [lq+l_strideq*4] |
| pslld m12, 4 |
| shl mask_bitsd, 4 |
| %if ARCH_X86_64 |
| sub hd, 4 |
| %else |
| sub dword hm, 4 |
| %endif |
| XCHG_PIC_REG 0 |
| jg .loop |
| RET |