| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; Copyright © 2018, VideoLabs |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 |
| wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 |
| wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 |
| wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 |
| wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 |
| sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 |
| sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 |
| pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| pb_right_ext_mask: times 24 db 0xff |
| times 8 db 0 |
| pb_1: times 16 db 1 |
| pb_3: times 16 db 3 |
| pw_256: times 8 dw 256 |
| pw_2056: times 8 dw 2056 |
| pw_m16380: times 8 dw -16380 |
| pd_4096: times 4 dd 4096 |
| pd_34816: times 4 dd 34816 |
| pd_0xffff: times 4 dd 0xffff |
| pd_0xf00800a4: times 4 dd 0xf00800a4 |
| pd_0xf00801c7: times 4 dd 0xf00801c7 |
| |
| cextern sgr_x_by_x |
| |
| SECTION .text |
| |
| %macro movif64 2 ; dst, src |
| %if ARCH_X86_64 |
| mov %1, %2 |
| %endif |
| %endmacro |
| |
| %macro movif32 2 ; dst, src |
| %if ARCH_X86_32 |
| mov %1, %2 |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_32 |
| %define PIC_base_offset $$ |
| |
| %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg |
| %assign pic_reg_stk_off 4 |
| %xdefine PIC_reg %1 |
| %if %2 == 1 |
| mov [esp], %1 |
| %endif |
| LEA PIC_reg, PIC_base_offset |
| %if %3 == 1 |
| XCHG_PIC_REG |
| %endif |
| %endmacro |
| |
| %macro XCHG_PIC_REG 0 |
| mov [esp+pic_reg_stk_off], PIC_reg |
| %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 |
| mov PIC_reg, [esp+pic_reg_stk_off] |
| %endmacro |
| |
| %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) |
| |
| %else |
| %macro XCHG_PIC_REG 0 |
| %endmacro |
| |
| %define PIC_sym(sym) (sym) |
| %endif |
| |
| %macro WIENER 0 |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers |
| cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ |
| w, h, edge, flt, x |
| %define tmpstrideq strideq |
| %define base 0 |
| mov fltq, r6mp |
| mov wd, wm |
| movifnidn hd, hm |
| mov edged, r7m |
| movq m14, [fltq] |
| add lpfq, wq |
| movq m7, [fltq+16] |
| add dstq, wq |
| lea t1, [rsp+wq*2+16] |
| mova m15, [pw_2056] |
| neg wq |
| %if cpuflag(ssse3) |
| pshufb m14, [wiener_init] |
| mova m8, [wiener_shufA] |
| pshufd m12, m14, q2222 ; x0 x0 |
| mova m9, [wiener_shufB] |
| pshufd m13, m14, q3333 ; x1 x2 |
| mova m10, [wiener_shufC] |
| punpcklqdq m14, m14 ; x3 |
| mova m11, [wiener_shufD] |
| %else |
| mova m10, [pw_m16380] |
| punpcklwd m14, m14 |
| pshufd m11, m14, q0000 ; x0 |
| pshufd m12, m14, q1111 ; x1 |
| pshufd m13, m14, q2222 ; x2 |
| pshufd m14, m14, q3333 ; x3 |
| %endif |
| %else |
| DECLARE_REG_TMP 4, 0, _, 5 |
| %if cpuflag(ssse3) |
| %define m10 [base+wiener_shufC] |
| %define m11 [base+wiener_shufD] |
| %define stk_off 96 |
| %else |
| %define m10 [base+pw_m16380] |
| %define m11 [stk+96] |
| %define stk_off 112 |
| %endif |
| cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride |
| %define base r6-pb_right_ext_mask-21 |
| %define stk esp |
| %define dstq leftq |
| %define edgeb byte edged |
| %define edged [stk+ 8] |
| %define dstmp [stk+12] |
| %define hd dword [stk+16] |
| %define wq [stk+20] |
| %define strideq [stk+24] |
| %define leftmp [stk+28] |
| %define t2 [stk+32] |
| %define t4 [stk+36] |
| %define t5 [stk+40] |
| %define t6 [stk+44] |
| %define m8 [base+wiener_shufA] |
| %define m9 [base+wiener_shufB] |
| %define m12 [stk+48] |
| %define m13 [stk+64] |
| %define m14 [stk+80] |
| %define m15 [base+pw_2056] |
| mov r1, r6m ; flt |
| mov r0, r0m ; dst |
| mov r4, r4m ; w |
| mov lpfq, lpfm |
| mov r2, r7m ; edge |
| mov r5, r5m ; h |
| movq m3, [r1+ 0] |
| movq m7, [r1+16] |
| add r0, r4 |
| mov r1, r1m ; stride |
| add lpfq, r4 |
| mov edged, r2 |
| mov r2, r2m ; left |
| mov dstmp, r0 |
| lea t1, [rsp+r4*2+stk_off] |
| mov hd, r5 |
| neg r4 |
| LEA r6, pb_right_ext_mask+21 |
| mov wq, r4 |
| mov strideq, r1 |
| mov leftmp, r2 |
| mov r4, r1 |
| %if cpuflag(ssse3) |
| pshufb m3, [base+wiener_init] |
| pshufd m1, m3, q2222 |
| pshufd m2, m3, q3333 |
| punpcklqdq m3, m3 |
| %else |
| punpcklwd m3, m3 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova m11, m0 |
| %endif |
| mova m12, m1 |
| mova m13, m2 |
| mova m14, m3 |
| %endif |
| psllw m7, 5 |
| pshufd m6, m7, q0000 ; y0 y1 |
| pshufd m7, m7, q1111 ; y2 y3 |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t6, t1 |
| mov t5, t1 |
| add t1, 384*2 |
| call .h_top |
| lea t3, [lpfq+tmpstrideq*4] |
| mov lpfq, dstmp |
| add t3, tmpstrideq |
| mov [rsp], t3 ; below |
| mov t4, t1 |
| add t1, 384*2 |
| call .h |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v2 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v3 |
| .main: |
| lea t0, [t1+384*2] |
| .main_loop: |
| call .hv |
| dec hd |
| jnz .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .v3 |
| mov lpfq, [rsp] |
| call .hv_bottom |
| add lpfq, strideq |
| call .hv_bottom |
| .v1: |
| call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v |
| RET |
| .no_top: |
| lea t3, [lpfq+tmpstrideq*4] |
| mov lpfq, dstmp |
| lea t3, [t3+tmpstrideq*2] |
| mov [rsp], t3 |
| call .h |
| mov t6, t1 |
| mov t5, t1 |
| mov t4, t1 |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v2 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v3 |
| lea t0, [t1+384*2] |
| call .hv |
| dec hd |
| jz .v3 |
| add t0, 384*8 |
| call .hv |
| dec hd |
| jnz .main |
| .v3: |
| call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v |
| .v2: |
| call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v |
| jmp .v1 |
| .extend_right: |
| movd m2, [lpfq-4] |
| %if ARCH_X86_64 |
| push r0 |
| lea r0, [pb_right_ext_mask+21] |
| movu m0, [r0+xq+0] |
| movu m1, [r0+xq+8] |
| pop r0 |
| %else |
| movu m0, [r6+xq+0] |
| movu m1, [r6+xq+8] |
| %endif |
| %if cpuflag(ssse3) |
| pshufb m2, [base+pb_3] |
| %else |
| punpcklbw m2, m2 |
| pshuflw m2, m2, q3333 |
| punpcklqdq m2, m2 |
| %endif |
| pand m4, m0 |
| pand m5, m1 |
| pandn m0, m2 |
| pandn m1, m2 |
| por m4, m0 |
| por m5, m1 |
| ret |
| .h: |
| %define stk esp+4 ; offset due to call |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movifnidn leftq, leftmp |
| mova m4, [lpfq+xq] |
| movd m5, [leftq] |
| add leftq, 4 |
| pslldq m4, 4 |
| por m4, m5 |
| movifnidn leftmp, leftq |
| jmp .h_main |
| .h_extend_left: |
| %if cpuflag(ssse3) |
| mova m4, [lpfq+xq] |
| pshufb m4, [base+wiener_l_shuf] |
| %else |
| mova m5, [lpfq+xq] |
| pshufd m4, m5, q2103 |
| punpcklbw m5, m5 |
| punpcklwd m5, m5 |
| movss m4, m5 |
| %endif |
| jmp .h_main |
| .h_top: |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu m4, [lpfq+xq-4] |
| .h_main: |
| movu m5, [lpfq+xq+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp xd, -18 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| %macro %%h7 0 |
| %if cpuflag(ssse3) |
| pshufb m0, m4, m8 |
| pmaddubsw m0, m12 |
| pshufb m1, m5, m8 |
| pmaddubsw m1, m12 |
| pshufb m2, m4, m9 |
| pmaddubsw m2, m13 |
| pshufb m3, m5, m9 |
| pmaddubsw m3, m13 |
| paddw m0, m2 |
| pshufb m2, m4, m10 |
| pmaddubsw m2, m13 |
| paddw m1, m3 |
| pshufb m3, m5, m10 |
| pmaddubsw m3, m13 |
| pshufb m4, m11 |
| paddw m0, m2 |
| pmullw m2, m14, m4 |
| pshufb m5, m11 |
| paddw m1, m3 |
| pmullw m3, m14, m5 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m0, m2 |
| mova m2, [base+pw_m16380] |
| paddw m1, m3 |
| paddw m4, m2 |
| paddw m5, m2 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| %else |
| psrldq m0, m4, 1 |
| pslldq m1, m4, 1 |
| pxor m3, m3 |
| punpcklbw m0, m3 |
| punpckhbw m1, m3 |
| paddw m0, m1 |
| pmullw m0, m11 |
| psrldq m1, m4, 2 |
| pslldq m2, m4, 2 |
| punpcklbw m1, m3 |
| punpckhbw m2, m3 |
| paddw m1, m2 |
| pmullw m1, m12 |
| paddw m0, m1 |
| pshufd m2, m4, q0321 |
| punpcklbw m2, m3 |
| pmullw m1, m14, m2 |
| paddw m0, m1 |
| psrldq m1, m4, 3 |
| pslldq m4, 3 |
| punpcklbw m1, m3 |
| punpckhbw m4, m3 |
| paddw m1, m4 |
| pmullw m1, m13 |
| paddw m0, m1 |
| psllw m2, 7 |
| paddw m2, m10 |
| paddsw m0, m2 |
| psrldq m1, m5, 1 |
| pslldq m2, m5, 1 |
| punpcklbw m1, m3 |
| punpckhbw m2, m3 |
| paddw m1, m2 |
| pmullw m1, m11 |
| psrldq m2, m5, 2 |
| pslldq m4, m5, 2 |
| punpcklbw m2, m3 |
| punpckhbw m4, m3 |
| paddw m2, m4 |
| pmullw m2, m12 |
| paddw m1, m2 |
| pshufd m4, m5, q0321 |
| punpcklbw m4, m3 |
| pmullw m2, m14, m4 |
| paddw m1, m2 |
| psrldq m2, m5, 3 |
| pslldq m5, 3 |
| punpcklbw m2, m3 |
| punpckhbw m5, m3 |
| paddw m2, m5 |
| pmullw m2, m13 |
| paddw m1, m2 |
| psllw m4, 7 |
| paddw m4, m10 |
| paddsw m1, m4 |
| %endif |
| %endmacro |
| %%h7 |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m15 |
| paddw m1, m15 |
| mova [t1+xq*2+ 0], m0 |
| mova [t1+xq*2+16], m1 |
| add xq, 16 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv: |
| add lpfq, strideq |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movifnidn leftq, leftmp |
| mova m4, [lpfq+xq] |
| movd m5, [leftq] |
| add leftq, 4 |
| pslldq m4, 4 |
| por m4, m5 |
| movifnidn leftmp, leftq |
| jmp .hv_main |
| .hv_extend_left: |
| %if cpuflag(ssse3) |
| mova m4, [lpfq+xq] |
| pshufb m4, [base+wiener_l_shuf] |
| %else |
| mova m5, [lpfq+xq] |
| pshufd m4, m5, q2103 |
| punpcklbw m5, m5 |
| punpcklwd m5, m5 |
| movss m4, m5 |
| %endif |
| jmp .hv_main |
| .hv_bottom: |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu m4, [lpfq+xq-4] |
| .hv_main: |
| movu m5, [lpfq+xq+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp xd, -18 |
| jl .hv_have_right |
| call .extend_right |
| .hv_have_right: |
| %%h7 |
| %if ARCH_X86_64 |
| mova m2, [t4+xq*2] |
| paddw m2, [t2+xq*2] |
| %else |
| mov r2, t4 |
| mova m2, [r2+xq*2] |
| mov r2, t2 |
| paddw m2, [r2+xq*2] |
| mov r2, t5 |
| %endif |
| mova m3, [t3+xq*2] |
| %if ARCH_X86_64 |
| mova m5, [t5+xq*2] |
| %else |
| mova m5, [r2+xq*2] |
| mov r2, t6 |
| %endif |
| paddw m5, [t1+xq*2] |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m15 |
| paddw m1, m15 |
| %if ARCH_X86_64 |
| paddw m4, m0, [t6+xq*2] |
| %else |
| paddw m4, m0, [r2+xq*2] |
| mov r2, t4 |
| %endif |
| mova [t0+xq*2], m0 |
| punpcklwd m0, m2, m3 |
| pmaddwd m0, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m6 |
| punpckhwd m4, m5 |
| pmaddwd m4, m6 |
| paddd m0, m3 |
| mova m3, [t3+xq*2+16] |
| paddd m4, m2 |
| %if ARCH_X86_64 |
| mova m2, [t4+xq*2+16] |
| paddw m2, [t2+xq*2+16] |
| mova m5, [t5+xq*2+16] |
| %else |
| mova m2, [r2+xq*2+16] |
| mov r2, t2 |
| paddw m2, [r2+xq*2+16] |
| mov r2, t5 |
| mova m5, [r2+xq*2+16] |
| mov r2, t6 |
| %endif |
| paddw m5, [t1+xq*2+16] |
| packuswb m0, m4 |
| %if ARCH_X86_64 |
| paddw m4, m1, [t6+xq*2+16] |
| %else |
| paddw m4, m1, [r2+xq*2+16] |
| mov dstq, dstmp |
| %endif |
| mova [t0+xq*2+16], m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m6 |
| punpckhwd m4, m5 |
| pmaddwd m4, m6 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+xq], m0 |
| add xq, 16 |
| jl .hv_loop |
| add dstq, strideq |
| %if ARCH_X86_64 |
| mov t6, t5 |
| mov t5, t4 |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t6 |
| %else |
| mov dstmp, dstq |
| mov r1, t5 |
| mov r2, t4 |
| mov t6, r1 |
| mov t5, r2 |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, r1 |
| %endif |
| ret |
| %if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code |
| .v: |
| mov xq, wq |
| .v_loop: |
| %if ARCH_X86_64 |
| mova m1, [t4+xq*2] |
| paddw m1, [t2+xq*2] |
| %else |
| mov r2, t4 |
| mova m1, [r2+xq*2] |
| mov r2, t2 |
| paddw m1, [r2+xq*2] |
| mov r2, t6 |
| %endif |
| mova m2, [t3+xq*2] |
| mova m4, [t1+xq*2] |
| %if ARCH_X86_64 |
| paddw m3, m4, [t6+xq*2] |
| paddw m4, [t5+xq*2] |
| %else |
| paddw m3, m4, [r2+xq*2] |
| mov r2, t5 |
| paddw m4, [r2+xq*2] |
| mov r2, t4 |
| %endif |
| punpcklwd m0, m1, m2 |
| pmaddwd m0, m7 |
| punpckhwd m1, m2 |
| pmaddwd m1, m7 |
| punpcklwd m2, m3, m4 |
| pmaddwd m2, m6 |
| punpckhwd m3, m4 |
| pmaddwd m3, m6 |
| paddd m0, m2 |
| paddd m1, m3 |
| %if ARCH_X86_64 |
| mova m2, [t4+xq*2+16] |
| paddw m2, [t2+xq*2+16] |
| %else |
| mova m2, [r2+xq*2+16] |
| mov r2, t2 |
| paddw m2, [r2+xq*2+16] |
| mov r2, t6 |
| %endif |
| mova m3, [t3+xq*2+16] |
| mova m5, [t1+xq*2+16] |
| %if ARCH_X86_64 |
| paddw m4, m5, [t6+xq*2+16] |
| paddw m5, [t5+xq*2+16] |
| %else |
| paddw m4, m5, [r2+xq*2+16] |
| mov r2, t5 |
| paddw m5, [r2+xq*2+16] |
| movifnidn dstq, dstmp |
| %endif |
| packuswb m0, m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m6 |
| punpckhwd m4, m5 |
| pmaddwd m4, m6 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+xq], m0 |
| add xq, 16 |
| jl .v_loop |
| add dstq, strideq |
| %if ARCH_X86_64 |
| mov t6, t5 |
| mov t5, t4 |
| %else |
| mov dstmp, dstq |
| mov r1, t5 |
| mov r2, t4 |
| mov t6, r1 |
| mov t5, r2 |
| %endif |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| ret |
| %endif |
| |
| %if ARCH_X86_64 |
| cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ |
| w, h, edge, flt, x |
| mov fltq, r6mp |
| mov wd, wm |
| movifnidn hd, hm |
| mov edged, r7m |
| movq m14, [fltq] |
| add lpfq, wq |
| movq m7, [fltq+16] |
| add dstq, wq |
| mova m8, [pw_m16380] |
| lea t1, [rsp+wq*2+16] |
| mova m15, [pw_2056] |
| neg wq |
| %if cpuflag(ssse3) |
| pshufb m14, [wiener_init] |
| mova m9, [wiener_shufB] |
| pshufd m13, m14, q3333 ; x1 x2 |
| mova m10, [wiener_shufC] |
| punpcklqdq m14, m14 ; x3 |
| mova m11, [wiener_shufD] |
| mova m12, [wiener_l_shuf] |
| %else |
| punpcklwd m14, m14 |
| pshufd m11, m14, q1111 ; x1 |
| pshufd m13, m14, q2222 ; x2 |
| pshufd m14, m14, q3333 ; x3 |
| %endif |
| %else |
| %if cpuflag(ssse3) |
| %define stk_off 80 |
| %else |
| %define m11 [stk+80] |
| %define stk_off 96 |
| %endif |
| cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride |
| %define stk esp |
| %define leftmp [stk+28] |
| %define m8 [base+pw_m16380] |
| %define m12 [base+wiener_l_shuf] |
| %define m14 [stk+48] |
| mov r1, r6m ; flt |
| mov r0, r0m ; dst |
| mov r4, r4m ; w |
| mov lpfq, lpfm |
| mov r2, r7m ; edge |
| mov r5, r5m ; h |
| movq m2, [r1+ 0] |
| movq m7, [r1+16] |
| add r0, r4 |
| mov r1, r1m ; stride |
| add lpfq, r4 |
| mov edged, r2 |
| mov r2, r2m ; left |
| mov dstmp, r0 |
| lea t1, [rsp+r4*2+stk_off] |
| mov hd, r5 |
| neg r4 |
| LEA r6, pb_right_ext_mask+21 |
| mov wq, r4 |
| mov strideq, r1 |
| mov leftmp, r2 |
| mov r4, r1 |
| %if cpuflag(ssse3) |
| pshufb m2, [base+wiener_init] |
| pshufd m1, m2, q3333 |
| punpcklqdq m2, m2 |
| %else |
| punpcklwd m2, m2 |
| pshufd m0, m2, q1111 |
| pshufd m1, m2, q2222 |
| pshufd m2, m2, q3333 |
| mova m11, m0 |
| %endif |
| mova m13, m1 |
| mova m14, m2 |
| %endif |
| psllw m7, 5 |
| pshufd m6, m7, q0000 ; __ y1 |
| pshufd m7, m7, q1111 ; y2 y3 |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, strideq |
| mov t4, t1 |
| add t1, 384*2 |
| call .h_top |
| lea xq, [lpfq+tmpstrideq*4] |
| mov lpfq, dstmp |
| mov t3, t1 |
| add t1, 384*2 |
| add xq, tmpstrideq |
| mov [rsp], xq ; below |
| call .h |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v2 |
| .main: |
| mov t0, t4 |
| .main_loop: |
| call .hv |
| dec hd |
| jnz .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .v2 |
| mov lpfq, [rsp] |
| call .hv_bottom |
| add lpfq, strideq |
| call .hv_bottom |
| .end: |
| RET |
| .no_top: |
| lea t3, [lpfq+tmpstrideq*4] |
| mov lpfq, dstmp |
| lea t3, [t3+tmpstrideq*2] |
| mov [rsp], t3 |
| call .h |
| mov t4, t1 |
| mov t3, t1 |
| mov t2, t1 |
| dec hd |
| jz .v1 |
| add lpfq, strideq |
| add t1, 384*2 |
| call .h |
| dec hd |
| jz .v2 |
| lea t0, [t1+384*2] |
| call .hv |
| dec hd |
| jz .v2 |
| add t0, 384*6 |
| call .hv |
| dec hd |
| jnz .main |
| .v2: |
| call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v |
| add dstq, strideq |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| movifnidn dstmp, dstq |
| .v1: |
| call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v |
| jmp .end |
| .h: |
| %define stk esp+4 |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movifnidn leftq, leftmp |
| mova m4, [lpfq+xq] |
| movd m5, [leftq] |
| add leftq, 4 |
| pslldq m4, 4 |
| por m4, m5 |
| movifnidn leftmp, leftq |
| jmp .h_main |
| .h_extend_left: |
| %if cpuflag(ssse3) |
| mova m4, [lpfq+xq] |
| pshufb m4, m12 |
| %else |
| mova m5, [lpfq+xq] |
| pshufd m4, m5, q2103 |
| punpcklbw m5, m5 |
| punpcklwd m5, m5 |
| movss m4, m5 |
| %endif |
| jmp .h_main |
| .h_top: |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| .h_loop: |
| movu m4, [lpfq+xq-4] |
| .h_main: |
| movu m5, [lpfq+xq+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp xd, -17 |
| jl .h_have_right |
| call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right |
| .h_have_right: |
| %macro %%h5 0 |
| %if cpuflag(ssse3) |
| pshufb m0, m4, m9 |
| pmaddubsw m0, m13 |
| pshufb m1, m5, m9 |
| pmaddubsw m1, m13 |
| pshufb m2, m4, m10 |
| pmaddubsw m2, m13 |
| pshufb m3, m5, m10 |
| pmaddubsw m3, m13 |
| pshufb m4, m11 |
| paddw m0, m2 |
| pmullw m2, m14, m4 |
| pshufb m5, m11 |
| paddw m1, m3 |
| pmullw m3, m14, m5 |
| psllw m4, 7 |
| psllw m5, 7 |
| paddw m4, m8 |
| paddw m5, m8 |
| paddw m0, m2 |
| paddw m1, m3 |
| paddsw m0, m4 |
| paddsw m1, m5 |
| %else |
| psrldq m0, m4, 2 |
| pslldq m1, m4, 2 |
| pxor m3, m3 |
| punpcklbw m0, m3 |
| punpckhbw m1, m3 |
| paddw m0, m1 |
| pmullw m0, m11 |
| pshufd m2, m4, q0321 |
| punpcklbw m2, m3 |
| pmullw m1, m14, m2 |
| paddw m0, m1 |
| psrldq m1, m4, 3 |
| pslldq m4, 3 |
| punpcklbw m1, m3 |
| punpckhbw m4, m3 |
| paddw m1, m4 |
| pmullw m1, m13 |
| paddw m0, m1 |
| psllw m2, 7 |
| paddw m2, m8 |
| paddsw m0, m2 |
| psrldq m1, m5, 2 |
| pslldq m4, m5, 2 |
| punpcklbw m1, m3 |
| punpckhbw m4, m3 |
| paddw m1, m4 |
| pmullw m1, m11 |
| pshufd m4, m5, q0321 |
| punpcklbw m4, m3 |
| pmullw m2, m14, m4 |
| paddw m1, m2 |
| psrldq m2, m5, 3 |
| pslldq m5, 3 |
| punpcklbw m2, m3 |
| punpckhbw m5, m3 |
| paddw m2, m5 |
| pmullw m2, m13 |
| paddw m1, m2 |
| psllw m4, 7 |
| paddw m4, m8 |
| paddsw m1, m4 |
| %endif |
| %endmacro |
| %%h5 |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m15 |
| paddw m1, m15 |
| mova [t1+xq*2+ 0], m0 |
| mova [t1+xq*2+16], m1 |
| add xq, 16 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv: |
| add lpfq, strideq |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movifnidn leftq, leftmp |
| mova m4, [lpfq+xq] |
| movd m5, [leftq] |
| add leftq, 4 |
| pslldq m4, 4 |
| por m4, m5 |
| movifnidn leftmp, leftq |
| jmp .hv_main |
| .hv_extend_left: |
| %if cpuflag(ssse3) |
| mova m4, [lpfq+xq] |
| pshufb m4, m12 |
| %else |
| mova m5, [lpfq+xq] |
| pshufd m4, m5, q2103 |
| punpcklbw m5, m5 |
| punpcklwd m5, m5 |
| movss m4, m5 |
| %endif |
| jmp .hv_main |
| .hv_bottom: |
| mov xq, wq |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| .hv_loop: |
| movu m4, [lpfq+xq-4] |
| .hv_main: |
| movu m5, [lpfq+xq+4] |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp xd, -17 |
| jl .hv_have_right |
| call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right |
| .hv_have_right: |
| %%h5 |
| mova m2, [t3+xq*2] |
| paddw m2, [t1+xq*2] |
| psraw m0, 3 |
| psraw m1, 3 |
| paddw m0, m15 |
| paddw m1, m15 |
| %if ARCH_X86_64 |
| mova m3, [t2+xq*2] |
| paddw m4, m0, [t4+xq*2] |
| %else |
| mov r2, t2 |
| mova m3, [r2+xq*2] |
| mov r2, t4 |
| paddw m4, m0, [r2+xq*2] |
| %endif |
| mova [t0+xq*2], m0 |
| punpcklwd m0, m2, m3 |
| pmaddwd m0, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4, m4 |
| pmaddwd m3, m6 |
| punpckhwd m4, m4 |
| pmaddwd m4, m6 |
| paddd m0, m3 |
| paddd m4, m2 |
| mova m2, [t3+xq*2+16] |
| paddw m2, [t1+xq*2+16] |
| packuswb m0, m4 |
| %if ARCH_X86_64 |
| mova m3, [t2+xq*2+16] |
| paddw m4, m1, [t4+xq*2+16] |
| %else |
| paddw m4, m1, [r2+xq*2+16] |
| mov r2, t2 |
| mova m3, [r2+xq*2+16] |
| mov dstq, dstmp |
| %endif |
| mova [t0+xq*2+16], m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4, m4 |
| pmaddwd m3, m6 |
| punpckhwd m4, m4 |
| pmaddwd m4, m6 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+xq], m0 |
| add xq, 16 |
| jl .hv_loop |
| add dstq, strideq |
| mov t4, t3 |
| mov t3, t2 |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t4 |
| movifnidn dstmp, dstq |
| ret |
| %if cpuflag(ssse3) |
| .v: |
| mov xq, wq |
| .v_loop: |
| mova m3, [t1+xq*2] |
| paddw m1, m3, [t3+xq*2] |
| %if ARCH_X86_64 |
| mova m2, [t2+xq*2] |
| paddw m3, [t4+xq*2] |
| %else |
| mov r2, t2 |
| mova m2, [r2+xq*2] |
| mov r2, t4 |
| paddw m3, [r2+xq*2] |
| %endif |
| punpcklwd m0, m1, m2 |
| pmaddwd m0, m7 |
| punpckhwd m1, m2 |
| pmaddwd m1, m7 |
| punpcklwd m2, m3 |
| pmaddwd m2, m6 |
| punpckhwd m3, m3 |
| pmaddwd m3, m6 |
| paddd m0, m2 |
| paddd m1, m3 |
| mova m4, [t1+xq*2+16] |
| paddw m2, m4, [t3+xq*2+16] |
| %if ARCH_X86_64 |
| mova m3, [t2+xq*2+16] |
| paddw m4, [t4+xq*2+16] |
| %else |
| paddw m4, [r2+xq*2+16] |
| mov r2, t2 |
| mova m3, [r2+xq*2+16] |
| mov dstq, dstmp |
| %endif |
| packuswb m0, m1 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m7 |
| punpckhwd m2, m3 |
| pmaddwd m2, m7 |
| punpcklwd m3, m4 |
| pmaddwd m3, m6 |
| punpckhwd m4, m4 |
| pmaddwd m4, m6 |
| paddd m1, m3 |
| paddd m2, m4 |
| packuswb m1, m2 |
| psrlw m0, 8 |
| psrlw m1, 8 |
| packuswb m0, m1 |
| mova [dstq+xq], m0 |
| add xq, 16 |
| jl .v_loop |
| ret |
| %endif |
| %endmacro |
| |
| INIT_XMM sse2 |
| WIENER |
| |
| INIT_XMM ssse3 |
| WIENER |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; self-guided ;; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| %macro GATHERDD 3 ; dst, src, tmp |
| movd %3d, %2 |
| %if ARCH_X86_64 |
| movd %1, [r13+%3] |
| pextrw %3d, %2, 2 |
| pinsrw %1, [r13+%3+2], 3 |
| pextrw %3d, %2, 4 |
| pinsrw %1, [r13+%3+2], 5 |
| pextrw %3d, %2, 6 |
| pinsrw %1, [r13+%3+2], 7 |
| %else |
| movd %1, [base+sgr_x_by_x-0xf03+%3] |
| pextrw %3, %2, 2 |
| pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 |
| pextrw %3, %2, 4 |
| pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 |
| pextrw %3, %2, 6 |
| pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 |
| %endif |
| %endmacro |
| |
| %macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore |
| %if ARCH_X86_64 |
| %define tmp r14 |
| %else |
| %define tmp %4 |
| %endif |
| GATHERDD %1, %2, tmp |
| GATHERDD %2, %3, tmp |
| movif32 %4, %5 |
| psrld %1, 24 |
| psrld %2, 24 |
| packssdw %1, %2 |
| %endmacro |
| |
| %macro MULLD 3 ; dst, src, tmp |
| pmulhuw %3, %1, %2 |
| pmullw %1, %2 |
| pslld %3, 16 |
| paddd %1, %3 |
| %endmacro |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 0, 1, 2, 3, 5 |
| %if STACK_ALIGNMENT < 16 |
| %assign extra_stack 5*16 |
| %else |
| %assign extra_stack 3*16 |
| %endif |
| cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ |
| dst, stride, left, lpf, w |
| %if STACK_ALIGNMENT < 16 |
| %define dstm dword [esp+calloff+16*0+4*6] |
| %define stridemp dword [esp+calloff+16*0+4*7] |
| %define leftm dword [esp+calloff+16*3+4*0] |
| %define lpfm dword [esp+calloff+16*3+4*1] |
| %define w0m dword [esp+calloff+16*3+4*2] |
| %define hd dword [esp+calloff+16*3+4*3] |
| %define edgeb byte [esp+calloff+16*3+4*4] |
| %define edged dword [esp+calloff+16*3+4*4] |
| %define leftmp leftm |
| %else |
| %define w0m wm |
| %define hd dword r5m |
| %define edgeb byte r7m |
| %define edged dword r7m |
| %endif |
| %define hvsrcm dword [esp+calloff+4*0] |
| %define w1m dword [esp+calloff+4*1] |
| %define t0m dword [esp+calloff+4*2] |
| %define t2m dword [esp+calloff+4*3] |
| %define t3m dword [esp+calloff+4*4] |
| %define t4m dword [esp+calloff+4*5] |
| %define m8 [base+pb_1] |
| %define m9 [esp+calloff+16*2] |
| %define m10 [base+pd_0xf00800a4] |
| %define m11 [base+sgr_lshuf5] |
| %define m12 [base+pd_34816] |
| %define m13 [base+pb_0to15] |
| %define r10 r4 |
| %define base r6-$$ |
| %assign calloff 0 |
| %if STACK_ALIGNMENT < 16 |
| mov strideq, [rstk+stack_offset+ 8] |
| mov leftq, [rstk+stack_offset+12] |
| mov lpfq, [rstk+stack_offset+16] |
| mov wd, [rstk+stack_offset+20] |
| mov dstm, dstq |
| mov stridemp, strideq |
| mov leftm, leftq |
| mov r1, [rstk+stack_offset+24] |
| mov r2, [rstk+stack_offset+32] |
| mov lpfm, lpfq |
| mov hd, r1 |
| mov edged, r2 |
| %endif |
| %else |
| DECLARE_REG_TMP 8, 7, 9, 11, 12 |
| cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %endif |
| %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 |
| mov wd, wm |
| %endif |
| %if ARCH_X86_64 |
| mov paramsq, r6mp |
| lea r13, [sgr_x_by_x-0xf03] |
| movifnidn hd, hm |
| mov edged, r7m |
| movu m9, [paramsq] |
| add lpfq, wq |
| mova m8, [pb_1] |
| lea t1, [rsp+wq*2+20] |
| mova m10, [pd_0xf00800a4] |
| add dstq, wq |
| lea t3, [rsp+wq*4+400*12+16] |
| mova m12, [pd_34816] ; (1 << 11) + (1 << 15) |
| lea t4, [rsp+wq*2+400*20+16] |
| pshufhw m7, m9, q0000 |
| pshufb m9, [pw_256] ; s0 |
| punpckhqdq m7, m7 ; w0 |
| neg wq |
| mova m13, [pb_0to15] |
| pxor m6, m6 |
| mova m11, [sgr_lshuf5] |
| psllw m7, 4 |
| DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w |
| %define lpfm [rsp] |
| %else |
| mov r1, [rstk+stack_offset+28] ; params |
| LEA r6, $$ |
| movu m1, [r1] |
| add lpfm, wq |
| lea t1, [rsp+extra_stack+wq*2+20] |
| add dstq, wq |
| lea t3, [rsp+extra_stack+wq*4+400*12+16] |
| mov dstm, dstq |
| lea t4, [rsp+extra_stack+wq*2+400*20+16] |
| mov t3m, t3 |
| pshufhw m7, m1, q0000 |
| mov t4m, t4 |
| pshufb m1, [base+pw_256] ; s0 |
| punpckhqdq m7, m7 ; w0 |
| psllw m7, 4 |
| neg wq |
| mova m9, m1 |
| pxor m6, m6 |
| mov w1m, wd |
| sub wd, 2 |
| mov lpfq, lpfm |
| mov w0m, wd |
| %define strideq r5 |
| %endif |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, stridemp |
| movif32 t2m, t1 |
| mov t2, t1 |
| call .top_fixup |
| add t1, 400*6 |
| call .h_top |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add r10, strideq |
| mov lpfm, r10 ; below |
| movif32 t0m, t2 |
| mov t0, t2 |
| dec hd |
| jz .height1 |
| or edged, 16 |
| call .h |
| .main: |
| add lpfq, stridemp |
| movif32 t4, t4m |
| call .hv |
| call .prep_n |
| sub hd, 2 |
| jl .extend_bottom |
| .main_loop: |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| %if ARCH_X86_64 |
| test hb, hb |
| %else |
| mov r4, hd |
| test r4, r4 |
| %endif |
| jz .odd_height |
| call .h |
| add lpfq, stridemp |
| call .hv |
| movif32 dstq, dstm |
| call .n0 |
| call .n1 |
| sub hd, 2 |
| movif32 t0, t0m |
| jge .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, lpfm |
| call .h_top |
| add lpfq, stridemp |
| call .hv_bottom |
| .end: |
| movif32 dstq, dstm |
| call .n0 |
| call .n1 |
| .end2: |
| RET |
| .height1: |
| movif32 t4, t4m |
| call .hv |
| call .prep_n |
| jmp .odd_height_end |
| .odd_height: |
| call .hv |
| movif32 dstq, dstm |
| call .n0 |
| call .n1 |
| .odd_height_end: |
| call .v |
| movif32 dstq, dstm |
| call .n0 |
| jmp .end2 |
| .extend_bottom: |
| call .v |
| jmp .end |
| .no_top: |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov lpfm, r10 |
| call .h |
| lea t2, [t1+400*6] |
| movif32 t2m, t2 |
| call .top_fixup |
| dec hd |
| jz .no_top_height1 |
| or edged, 16 |
| mov t0, t1 |
| mov t1, t2 |
| movif32 t0m, t0 |
| jmp .main |
| .no_top_height1: |
| movif32 t3, t3m |
| movif32 t4, t4m |
| call .v |
| call .prep_n |
| jmp .odd_height_end |
| .extend_right: |
| %assign stack_offset stack_offset+8 |
| %assign calloff 8 |
| movd m1, wd |
| movd m3, [lpfq-1] |
| pshufb m1, m6 |
| pshufb m3, m6 |
| psubb m2, m8, m1 |
| pcmpgtb m2, m13 |
| pand m5, m2 |
| pandn m2, m3 |
| por m5, m2 |
| ret |
| %assign stack_offset stack_offset-4 |
| %assign calloff 4 |
| .h: ; horizontal boxsum |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| %define leftq r4 |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 13 |
| jmp .h_main |
| .h_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, m11 |
| jmp .h_main |
| .h_top: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 wq, w0m |
| .h_loop: |
| movu m5, [lpfq+wq-1] |
| .h_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp wd, -10 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| palignr m2, m5, m4, 2 |
| paddw m0, m4, m2 |
| palignr m3, m5, m4, 6 |
| paddw m0, m3 |
| punpcklwd m1, m2, m3 |
| pmaddwd m1, m1 |
| punpckhwd m2, m3 |
| pmaddwd m2, m2 |
| palignr m5, m4, 8 |
| paddw m0, m5 |
| punpcklwd m3, m4, m5 |
| pmaddwd m3, m3 |
| paddd m1, m3 |
| punpckhwd m3, m4, m5 |
| pmaddwd m3, m3 |
| shufps m4, m5, q2121 |
| paddw m0, m4 ; sum |
| punpcklwd m5, m4, m6 |
| pmaddwd m5, m5 |
| punpckhwd m4, m6 |
| pmaddwd m4, m4 |
| paddd m2, m3 |
| test edgeb, 16 ; y > 0 |
| jz .h_loop_end |
| paddw m0, [t1+wq*2+400*0] |
| paddd m1, [t1+wq*2+400*2] |
| paddd m2, [t1+wq*2+400*4] |
| .h_loop_end: |
| paddd m1, m5 ; sumsq |
| paddd m2, m4 |
| mova [t1+wq*2+400*0], m0 |
| mova [t1+wq*2+400*2], m1 |
| mova [t1+wq*2+400*4], m2 |
| add wq, 8 |
| jl .h_loop |
| ret |
| .top_fixup: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .top_fixup_loop: ; the sums of the first row needs to be doubled |
| mova m0, [t1+wq*2+400*0] |
| mova m1, [t1+wq*2+400*2] |
| mova m2, [t1+wq*2+400*4] |
| paddw m0, m0 |
| paddd m1, m1 |
| paddd m2, m2 |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m1 |
| mova [t2+wq*2+400*4], m2 |
| add wq, 8 |
| jl .top_fixup_loop |
| ret |
| ALIGN function_align |
| .hv: ; horizontal boxsum + vertical boxsum + ab |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 13 |
| jmp .hv_main |
| .hv_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, m11 |
| jmp .hv_main |
| .hv_bottom: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv_extend_left |
| movif32 wq, w0m |
| %if ARCH_X86_32 |
| jmp .hv_loop_start |
| %endif |
| .hv_loop: |
| movif32 lpfq, hvsrcm |
| .hv_loop_start: |
| movu m5, [lpfq+wq-1] |
| .hv_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv_have_right |
| cmp wd, -10 |
| jl .hv_have_right |
| call .extend_right |
| .hv_have_right: |
| movif32 t3, hd |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| palignr m3, m5, m4, 2 |
| paddw m0, m4, m3 |
| palignr m1, m5, m4, 6 |
| paddw m0, m1 |
| punpcklwd m2, m3, m1 |
| pmaddwd m2, m2 |
| punpckhwd m3, m1 |
| pmaddwd m3, m3 |
| palignr m5, m4, 8 |
| paddw m0, m5 |
| punpcklwd m1, m4, m5 |
| pmaddwd m1, m1 |
| paddd m2, m1 |
| punpckhwd m1, m4, m5 |
| pmaddwd m1, m1 |
| shufps m4, m5, q2121 |
| paddw m0, m4 ; h sum |
| punpcklwd m5, m4, m6 |
| pmaddwd m5, m5 |
| punpckhwd m4, m6 |
| pmaddwd m4, m4 |
| paddd m3, m1 |
| paddd m2, m5 ; h sumsq |
| paddd m3, m4 |
| paddw m1, m0, [t1+wq*2+400*0] |
| paddd m4, m2, [t1+wq*2+400*2] |
| paddd m5, m3, [t1+wq*2+400*4] |
| %if ARCH_X86_64 |
| test hd, hd |
| %else |
| test t3, t3 |
| %endif |
| jz .hv_last_row |
| .hv_main2: |
| paddw m1, [t2+wq*2+400*0] ; hv sum |
| paddd m4, [t2+wq*2+400*2] ; hv sumsq |
| paddd m5, [t2+wq*2+400*4] |
| mova [t0+wq*2+400*0], m0 |
| pslld m0, m4, 4 |
| mova [t0+wq*2+400*2], m2 |
| mova [t0+wq*2+400*4], m3 |
| pslld m2, m4, 3 |
| paddd m4, m0 |
| pslld m0, m5, 4 |
| paddd m4, m2 ; a * 25 |
| pslld m2, m5, 3 |
| paddd m5, m0 |
| paddd m5, m2 |
| punpcklwd m0, m1, m6 ; b |
| punpckhwd m1, m6 |
| pmaddwd m2, m0, m0 ; b * b |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m2 ; p * s |
| MULLD m5, m9, m2 |
| pmaddwd m0, m10 ; b * 164 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| movif32 t3, t3m |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, t2, t2m |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m2 |
| MULLD m1, m5, m2 |
| paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m12 |
| mova [t4+wq*2+4], m3 |
| psrld m0, 12 ; b |
| psrld m1, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m1 |
| add wq, 8 |
| jl .hv_loop |
| mov t2, t1 |
| mov t1, t0 |
| mov t0, t2 |
| movif32 t2m, t2 |
| movif32 t0m, t0 |
| ret |
| .hv_last_row: ; esoteric edge case for odd heights |
| mova [t1+wq*2+400*0], m1 |
| paddw m1, m0 |
| mova [t1+wq*2+400*2], m4 |
| paddd m4, m2 |
| mova [t1+wq*2+400*4], m5 |
| paddd m5, m3 |
| jmp .hv_main2 |
| .v: ; vertical boxsum + ab |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .v_loop: |
| mova m0, [t1+wq*2+400*0] |
| mova m2, [t1+wq*2+400*2] |
| mova m3, [t1+wq*2+400*4] |
| paddw m1, m0, [t2+wq*2+400*0] |
| paddd m4, m2, [t2+wq*2+400*2] |
| paddd m5, m3, [t2+wq*2+400*4] |
| paddw m0, m0 |
| paddd m2, m2 |
| paddd m3, m3 |
| paddw m1, m0 ; hv sum |
| paddd m4, m2 ; hv sumsq |
| pslld m0, m4, 4 |
| paddd m5, m3 |
| pslld m2, m4, 3 |
| paddd m4, m0 |
| pslld m0, m5, 4 |
| paddd m4, m2 ; a * 25 |
| pslld m2, m5, 3 |
| paddd m5, m0 |
| paddd m5, m2 |
| punpcklwd m0, m1, m6 |
| punpckhwd m1, m6 |
| pmaddwd m2, m0, m0 ; b * b |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m2 ; p * s |
| MULLD m5, m9, m2 |
| pmaddwd m0, m10 ; b * 164 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, t2, t2m |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m2 |
| MULLD m1, m5, m2 |
| paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m12 |
| mova [t4+wq*2+4], m3 |
| psrld m0, 12 ; b |
| psrld m1, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m1 |
| add wq, 8 |
| jl .v_loop |
| ret |
| .prep_n: ; initial neighbor setup |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .prep_n_loop: |
| movu m0, [t4+wq*2+ 2] |
| movu m3, [t4+wq*2+ 4] |
| movu m1, [t3+wq*4+ 4] |
| movu m4, [t3+wq*4+ 8] |
| movu m2, [t3+wq*4+20] |
| movu m5, [t3+wq*4+24] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| paddw m3, [t4+wq*2+ 0] |
| paddd m4, [t3+wq*4+ 0] |
| paddd m5, [t3+wq*4+16] |
| paddw m0, m3 |
| psllw m3, 2 |
| paddd m1, m4 |
| pslld m4, 2 |
| paddd m2, m5 |
| pslld m5, 2 |
| paddw m0, m3 ; a 565 |
| paddd m1, m4 ; b 565 |
| paddd m2, m5 |
| mova [t4+wq*2+400*2+ 0], m0 |
| mova [t3+wq*4+400*4+ 0], m1 |
| mova [t3+wq*4+400*4+16], m2 |
| add wq, 8 |
| jl .prep_n_loop |
| ret |
| ALIGN function_align |
| .n0: ; neighbor + output (even rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n0_loop: |
| movu m0, [t4+wq*2+ 2] |
| movu m3, [t4+wq*2+ 4] |
| movu m1, [t3+wq*4+ 4] |
| movu m4, [t3+wq*4+ 8] |
| movu m2, [t3+wq*4+20] |
| movu m5, [t3+wq*4+24] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| paddw m3, [t4+wq*2+ 0] |
| paddd m4, [t3+wq*4+ 0] |
| paddd m5, [t3+wq*4+16] |
| paddw m0, m3 |
| psllw m3, 2 |
| paddd m1, m4 |
| pslld m4, 2 |
| paddd m2, m5 |
| pslld m5, 2 |
| paddw m0, m3 ; a 565 |
| paddd m1, m4 ; b 565 |
| paddd m2, m5 |
| paddw m3, m0, [t4+wq*2+400*2+ 0] |
| paddd m4, m1, [t3+wq*4+400*4+ 0] |
| paddd m5, m2, [t3+wq*4+400*4+16] |
| mova [t4+wq*2+400*2+ 0], m0 |
| mova [t3+wq*4+400*4+ 0], m1 |
| mova [t3+wq*4+400*4+16], m2 |
| movq m0, [dstq+wq] |
| punpcklbw m0, m6 |
| punpcklwd m1, m0, m6 ; src |
| punpcklwd m2, m3, m6 ; a |
| pmaddwd m2, m1 ; a * src |
| punpckhwd m1, m0, m6 |
| punpckhwd m3, m6 |
| pmaddwd m3, m1 |
| psubd m4, m2 ; b - a * src + (1 << 8) |
| psubd m5, m3 |
| psrad m4, 9 |
| psrad m5, 9 |
| packssdw m4, m5 |
| pmulhrsw m4, m7 |
| paddw m0, m4 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n0_loop |
| add dstq, stridemp |
| ret |
| ALIGN function_align |
| .n1: ; neighbor + output (odd rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n1_loop: |
| movq m0, [dstq+wq] |
| mova m3, [t4+wq*2+400*2+ 0] |
| mova m4, [t3+wq*4+400*4+ 0] |
| mova m5, [t3+wq*4+400*4+16] |
| punpcklbw m0, m6 |
| punpcklwd m1, m0, m6 ; src |
| punpcklwd m2, m3, m6 ; a |
| pmaddwd m2, m1 ; a * src |
| punpckhwd m1, m0, m6 |
| punpckhwd m3, m6 |
| pmaddwd m3, m1 |
| psubd m4, m2 ; b - a * src + (1 << 7) |
| psubd m5, m3 |
| psrad m4, 8 |
| psrad m5, 8 |
| packssdw m4, m5 |
| pmulhrsw m4, m7 |
| paddw m0, m4 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n1_loop |
| add dstq, stridemp |
| movif32 dstm, dstq |
| ret |
| |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < 16 |
| %assign extra_stack 4*16 |
| %else |
| %assign extra_stack 2*16 |
| %endif |
| cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ |
| dst, stride, left, lpf, w |
| %if STACK_ALIGNMENT < 16 |
| %define dstm dword [esp+calloff+16*2+4*0] |
| %define stridemp dword [esp+calloff+16*2+4*1] |
| %define leftm dword [esp+calloff+16*2+4*2] |
| %define lpfm dword [esp+calloff+16*2+4*3] |
| %define w0m dword [esp+calloff+16*2+4*4] |
| %define hd dword [esp+calloff+16*2+4*5] |
| %define edgeb byte [esp+calloff+16*2+4*6] |
| %define edged dword [esp+calloff+16*2+4*6] |
| %define leftmp leftm |
| %else |
| %define w0m wm |
| %define hd dword r5m |
| %define edgeb byte r7m |
| %define edged dword r7m |
| %endif |
| %define hvsrcm dword [esp+calloff+4*0] |
| %define w1m dword [esp+calloff+4*1] |
| %define t3m dword [esp+calloff+4*2] |
| %define t4m dword [esp+calloff+4*3] |
| %define m8 [base+pb_0to15] |
| %define m9 [esp+calloff+16*1] |
| %define m10 [base+pd_0xf00801c7] |
| %define m11 [base+pd_34816] |
| %define m12 m6 |
| %define m13 [base+sgr_lshuf3] |
| %define base r6-$$ |
| %assign calloff 0 |
| %if STACK_ALIGNMENT < 16 |
| mov strideq, [rstk+stack_offset+ 8] |
| mov leftq, [rstk+stack_offset+12] |
| mov lpfq, [rstk+stack_offset+16] |
| mov wd, [rstk+stack_offset+20] |
| mov dstm, dstq |
| mov stridemp, strideq |
| mov leftm, leftq |
| mov r1, [rstk+stack_offset+24] |
| mov r2, [rstk+stack_offset+32] |
| mov lpfm, lpfq |
| mov hd, r1 |
| mov edged, r2 |
| %endif |
| %else |
| cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %endif |
| %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 |
| mov wd, wm |
| %endif |
| %if ARCH_X86_64 |
| mov paramsq, r6mp |
| lea r13, [sgr_x_by_x-0xf03] |
| mov hd, hm |
| mov edged, r7m |
| movq m9, [paramsq+4] |
| add lpfq, wq |
| lea t1, [rsp+wq*2+12] |
| mova m8, [pb_0to15] |
| add dstq, wq |
| lea t3, [rsp+wq*4+400*12+8] |
| mova m10, [pd_0xf00801c7] |
| lea t4, [rsp+wq*2+400*32+8] |
| mova m11, [pd_34816] |
| pshuflw m7, m9, q3333 |
| pshufb m9, [pw_256] ; s1 |
| punpcklqdq m7, m7 ; w1 |
| neg wq |
| pxor m6, m6 |
| mova m13, [sgr_lshuf3] |
| psllw m7, 4 |
| DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w |
| %define lpfm [rsp] |
| %else |
| mov r1, [rstk+stack_offset+28] ; params |
| LEA r6, $$ |
| movq m1, [r1+4] |
| add lpfm, wq |
| lea t1, [rsp+extra_stack+wq*2+20] |
| add dstq, wq |
| lea t3, [rsp+extra_stack+wq*4+400*12+16] |
| mov dstm, dstq |
| lea t4, [rsp+extra_stack+wq*2+400*32+16] |
| mov t3m, t3 |
| pshuflw m7, m1, q3333 |
| mov t4m, t4 |
| pshufb m1, [base+pw_256] ; s1 |
| punpcklqdq m7, m7 ; w1 |
| psllw m7, 4 |
| neg wq |
| mova m9, m1 |
| pxor m6, m6 |
| mov w1m, wd |
| sub wd, 2 |
| mov lpfq, lpfm |
| mov w0m, wd |
| %define strideq r5 |
| %endif |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, stridemp |
| mov t2, t1 |
| add t1, 400*6 |
| call .h_top |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add r10, strideq |
| mov lpfm, r10 ; below |
| movif32 t4, t4m |
| call .hv0 |
| .main: |
| dec hd |
| jz .height1 |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1 |
| call .prep_n |
| sub hd, 2 |
| jl .extend_bottom |
| .main_loop: |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv0 |
| %if ARCH_X86_64 |
| test hb, hb |
| %else |
| mov r4, hd |
| test r4, r4 |
| %endif |
| jz .odd_height |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1 |
| call .n0 |
| call .n1 |
| sub hd, 2 |
| jge .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, lpfm |
| call .hv0_bottom |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1_bottom |
| .end: |
| call .n0 |
| call .n1 |
| .end2: |
| RET |
| .height1: |
| call .v1 |
| call .prep_n |
| jmp .odd_height_end |
| .odd_height: |
| call .v1 |
| call .n0 |
| call .n1 |
| .odd_height_end: |
| call .v0 |
| call .v1 |
| call .n0 |
| jmp .end2 |
| .extend_bottom: |
| call .v0 |
| call .v1 |
| jmp .end |
| .no_top: |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov lpfm, r10 |
| call .h |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wq, w0m |
| mov hvsrcm, lpfq |
| %endif |
| lea t2, [t1+400*6] |
| .top_fixup_loop: |
| mova m0, [t1+wq*2+400*0] |
| mova m1, [t1+wq*2+400*2] |
| mova m2, [t1+wq*2+400*4] |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m1 |
| mova [t2+wq*2+400*4], m2 |
| add wq, 8 |
| jl .top_fixup_loop |
| movif32 t3, t3m |
| movif32 t4, t4m |
| call .v0 |
| jmp .main |
| .extend_right: |
| %assign stack_offset stack_offset+8 |
| %assign calloff 8 |
| movd m0, [lpfq-1] |
| movd m1, wd |
| mova m3, m8 |
| pshufb m0, m6 |
| pshufb m1, m6 |
| mova m2, m6 |
| psubb m2, m1 |
| pcmpgtb m2, m3 |
| pand m5, m2 |
| pandn m2, m0 |
| por m5, m2 |
| ret |
| %assign stack_offset stack_offset-4 |
| %assign calloff 4 |
| .h: ; horizontal boxsum |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| %define leftq r4 |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 14 |
| jmp .h_main |
| .h_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, m13 |
| jmp .h_main |
| .h_top: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 wq, w0m |
| .h_loop: |
| movu m5, [lpfq+wq] |
| .h_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .h_have_right |
| cmp wd, -9 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| palignr m0, m5, m4, 2 |
| paddw m1, m4, m0 |
| punpcklwd m2, m4, m0 |
| pmaddwd m2, m2 |
| punpckhwd m3, m4, m0 |
| pmaddwd m3, m3 |
| palignr m5, m4, 4 |
| paddw m1, m5 ; sum |
| punpcklwd m4, m5, m6 |
| pmaddwd m4, m4 |
| punpckhwd m5, m6 |
| pmaddwd m5, m5 |
| paddd m2, m4 ; sumsq |
| paddd m3, m5 |
| mova [t1+wq*2+400*0], m1 |
| mova [t1+wq*2+400*2], m2 |
| mova [t1+wq*2+400*4], m3 |
| add wq, 8 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 14 |
| jmp .hv0_main |
| .hv0_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, m13 |
| jmp .hv0_main |
| .hv0_bottom: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| movif32 wq, w0m |
| %if ARCH_X86_32 |
| jmp .hv0_loop_start |
| %endif |
| .hv0_loop: |
| movif32 lpfq, hvsrcm |
| .hv0_loop_start: |
| movu m5, [lpfq+wq] |
| .hv0_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv0_have_right |
| cmp wd, -9 |
| jl .hv0_have_right |
| call .extend_right |
| .hv0_have_right: |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| palignr m0, m5, m4, 2 |
| paddw m1, m4, m0 |
| punpcklwd m2, m4, m0 |
| pmaddwd m2, m2 |
| punpckhwd m3, m4, m0 |
| pmaddwd m3, m3 |
| palignr m5, m4, 4 |
| paddw m1, m5 ; sum |
| punpcklwd m4, m5, m6 |
| pmaddwd m4, m4 |
| punpckhwd m5, m6 |
| pmaddwd m5, m5 |
| paddd m2, m4 ; sumsq |
| paddd m3, m5 |
| paddw m0, m1, [t1+wq*2+400*0] |
| paddd m4, m2, [t1+wq*2+400*2] |
| paddd m5, m3, [t1+wq*2+400*4] |
| mova [t1+wq*2+400*0], m1 |
| mova [t1+wq*2+400*2], m2 |
| mova [t1+wq*2+400*4], m3 |
| paddw m1, m0, [t2+wq*2+400*0] |
| paddd m2, m4, [t2+wq*2+400*2] |
| paddd m3, m5, [t2+wq*2+400*4] |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m4 |
| mova [t2+wq*2+400*4], m5 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m6 ; b |
| pmaddwd m2, m0, m0 ; b * b |
| punpckhwd m1, m6 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m12 ; p * s |
| MULLD m5, m9, m12 |
| pmaddwd m0, m10 ; b * 455 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| movif32 t3, t3m |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m12 |
| MULLD m1, m5, m12 |
| %if ARCH_X86_32 |
| pxor m6, m6 |
| %endif |
| paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m11 |
| mova [t4+wq*2+4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m1 |
| add wq, 8 |
| jl .hv0_loop |
| ret |
| ALIGN function_align |
| .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 14 |
| jmp .hv1_main |
| .hv1_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, m13 |
| jmp .hv1_main |
| .hv1_bottom: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| movif32 wq, w0m |
| %if ARCH_X86_32 |
| jmp .hv1_loop_start |
| %endif |
| .hv1_loop: |
| movif32 lpfq, hvsrcm |
| .hv1_loop_start: |
| movu m5, [lpfq+wq] |
| .hv1_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| jnz .hv1_have_right |
| cmp wd, -9 |
| jl .hv1_have_right |
| call .extend_right |
| .hv1_have_right: |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| palignr m1, m5, m4, 2 |
| paddw m0, m4, m1 |
| punpcklwd m2, m4, m1 |
| pmaddwd m2, m2 |
| punpckhwd m3, m4, m1 |
| pmaddwd m3, m3 |
| palignr m5, m4, 4 |
| paddw m0, m5 ; h sum |
| punpcklwd m1, m5, m6 |
| pmaddwd m1, m1 |
| punpckhwd m5, m6 |
| pmaddwd m5, m5 |
| paddd m2, m1 ; h sumsq |
| paddd m3, m5 |
| paddw m1, m0, [t2+wq*2+400*0] |
| paddd m4, m2, [t2+wq*2+400*2] |
| paddd m5, m3, [t2+wq*2+400*4] |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m2 |
| mova [t2+wq*2+400*4], m3 |
| pslld m2, m4, 3 |
| pslld m3, m5, 3 |
| paddd m4, m2 ; a * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m6 ; b |
| pmaddwd m2, m0, m0 ; b * b |
| punpckhwd m1, m6 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m12 ; p * s |
| MULLD m5, m9, m12 |
| pmaddwd m0, m10 ; b * 455 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| movif32 t3, t3m |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m12 |
| MULLD m1, m5, m12 |
| %if ARCH_X86_32 |
| pxor m6, m6 |
| %endif |
| paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m11 |
| mova [t4+wq*2+400*2 +4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+400*4+ 8], m0 |
| mova [t3+wq*4+400*4+24], m1 |
| add wq, 8 |
| jl .hv1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .v0: ; vertical boxsums + ab (even rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .v0_loop: |
| mova m0, [t1+wq*2+400*0] |
| mova m4, [t1+wq*2+400*2] |
| mova m5, [t1+wq*2+400*4] |
| paddw m0, m0 |
| paddd m4, m4 |
| paddd m5, m5 |
| paddw m1, m0, [t2+wq*2+400*0] |
| paddd m2, m4, [t2+wq*2+400*2] |
| paddd m3, m5, [t2+wq*2+400*4] |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m4 |
| mova [t2+wq*2+400*4], m5 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m6 ; b |
| pmaddwd m2, m0, m0 ; b * b |
| punpckhwd m1, m6 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m12 ; p * s |
| MULLD m5, m9, m12 |
| pmaddwd m0, m10 ; b * 455 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m12 |
| MULLD m1, m5, m12 |
| %if ARCH_X86_32 |
| pxor m6, m6 |
| %endif |
| paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m11 |
| mova [t4+wq*2+4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m1 |
| add wq, 8 |
| jl .v0_loop |
| ret |
| .v1: ; vertical boxsums + ab (odd rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .v1_loop: |
| mova m0, [t1+wq*2+400*0] |
| mova m4, [t1+wq*2+400*2] |
| mova m5, [t1+wq*2+400*4] |
| paddw m1, m0, [t2+wq*2+400*0] |
| paddd m2, m4, [t2+wq*2+400*2] |
| paddd m3, m5, [t2+wq*2+400*4] |
| mova [t2+wq*2+400*0], m0 |
| mova [t2+wq*2+400*2], m4 |
| mova [t2+wq*2+400*4], m5 |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m6 ; b |
| pmaddwd m2, m0, m0 ; b * b |
| punpckhwd m1, m6 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p |
| psubd m5, m3 |
| MULLD m4, m9, m12 ; p * s |
| MULLD m5, m9, m12 |
| pmaddwd m0, m10 ; b * 455 |
| pmaddwd m1, m10 |
| paddusw m4, m10 |
| paddusw m5, m10 |
| psrld m4, 20 ; min(z, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m12 |
| MULLD m1, m5, m12 |
| %if ARCH_X86_32 |
| pxor m6, m6 |
| %endif |
| paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m11 |
| mova [t4+wq*2+400*2+ 4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+400*4+ 8], m0 |
| mova [t3+wq*4+400*4+24], m1 |
| add wq, 8 |
| jl .v1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .prep_n: ; initial neighbor setup |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .prep_n_loop: |
| movu m0, [t4+wq*2+400*0+ 4] |
| movu m1, [t3+wq*4+400*0+ 8] |
| movu m2, [t3+wq*4+400*0+24] |
| movu m3, [t4+wq*2+400*0+ 2] |
| movu m4, [t3+wq*4+400*0+ 4] |
| movu m5, [t3+wq*4+400*0+20] |
| paddw m0, [t4+wq*2+400*0+ 0] |
| paddd m1, [t3+wq*4+400*0+ 0] |
| paddd m2, [t3+wq*4+400*0+16] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| psllw m3, 2 ; a[-1] 444 |
| pslld m4, 2 ; b[-1] 444 |
| pslld m5, 2 |
| psubw m3, m0 ; a[-1] 343 |
| psubd m4, m1 ; b[-1] 343 |
| psubd m5, m2 |
| mova [t4+wq*2+400*4], m3 |
| mova [t3+wq*4+400*8+ 0], m4 |
| mova [t3+wq*4+400*8+16], m5 |
| movu m0, [t4+wq*2+400*2+ 4] |
| movu m1, [t3+wq*4+400*4+ 8] |
| movu m2, [t3+wq*4+400*4+24] |
| movu m3, [t4+wq*2+400*2+ 2] |
| movu m4, [t3+wq*4+400*4+ 4] |
| movu m5, [t3+wq*4+400*4+20] |
| paddw m0, [t4+wq*2+400*2+ 0] |
| paddd m1, [t3+wq*4+400*4+ 0] |
| paddd m2, [t3+wq*4+400*4+16] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| psllw m3, 2 ; a[ 0] 444 |
| pslld m4, 2 ; b[ 0] 444 |
| pslld m5, 2 |
| mova [t4+wq*2+400* 6], m3 |
| mova [t3+wq*4+400*12+ 0], m4 |
| mova [t3+wq*4+400*12+16], m5 |
| psubw m3, m0 ; a[ 0] 343 |
| psubd m4, m1 ; b[ 0] 343 |
| psubd m5, m2 |
| mova [t4+wq*2+400* 8], m3 |
| mova [t3+wq*4+400*16+ 0], m4 |
| mova [t3+wq*4+400*16+16], m5 |
| add wq, 8 |
| jl .prep_n_loop |
| ret |
| ALIGN function_align |
| .n0: ; neighbor + output (even rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n0_loop: |
| movu m3, [t4+wq*2+400*0+4] |
| movu m1, [t4+wq*2+400*0+2] |
| paddw m3, [t4+wq*2+400*0+0] |
| paddw m1, m3 |
| psllw m1, 2 ; a[ 1] 444 |
| psubw m2, m1, m3 ; a[ 1] 343 |
| paddw m3, m2, [t4+wq*2+400*4] |
| paddw m3, [t4+wq*2+400*6] |
| mova [t4+wq*2+400*4], m2 |
| mova [t4+wq*2+400*6], m1 |
| movu m4, [t3+wq*4+400*0+8] |
| movu m1, [t3+wq*4+400*0+4] |
| paddd m4, [t3+wq*4+400*0+0] |
| paddd m1, m4 |
| pslld m1, 2 ; b[ 1] 444 |
| psubd m2, m1, m4 ; b[ 1] 343 |
| paddd m4, m2, [t3+wq*4+400* 8+ 0] |
| paddd m4, [t3+wq*4+400*12+ 0] |
| mova [t3+wq*4+400* 8+ 0], m2 |
| mova [t3+wq*4+400*12+ 0], m1 |
| movu m5, [t3+wq*4+400*0+24] |
| movu m1, [t3+wq*4+400*0+20] |
| paddd m5, [t3+wq*4+400*0+16] |
| paddd m1, m5 |
| pslld m1, 2 |
| psubd m2, m1, m5 |
| paddd m5, m2, [t3+wq*4+400* 8+16] |
| paddd m5, [t3+wq*4+400*12+16] |
| mova [t3+wq*4+400* 8+16], m2 |
| mova [t3+wq*4+400*12+16], m1 |
| movq m0, [dstq+wq] |
| punpcklbw m0, m6 |
| punpcklwd m1, m0, m6 |
| punpcklwd m2, m3, m6 |
| pmaddwd m2, m1 ; a * src |
| punpckhwd m1, m0, m6 |
| punpckhwd m3, m6 |
| pmaddwd m3, m1 |
| psubd m4, m2 ; b - a * src + (1 << 8) |
| psubd m5, m3 |
| psrad m4, 9 |
| psrad m5, 9 |
| packssdw m4, m5 |
| pmulhrsw m4, m7 |
| paddw m0, m4 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n0_loop |
| add dstq, stridemp |
| ret |
| ALIGN function_align |
| .n1: ; neighbor + output (odd rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n1_loop: |
| movu m3, [t4+wq*2+400*2+4] |
| movu m1, [t4+wq*2+400*2+2] |
| paddw m3, [t4+wq*2+400*2+0] |
| paddw m1, m3 |
| psllw m1, 2 ; a[ 1] 444 |
| psubw m2, m1, m3 ; a[ 1] 343 |
| paddw m3, m2, [t4+wq*2+400*6] |
| paddw m3, [t4+wq*2+400*8] |
| mova [t4+wq*2+400*6], m1 |
| mova [t4+wq*2+400*8], m2 |
| movu m4, [t3+wq*4+400*4+8] |
| movu m1, [t3+wq*4+400*4+4] |
| paddd m4, [t3+wq*4+400*4+0] |
| paddd m1, m4 |
| pslld m1, 2 ; b[ 1] 444 |
| psubd m2, m1, m4 ; b[ 1] 343 |
| paddd m4, m2, [t3+wq*4+400*12+ 0] |
| paddd m4, [t3+wq*4+400*16+ 0] |
| mova [t3+wq*4+400*12+ 0], m1 |
| mova [t3+wq*4+400*16+ 0], m2 |
| movu m5, [t3+wq*4+400*4+24] |
| movu m1, [t3+wq*4+400*4+20] |
| paddd m5, [t3+wq*4+400*4+16] |
| paddd m1, m5 |
| pslld m1, 2 |
| psubd m2, m1, m5 |
| paddd m5, m2, [t3+wq*4+400*12+16] |
| paddd m5, [t3+wq*4+400*16+16] |
| mova [t3+wq*4+400*12+16], m1 |
| mova [t3+wq*4+400*16+16], m2 |
| movq m0, [dstq+wq] |
| punpcklbw m0, m6 |
| punpcklwd m1, m0, m6 |
| punpcklwd m2, m3, m6 |
| pmaddwd m2, m1 ; a * src |
| punpckhwd m1, m0, m6 |
| punpckhwd m3, m6 |
| pmaddwd m3, m1 |
| psubd m4, m2 ; b - a * src + (1 << 8) |
| psubd m5, m3 |
| psrad m4, 9 |
| psrad m5, 9 |
| packssdw m4, m5 |
| pmulhrsw m4, m7 |
| paddw m0, m4 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n1_loop |
| add dstq, stridemp |
| movif32 dstm, dstq |
| ret |
| |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < 16 |
| %assign extra_stack 10*16 |
| %else |
| %assign extra_stack 8*16 |
| %endif |
| cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ |
| dst, stride, left, lpf, w |
| %if STACK_ALIGNMENT < 16 |
| %define dstm dword [esp+calloff+16*8+4*0] |
| %define stridemp dword [esp+calloff+16*8+4*1] |
| %define leftm dword [esp+calloff+16*8+4*2] |
| %define lpfm dword [esp+calloff+16*8+4*3] |
| %define w0m dword [esp+calloff+16*8+4*4] |
| %define hd dword [esp+calloff+16*8+4*5] |
| %define edgeb byte [esp+calloff+16*8+4*6] |
| %define edged dword [esp+calloff+16*8+4*6] |
| %define leftmp leftm |
| %else |
| %define w0m wm |
| %define hd dword r5m |
| %define edgeb byte r7m |
| %define edged dword r7m |
| %endif |
| %define hvsrcm dword [esp+calloff+4*0] |
| %define w1m dword [esp+calloff+4*1] |
| %define t3m dword [esp+calloff+4*2] |
| %define t4m dword [esp+calloff+4*3] |
| %xdefine m8 m6 |
| %define m9 [base+pd_0xffff] |
| %define m10 [base+pd_34816] |
| %define m11 [base+pd_0xf00801c7] |
| %define m12 [base+pd_0xf00800a4] |
| %define m13 [esp+calloff+16*4] |
| %define m14 [esp+calloff+16*5] |
| %define m15 [esp+calloff+16*6] |
| %define m6 [esp+calloff+16*7] |
| %define base r6-$$ |
| %assign calloff 0 |
| %if STACK_ALIGNMENT < 16 |
| mov strideq, [rstk+stack_offset+ 8] |
| mov leftq, [rstk+stack_offset+12] |
| mov lpfq, [rstk+stack_offset+16] |
| mov wd, [rstk+stack_offset+20] |
| mov dstm, dstq |
| mov stridemp, strideq |
| mov leftm, leftq |
| mov r1, [rstk+stack_offset+24] |
| mov r2, [rstk+stack_offset+32] |
| mov lpfm, lpfq |
| mov hd, r1 |
| mov edged, r2 |
| %endif |
| %else |
| cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ |
| w, h, edge, params |
| %endif |
| %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 |
| mov wd, wm |
| %endif |
| %if ARCH_X86_64 |
| mov paramsq, r6mp |
| lea r13, [sgr_x_by_x-0xf03] |
| movifnidn hd, hm |
| mov edged, r7m |
| mova m15, [paramsq] |
| add lpfq, wq |
| mova m9, [pd_0xffff] |
| lea t1, [rsp+wq*2+44] |
| mova m10, [pd_34816] |
| add dstq, wq |
| lea t3, [rsp+wq*4+400*24+40] |
| mova m11, [pd_0xf00801c7] |
| lea t4, [rsp+wq*2+400*52+40] |
| mova m12, [base+pd_0xf00800a4] |
| neg wq |
| pshuflw m13, m15, q0000 |
| pshuflw m14, m15, q2222 |
| pshufhw m15, m15, q1010 |
| punpcklqdq m13, m13 ; s0 |
| punpcklqdq m14, m14 ; s1 |
| punpckhqdq m15, m15 ; w0 w1 |
| pxor m6, m6 |
| psllw m15, 2 |
| DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w |
| %define lpfm [rsp] |
| %else |
| mov r1, [rstk+stack_offset+28] ; params |
| LEA r6, $$ |
| mova m2, [r1] |
| add lpfm, wq |
| lea t1, [rsp+extra_stack+wq*2+52] |
| add dstq, wq |
| lea t3, [rsp+extra_stack+wq*4+400*24+48] |
| mov dstm, dstq |
| lea t4, [rsp+extra_stack+wq*2+400*52+48] |
| mov t3m, t3 |
| mov t4m, t4 |
| neg wq |
| pshuflw m0, m2, q0000 |
| pshuflw m1, m2, q2222 |
| pshufhw m2, m2, q1010 |
| punpcklqdq m0, m0 ; s0 |
| punpcklqdq m1, m1 ; s1 |
| punpckhqdq m2, m2 ; w0 w1 |
| mov w1m, wd |
| pxor m3, m3 |
| psllw m2, 2 |
| mova m13, m0 |
| mova m14, m1 |
| sub wd, 2 |
| mova m15, m2 |
| mova m6, m3 |
| mov lpfq, lpfm |
| mov w0m, wd |
| %define strideq r5 |
| %endif |
| test edgeb, 4 ; LR_HAVE_TOP |
| jz .no_top |
| call .h_top |
| add lpfq, stridemp |
| mov t2, t1 |
| %if ARCH_X86_64 |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup |
| %else |
| mov wq, w0m |
| call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop |
| %endif |
| add t1, 400*12 |
| call .h_top |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| add r10, strideq |
| mov lpfm, r10 ; below |
| movif32 t4, t4m |
| call .hv0 |
| .main: |
| dec hd |
| jz .height1 |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1 |
| call .prep_n |
| sub hd, 2 |
| jl .extend_bottom |
| .main_loop: |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv0 |
| %if ARCH_X86_64 |
| test hd, hd |
| %else |
| mov r4, hd |
| test r4, r4 |
| %endif |
| jz .odd_height |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1 |
| call .n0 |
| call .n1 |
| sub hd, 2 |
| jge .main_loop |
| test edgeb, 8 ; LR_HAVE_BOTTOM |
| jz .extend_bottom |
| mov lpfq, lpfm |
| call .hv0_bottom |
| movif32 lpfq, hvsrcm |
| add lpfq, stridemp |
| call .hv1_bottom |
| .end: |
| call .n0 |
| call .n1 |
| .end2: |
| RET |
| .height1: |
| call .v1 |
| call .prep_n |
| jmp .odd_height_end |
| .odd_height: |
| call .v1 |
| call .n0 |
| call .n1 |
| .odd_height_end: |
| call .v0 |
| call .v1 |
| call .n0 |
| jmp .end2 |
| .extend_bottom: |
| call .v0 |
| call .v1 |
| jmp .end |
| .no_top: |
| movif32 strideq, stridemp |
| lea r10, [lpfq+strideq*4] |
| mov lpfq, dstq |
| lea r10, [r10+strideq*2] |
| mov lpfm, r10 |
| call .h |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wq, w0m |
| mov hvsrcm, lpfq |
| %endif |
| lea t2, [t1+400*12] |
| .top_fixup_loop: |
| mova m0, [t1+wq*2+400* 0] |
| mova m1, [t1+wq*2+400* 2] |
| mova m2, [t1+wq*2+400* 4] |
| paddw m0, m0 |
| mova m3, [t1+wq*2+400* 6] |
| paddd m1, m1 |
| mova m4, [t1+wq*2+400* 8] |
| paddd m2, m2 |
| mova m5, [t1+wq*2+400*10] |
| mova [t2+wq*2+400* 0], m0 |
| mova [t2+wq*2+400* 2], m1 |
| mova [t2+wq*2+400* 4], m2 |
| mova [t2+wq*2+400* 6], m3 |
| mova [t2+wq*2+400* 8], m4 |
| mova [t2+wq*2+400*10], m5 |
| add wq, 8 |
| jl .top_fixup_loop |
| movif32 t3, t3m |
| movif32 t4, t4m |
| call .v0 |
| jmp .main |
| .extend_right: |
| %assign stack_offset stack_offset+8 |
| %assign calloff 8 |
| %if ARCH_X86_64 |
| SWAP m8, m6 |
| %endif |
| movd m1, wd |
| movd m3, [lpfq-1] |
| pshufb m1, m8 |
| pshufb m3, m8 |
| psubb m2, [base+pb_1], m1 |
| pcmpgtb m2, [base+pb_0to15] |
| pand m5, m2 |
| pandn m2, m3 |
| por m5, m2 |
| %if ARCH_X86_64 |
| SWAP m6, m8 |
| %endif |
| ret |
| %assign stack_offset stack_offset-4 |
| %assign calloff 4 |
| .h: ; horizontal boxsum |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| %define leftq r4 |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 13 |
| jmp .h_main |
| .h_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, [base+sgr_lshuf5] |
| jmp .h_main |
| .h_top: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .h_extend_left |
| movif32 wq, w0m |
| .h_loop: |
| movu m5, [lpfq+wq-1] |
| .h_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| %if ARCH_X86_32 |
| pxor m8, m8 |
| %else |
| SWAP m8, m6 |
| %endif |
| jnz .h_have_right |
| cmp wd, -10 |
| jl .h_have_right |
| call .extend_right |
| .h_have_right: |
| punpcklbw m4, m5, m8 |
| punpckhbw m5, m8 |
| palignr m3, m5, m4, 2 |
| palignr m0, m5, m4, 4 |
| paddw m1, m3, m0 |
| punpcklwd m2, m3, m0 |
| pmaddwd m2, m2 |
| punpckhwd m3, m0 |
| pmaddwd m3, m3 |
| palignr m0, m5, m4, 6 |
| paddw m1, m0 ; sum3 |
| punpcklwd m7, m0, m8 |
| pmaddwd m7, m7 |
| punpckhwd m0, m8 |
| pmaddwd m0, m0 |
| %if ARCH_X86_64 |
| SWAP m6, m8 |
| %endif |
| paddd m2, m7 ; sumsq3 |
| palignr m5, m4, 8 |
| punpcklwd m7, m5, m4 |
| paddw m8, m4, m5 |
| pmaddwd m7, m7 |
| punpckhwd m5, m4 |
| pmaddwd m5, m5 |
| paddd m3, m0 |
| mova [t1+wq*2+400* 6], m1 |
| mova [t1+wq*2+400* 8], m2 |
| mova [t1+wq*2+400*10], m3 |
| paddw m8, m1 ; sum5 |
| paddd m7, m2 ; sumsq5 |
| paddd m5, m3 |
| mova [t1+wq*2+400* 0], m8 |
| mova [t1+wq*2+400* 2], m7 |
| mova [t1+wq*2+400* 4], m5 |
| add wq, 8 |
| jl .h_loop |
| ret |
| ALIGN function_align |
| .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 13 |
| jmp .hv0_main |
| .hv0_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, [base+sgr_lshuf5] |
| jmp .hv0_main |
| .hv0_bottom: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv0_extend_left |
| movif32 wq, w0m |
| %if ARCH_X86_32 |
| jmp .hv0_loop_start |
| %endif |
| .hv0_loop: |
| movif32 lpfq, hvsrcm |
| .hv0_loop_start: |
| movu m5, [lpfq+wq-1] |
| .hv0_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| %if ARCH_X86_32 |
| pxor m8, m8 |
| %else |
| SWAP m8, m6 |
| %endif |
| jnz .hv0_have_right |
| cmp wd, -10 |
| jl .hv0_have_right |
| call .extend_right |
| .hv0_have_right: |
| punpcklbw m4, m5, m8 |
| punpckhbw m5, m8 |
| palignr m3, m5, m4, 2 |
| palignr m0, m5, m4, 4 |
| movif32 t3, t3m |
| paddw m1, m3, m0 |
| punpcklwd m2, m3, m0 |
| pmaddwd m2, m2 |
| punpckhwd m3, m0 |
| pmaddwd m3, m3 |
| palignr m0, m5, m4, 6 |
| paddw m1, m0 ; h sum3 |
| punpcklwd m7, m0, m8 |
| pmaddwd m7, m7 |
| punpckhwd m0, m8 |
| %if ARCH_X86_64 |
| SWAP m6, m8 |
| %endif |
| pmaddwd m0, m0 |
| paddd m2, m7 ; h sumsq3 |
| palignr m5, m4, 8 |
| punpcklwd m7, m5, m4 |
| paddw m8, m4, m5 |
| pmaddwd m7, m7 |
| punpckhwd m5, m4 |
| pmaddwd m5, m5 |
| paddd m3, m0 |
| paddw m8, m1 ; h sum5 |
| paddd m7, m2 ; h sumsq5 |
| paddd m5, m3 |
| mova [t3+wq*4+400*8+ 8], m8 |
| mova [t3+wq*4+400*0+ 8], m7 |
| mova [t3+wq*4+400*0+24], m5 |
| paddw m8, [t1+wq*2+400* 0] |
| paddd m7, [t1+wq*2+400* 2] |
| paddd m5, [t1+wq*2+400* 4] |
| mova [t1+wq*2+400* 0], m8 |
| mova [t1+wq*2+400* 2], m7 |
| mova [t1+wq*2+400* 4], m5 |
| paddw m0, m1, [t1+wq*2+400* 6] |
| paddd m4, m2, [t1+wq*2+400* 8] |
| paddd m5, m3, [t1+wq*2+400*10] |
| mova [t1+wq*2+400* 6], m1 |
| mova [t1+wq*2+400* 8], m2 |
| mova [t1+wq*2+400*10], m3 |
| paddw m1, m0, [t2+wq*2+400* 6] |
| paddd m2, m4, [t2+wq*2+400* 8] |
| paddd m3, m5, [t2+wq*2+400*10] |
| mova [t2+wq*2+400* 6], m0 |
| mova [t2+wq*2+400* 8], m4 |
| mova [t2+wq*2+400*10], m5 |
| %if ARCH_X86_32 |
| pxor m7, m7 |
| %else |
| SWAP m7, m6 |
| %endif |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a3 * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m7 ; b3 |
| pmaddwd m2, m0, m0 |
| punpckhwd m1, m7 |
| pmaddwd m3, m1, m1 |
| %if ARCH_X86_64 |
| SWAP m7, m6 |
| %endif |
| psubd m4, m2 ; p3 |
| psubd m5, m3 |
| MULLD m4, m14, m7 ; p3 * s1 |
| MULLD m5, m14, m7 |
| pmaddwd m0, m11 ; b3 * 455 |
| pmaddwd m1, m11 |
| paddusw m4, m11 |
| paddusw m5, m11 |
| psrld m4, 20 ; min(z3, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m7 |
| MULLD m1, m5, m7 |
| paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m10 |
| mova [t4+wq*2+400*2+ 4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+400*4+ 8], m0 |
| mova [t3+wq*4+400*4+24], m1 |
| add wq, 8 |
| jl .hv0_loop |
| ret |
| ALIGN function_align |
| .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| movif32 leftq, leftm |
| movddup m4, [leftq-4] |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| add leftmp, 4 |
| palignr m5, m4, 13 |
| jmp .hv1_main |
| .hv1_extend_left: |
| movif32 wq, w0m |
| mova m5, [lpfq+wq+2] |
| pshufb m5, [base+sgr_lshuf5] |
| jmp .hv1_main |
| .hv1_bottom: |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov hvsrcm, lpfq |
| %endif |
| test edgeb, 1 ; LR_HAVE_LEFT |
| jz .hv1_extend_left |
| movif32 wq, w0m |
| %if ARCH_X86_32 |
| jmp .hv1_loop_start |
| %endif |
| .hv1_loop: |
| movif32 lpfq, hvsrcm |
| .hv1_loop_start: |
| movu m5, [lpfq+wq-1] |
| .hv1_main: |
| test edgeb, 2 ; LR_HAVE_RIGHT |
| %if ARCH_X86_32 |
| pxor m8, m8 |
| %else |
| SWAP m8, m6 |
| %endif |
| jnz .hv1_have_right |
| cmp wd, -10 |
| jl .hv1_have_right |
| call .extend_right |
| .hv1_have_right: |
| punpcklbw m4, m5, m8 |
| punpckhbw m5, m8 |
| palignr m7, m5, m4, 2 |
| palignr m3, m5, m4, 4 |
| paddw m2, m7, m3 |
| punpcklwd m0, m7, m3 |
| pmaddwd m0, m0 |
| punpckhwd m7, m3 |
| pmaddwd m7, m7 |
| palignr m3, m5, m4, 6 |
| paddw m2, m3 ; h sum3 |
| punpcklwd m1, m3, m8 |
| pmaddwd m1, m1 |
| punpckhwd m3, m8 |
| %if ARCH_X86_64 |
| SWAP m6, m8 |
| %endif |
| pmaddwd m3, m3 |
| paddd m0, m1 ; h sumsq3 |
| palignr m5, m4, 8 |
| punpckhwd m1, m4, m5 |
| paddw m8, m4, m5 |
| pmaddwd m1, m1 |
| punpcklwd m4, m5 |
| pmaddwd m4, m4 |
| paddd m7, m3 |
| paddw m5, m2, [t2+wq*2+400* 6] |
| mova [t2+wq*2+400* 6], m2 |
| paddw m8, m2 ; h sum5 |
| paddd m2, m0, [t2+wq*2+400* 8] |
| paddd m3, m7, [t2+wq*2+400*10] |
| mova [t2+wq*2+400* 8], m0 |
| mova [t2+wq*2+400*10], m7 |
| paddd m4, m0 ; h sumsq5 |
| paddd m1, m7 |
| pslld m0, m2, 3 |
| pslld m7, m3, 3 |
| paddd m2, m0 ; a3 * 9 |
| paddd m3, m7 |
| %if ARCH_X86_32 |
| mova [esp+20], m8 |
| pxor m8, m8 |
| %else |
| SWAP m8, m6 |
| %endif |
| punpcklwd m0, m5, m8 ; b3 |
| pmaddwd m7, m0, m0 |
| punpckhwd m5, m8 |
| pmaddwd m8, m5, m5 |
| psubd m2, m7 ; p3 |
| psubd m3, m8 |
| MULLD m2, m14, m8 ; p3 * s1 |
| MULLD m3, m14, m8 |
| pmaddwd m0, m11 ; b3 * 455 |
| pmaddwd m5, m11 |
| paddusw m2, m11 |
| paddusw m3, m11 |
| psrld m2, 20 ; min(z3, 255) |
| movif32 t3, t3m |
| psrld m3, 20 |
| GATHER_X_BY_X m8, m2, m3, r0, dstm |
| punpcklwd m2, m8, m8 |
| punpckhwd m3, m8, m8 |
| MULLD m0, m2, m7 |
| MULLD m5, m3, m7 |
| paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m5, m10 |
| psrld m0, 12 |
| psrld m5, 12 |
| mova [t4+wq*2+400*4+ 4], m8 |
| mova [t3+wq*4+400*8+ 8], m0 |
| mova [t3+wq*4+400*8+24], m5 |
| %if ARCH_X86_32 |
| mova m8, [esp+20] |
| %else |
| SWAP m6, m8 |
| pxor m6, m6 |
| %endif |
| paddw m5, m8, [t2+wq*2+400*0] |
| paddd m2, m4, [t2+wq*2+400*2] |
| paddd m3, m1, [t2+wq*2+400*4] |
| paddw m5, [t1+wq*2+400*0] |
| paddd m2, [t1+wq*2+400*2] |
| paddd m3, [t1+wq*2+400*4] |
| mova [t2+wq*2+400*0], m8 |
| pslld m0, m2, 4 |
| mova [t2+wq*2+400*2], m4 |
| pslld m8, m3, 4 |
| mova [t2+wq*2+400*4], m1 |
| pslld m4, m2, 3 |
| paddd m2, m0 |
| pslld m7, m3, 3 |
| paddd m3, m8 |
| paddd m2, m4 ; a5 * 25 |
| paddd m3, m7 |
| %if ARCH_X86_32 |
| pxor m7, m7 |
| %else |
| SWAP m7, m6 |
| %endif |
| punpcklwd m0, m5, m7 ; b5 |
| pmaddwd m4, m0, m0 |
| punpckhwd m5, m7 |
| pmaddwd m1, m5, m5 |
| %if ARCH_X86_64 |
| SWAP m7, m6 |
| %endif |
| psubd m2, m4 ; p5 |
| psubd m3, m1 |
| MULLD m2, m13, m7 ; p5 * s0 |
| MULLD m3, m13, m7 |
| pmaddwd m0, m12 ; b5 * 164 |
| pmaddwd m5, m12 |
| paddusw m2, m12 |
| paddusw m3, m12 |
| psrld m2, 20 ; min(z5, 255) |
| psrld m3, 20 |
| GATHER_X_BY_X m1, m2, m3, r0, dstm |
| punpcklwd m2, m1, m1 |
| punpckhwd m3, m1, m1 |
| MULLD m0, m2, m7 |
| MULLD m5, m3, m7 |
| paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) |
| paddd m5, m10 |
| mova [t4+wq*2+4], m1 |
| psrld m0, 12 |
| psrld m5, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m5 |
| add wq, 8 |
| jl .hv1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .v0: ; vertical boxsums + ab3 (even rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .v0_loop: |
| mova m0, [t1+wq*2+400* 6] |
| mova m4, [t1+wq*2+400* 8] |
| mova m5, [t1+wq*2+400*10] |
| paddw m0, m0 |
| paddd m4, m4 |
| paddd m5, m5 |
| paddw m1, m0, [t2+wq*2+400* 6] |
| paddd m2, m4, [t2+wq*2+400* 8] |
| paddd m3, m5, [t2+wq*2+400*10] |
| mova [t2+wq*2+400* 6], m0 |
| mova [t2+wq*2+400* 8], m4 |
| mova [t2+wq*2+400*10], m5 |
| %if ARCH_X86_32 |
| pxor m7, m7 |
| %else |
| SWAP m7, m6 |
| %endif |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; a3 * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m7 ; b3 |
| pmaddwd m2, m0, m0 |
| punpckhwd m1, m7 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p3 |
| psubd m5, m3 |
| %if ARCH_X86_64 |
| SWAP m7, m6 |
| %endif |
| MULLD m4, m14, m7 ; p3 * s1 |
| MULLD m5, m14, m7 |
| pmaddwd m0, m11 ; b3 * 455 |
| pmaddwd m1, m11 |
| paddusw m4, m11 |
| paddusw m5, m11 |
| psrld m4, 20 ; min(z3, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m7 |
| MULLD m1, m5, m7 |
| paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m10 |
| mova [t4+wq*2+400*2+4], m3 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova m3, [t1+wq*2+400*0] |
| mova m4, [t1+wq*2+400*2] |
| mova m5, [t1+wq*2+400*4] |
| mova [t3+wq*4+400*8+ 8], m3 |
| mova [t3+wq*4+400*0+ 8], m4 |
| mova [t3+wq*4+400*0+24], m5 |
| paddw m3, m3 ; cc5 |
| paddd m4, m4 |
| paddd m5, m5 |
| mova [t1+wq*2+400*0], m3 |
| mova [t1+wq*2+400*2], m4 |
| mova [t1+wq*2+400*4], m5 |
| mova [t3+wq*4+400*4+ 8], m0 |
| mova [t3+wq*4+400*4+24], m1 |
| add wq, 8 |
| jl .v0_loop |
| ret |
| .v1: ; vertical boxsums + ab (odd rows) |
| %if ARCH_X86_64 |
| lea wq, [r4-2] |
| %else |
| mov wd, w0m |
| %endif |
| .v1_loop: |
| mova m4, [t1+wq*2+400* 6] |
| mova m5, [t1+wq*2+400* 8] |
| mova m7, [t1+wq*2+400*10] |
| paddw m1, m4, [t2+wq*2+400* 6] |
| paddd m2, m5, [t2+wq*2+400* 8] |
| paddd m3, m7, [t2+wq*2+400*10] |
| mova [t2+wq*2+400* 6], m4 |
| mova [t2+wq*2+400* 8], m5 |
| mova [t2+wq*2+400*10], m7 |
| %if ARCH_X86_32 |
| pxor m7, m7 |
| %else |
| SWAP m7, m6 |
| %endif |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m4, m2 ; ((a3 + 8) >> 4) * 9 |
| paddd m5, m3 |
| punpcklwd m0, m1, m7 ; b3 |
| pmaddwd m2, m0, m0 |
| punpckhwd m1, m7 |
| pmaddwd m3, m1, m1 |
| psubd m4, m2 ; p3 |
| psubd m5, m3 |
| %if ARCH_X86_64 |
| SWAP m7, m6 |
| %endif |
| MULLD m4, m14, m7 ; p3 * s1 |
| MULLD m5, m14, m7 |
| pmaddwd m0, m11 ; b3 * 455 |
| pmaddwd m1, m11 |
| paddusw m4, m11 |
| paddusw m5, m11 |
| psrld m4, 20 ; min(z3, 255) |
| psrld m5, 20 |
| GATHER_X_BY_X m3, m4, m5, r0, dstm |
| punpcklwd m4, m3, m3 |
| punpckhwd m5, m3, m3 |
| MULLD m0, m4, m7 |
| MULLD m1, m5, m7 |
| paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) |
| paddd m1, m10 |
| mova [t4+wq*2+400*4+4], m3 |
| psrld m0, 12 |
| psrld m8, m1, 12 |
| mova m4, [t3+wq*4+400*8+ 8] |
| mova m5, [t3+wq*4+400*0+ 8] |
| mova m7, [t3+wq*4+400*0+24] |
| paddw m1, m4, [t2+wq*2+400*0] |
| paddd m2, m5, [t2+wq*2+400*2] |
| paddd m3, m7, [t2+wq*2+400*4] |
| paddw m1, [t1+wq*2+400*0] |
| paddd m2, [t1+wq*2+400*2] |
| paddd m3, [t1+wq*2+400*4] |
| mova [t2+wq*2+400*0], m4 |
| mova [t2+wq*2+400*2], m5 |
| mova [t2+wq*2+400*4], m7 |
| pslld m4, m2, 4 |
| mova [t3+wq*4+400*8+ 8], m0 |
| pslld m5, m3, 4 |
| mova [t3+wq*4+400*8+24], m8 |
| pslld m7, m2, 3 |
| paddd m2, m4 |
| pslld m8, m3, 3 |
| paddd m3, m5 |
| paddd m2, m7 ; a5 * 25 |
| paddd m3, m8 |
| %if ARCH_X86_32 |
| pxor m7, m7 |
| %else |
| SWAP m7, m6 |
| %endif |
| punpcklwd m0, m1, m7 ; b5 |
| pmaddwd m4, m0, m0 |
| punpckhwd m1, m7 |
| pmaddwd m5, m1, m1 |
| psubd m2, m4 ; p5 |
| psubd m3, m5 |
| %if ARCH_X86_64 |
| SWAP m7, m6 |
| %endif |
| MULLD m2, m13, m7 ; p5 * s0 |
| MULLD m3, m13, m7 |
| pmaddwd m0, m12 ; b5 * 164 |
| pmaddwd m1, m12 |
| paddusw m2, m12 |
| paddusw m3, m12 |
| psrld m2, 20 ; min(z5, 255) |
| psrld m3, 20 |
| GATHER_X_BY_X m4, m2, m3, r0, dstm |
| punpcklwd m2, m4, m4 |
| punpckhwd m3, m4, m4 |
| MULLD m0, m2, m7 |
| MULLD m1, m3, m7 |
| paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) |
| paddd m1, m10 |
| mova [t4+wq*2+4], m4 |
| psrld m0, 12 |
| psrld m1, 12 |
| mova [t3+wq*4+ 8], m0 |
| mova [t3+wq*4+24], m1 |
| add wq, 8 |
| jl .v1_loop |
| mov r10, t2 |
| mov t2, t1 |
| mov t1, r10 |
| ret |
| .prep_n: ; initial neighbor setup |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .prep_n_loop: |
| movu m0, [t4+wq*2+400*0+ 2] |
| movu m1, [t3+wq*4+400*0+ 4] |
| movu m2, [t3+wq*4+400*0+20] |
| movu m7, [t4+wq*2+400*0+ 4] |
| movu m8, [t3+wq*4+400*0+ 8] |
| paddw m3, m0, [t4+wq*2+400*0+ 0] |
| paddd m4, m1, [t3+wq*4+400*0+ 0] |
| paddd m5, m2, [t3+wq*4+400*0+16] |
| paddw m3, m7 |
| paddd m4, m8 |
| movu m7, [t3+wq*4+400*0+24] |
| paddw m0, m3 |
| paddd m1, m4 |
| psllw m3, 2 |
| pslld m4, 2 |
| paddd m5, m7 |
| paddd m2, m5 |
| pslld m5, 2 |
| paddw m0, m3 ; a5 565 |
| paddd m1, m4 ; b5 565 |
| paddd m2, m5 |
| mova [t4+wq*2+400* 6+ 0], m0 |
| mova [t3+wq*4+400*12+ 0], m1 |
| mova [t3+wq*4+400*12+16], m2 |
| movu m0, [t4+wq*2+400*2+ 4] |
| movu m1, [t3+wq*4+400*4+ 8] |
| movu m2, [t3+wq*4+400*4+24] |
| movu m3, [t4+wq*2+400*2+ 2] |
| movu m4, [t3+wq*4+400*4+ 4] |
| movu m5, [t3+wq*4+400*4+20] |
| paddw m0, [t4+wq*2+400*2+ 0] |
| paddd m1, [t3+wq*4+400*4+ 0] |
| paddd m2, [t3+wq*4+400*4+16] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| psllw m3, 2 ; a3[-1] 444 |
| pslld m4, 2 ; b3[-1] 444 |
| pslld m5, 2 |
| psubw m3, m0 ; a3[-1] 343 |
| psubd m4, m1 ; b3[-1] 343 |
| psubd m5, m2 |
| mova [t4+wq*2+400* 8+ 0], m3 |
| mova [t3+wq*4+400*16+ 0], m4 |
| mova [t3+wq*4+400*16+16], m5 |
| movu m0, [t4+wq*2+400*4+ 4] |
| movu m1, [t3+wq*4+400*8+ 8] |
| movu m2, [t3+wq*4+400*8+24] |
| movu m3, [t4+wq*2+400*4+ 2] |
| movu m4, [t3+wq*4+400*8+ 4] |
| movu m5, [t3+wq*4+400*8+20] |
| paddw m0, [t4+wq*2+400*4+ 0] |
| paddd m1, [t3+wq*4+400*8+ 0] |
| paddd m2, [t3+wq*4+400*8+16] |
| paddw m3, m0 |
| paddd m4, m1 |
| paddd m5, m2 |
| psllw m3, 2 ; a3[ 0] 444 |
| pslld m4, 2 ; b3[ 0] 444 |
| pslld m5, 2 |
| mova [t4+wq*2+400*10+ 0], m3 |
| mova [t3+wq*4+400*20+ 0], m4 |
| mova [t3+wq*4+400*20+16], m5 |
| psubw m3, m0 ; a3[ 0] 343 |
| psubd m4, m1 ; b3[ 0] 343 |
| psubd m5, m2 |
| mova [t4+wq*2+400*12+ 0], m3 |
| mova [t3+wq*4+400*24+ 0], m4 |
| mova [t3+wq*4+400*24+16], m5 |
| add wq, 8 |
| jl .prep_n_loop |
| ret |
| ALIGN function_align |
| .n0: ; neighbor + output (even rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n0_loop: |
| movu m0, [t4+wq*2+ 4] |
| movu m2, [t4+wq*2+ 2] |
| paddw m0, [t4+wq*2+ 0] |
| paddw m0, m2 |
| paddw m2, m0 |
| psllw m0, 2 |
| paddw m0, m2 ; a5 |
| movu m4, [t3+wq*4+ 8] |
| movu m5, [t3+wq*4+24] |
| movu m1, [t3+wq*4+ 4] |
| movu m3, [t3+wq*4+20] |
| paddd m4, [t3+wq*4+ 0] |
| paddd m5, [t3+wq*4+16] |
| paddd m4, m1 |
| paddd m5, m3 |
| paddd m1, m4 |
| paddd m3, m5 |
| pslld m4, 2 |
| pslld m5, 2 |
| paddd m4, m1 ; b5 |
| paddd m5, m3 |
| movu m2, [t4+wq*2+400* 6] |
| paddw m2, m0 |
| mova [t4+wq*2+400* 6], m0 |
| paddd m0, m4, [t3+wq*4+400*12+ 0] |
| paddd m1, m5, [t3+wq*4+400*12+16] |
| mova [t3+wq*4+400*12+ 0], m4 |
| mova [t3+wq*4+400*12+16], m5 |
| mova [rsp+16+ARCH_X86_32*4], m1 |
| movu m3, [t4+wq*2+400*2+4] |
| movu m5, [t4+wq*2+400*2+2] |
| paddw m3, [t4+wq*2+400*2+0] |
| paddw m5, m3 |
| psllw m5, 2 ; a3[ 1] 444 |
| psubw m4, m5, m3 ; a3[ 1] 343 |
| movu m3, [t4+wq*2+400* 8] |
| paddw m3, [t4+wq*2+400*10] |
| paddw m3, m4 |
| mova [t4+wq*2+400* 8], m4 |
| mova [t4+wq*2+400*10], m5 |
| movu m1, [t3+wq*4+400*4+ 8] |
| movu m5, [t3+wq*4+400*4+ 4] |
| movu m7, [t3+wq*4+400*4+24] |
| movu m8, [t3+wq*4+400*4+20] |
| paddd m1, [t3+wq*4+400*4+ 0] |
| paddd m7, [t3+wq*4+400*4+16] |
| paddd m5, m1 |
| paddd m8, m7 |
| pslld m5, 2 ; b3[ 1] 444 |
| pslld m8, 2 |
| psubd m4, m5, m1 ; b3[ 1] 343 |
| %if ARCH_X86_32 |
| mova [esp+52], m8 |
| psubd m8, m7 |
| %else |
| psubd m6, m8, m7 |
| SWAP m8, m6 |
| %endif |
| paddd m1, m4, [t3+wq*4+400*16+ 0] |
| paddd m7, m8, [t3+wq*4+400*16+16] |
| paddd m1, [t3+wq*4+400*20+ 0] |
| paddd m7, [t3+wq*4+400*20+16] |
| mova [t3+wq*4+400*16+ 0], m4 |
| mova [t3+wq*4+400*16+16], m8 |
| mova [t3+wq*4+400*20+ 0], m5 |
| %if ARCH_X86_32 |
| mova m8, [esp+52] |
| %else |
| SWAP m8, m6 |
| pxor m6, m6 |
| %endif |
| mova [t3+wq*4+400*20+16], m8 |
| mova [rsp+32+ARCH_X86_32*4], m7 |
| movq m4, [dstq+wq] |
| punpcklbw m4, m6 |
| punpcklwd m5, m4, m6 |
| punpcklwd m7, m2, m6 |
| pmaddwd m7, m5 ; a5 * src |
| punpcklwd m8, m3, m6 |
| pmaddwd m8, m5 ; a3 * src |
| punpckhwd m5, m4, m6 |
| punpckhwd m2, m6 |
| pmaddwd m2, m5 |
| punpckhwd m3, m6 |
| pmaddwd m3, m5 |
| psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) |
| psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) |
| psrld m0, 9 |
| pslld m1, 7 |
| pand m0, m9 |
| pandn m8, m9, m1 |
| por m0, m8 |
| mova m1, [rsp+16+ARCH_X86_32*4] |
| psubd m1, m2 |
| mova m2, [rsp+32+ARCH_X86_32*4] |
| psubd m2, m3 |
| mova m3, [base+pd_4096] |
| psrld m1, 9 |
| pslld m2, 7 |
| pand m1, m9 |
| pandn m5, m9, m2 |
| por m1, m5 |
| pmaddwd m0, m15 |
| pmaddwd m1, m15 |
| paddd m0, m3 |
| paddd m1, m3 |
| psrad m0, 13 |
| psrad m1, 13 |
| packssdw m0, m1 |
| paddw m0, m4 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n0_loop |
| add dstq, stridemp |
| ret |
| ALIGN function_align |
| .n1: ; neighbor + output (odd rows) |
| movif64 wq, r4 |
| movif32 wd, w1m |
| .n1_loop: |
| movu m3, [t4+wq*2+400*4+4] |
| movu m5, [t4+wq*2+400*4+2] |
| paddw m3, [t4+wq*2+400*4+0] |
| paddw m5, m3 |
| psllw m5, 2 ; a3[ 1] 444 |
| psubw m4, m5, m3 ; a3[ 1] 343 |
| paddw m3, m4, [t4+wq*2+400*12] |
| paddw m3, [t4+wq*2+400*10] |
| mova [t4+wq*2+400*10], m5 |
| mova [t4+wq*2+400*12], m4 |
| movu m1, [t3+wq*4+400*8+ 8] |
| movu m5, [t3+wq*4+400*8+ 4] |
| movu m7, [t3+wq*4+400*8+24] |
| movu m8, [t3+wq*4+400*8+20] |
| paddd m1, [t3+wq*4+400*8+ 0] |
| paddd m7, [t3+wq*4+400*8+16] |
| paddd m5, m1 |
| paddd m8, m7 |
| pslld m5, 2 ; b3[ 1] 444 |
| pslld m8, 2 |
| psubd m4, m5, m1 ; b3[ 1] 343 |
| psubd m0, m8, m7 |
| paddd m1, m4, [t3+wq*4+400*24+ 0] |
| paddd m7, m0, [t3+wq*4+400*24+16] |
| paddd m1, [t3+wq*4+400*20+ 0] |
| paddd m7, [t3+wq*4+400*20+16] |
| mova [t3+wq*4+400*20+ 0], m5 |
| mova [t3+wq*4+400*20+16], m8 |
| mova [t3+wq*4+400*24+ 0], m4 |
| mova [t3+wq*4+400*24+16], m0 |
| movq m5, [dstq+wq] |
| mova m2, [t4+wq*2+400* 6] |
| punpcklbw m5, m6 |
| punpcklwd m4, m5, m6 |
| punpcklwd m8, m2, m6 |
| pmaddwd m8, m4 ; a5 * src |
| punpcklwd m0, m3, m6 |
| pmaddwd m0, m4 ; a3 * src |
| punpckhwd m4, m5, m6 |
| punpckhwd m2, m6 |
| pmaddwd m2, m4 |
| punpckhwd m3, m6 |
| pmaddwd m3, m4 |
| psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) |
| mova m0, [t3+wq*4+400*12+ 0] |
| psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) |
| mova m4, [t3+wq*4+400*12+16] |
| psubd m4, m2 |
| psubd m7, m3 |
| pslld m1, 7 |
| psrld m0, 8 |
| psrld m4, 8 |
| pslld m7, 7 |
| pandn m3, m9, m1 |
| pand m0, m9 |
| por m0, m3 |
| pand m4, m9 |
| pandn m2, m9, m7 |
| por m2, m4 |
| mova m1, [base+pd_4096] |
| pmaddwd m0, m15 |
| pmaddwd m2, m15 |
| paddd m0, m1 |
| paddd m2, m1 |
| psrad m0, 13 |
| psrad m2, 13 |
| packssdw m0, m2 |
| paddw m0, m5 |
| packuswb m0, m0 |
| movq [dstq+wq], m0 |
| add wq, 8 |
| jl .n1_loop |
| add dstq, stridemp |
| movif32 dstm, dstq |
| ret |