| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; Copyright © 2018, VideoLabs |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| pb_right_ext_mask: times 16 db 0xff |
| times 16 db 0 |
| pb_14x0_1_2: times 14 db 0 |
| db 1, 2 |
| pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 |
| db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 |
| pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 |
| pb_0: times 16 db 0 |
| pb_2: times 16 db 2 |
| pb_3: times 16 db 3 |
| pb_4: times 16 db 4 |
| pb_15: times 16 db 15 |
| pb_0_1: times 8 db 0, 1 |
| pb_6_7: times 8 db 6, 7 |
| pb_14_15: times 8 db 14, 15 |
| pw_1: times 8 dw 1 |
| pw_16: times 8 dw 16 |
| pw_128: times 8 dw 128 |
| pw_255: times 8 dw 255 |
| pw_256: times 8 dw 256 |
| pw_2048: times 8 dw 2048 |
| pw_16380: times 8 dw 16380 |
| pw_5_6: times 4 dw 5, 6 |
| pw_0_128: times 4 dw 0, 128 |
| pd_1024: times 4 dd 1024 |
| %if ARCH_X86_32 |
| pd_256: times 4 dd 256 |
| pd_512: times 4 dd 512 |
| pd_2048: times 4 dd 2048 |
| %endif |
| pd_0xF0080029: times 4 dd 0xF0080029 |
| pd_0xF00801C7: times 4 dd 0XF00801C7 |
| |
| cextern sgr_x_by_x |
| |
| SECTION .text |
| |
| %if ARCH_X86_32 |
| %define PIC_base_offset $$ |
| |
| %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg |
| %assign pic_reg_stk_off 4 |
| %xdefine PIC_reg %1 |
| %if %2 == 1 |
| mov [esp], %1 |
| %endif |
| LEA PIC_reg, PIC_base_offset |
| %if %3 == 1 |
| XCHG_PIC_REG |
| %endif |
| %endmacro |
| |
| %macro XCHG_PIC_REG 0 |
| mov [esp+pic_reg_stk_off], PIC_reg |
| %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 |
| mov PIC_reg, [esp+pic_reg_stk_off] |
| %endmacro |
| |
| %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) |
| |
| %else |
| %macro XCHG_PIC_REG 0 |
| %endmacro |
| |
| %define PIC_sym(sym) (sym) |
| %endif |
| |
| %macro PALIGNR 4 ; dst, src1, src2, shift |
| %if cpuflag(ssse3) |
| palignr %1, %2, %3, %4 |
| %else |
| %assign %%i regnumof%+%1 + 1 |
| %define %%tmp m %+ %%i |
| psrldq %1, %3, %4 |
| pslldq %%tmp, %2, 16-%4 |
| por %1, %%tmp |
| %endif |
| %endmacro |
| |
| %macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero |
| %if cpuflag(ssse3) |
| pmaddubsw %1, %2 |
| %else |
| %if %5 == 1 |
| pxor %3, %3 |
| %endif |
| punpckhbw %4, %1, %3 |
| punpcklbw %1, %3 |
| pmaddwd %4, %2 |
| pmaddwd %1, %2 |
| packssdw %1, %4 |
| %endif |
| %endmacro |
| |
| ;;;;;;;;;;;;;;;;;;;;;; |
| ;; wiener ;; |
| ;;;;;;;;;;;;;;;;;;;;;; |
| |
| %macro WIENER_H 0 |
| %if ARCH_X86_64 |
| cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge |
| mov edged, edgem |
| movifnidn wd, wm |
| mov hd, hm |
| %else |
| cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge |
| mov r5, edgem |
| mov [esp+12], r5 |
| mov wd, wm |
| mov hd, hm |
| SETUP_PIC hd |
| %define m15 m0 |
| %define m14 m1 |
| %define m13 m2 |
| %define m12 m3 |
| %endif |
| |
| movq m15, [fhq] |
| %if cpuflag(ssse3) |
| pshufb m12, m15, [PIC_sym(pb_6_7)] |
| pshufb m13, m15, [PIC_sym(pb_4)] |
| pshufb m14, m15, [PIC_sym(pb_2)] |
| pshufb m15, m15, [PIC_sym(pb_0)] |
| %else |
| pshuflw m12, m15, q3333 |
| punpcklbw m15, m15 |
| pshufhw m13, m15, q0000 |
| pshuflw m14, m15, q2222 |
| pshuflw m15, m15, q0000 |
| punpcklqdq m12, m12 |
| punpckhqdq m13, m13 |
| punpcklqdq m14, m14 |
| punpcklqdq m15, m15 |
| psraw m13, 8 |
| psraw m14, 8 |
| psraw m15, 8 |
| %endif |
| |
| %if ARCH_X86_64 |
| mova m11, [pw_2048] |
| mova m10, [pw_16380] |
| lea r11, [pb_right_ext_mask] |
| |
| DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim |
| %else |
| %define m10 [PIC_sym(pw_16380)] |
| %define m11 [PIC_sym(pw_2048)] |
| %define m12 [esp+0x14] |
| %define m13 [esp+0x24] |
| %define m14 [esp+0x34] |
| %define m15 [esp+0x44] |
| mova m12, m3 |
| mova m13, m2 |
| mova m14, m1 |
| mova m15, m0 |
| |
| DEFINE_ARGS dst, left, src, stride, x, w, h, edge |
| %define srcptrq srcq |
| %define dstptrq dstq |
| %define hd dword [esp+ 0] |
| %define edged dword [esp+12] |
| %define xlimd dword [esp+16] |
| %endif |
| |
| ; if (edge & has_right) align_w_to_16 |
| ; else w -= 3, and use that as limit in x loop |
| test edged, 2 ; has_right |
| jnz .align |
| mov xlimd, -3 |
| jmp .loop |
| .align: |
| add wd, 15 |
| and wd, ~15 |
| %if ARCH_X86_64 |
| xor xlimd, xlimd |
| %else |
| mov xlimd, 0 |
| %endif |
| |
| ; main y loop for vertical filter |
| .loop: |
| %if ARCH_X86_64 |
| mov srcptrq, srcq |
| mov dstptrq, dstq |
| lea xd, [wq+xlimq] |
| %else |
| mov [esp+8], srcq |
| mov [esp+4], dstq |
| mov xd, xlimd |
| add xd, wd |
| %endif |
| |
| ; load left edge pixels |
| test edged, 1 ; have_left |
| jz .emu_left |
| test leftq, leftq ; left == NULL for the edge-extended bottom/top |
| jz .load_left_combined |
| movd m0, [leftq] |
| movd m1, [srcq] |
| punpckldq m0, m1 |
| pslldq m0, 9 |
| add leftq, 4 |
| jmp .left_load_done |
| .load_left_combined: |
| movq m0, [srcq-3] |
| pslldq m0, 10 |
| jmp .left_load_done |
| .emu_left: |
| movd m0, [srcq] |
| %if cpuflag(ssse3) |
| pshufb m0, [PIC_sym(pb_14x0_1_2)] |
| %else |
| pslldq m1, m0, 13 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| psrldq m0, 2 |
| por m0, m1 |
| %endif |
| |
| ; load right edge pixels |
| .left_load_done: |
| cmp xd, 16 |
| jg .main_load |
| test xd, xd |
| jg .load_and_splat |
| je .splat_right |
| |
| ; for very small images (w=[1-2]), edge-extend the original cache, |
| ; ugly, but only runs in very odd cases |
| %if cpuflag(ssse3) |
| add wd, wd |
| %if ARCH_X86_64 |
| pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] |
| %else |
| pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] |
| %endif |
| shr wd, 1 |
| %else |
| shl wd, 4 |
| pcmpeqd m2, m2 |
| movd m3, wd |
| psrldq m2, 2 |
| punpckhbw m1, m0, m0 |
| pshufhw m1, m1, q1122 |
| psllq m1, m3 |
| pand m0, m2 |
| pandn m2, m1 |
| por m0, m2 |
| shr wd, 4 |
| %endif |
| |
| ; main x loop, mostly this starts in .main_load |
| .splat_right: |
| ; no need to load new pixels, just extend them from the (possibly previously |
| ; extended) previous load into m0 |
| %if cpuflag(ssse3) |
| pshufb m1, m0, [PIC_sym(pb_15)] |
| %else |
| punpckhbw m1, m0, m0 |
| pshufhw m1, m1, q3333 |
| punpckhqdq m1, m1 |
| %endif |
| jmp .main_loop |
| .load_and_splat: |
| ; load new pixels and extend edge for right-most |
| movu m1, [srcptrq+3] |
| %if ARCH_X86_64 |
| sub r11, xq |
| movu m2, [r11+16] |
| add r11, xq |
| %else |
| sub PIC_reg, xd |
| movu m2, [PIC_sym(pb_right_ext_mask)+16] |
| add PIC_reg, xd |
| %endif |
| movd m3, [srcptrq+2+xq] |
| %if cpuflag(ssse3) |
| pshufb m3, [PIC_sym(pb_0)] |
| %else |
| punpcklbw m3, m3 |
| pshuflw m3, m3, q0000 |
| punpcklqdq m3, m3 |
| %endif |
| pand m1, m2 |
| pxor m2, [PIC_sym(pb_right_ext_mask)] |
| pand m3, m2 |
| pxor m2, [PIC_sym(pb_right_ext_mask)] |
| por m1, m3 |
| jmp .main_loop |
| .main_load: |
| ; load subsequent line |
| movu m1, [srcptrq+3] |
| .main_loop: |
| %if ARCH_X86_64 |
| PALIGNR m2, m1, m0, 10 |
| PALIGNR m3, m1, m0, 11 |
| PALIGNR m4, m1, m0, 12 |
| PALIGNR m5, m1, m0, 13 |
| PALIGNR m6, m1, m0, 14 |
| PALIGNR m7, m1, m0, 15 |
| |
| punpcklbw m0, m2, m1 |
| punpckhbw m2, m1 |
| punpcklbw m8, m3, m7 |
| punpckhbw m3, m7 |
| punpcklbw m7, m4, m6 |
| punpckhbw m4, m6 |
| PMADDUBSW m0, m15, m6, m9, 1 |
| PMADDUBSW m2, m15, m6, m9, 0 |
| PMADDUBSW m8, m14, m6, m9, 0 |
| PMADDUBSW m3, m14, m6, m9, 0 |
| PMADDUBSW m7, m13, m6, m9, 0 |
| PMADDUBSW m4, m13, m6, m9, 0 |
| paddw m0, m8 |
| paddw m2, m3 |
| %if cpuflag(ssse3) |
| pxor m6, m6 |
| %endif |
| punpcklbw m3, m5, m6 |
| punpckhbw m5, m6 |
| psllw m8, m3, 7 |
| psllw m6, m5, 7 |
| psubw m8, m10 |
| psubw m6, m10 |
| pmullw m3, m12 |
| pmullw m5, m12 |
| paddw m0, m7 |
| paddw m2, m4 |
| paddw m0, m3 |
| paddw m2, m5 |
| paddsw m0, m8 |
| paddsw m2, m6 |
| psraw m0, 3 |
| psraw m2, 3 |
| paddw m0, m11 |
| paddw m2, m11 |
| mova [dstptrq+ 0], m0 |
| mova [dstptrq+16], m2 |
| %else |
| PALIGNR m2, m1, m0, 10 |
| punpcklbw m3, m2, m1 |
| punpckhbw m2, m1 |
| PMADDUBSW m3, m15, m4, m5, 1 |
| PMADDUBSW m2, m15, m4, m5, 0 |
| PALIGNR m4, m1, m0, 11 |
| PALIGNR m5, m1, m0, 15 |
| punpcklbw m6, m4, m5 |
| punpckhbw m4, m5 |
| PMADDUBSW m6, m14, m5, m7, 1 |
| PMADDUBSW m4, m14, m5, m7, 0 |
| paddw m3, m6 |
| paddw m2, m4 |
| PALIGNR m4, m1, m0, 12 |
| PALIGNR m5, m1, m0, 14 |
| punpcklbw m6, m4, m5 |
| punpckhbw m4, m5 |
| PMADDUBSW m6, m13, m5, m7, 1 |
| PMADDUBSW m4, m13, m5, m7, 0 |
| paddw m3, m6 |
| paddw m2, m4 |
| PALIGNR m6, m1, m0, 13 |
| %if cpuflag(ssse3) |
| pxor m5, m5 |
| %endif |
| punpcklbw m4, m6, m5 |
| punpckhbw m6, m5 |
| psllw m5, m4, 7 |
| psllw m7, m6, 7 |
| psubw m5, m10 |
| psubw m7, m10 |
| pmullw m4, m12 |
| pmullw m6, m12 |
| paddw m3, m4 |
| paddw m2, m6 |
| paddsw m3, m5 |
| paddsw m2, m7 |
| psraw m3, 3 |
| psraw m2, 3 |
| paddw m3, m11 |
| paddw m2, m11 |
| mova [dstptrq+ 0], m3 |
| mova [dstptrq+16], m2 |
| %endif |
| |
| mova m0, m1 |
| add srcptrq, 16 |
| add dstptrq, 32 |
| sub xd, 16 |
| cmp xd, 16 |
| jg .main_load |
| test xd, xd |
| jg .load_and_splat |
| cmp xd, xlimd |
| jg .splat_right |
| |
| %if ARCH_X86_32 |
| mov srcq, [esp+8] |
| mov dstq, [esp+4] |
| %endif |
| add srcq, strideq |
| add dstq, 384*2 |
| dec hd |
| jg .loop |
| RET |
| %endmacro |
| |
| %macro WIENER_V 0 |
| %if ARCH_X86_64 |
| cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge |
| mov edged, edgem |
| movifnidn fvq, fvmp |
| movifnidn hd, hm |
| movq m15, [fvq] |
| pshufd m14, m15, q1111 |
| pshufd m15, m15, q0000 |
| paddw m14, [pw_0_128] |
| mova m12, [pd_1024] |
| |
| DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr |
| |
| mov ylimd, edged |
| and ylimd, 8 ; have_bottom |
| shr ylimd, 2 |
| sub ylimd, 3 |
| %else |
| cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge |
| %define ylimd [esp+12] |
| |
| mov r5d, edgem |
| and r5d, 8 |
| shr r5d, 2 |
| sub r5d, 3 |
| mov ylimd, r5d |
| mov fvq, fvmp |
| mov edged, edgem |
| |
| SETUP_PIC edged |
| |
| movq m0, [fvq] |
| pshufd m1, m0, q1111 |
| pshufd m0, m0, q0000 |
| paddw m1, [PIC_sym(pw_0_128)] |
| mova [esp+0x50], m0 |
| mova [esp+0x40], m1 |
| |
| DEFINE_ARGS dst, stride, mid, w, h, y, edge |
| %define mptrq midq |
| %define dstptrq dstq |
| %define edged dword [esp] |
| %endif |
| |
| ; main x loop for vertical filter, does one column of 16 pixels |
| .loop_x: |
| mova m3, [midq] ; middle line |
| |
| ; load top pixels |
| test edged, 4 ; have_top |
| jz .emu_top |
| mova m0, [midq-384*4] |
| mova m2, [midq-384*2] |
| mova m1, m0 |
| jmp .load_bottom_pixels |
| .emu_top: |
| mova m0, m3 |
| mova m1, m3 |
| mova m2, m3 |
| |
| ; load bottom pixels |
| .load_bottom_pixels: |
| mov yd, hd |
| %if ARCH_X86_64 |
| mov mptrq, midq |
| mov dstptrq, dstq |
| add yd, ylimd |
| %else |
| mov [esp+8], midq |
| mov [esp+4], dstq |
| add yd, ylimd |
| %endif |
| jg .load_threelines |
| |
| ; the remainder here is somewhat messy but only runs in very weird |
| ; circumstances at the bottom of the image in very small blocks (h=[1-3]), |
| ; so performance is not terribly important here... |
| je .load_twolines |
| cmp yd, -1 |
| je .load_oneline |
| ; h == 1 case |
| mova m5, m3 |
| mova m4, m3 |
| mova m6, m3 |
| jmp .loop |
| .load_oneline: |
| ; h == 2 case |
| mova m4, [midq+384*2] |
| mova m5, m4 |
| mova m6, m4 |
| jmp .loop |
| .load_twolines: |
| ; h == 3 case |
| mova m4, [midq+384*2] |
| mova m5, [midq+384*4] |
| mova m6, m5 |
| jmp .loop |
| .load_threelines: |
| ; h > 3 case |
| mova m4, [midq+384*2] |
| mova m5, [midq+384*4] |
| ; third line loaded in main loop below |
| |
| ; main y loop for vertical filter |
| .loop_load: |
| ; load one line into m6. if that pixel is no longer available, do |
| ; nothing, since m6 still has the data from the previous line in it. We |
| ; try to structure the loop so that the common case is evaluated fastest |
| mova m6, [mptrq+384*6] |
| .loop: |
| %if ARCH_X86_64 |
| paddw m7, m0, m6 |
| paddw m8, m1, m5 |
| paddw m9, m2, m4 |
| punpcklwd m10, m7, m8 |
| punpckhwd m7, m8 |
| punpcklwd m11, m9, m3 |
| punpckhwd m9, m3 |
| pmaddwd m10, m15 |
| pmaddwd m7, m15 |
| pmaddwd m11, m14 |
| pmaddwd m9, m14 |
| paddd m10, m12 |
| paddd m7, m12 |
| paddd m10, m11 |
| paddd m7, m9 |
| psrad m10, 11 |
| psrad m7, 11 |
| packssdw m10, m7 |
| packuswb m10, m10 |
| movq [dstptrq], m10 |
| %else |
| mova [esp+0x30], m1 |
| mova [esp+0x20], m2 |
| mova [esp+0x10], m3 |
| paddw m0, m6 |
| paddw m1, m5 |
| paddw m2, m4 |
| punpcklwd m7, m2, m3 |
| punpckhwd m2, m3 |
| punpcklwd m3, m0, m1 |
| punpckhwd m0, m1 |
| mova m1, [esp+0x50] |
| pmaddwd m3, m1 |
| pmaddwd m0, m1 |
| mova m1, [esp+0x40] |
| pmaddwd m7, m1 |
| pmaddwd m2, m1 |
| paddd m3, [PIC_sym(pd_1024)] |
| paddd m0, [PIC_sym(pd_1024)] |
| paddd m3, m7 |
| paddd m0, m2 |
| psrad m3, 11 |
| psrad m0, 11 |
| packssdw m3, m0 |
| packuswb m3, m3 |
| movq [dstq], m3 |
| mova m1, [esp+0x30] |
| mova m2, [esp+0x20] |
| mova m3, [esp+0x10] |
| %endif |
| ; shift pixels one position |
| mova m0, m1 |
| mova m1, m2 |
| mova m2, m3 |
| mova m3, m4 |
| mova m4, m5 |
| mova m5, m6 |
| add dstptrq, strideq |
| add mptrq, 384*2 |
| dec yd |
| jg .loop_load |
| ; for the bottom pixels, continue using m6 (as extended edge) |
| cmp yd, ylimd |
| jg .loop |
| |
| %if ARCH_X86_32 |
| mov midq, [esp+8] |
| mov dstq, [esp+4] |
| %endif |
| add dstq, 8 |
| add midq, 16 |
| sub wd, 8 |
| jg .loop_x |
| RET |
| %endmacro |
| |
| INIT_XMM sse2 |
| WIENER_H |
| WIENER_V |
| |
| INIT_XMM ssse3 |
| WIENER_H |
| WIENER_V |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; self-guided ;; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| %macro MULLD 2 |
| pmulhuw m5, %1, %2 |
| pmullw %1, %2 |
| pslld m5, 16 |
| paddd %1, m5 |
| %endmacro |
| |
| %macro GATHERDD 2 |
| mova m5, m7 |
| movd r6d, %2 |
| %if ARCH_X86_64 |
| movd %1, [r5+r6] |
| pextrw r6d, %2, 2 |
| pinsrw m5, [r5+r6+2], 3 |
| pextrw r6d, %2, 4 |
| pinsrw %1, [r5+r6+2], 5 |
| pextrw r6d, %2, 6 |
| pinsrw m5, [r5+r6+2], 7 |
| %else |
| movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] |
| pextrw r6d, %2, 2 |
| pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 |
| pextrw r6d, %2, 4 |
| pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 |
| pextrw r6d, %2, 6 |
| pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 |
| %endif |
| por %1, m5 |
| %endmacro |
| |
| %if ARCH_X86_64 |
| cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim |
| mov xlimd, edgem |
| movifnidn xd, xm |
| mov hd, hm |
| mov edged, xlimd |
| and xlimd, 2 ; have_right |
| add xd, xlimd |
| xor xlimd, 2 ; 2*!have_right |
| %else |
| cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim |
| %define wq r0m |
| %define xlimd r1m |
| %define hd hmp |
| %define edged edgemp |
| |
| mov r6, edgem |
| and r6, 2 ; have_right |
| add xd, r6 |
| xor r6, 2 ; 2*!have_right |
| mov xlimd, r6 |
| SETUP_PIC r6, 0 |
| %endif |
| |
| jnz .no_right |
| add xd, 7 |
| and xd, ~7 |
| .no_right: |
| pxor m1, m1 |
| lea srcq, [srcq+xq] |
| lea sumq, [sumq+xq*2-2] |
| lea sumsqq, [sumsqq+xq*4-4] |
| neg xq |
| mov wq, xq |
| %if ARCH_X86_64 |
| lea r10, [pb_right_ext_mask+16] |
| %endif |
| .loop_y: |
| mov xq, wq |
| |
| ; load left |
| test edged, 1 ; have_left |
| jz .no_left |
| test leftq, leftq |
| jz .load_left_from_main |
| movd m0, [leftq] |
| pslldq m0, 12 |
| add leftq, 4 |
| jmp .expand_x |
| .no_left: |
| movd m0, [srcq+xq] |
| pshufb m0, [PIC_sym(pb_0)] |
| jmp .expand_x |
| .load_left_from_main: |
| movd m0, [srcq+xq-2] |
| pslldq m0, 14 |
| .expand_x: |
| punpckhbw xm0, xm1 |
| |
| ; when we reach this, m0 contains left two px in highest words |
| cmp xq, -8 |
| jle .loop_x |
| .partial_load_and_extend: |
| movd m3, [srcq-4] |
| pshufb m3, [PIC_sym(pb_3)] |
| movq m2, [srcq+xq] |
| punpcklbw m2, m1 |
| punpcklbw m3, m1 |
| %if ARCH_X86_64 |
| movu m4, [r10+xq*2] |
| %else |
| movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] |
| %endif |
| pand m2, m4 |
| pandn m4, m3 |
| por m2, m4 |
| jmp .loop_x_noload |
| .right_extend: |
| pshufb m2, m0, [PIC_sym(pb_14_15)] |
| jmp .loop_x_noload |
| |
| .loop_x: |
| movq m2, [srcq+xq] |
| punpcklbw m2, m1 |
| .loop_x_noload: |
| palignr m3, m2, m0, 12 |
| palignr m4, m2, m0, 14 |
| |
| punpcklwd m5, m3, m2 |
| punpckhwd m6, m3, m2 |
| paddw m3, m4 |
| punpcklwd m7, m4, m1 |
| punpckhwd m4, m1 |
| pmaddwd m5, m5 |
| pmaddwd m6, m6 |
| pmaddwd m7, m7 |
| pmaddwd m4, m4 |
| paddd m5, m7 |
| paddd m6, m4 |
| paddw m3, m2 |
| movu [sumq+xq*2], m3 |
| movu [sumsqq+xq*4+ 0], m5 |
| movu [sumsqq+xq*4+16], m6 |
| |
| mova m0, m2 |
| add xq, 8 |
| |
| ; if x <= -8 we can reload more pixels |
| ; else if x < 0 we reload and extend (this implies have_right=0) |
| ; else if x < xlimd we extend from previous load (this implies have_right=0) |
| ; else we are done |
| |
| cmp xd, -8 |
| jle .loop_x |
| test xd, xd |
| jl .partial_load_and_extend |
| cmp xd, xlimd |
| jl .right_extend |
| |
| add sumsqq, (384+16)*4 |
| add sumq, (384+16)*2 |
| add srcq, strideq |
| dec hd |
| jg .loop_y |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim |
| movifnidn edged, edgem |
| %else |
| cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y |
| %define sumsq_baseq dword [esp+0] |
| %define sum_baseq dword [esp+4] |
| %define ylimd dword [esp+8] |
| %define m8 [esp+12] |
| %endif |
| mov xq, -2 |
| %if ARCH_X86_64 |
| mov ylimd, edged |
| and ylimd, 8 ; have_bottom |
| shr ylimd, 2 |
| sub ylimd, 2 ; -2 if have_bottom=0, else 0 |
| mov sumsq_baseq, sumsqq |
| mov sum_baseq, sumq |
| .loop_x: |
| mov sumsqq, sumsq_baseq |
| mov sumq, sum_baseq |
| lea yd, [hd+ylimd+2] |
| %else |
| mov yd, edged |
| and yd, 8 ; have_bottom |
| shr yd, 2 |
| sub yd, 2 ; -2 if have_bottom=0, else 0 |
| mov sumsq_baseq, sumsqq |
| mov sum_baseq, sumq |
| mov ylimd, yd |
| .loop_x: |
| mov sumsqd, sumsq_baseq |
| mov sumd, sum_baseq |
| lea yd, [hd+2] |
| add yd, ylimd |
| %endif |
| lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] |
| lea sumq, [sumq+xq*2+2-(384+16)*2] |
| test edged, 4 ; have_top |
| jnz .load_top |
| movu m0, [sumsqq+(384+16)*4*1] |
| movu m1, [sumsqq+(384+16)*4*1+16] |
| mova m2, m0 |
| mova m3, m1 |
| mova m4, m0 |
| mova m5, m1 |
| movu m6, [sumq+(384+16)*2*1] |
| mova m7, m6 |
| mova m8, m6 |
| jmp .loop_y_noload |
| .load_top: |
| movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] |
| movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] |
| movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] |
| movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] |
| movu m6, [sumq-(384+16)*2*1] ; l2 |
| movu m7, [sumq-(384+16)*2*0] ; l1 |
| .loop_y: |
| %if ARCH_X86_64 |
| movu m8, [sumq+(384+16)*2*1] ; l0 |
| %else |
| movu m4, [sumq+(384+16)*2*1] ; l0 |
| mova m8, m4 |
| %endif |
| movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] |
| movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] |
| .loop_y_noload: |
| paddd m0, m2 |
| paddd m1, m3 |
| paddw m6, m7 |
| paddd m0, m4 |
| paddd m1, m5 |
| paddw m6, m8 |
| movu [sumsqq+ 0], m0 |
| movu [sumsqq+16], m1 |
| movu [sumq], m6 |
| |
| ; shift position down by one |
| mova m0, m2 |
| mova m1, m3 |
| mova m2, m4 |
| mova m3, m5 |
| mova m6, m7 |
| mova m7, m8 |
| add sumsqq, (384+16)*4 |
| add sumq, (384+16)*2 |
| dec yd |
| jg .loop_y |
| cmp yd, ylimd |
| jg .loop_y_noload |
| add xd, 8 |
| cmp xd, wd |
| jl .loop_x |
| RET |
| |
| cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s |
| movifnidn sd, sm |
| sub aq, (384+16-1)*4 |
| sub bq, (384+16-1)*2 |
| add hd, 2 |
| %if ARCH_X86_64 |
| LEA r5, sgr_x_by_x-0xF03 |
| %else |
| SETUP_PIC r5, 0 |
| %endif |
| movd m6, sd |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| pxor m7, m7 |
| DEFINE_ARGS a, b, w, h, x |
| %if ARCH_X86_64 |
| mova m8, [pd_0xF00801C7] |
| mova m9, [pw_256] |
| psrld m10, m9, 13 ; pd_2048 |
| mova m11, [pb_unpcklwdw] |
| %else |
| %define m8 [PIC_sym(pd_0xF00801C7)] |
| %define m9 [PIC_sym(pw_256)] |
| %define m10 [PIC_sym(pd_2048)] |
| %define m11 [PIC_sym(pb_unpcklwdw)] |
| %endif |
| .loop_y: |
| mov xq, -2 |
| .loop_x: |
| movq m0, [bq+xq*2] |
| movq m1, [bq+xq*2+(384+16)*2] |
| punpcklwd m0, m7 |
| punpcklwd m1, m7 |
| movu m2, [aq+xq*4] |
| movu m3, [aq+xq*4+(384+16)*4] |
| pslld m4, m2, 3 |
| pslld m5, m3, 3 |
| paddd m2, m4 ; aa * 9 |
| paddd m3, m5 |
| pmaddwd m4, m0, m0 |
| pmaddwd m5, m1, m1 |
| pmaddwd m0, m8 |
| pmaddwd m1, m8 |
| psubd m2, m4 ; p = aa * 9 - bb * bb |
| psubd m3, m5 |
| MULLD m2, m6 |
| MULLD m3, m6 |
| paddusw m2, m8 |
| paddusw m3, m8 |
| psrld m2, 20 ; z |
| psrld m3, 20 |
| GATHERDD m4, m2 ; xx |
| GATHERDD m2, m3 |
| psrld m4, 24 |
| psrld m2, 24 |
| packssdw m3, m4, m2 |
| pshufb m4, m11 |
| MULLD m0, m4 |
| pshufb m2, m11 |
| MULLD m1, m2 |
| psubw m5, m9, m3 |
| paddd m0, m10 |
| paddd m1, m10 |
| psrld m0, 12 |
| psrld m1, 12 |
| movq [bq+xq*2], m5 |
| psrldq m5, 8 |
| movq [bq+xq*2+(384+16)*2], m5 |
| movu [aq+xq*4], m0 |
| movu [aq+xq*4+(384+16)*4], m1 |
| add xd, 4 |
| cmp xd, wd |
| jl .loop_x |
| add aq, (384+16)*4*2 |
| add bq, (384+16)*2*2 |
| sub hd, 2 |
| jg .loop_y |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ |
| tmp_base, src_base, a_base, b_base, x, y |
| movifnidn wd, wm |
| mov hd, hm |
| mova m15, [pw_16] |
| mov tmp_baseq, tq |
| mov src_baseq, srcq |
| mov a_baseq, aq |
| mov b_baseq, bq |
| xor xd, xd |
| %else |
| cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y |
| %define tmp_baseq [esp+8] |
| %define src_baseq [esp+12] |
| %define a_baseq [esp+16] |
| %define b_baseq [esp+20] |
| %define wd [esp+24] |
| %define hd [esp+28] |
| mov tmp_baseq, tq |
| mov src_baseq, srcq |
| mov a_baseq, aq |
| mov b_baseq, bq |
| mov wd, xd |
| mov hd, yd |
| xor xd, xd |
| SETUP_PIC yd, 1, 1 |
| jmp .loop_start |
| %endif |
| |
| .loop_x: |
| mov tq, tmp_baseq |
| mov srcq, src_baseq |
| mov aq, a_baseq |
| mov bq, b_baseq |
| %if ARCH_X86_32 |
| .loop_start: |
| movu m0, [bq+xq*2-(384+16)*2-2] |
| movu m2, [bq+xq*2-(384+16)*2+2] |
| mova m1, [bq+xq*2-(384+16)*2] ; b:top |
| paddw m0, m2 ; b:tl+tr |
| movu m2, [bq+xq*2-2] |
| movu m3, [bq+xq*2+2] |
| paddw m1, [bq+xq*2] ; b:top+ctr |
| paddw m2, m3 ; b:l+r |
| mova [esp+0x80], m0 |
| mova [esp+0x70], m1 |
| mova [esp+0x60], m2 |
| %endif |
| movu m0, [aq+xq*4-(384+16)*4-4] |
| movu m2, [aq+xq*4-(384+16)*4+4] |
| mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] |
| paddd m0, m2 ; a:tl+tr [first half] |
| movu m2, [aq+xq*4-(384+16)*4-4+16] |
| movu m4, [aq+xq*4-(384+16)*4+4+16] |
| mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] |
| paddd m2, m4 ; a:tl+tr [second half] |
| movu m4, [aq+xq*4-4] |
| movu m5, [aq+xq*4+4] |
| paddd m1, [aq+xq*4] ; a:top+ctr [first half] |
| paddd m4, m5 ; a:l+r [first half] |
| movu m5, [aq+xq*4+16-4] |
| movu m6, [aq+xq*4+16+4] |
| paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] |
| paddd m5, m6 ; a:l+r [second half] |
| %if ARCH_X86_64 |
| movu m6, [bq+xq*2-(384+16)*2-2] |
| movu m8, [bq+xq*2-(384+16)*2+2] |
| mova m7, [bq+xq*2-(384+16)*2] ; b:top |
| paddw m6, m8 ; b:tl+tr |
| movu m8, [bq+xq*2-2] |
| movu m9, [bq+xq*2+2] |
| paddw m7, [bq+xq*2] ; b:top+ctr |
| paddw m8, m9 ; b:l+r |
| %endif |
| |
| lea tq, [tq+xq*2] |
| lea srcq, [srcq+xq*1] |
| lea aq, [aq+xq*4+(384+16)*4] |
| lea bq, [bq+xq*2+(384+16)*2] |
| mov yd, hd |
| .loop_y: |
| %if ARCH_X86_64 |
| movu m9, [bq-2] |
| movu m10, [bq+2] |
| paddw m7, [bq] ; b:top+ctr+bottom |
| paddw m9, m10 ; b:bl+br |
| paddw m10, m7, m8 ; b:top+ctr+bottom+l+r |
| paddw m6, m9 ; b:tl+tr+bl+br |
| psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom |
| paddw m10, m6 |
| psllw m10, 2 |
| psubw m10, m6 ; aa |
| pxor m14, m14 |
| movq m12, [srcq] |
| punpcklbw m12, m14 |
| punpcklwd m6, m10, m15 |
| punpckhwd m10, m15 |
| punpcklwd m13, m12, m15 |
| punpckhwd m12, m15 |
| pmaddwd m6, m13 ; aa*src[x]+256 [first half] |
| pmaddwd m10, m12 ; aa*src[x]+256 [second half] |
| %else |
| paddd m1, [aq] ; a:top+ctr+bottom [first half] |
| paddd m3, [aq+16] ; a:top+ctr+bottom [second half] |
| mova [esp+0x50], m1 |
| mova [esp+0x40], m3 |
| mova [esp+0x30], m4 |
| movu m6, [aq-4] |
| movu m7, [aq+4] |
| paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] |
| paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] |
| paddd m6, m7 ; a:bl+br [first half] |
| movu m7, [aq+16-4] |
| movu m4, [aq+16+4] |
| paddd m7, m4 ; a:bl+br [second half] |
| paddd m0, m6 ; a:tl+tr+bl+br [first half] |
| paddd m2, m7 ; a:tl+tr+bl+br [second half] |
| paddd m1, m0 |
| paddd m3, m2 |
| pslld m1, 2 |
| pslld m3, 2 |
| psubd m1, m0 ; bb [first half] |
| psubd m3, m2 ; bb [second half] |
| %endif |
| |
| %if ARCH_X86_64 |
| movu m11, [aq-4] |
| movu m12, [aq+4] |
| paddd m1, [aq] ; a:top+ctr+bottom [first half] |
| paddd m11, m12 ; a:bl+br [first half] |
| movu m12, [aq+16-4] |
| movu m13, [aq+16+4] |
| paddd m3, [aq+16] ; a:top+ctr+bottom [second half] |
| paddd m12, m13 ; a:bl+br [second half] |
| paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] |
| paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] |
| paddd m0, m11 ; a:tl+tr+bl+br [first half] |
| paddd m2, m12 ; a:tl+tr+bl+br [second half] |
| paddd m13, m0 |
| paddd m14, m2 |
| pslld m13, 2 |
| pslld m14, 2 |
| psubd m13, m0 ; bb [first half] |
| psubd m14, m2 ; bb [second half] |
| psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] |
| psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] |
| %else |
| mova m4, [esp+0x80] |
| mova [esp+0x80], m5 |
| mova m5, [esp+0x70] |
| mova [esp+0x70], m6 |
| mova m6, [esp+0x60] |
| mova [esp+0x60], m7 |
| mova [esp+0x20], m1 |
| movu m7, [bq-2] |
| movu m1, [bq+2] |
| paddw m5, [bq] ; b:top+ctr+bottom |
| paddw m7, m1 |
| paddw m1, m5, m6 ; b:top+ctr+bottom+l+r |
| paddw m4, m7 ; b:tl+tr+bl+br |
| psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom |
| paddw m1, m4 |
| psllw m1, 2 |
| psubw m1, m4 ; aa |
| movq m0, [srcq] |
| XCHG_PIC_REG |
| punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] |
| punpcklwd m4, m1, [PIC_sym(pw_16)] |
| punpckhwd m1, [PIC_sym(pw_16)] |
| punpcklwd m2, m0, [PIC_sym(pw_16)] |
| punpckhwd m0, [PIC_sym(pw_16)] |
| XCHG_PIC_REG |
| pmaddwd m4, m2 ; aa*src[x]+256 [first half] |
| pmaddwd m1, m0 ; aa*src[x]+256 [second half] |
| %endif |
| |
| %if ARCH_X86_64 |
| paddd m6, m13 |
| paddd m10, m14 |
| psrad m6, 9 |
| psrad m10, 9 |
| packssdw m6, m10 |
| mova [tq], m6 |
| %else |
| paddd m4, [esp+0x20] |
| paddd m1, m3 |
| psrad m4, 9 |
| psrad m1, 9 |
| packssdw m4, m1 |
| mova [tq], m4 |
| %endif |
| |
| ; shift to next row |
| %if ARCH_X86_64 |
| mova m0, m4 |
| mova m2, m5 |
| mova m4, m11 |
| mova m5, m12 |
| mova m6, m8 |
| mova m8, m9 |
| %else |
| mova m1, [esp+0x50] |
| mova m3, [esp+0x40] |
| mova m0, [esp+0x30] |
| mova m2, [esp+0x80] |
| mova m4, [esp+0x70] |
| mova [esp+0x70], m5 |
| mova m5, [esp+0x60] |
| mova [esp+0x80], m6 |
| mova [esp+0x60], m7 |
| psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] |
| psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] |
| %endif |
| |
| add aq, (384+16)*4 |
| add bq, (384+16)*2 |
| add tq, 384*2 |
| add srcq, strideq |
| dec yd |
| jg .loop_y |
| add xd, 8 |
| cmp xd, wd |
| jl .loop_x |
| RET |
| |
| cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt |
| movifnidn hd, hm |
| %if ARCH_X86_32 |
| SETUP_PIC r6, 0 |
| %endif |
| movd m0, wtm |
| pshufb m0, [PIC_sym(pb_0_1)] |
| psllw m0, 4 |
| pxor m7, m7 |
| DEFINE_ARGS dst, stride, t, w, h, idx |
| .loop_y: |
| xor idxd, idxd |
| .loop_x: |
| mova m1, [tq+idxq*2+ 0] |
| mova m4, [tq+idxq*2+16] |
| mova m5, [dstq+idxq] |
| punpcklbw m2, m5, m7 |
| punpckhbw m5, m7 |
| psllw m3, m2, 4 |
| psllw m6, m5, 4 |
| psubw m1, m3 |
| psubw m4, m6 |
| pmulhrsw m1, m0 |
| pmulhrsw m4, m0 |
| paddw m1, m2 |
| paddw m4, m5 |
| packuswb m1, m4 |
| mova [dstq+idxq], m1 |
| add idxd, 16 |
| cmp idxd, wd |
| jl .loop_x |
| add dstq, strideq |
| add tq, 384 * 2 |
| dec hd |
| jg .loop_y |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim |
| mov edged, edgem |
| movifnidn wd, wm |
| mov hd, hm |
| mova m10, [pb_0] |
| mova m11, [pb_0_1] |
| %else |
| cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge |
| %define edged edgemp |
| %define wd xd |
| %define wq wd |
| %define wm r5m |
| %define strideq r4m |
| SUB esp, 8 |
| SETUP_PIC sumsqd, 1, 1 |
| |
| %define m10 [PIC_sym(pb_0)] |
| %define m11 [PIC_sym(pb_0_1)] |
| %endif |
| |
| test edged, 2 ; have_right |
| jz .no_right |
| xor xlimd, xlimd |
| add wd, 2 |
| add wd, 15 |
| and wd, ~15 |
| jmp .right_done |
| .no_right: |
| mov xlimd, 3 |
| dec wd |
| .right_done: |
| pxor m1, m1 |
| lea srcq, [srcq+wq+1] |
| lea sumq, [sumq+wq*2-2] |
| lea sumsqq, [sumsqq+wq*4-4] |
| neg wq |
| %if ARCH_X86_64 |
| lea r10, [pb_right_ext_mask+16] |
| %else |
| mov wm, xd |
| %define wq wm |
| %endif |
| |
| .loop_y: |
| mov xq, wq |
| ; load left |
| test edged, 1 ; have_left |
| jz .no_left |
| test leftq, leftq |
| jz .load_left_from_main |
| movd m0, [leftq] |
| movd m2, [srcq+xq-1] |
| pslldq m2, 4 |
| por m0, m2 |
| pslldq m0, 11 |
| add leftq, 4 |
| jmp .expand_x |
| .no_left: |
| movd m0, [srcq+xq-1] |
| XCHG_PIC_REG |
| pshufb m0, m10 |
| XCHG_PIC_REG |
| jmp .expand_x |
| .load_left_from_main: |
| movd m0, [srcq+xq-4] |
| pslldq m0, 12 |
| .expand_x: |
| punpckhbw m0, m1 |
| |
| ; when we reach this, m0 contains left two px in highest words |
| cmp xq, -8 |
| jle .loop_x |
| test xq, xq |
| jge .right_extend |
| .partial_load_and_extend: |
| XCHG_PIC_REG |
| movd m3, [srcq-1] |
| movq m2, [srcq+xq] |
| pshufb m3, m10 |
| punpcklbw m3, m1 |
| punpcklbw m2, m1 |
| %if ARCH_X86_64 |
| movu m4, [r10+xq*2] |
| %else |
| movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] |
| XCHG_PIC_REG |
| %endif |
| pand m2, m4 |
| pandn m4, m3 |
| por m2, m4 |
| jmp .loop_x_noload |
| .right_extend: |
| psrldq m2, m0, 14 |
| XCHG_PIC_REG |
| pshufb m2, m11 |
| XCHG_PIC_REG |
| jmp .loop_x_noload |
| |
| .loop_x: |
| movq m2, [srcq+xq] |
| punpcklbw m2, m1 |
| .loop_x_noload: |
| palignr m3, m2, m0, 8 |
| palignr m4, m2, m0, 10 |
| palignr m5, m2, m0, 12 |
| palignr m6, m2, m0, 14 |
| |
| %if ARCH_X86_64 |
| paddw m0, m3, m2 |
| punpcklwd m7, m3, m2 |
| punpckhwd m3, m2 |
| paddw m0, m4 |
| punpcklwd m8, m4, m5 |
| punpckhwd m4, m5 |
| paddw m0, m5 |
| punpcklwd m9, m6, m1 |
| punpckhwd m5, m6, m1 |
| paddw m0, m6 |
| pmaddwd m7, m7 |
| pmaddwd m3, m3 |
| pmaddwd m8, m8 |
| pmaddwd m4, m4 |
| pmaddwd m9, m9 |
| pmaddwd m5, m5 |
| paddd m7, m8 |
| paddd m3, m4 |
| paddd m7, m9 |
| paddd m3, m5 |
| movu [sumq+xq*2], m0 |
| movu [sumsqq+xq*4+ 0], m7 |
| movu [sumsqq+xq*4+16], m3 |
| %else |
| paddw m0, m3, m2 |
| paddw m0, m4 |
| paddw m0, m5 |
| paddw m0, m6 |
| movu [sumq+xq*2], m0 |
| punpcklwd m7, m3, m2 |
| punpckhwd m3, m2 |
| punpcklwd m0, m4, m5 |
| punpckhwd m4, m5 |
| punpckhwd m5, m6, m1 |
| pmaddwd m7, m7 |
| pmaddwd m3, m3 |
| pmaddwd m0, m0 |
| pmaddwd m4, m4 |
| pmaddwd m5, m5 |
| paddd m7, m0 |
| paddd m3, m4 |
| paddd m3, m5 |
| punpcklwd m0, m6, m1 |
| pmaddwd m0, m0 |
| paddd m7, m0 |
| movu [sumsqq+xq*4+ 0], m7 |
| movu [sumsqq+xq*4+16], m3 |
| %endif |
| |
| mova m0, m2 |
| add xq, 8 |
| |
| ; if x <= -8 we can reload more pixels |
| ; else if x < 0 we reload and extend (this implies have_right=0) |
| ; else if x < xlimd we extend from previous load (this implies have_right=0) |
| ; else we are done |
| |
| cmp xq, -8 |
| jle .loop_x |
| test xq, xq |
| jl .partial_load_and_extend |
| cmp xq, xlimq |
| jl .right_extend |
| |
| add sumsqq, (384+16)*4 |
| add sumq, (384+16)*2 |
| add srcq, strideq |
| dec hd |
| jg .loop_y |
| %if ARCH_X86_32 |
| ADD esp, 8 |
| %endif |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim |
| movifnidn edged, edgem |
| mov ylimd, edged |
| %else |
| cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr |
| %define wm [esp+0] |
| %define hm [esp+4] |
| %define edgem [esp+8] |
| mov wm, xd |
| mov hm, yd |
| mov edgem, ylimd |
| %endif |
| |
| and ylimd, 8 ; have_bottom |
| shr ylimd, 2 |
| sub ylimd, 3 ; -3 if have_bottom=0, else -1 |
| mov xq, -2 |
| %if ARCH_X86_64 |
| .loop_x: |
| lea yd, [hd+ylimd+2] |
| lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] |
| lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] |
| test edged, 4 ; have_top |
| jnz .load_top |
| movu m0, [sumsq_ptrq+(384+16)*4*1] |
| movu m1, [sumsq_ptrq+(384+16)*4*1+16] |
| mova m2, m0 |
| mova m3, m1 |
| mova m4, m0 |
| mova m5, m1 |
| mova m6, m0 |
| mova m7, m1 |
| movu m10, [sum_ptrq+(384+16)*2*1] |
| mova m11, m10 |
| mova m12, m10 |
| mova m13, m10 |
| jmp .loop_y_second_load |
| .load_top: |
| movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] |
| movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] |
| movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] |
| movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] |
| mova m2, m0 |
| mova m3, m1 |
| movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 |
| movu m12, [sum_ptrq-(384+16)*2*0] ; l2 |
| mova m11, m10 |
| .loop_y: |
| movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] |
| movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] |
| movu m13, [sum_ptrq+(384+16)*2*1] ; l1 |
| .loop_y_second_load: |
| test yd, yd |
| jle .emulate_second_load |
| movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] |
| movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] |
| movu m14, [sum_ptrq+(384+16)*2*2] ; l0 |
| .loop_y_noload: |
| paddd m0, m2 |
| paddd m1, m3 |
| paddw m10, m11 |
| paddd m0, m4 |
| paddd m1, m5 |
| paddw m10, m12 |
| paddd m0, m6 |
| paddd m1, m7 |
| paddw m10, m13 |
| paddd m0, m8 |
| paddd m1, m9 |
| paddw m10, m14 |
| movu [sumsq_ptrq+ 0], m0 |
| movu [sumsq_ptrq+16], m1 |
| movu [sum_ptrq], m10 |
| |
| ; shift position down by one |
| mova m0, m4 |
| mova m1, m5 |
| mova m2, m6 |
| mova m3, m7 |
| mova m4, m8 |
| mova m5, m9 |
| mova m10, m12 |
| mova m11, m13 |
| mova m12, m14 |
| add sumsq_ptrq, (384+16)*4*2 |
| add sum_ptrq, (384+16)*2*2 |
| sub yd, 2 |
| jge .loop_y |
| ; l1 = l0 |
| mova m6, m8 |
| mova m7, m9 |
| mova m13, m14 |
| cmp yd, ylimd |
| jg .loop_y_noload |
| add xd, 8 |
| cmp xd, wd |
| jl .loop_x |
| RET |
| .emulate_second_load: |
| mova m8, m6 |
| mova m9, m7 |
| mova m14, m13 |
| jmp .loop_y_noload |
| %else |
| .sumsq_loop_x: |
| lea yd, [ylimd+2] |
| add yd, hm |
| lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] |
| test dword edgem, 4 ; have_top |
| jnz .sumsq_load_top |
| movu m0, [sumsq_ptrq+(384+16)*4*1] |
| movu m1, [sumsq_ptrq+(384+16)*4*1+16] |
| mova m4, m0 |
| mova m5, m1 |
| mova m6, m0 |
| mova m7, m1 |
| mova [esp+0x1c], m0 |
| mova [esp+0x0c], m1 |
| jmp .sumsq_loop_y_second_load |
| .sumsq_load_top: |
| movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] |
| movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] |
| movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] |
| movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] |
| mova [esp+0x1c], m0 |
| mova [esp+0x0c], m1 |
| .sumsq_loop_y: |
| movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] |
| movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] |
| .sumsq_loop_y_second_load: |
| test yd, yd |
| jle .sumsq_emulate_second_load |
| movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] |
| movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] |
| .sumsq_loop_y_noload: |
| paddd m0, [esp+0x1c] |
| paddd m1, [esp+0x0c] |
| paddd m0, m4 |
| paddd m1, m5 |
| paddd m0, m6 |
| paddd m1, m7 |
| paddd m0, m2 |
| paddd m1, m3 |
| movu [sumsq_ptrq+ 0], m0 |
| movu [sumsq_ptrq+16], m1 |
| |
| ; shift position down by one |
| mova m0, m4 |
| mova m1, m5 |
| mova m4, m2 |
| mova m5, m3 |
| mova [esp+0x1c], m6 |
| mova [esp+0x0c], m7 |
| add sumsq_ptrq, (384+16)*4*2 |
| sub yd, 2 |
| jge .sumsq_loop_y |
| ; l1 = l0 |
| mova m6, m2 |
| mova m7, m3 |
| cmp yd, ylimd |
| jg .sumsq_loop_y_noload |
| add xd, 8 |
| cmp xd, wm |
| jl .sumsq_loop_x |
| |
| mov xd, -2 |
| .sum_loop_x: |
| lea yd, [ylimd+2] |
| add yd, hm |
| lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] |
| test dword edgem, 4 ; have_top |
| jnz .sum_load_top |
| movu m0, [sum_ptrq+(384+16)*2*1] |
| mova m1, m0 |
| mova m2, m0 |
| mova m3, m0 |
| jmp .sum_loop_y_second_load |
| .sum_load_top: |
| movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 |
| movu m2, [sum_ptrq-(384+16)*2*0] ; l2 |
| mova m1, m0 |
| .sum_loop_y: |
| movu m3, [sum_ptrq+(384+16)*2*1] ; l1 |
| .sum_loop_y_second_load: |
| test yd, yd |
| jle .sum_emulate_second_load |
| movu m4, [sum_ptrq+(384+16)*2*2] ; l0 |
| .sum_loop_y_noload: |
| paddw m0, m1 |
| paddw m0, m2 |
| paddw m0, m3 |
| paddw m0, m4 |
| movu [sum_ptrq], m0 |
| |
| ; shift position down by one |
| mova m0, m2 |
| mova m1, m3 |
| mova m2, m4 |
| add sum_ptrq, (384+16)*2*2 |
| sub yd, 2 |
| jge .sum_loop_y |
| ; l1 = l0 |
| mova m3, m4 |
| cmp yd, ylimd |
| jg .sum_loop_y_noload |
| add xd, 8 |
| cmp xd, wm |
| jl .sum_loop_x |
| RET |
| .sumsq_emulate_second_load: |
| mova m2, m6 |
| mova m3, m7 |
| jmp .sumsq_loop_y_noload |
| .sum_emulate_second_load: |
| mova m4, m3 |
| jmp .sum_loop_y_noload |
| %endif |
| |
| cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s |
| movifnidn sd, sm |
| sub aq, (384+16-1)*4 |
| sub bq, (384+16-1)*2 |
| add hd, 2 |
| %if ARCH_X86_64 |
| LEA r5, sgr_x_by_x-0xF03 |
| %else |
| SETUP_PIC r5, 0 |
| %endif |
| movd m6, sd |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| pxor m7, m7 |
| DEFINE_ARGS a, b, w, h, x |
| %if ARCH_X86_64 |
| mova m8, [pd_0xF0080029] |
| mova m9, [pw_256] |
| psrld m10, m9, 15 ; pd_512 |
| %else |
| %define m8 [PIC_sym(pd_0xF0080029)] |
| %define m9 [PIC_sym(pw_256)] |
| %define m10 [PIC_sym(pd_512)] |
| %endif |
| .loop_y: |
| mov xq, -2 |
| .loop_x: |
| movq m0, [bq+xq*2+0] |
| movq m1, [bq+xq*2+8] |
| punpcklwd m0, m7 |
| punpcklwd m1, m7 |
| movu m2, [aq+xq*4+ 0] |
| movu m3, [aq+xq*4+16] |
| pslld m4, m2, 3 ; aa * 8 |
| pslld m5, m3, 3 |
| paddd m2, m4 ; aa * 9 |
| paddd m3, m5 |
| paddd m4, m4 ; aa * 16 |
| paddd m5, m5 |
| paddd m2, m4 ; aa * 25 |
| paddd m3, m5 |
| pmaddwd m4, m0, m0 |
| pmaddwd m5, m1, m1 |
| psubd m2, m4 ; p = aa * 25 - bb * bb |
| psubd m3, m5 |
| MULLD m2, m6 |
| MULLD m3, m6 |
| paddusw m2, m8 |
| paddusw m3, m8 |
| psrld m2, 20 ; z |
| psrld m3, 20 |
| GATHERDD m4, m2 ; xx |
| GATHERDD m2, m3 |
| psrld m4, 24 |
| psrld m2, 24 |
| packssdw m3, m4, m2 |
| pmullw m4, m8 |
| pmullw m2, m8 |
| psubw m5, m9, m3 |
| pmaddwd m0, m4 |
| pmaddwd m1, m2 |
| paddd m0, m10 |
| paddd m1, m10 |
| psrld m0, 10 |
| psrld m1, 10 |
| movu [bq+xq*2], m5 |
| movu [aq+xq*4+ 0], m0 |
| movu [aq+xq*4+16], m1 |
| add xd, 8 |
| cmp xd, wd |
| jl .loop_x |
| add aq, (384+16)*4*2 |
| add bq, (384+16)*2*2 |
| sub hd, 2 |
| jg .loop_y |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ |
| tmp_base, src_base, a_base, b_base, x, y |
| movifnidn wd, wm |
| mov hd, hm |
| mov tmp_baseq, tq |
| mov src_baseq, srcq |
| mov a_baseq, aq |
| mov b_baseq, bq |
| mova m9, [pw_5_6] |
| mova m12, [pw_256] |
| psrlw m10, m12, 8 ; pw_1 |
| psrlw m11, m12, 1 ; pw_128 |
| pxor m13, m13 |
| %else |
| cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y |
| %define tmp_baseq r0m |
| %define src_baseq r1m |
| %define a_baseq r3m |
| %define b_baseq r4m |
| %define wd r5m |
| %define hd r6m |
| |
| SUB esp, 8 |
| SETUP_PIC yd |
| |
| %define m8 m5 |
| %define m9 [PIC_sym(pw_5_6)] |
| %define m10 [PIC_sym(pw_1)] |
| %define m11 [PIC_sym(pw_128)] |
| %define m12 [PIC_sym(pw_256)] |
| %define m13 m0 |
| %endif |
| xor xd, xd |
| .loop_x: |
| mov tq, tmp_baseq |
| mov srcq, src_baseq |
| mov aq, a_baseq |
| mov bq, b_baseq |
| movu m0, [aq+xq*4-(384+16)*4-4] |
| mova m1, [aq+xq*4-(384+16)*4] |
| movu m2, [aq+xq*4-(384+16)*4+4] |
| movu m3, [aq+xq*4-(384+16)*4-4+16] |
| mova m4, [aq+xq*4-(384+16)*4+16] |
| movu m5, [aq+xq*4-(384+16)*4+4+16] |
| paddd m0, m2 |
| paddd m3, m5 |
| paddd m0, m1 |
| paddd m3, m4 |
| pslld m2, m0, 2 |
| pslld m5, m3, 2 |
| paddd m2, m0 |
| paddd m5, m3 |
| paddd m0, m2, m1 ; prev_odd_b [first half] |
| paddd m1, m5, m4 ; prev_odd_b [second half] |
| movu m3, [bq+xq*2-(384+16)*2-2] |
| mova m4, [bq+xq*2-(384+16)*2] |
| movu m5, [bq+xq*2-(384+16)*2+2] |
| paddw m3, m5 |
| punpcklwd m5, m3, m4 |
| punpckhwd m3, m4 |
| pmaddwd m5, m9 |
| pmaddwd m3, m9 |
| mova m2, m5 |
| packssdw m2, m3 ; prev_odd_a |
| lea tq, [tq+xq*2] |
| lea srcq, [srcq+xq*1] |
| lea aq, [aq+xq*4+(384+16)*4] |
| lea bq, [bq+xq*2+(384+16)*2] |
| %if ARCH_X86_32 |
| mov [esp], PIC_reg |
| %endif |
| mov yd, hd |
| XCHG_PIC_REG |
| .loop_y: |
| movu m3, [aq-4] |
| mova m4, [aq] |
| movu m5, [aq+4] |
| paddd m3, m5 |
| paddd m3, m4 |
| pslld m5, m3, 2 |
| paddd m5, m3 |
| paddd m5, m4 ; cur_odd_b [first half] |
| movu m3, [aq+16-4] |
| mova m6, [aq+16] |
| movu m7, [aq+16+4] |
| paddd m3, m7 |
| paddd m3, m6 |
| pslld m7, m3, 2 |
| paddd m7, m3 |
| paddd m4, m7, m6 ; cur_odd_b [second half] |
| movu m3, [bq-2] |
| mova m6, [bq] |
| movu m7, [bq+2] |
| paddw m3, m7 |
| punpcklwd m7, m3, m6 |
| punpckhwd m3, m6 |
| pmaddwd m7, m9 |
| pmaddwd m3, m9 |
| packssdw m6, m7, m3 ; cur_odd_a |
| |
| paddd m0, m5 ; cur_even_b [first half] |
| paddd m1, m4 ; cur_even_b [second half] |
| paddw m2, m6 ; cur_even_a |
| |
| movq m3, [srcq] |
| %if ARCH_X86_64 |
| punpcklbw m3, m13 |
| %else |
| mova [td], m5 |
| pxor m7, m7 |
| punpcklbw m3, m7 |
| %endif |
| punpcklwd m7, m3, m10 |
| punpckhwd m3, m10 |
| punpcklwd m8, m2, m12 |
| punpckhwd m2, m12 |
| pmaddwd m7, m8 |
| pmaddwd m3, m2 |
| paddd m7, m0 |
| paddd m3, m1 |
| psrad m7, 9 |
| psrad m3, 9 |
| |
| %if ARCH_X86_32 |
| pxor m13, m13 |
| %endif |
| movq m8, [srcq+strideq] |
| punpcklbw m8, m13 |
| punpcklwd m0, m8, m10 |
| punpckhwd m8, m10 |
| punpcklwd m1, m6, m11 |
| punpckhwd m2, m6, m11 |
| pmaddwd m0, m1 |
| pmaddwd m8, m2 |
| %if ARCH_X86_64 |
| paddd m0, m5 |
| %else |
| paddd m0, [td] |
| %endif |
| paddd m8, m4 |
| psrad m0, 8 |
| psrad m8, 8 |
| |
| packssdw m7, m3 |
| packssdw m0, m8 |
| %if ARCH_X86_32 |
| mova m5, [td] |
| %endif |
| mova [tq+384*2*0], m7 |
| mova [tq+384*2*1], m0 |
| |
| mova m0, m5 |
| mova m1, m4 |
| mova m2, m6 |
| add aq, (384+16)*4*2 |
| add bq, (384+16)*2*2 |
| add tq, 384*2*2 |
| lea srcq, [srcq+strideq*2] |
| %if ARCH_X86_64 |
| sub yd, 2 |
| %else |
| sub dword [esp+4], 2 |
| %endif |
| jg .loop_y |
| add xd, 8 |
| cmp xd, wd |
| jl .loop_x |
| %if ARCH_X86_32 |
| ADD esp, 8 |
| %endif |
| RET |
| |
| cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt |
| movifnidn wd, wm |
| mov wtq, wtmp |
| %if ARCH_X86_64 |
| movifnidn hd, hm |
| mova m10, [pd_1024] |
| pxor m11, m11 |
| %else |
| SETUP_PIC hd, 0 |
| %define m10 [PIC_sym(pd_1024)] |
| %define m11 m7 |
| %endif |
| movd m0, [wtq] |
| pshufd m0, m0, 0 |
| DEFINE_ARGS dst, stride, t1, t2, w, h, idx |
| %if ARCH_X86_32 |
| %define hd hmp |
| %endif |
| |
| .loop_y: |
| xor idxd, idxd |
| .loop_x: |
| mova m1, [t1q+idxq*2+ 0] |
| mova m2, [t1q+idxq*2+16] |
| mova m3, [t2q+idxq*2+ 0] |
| mova m4, [t2q+idxq*2+16] |
| mova m6, [dstq+idxq] |
| %if ARCH_X86_32 |
| pxor m11, m11 |
| %endif |
| punpcklbw m5, m6, m11 |
| punpckhbw m6, m11 |
| psllw m7, m5, 4 |
| psubw m1, m7 |
| psubw m3, m7 |
| psllw m7, m6, 4 |
| psubw m2, m7 |
| psubw m4, m7 |
| punpcklwd m7, m1, m3 |
| punpckhwd m1, m3 |
| punpcklwd m3, m2, m4 |
| punpckhwd m2, m4 |
| pmaddwd m7, m0 |
| pmaddwd m1, m0 |
| pmaddwd m3, m0 |
| pmaddwd m2, m0 |
| paddd m7, m10 |
| paddd m1, m10 |
| paddd m3, m10 |
| paddd m2, m10 |
| psrad m7, 11 |
| psrad m1, 11 |
| psrad m3, 11 |
| psrad m2, 11 |
| packssdw m7, m1 |
| packssdw m3, m2 |
| paddw m7, m5 |
| paddw m3, m6 |
| packuswb m7, m3 |
| mova [dstq+idxq], m7 |
| add idxd, 16 |
| cmp idxd, wd |
| jl .loop_x |
| add dstq, strideq |
| add t1q, 384 * 2 |
| add t2q, 384 * 2 |
| dec hd |
| jg .loop_y |
| RET |