| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; Copyright © 2018, VideoLabs |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| ; dav1d_obmc_masks[] with 64-x interleaved |
| obmc_masks: db 0, 0, 0, 0 |
| ; 2 @4 |
| db 45, 19, 64, 0 |
| ; 4 @8 |
| db 39, 25, 50, 14, 59, 5, 64, 0 |
| ; 8 @16 |
| db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 |
| ; 16 @32 |
| db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 |
| db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 |
| ; 32 @64 |
| db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 |
| db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 |
| db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 |
| |
| warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 |
| warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 |
| warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 |
| warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 |
| blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 |
| subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 |
| db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 |
| subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 |
| subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 |
| subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 |
| bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 |
| bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 |
| |
| pb_64: times 16 db 64 |
| pw_8: times 8 dw 8 |
| pw_26: times 8 dw 26 |
| pw_34: times 8 dw 34 |
| pw_512: times 8 dw 512 |
| pw_1024: times 8 dw 1024 |
| pw_2048: times 8 dw 2048 |
| pw_6903: times 8 dw 6903 |
| pw_8192: times 8 dw 8192 |
| pd_32: times 4 dd 32 |
| pd_512: times 4 dd 512 |
| pd_16384: times 4 dd 16484 |
| pd_32768: times 4 dd 32768 |
| pd_262144:times 4 dd 262144 |
| |
| pw_258: times 2 dw 258 |
| |
| cextern mc_subpel_filters |
| %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) |
| |
| %macro BIDIR_JMP_TABLE 1-* |
| ;evaluated at definition time (in loop below) |
| %xdefine %1_table (%%table - 2*%2) |
| %xdefine %%base %1_table |
| %xdefine %%prefix mangle(private_prefix %+ _%1) |
| ; dynamically generated label |
| %%table: |
| %rep %0 - 1 ; repeat for num args |
| dd %%prefix %+ .w%2 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16 |
| BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32 |
| BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32 |
| BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 |
| |
| %macro BASE_JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - %3) |
| %xdefine %%base %1_%2 |
| %%table: |
| %rep %0 - 2 |
| dw %%base %+ _w%3 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) |
| %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) |
| |
| BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 |
| BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 |
| |
| %macro HV_JMP_TABLE 5-* |
| %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) |
| %xdefine %%base %1_%3 |
| %assign %%types %4 |
| %if %%types & 1 |
| %xdefine %1_%2_h_%3_table (%%h - %5) |
| %%h: |
| %rep %0 - 4 |
| dw %%prefix %+ .h_w%5 - %%base |
| %rotate 1 |
| %endrep |
| %rotate 4 |
| %endif |
| %if %%types & 2 |
| %xdefine %1_%2_v_%3_table (%%v - %5) |
| %%v: |
| %rep %0 - 4 |
| dw %%prefix %+ .v_w%5 - %%base |
| %rotate 1 |
| %endrep |
| %rotate 4 |
| %endif |
| %if %%types & 4 |
| %xdefine %1_%2_hv_%3_table (%%hv - %5) |
| %%hv: |
| %rep %0 - 4 |
| dw %%prefix %+ .hv_w%5 - %%base |
| %rotate 1 |
| %endrep |
| %endif |
| %endmacro |
| |
| HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 |
| HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 |
| HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 |
| HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 |
| |
| %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX |
| |
| cextern mc_warp_filter |
| |
| SECTION .text |
| |
| INIT_XMM ssse3 |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 1 |
| %define base t0-put_ssse3 |
| %else |
| DECLARE_REG_TMP 7 |
| %define base 0 |
| %endif |
| ; |
| %macro RESTORE_DSQ_32 1 |
| %if ARCH_X86_32 |
| mov %1, dsm ; restore dsq |
| %endif |
| %endmacro |
| ; |
| cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak |
| movifnidn mxyd, r6m ; mx |
| LEA t0, put_ssse3 |
| tzcnt wd, wm |
| mov hd, hm |
| test mxyd, mxyd |
| jnz .h |
| mov mxyd, r7m ; my |
| test mxyd, mxyd |
| jnz .v |
| .put: |
| movzx wd, word [t0+wq*2+table_offset(put,)] |
| add wq, t0 |
| RESTORE_DSQ_32 t0 |
| jmp wq |
| .put_w2: |
| movzx r4d, word [srcq+ssq*0] |
| movzx r6d, word [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mov [dstq+dsq*0], r4w |
| mov [dstq+dsq*1], r6w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w2 |
| RET |
| .put_w4: |
| mov r4d, [srcq+ssq*0] |
| mov r6d, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mov [dstq+dsq*0], r4d |
| mov [dstq+dsq*1], r6d |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w4 |
| RET |
| .put_w8: |
| movq m0, [srcq+ssq*0] |
| movq m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movq [dstq+dsq*0], m0 |
| movq [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w8 |
| RET |
| .put_w16: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mova [dstq+dsq*0], m0 |
| mova [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w16 |
| RET |
| .put_w32: |
| movu m0, [srcq+ssq*0+16*0] |
| movu m1, [srcq+ssq*0+16*1] |
| movu m2, [srcq+ssq*1+16*0] |
| movu m3, [srcq+ssq*1+16*1] |
| lea srcq, [srcq+ssq*2] |
| mova [dstq+dsq*0+16*0], m0 |
| mova [dstq+dsq*0+16*1], m1 |
| mova [dstq+dsq*1+16*0], m2 |
| mova [dstq+dsq*1+16*1], m3 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w32 |
| RET |
| .put_w64: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| add srcq, ssq |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| add dstq, dsq |
| dec hd |
| jg .put_w64 |
| RET |
| .put_w128: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| movu m0, [srcq+16*4] |
| movu m1, [srcq+16*5] |
| movu m2, [srcq+16*6] |
| movu m3, [srcq+16*7] |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| mova [dstq+16*6], m2 |
| mova [dstq+16*7], m3 |
| add srcq, ssq |
| add dstq, dsq |
| dec hd |
| jg .put_w128 |
| RET |
| .h: |
| ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 |
| ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 |
| imul mxyd, 0xff01 |
| mova m4, [base+bilin_h_shuf8] |
| mova m0, [base+bilin_h_shuf4] |
| add mxyd, 16 << 8 |
| movd m5, mxyd |
| mov mxyd, r7m ; my |
| pshuflw m5, m5, q0000 |
| punpcklqdq m5, m5 |
| test mxyd, mxyd |
| jnz .hv |
| movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] |
| mova m3, [base+pw_2048] |
| add wq, t0 |
| RESTORE_DSQ_32 t0 |
| jmp wq |
| .h_w2: |
| pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} |
| .h_w2_loop: |
| movd m0, [srcq+ssq*0] |
| movd m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpckldq m0, m1 |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| pmulhrsw m0, m3 |
| packuswb m0, m0 |
| movd r6d, m0 |
| mov [dstq+dsq*0], r6w |
| shr r6d, 16 |
| mov [dstq+dsq*1], r6w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w2_loop |
| RET |
| .h_w4: |
| movq m4, [srcq+ssq*0] |
| movhps m4, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pshufb m4, m0 |
| pmaddubsw m4, m5 |
| pmulhrsw m4, m3 |
| packuswb m4, m4 |
| movd [dstq+dsq*0], m4 |
| psrlq m4, 32 |
| movd [dstq+dsq*1], m4 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w4 |
| RET |
| .h_w8: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w8 |
| RET |
| .h_w16: |
| movu m0, [srcq+8*0] |
| movu m1, [srcq+8*1] |
| add srcq, ssq |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| mova [dstq], m0 |
| add dstq, dsq |
| dec hd |
| jg .h_w16 |
| RET |
| .h_w32: |
| movu m0, [srcq+mmsize*0+8*0] |
| movu m1, [srcq+mmsize*0+8*1] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| movu m1, [srcq+mmsize*1+8*0] |
| movu m2, [srcq+mmsize*1+8*1] |
| add srcq, ssq |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pmaddubsw m1, m5 |
| pmaddubsw m2, m5 |
| pmulhrsw m1, m3 |
| pmulhrsw m2, m3 |
| packuswb m1, m2 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| add dstq, dsq |
| dec hd |
| jg .h_w32 |
| RET |
| .h_w64: |
| mov r6, -16*3 |
| .h_w64_loop: |
| movu m0, [srcq+r6+16*3+8*0] |
| movu m1, [srcq+r6+16*3+8*1] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| mova [dstq+r6+16*3], m0 |
| add r6, 16 |
| jle .h_w64_loop |
| add srcq, ssq |
| add dstq, dsq |
| dec hd |
| jg .h_w64 |
| RET |
| .h_w128: |
| mov r6, -16*7 |
| .h_w128_loop: |
| movu m0, [srcq+r6+16*7+8*0] |
| movu m1, [srcq+r6+16*7+8*1] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmulhrsw m0, m3 |
| pmulhrsw m1, m3 |
| packuswb m0, m1 |
| mova [dstq+r6+16*7], m0 |
| add r6, 16 |
| jle .h_w128_loop |
| add srcq, ssq |
| add dstq, dsq |
| dec hd |
| jg .h_w128 |
| RET |
| .v: |
| movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] |
| imul mxyd, 0xff01 |
| mova m5, [base+pw_2048] |
| add mxyd, 16 << 8 |
| add wq, t0 |
| movd m4, mxyd |
| pshuflw m4, m4, q0000 |
| punpcklqdq m4, m4 |
| RESTORE_DSQ_32 t0 |
| jmp wq |
| .v_w2: |
| movd m0, [srcq+ssq*0] |
| .v_w2_loop: |
| pinsrw m0, [srcq+ssq*1], 1 ; 0 1 |
| lea srcq, [srcq+ssq*2] |
| pshuflw m2, m0, q2301 |
| pinsrw m0, [srcq+ssq*0], 0 ; 2 1 |
| punpcklbw m1, m0, m2 |
| pmaddubsw m1, m4 |
| pmulhrsw m1, m5 |
| packuswb m1, m1 |
| movd r6d, m1 |
| mov [dstq+dsq*1], r6w |
| shr r6d, 16 |
| mov [dstq+dsq*0], r6w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w2_loop |
| RET |
| .v_w4: |
| movd m0, [srcq+ssq*0] |
| .v_w4_loop: |
| movd m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpckldq m2, m0, m1 ; 0 1 |
| movd m0, [srcq+ssq*0] |
| punpckldq m1, m0 ; 1 2 |
| punpcklbw m1, m2 |
| pmaddubsw m1, m4 |
| pmulhrsw m1, m5 |
| packuswb m1, m1 |
| movd [dstq+dsq*0], m1 |
| psrlq m1, 32 |
| movd [dstq+dsq*1], m1 |
| ; |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w4_loop |
| RET |
| .v_w8: |
| movq m0, [srcq+ssq*0] |
| .v_w8_loop: |
| movq m3, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklbw m1, m3, m0 |
| movq m0, [srcq+ssq*0] |
| punpcklbw m2, m0, m3 |
| pmaddubsw m1, m4 |
| pmaddubsw m2, m4 |
| pmulhrsw m1, m5 |
| pmulhrsw m2, m5 |
| packuswb m1, m2 |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w8_loop |
| RET |
| ; |
| %macro PUT_BILIN_V_W16 0 |
| movu m0, [srcq+ssq*0] |
| %%loop: |
| movu m3, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklbw m1, m3, m0 |
| punpckhbw m2, m3, m0 |
| movu m0, [srcq+ssq*0] |
| pmaddubsw m1, m4 |
| pmaddubsw m2, m4 |
| pmulhrsw m1, m5 |
| pmulhrsw m2, m5 |
| packuswb m1, m2 |
| mova [dstq+dsq*0], m1 |
| punpcklbw m1, m0, m3 |
| punpckhbw m2, m0, m3 |
| pmaddubsw m1, m4 |
| pmaddubsw m2, m4 |
| pmulhrsw m1, m5 |
| pmulhrsw m2, m5 |
| packuswb m1, m2 |
| mova [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg %%loop |
| %endmacro |
| ; |
| .v_w16: |
| PUT_BILIN_V_W16 |
| RET |
| .v_w16gt: |
| mov r4, dstq |
| mov r6, srcq |
| .v_w16gt_loop: |
| %if ARCH_X86_32 |
| mov bakm, t0q |
| RESTORE_DSQ_32 t0 |
| PUT_BILIN_V_W16 |
| mov t0q, bakm |
| %else |
| PUT_BILIN_V_W16 |
| %endif |
| mov hw, t0w |
| add r4, mmsize |
| add r6, mmsize |
| mov dstq, r4 |
| mov srcq, r6 |
| sub t0d, 1<<16 |
| jg .v_w16gt |
| RET |
| .v_w32: |
| lea t0d, [hq+(1<<16)] |
| jmp .v_w16gt |
| .v_w64: |
| lea t0d, [hq+(3<<16)] |
| jmp .v_w16gt |
| .v_w128: |
| lea t0d, [hq+(7<<16)] |
| jmp .v_w16gt |
| .hv: |
| ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 |
| ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 |
| movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] |
| WIN64_SPILL_XMM 8 |
| shl mxyd, 11 ; can't shift by 12 due to signed overflow |
| mova m7, [base+pw_2048] |
| movd m6, mxyd |
| add wq, t0 |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| jmp wq |
| .hv_w2: |
| RESTORE_DSQ_32 t0 |
| movd m0, [srcq+ssq*0] |
| pshufd m0, m0, q0000 ; src[x - src_stride] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| .hv_w2_loop: |
| movd m1, [srcq+ssq*1] ; src[x] |
| lea srcq, [srcq+ssq*2] |
| movhps m1, [srcq+ssq*0] ; src[x + src_stride] |
| pshufd m1, m1, q3120 |
| pshufb m1, m4 |
| pmaddubsw m1, m5 ; 1 _ 2 _ |
| shufps m2, m0, m1, q1032 ; 0 _ 1 _ |
| mova m0, m1 |
| psubw m1, m2 ; src[x + src_stride] - src[x] |
| paddw m1, m1 |
| pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) |
| paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x]) |
| pmulhrsw m1, m7 |
| packuswb m1, m1 |
| %if ARCH_X86_64 |
| movq r6, m1 |
| %else |
| pshuflw m1, m1, q2020 |
| movd r6d, m1 |
| %endif |
| mov [dstq+dsq*0], r6w |
| shr r6, gprsize*4 |
| mov [dstq+dsq*1], r6w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w2_loop |
| RET |
| .hv_w4: |
| mova m4, [base+bilin_h_shuf4] |
| RESTORE_DSQ_32 t0 |
| movddup xm0, [srcq+ssq*0] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| .hv_w4_loop: |
| movq m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movhps m1, [srcq+ssq*0] |
| pshufb m1, m4 |
| pmaddubsw m1, m5 ; 1 2 |
| shufps m2, m0, m1, q1032 ; 0 1 |
| mova m0, m1 |
| psubw m1, m2 |
| paddw m1, m1 |
| pmulhw m1, m6 |
| paddw m1, m2 |
| pmulhrsw m1, m7 |
| packuswb m1, m1 |
| movd [dstq+dsq*0], m1 |
| psrlq m1, 32 |
| movd [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w4_loop |
| RET |
| .hv_w8: |
| RESTORE_DSQ_32 t0 |
| movu m0, [srcq+ssq*0+8*0] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| .hv_w8_loop: |
| movu m2, [srcq+ssq*1+8*0] |
| lea srcq, [srcq+ssq*2] |
| pshufb m2, m4 |
| pmaddubsw m2, m5 |
| psubw m1, m2, m0 |
| paddw m1, m1 |
| pmulhw m1, m6 |
| paddw m1, m0 |
| movu m0, [srcq+ssq*0+8*0] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| psubw m3, m0, m2 |
| paddw m3, m3 |
| pmulhw m3, m6 |
| paddw m3, m2 |
| pmulhrsw m1, m7 |
| pmulhrsw m3, m7 |
| packuswb m1, m3 |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w8_loop |
| RET |
| .hv_w16: |
| xor t0d, t0d |
| .hv_w16gt: |
| mov r4, dstq |
| mov r6, srcq |
| %if WIN64 |
| movaps r4m, xmm8 |
| %endif |
| .hv_w16_loop0: |
| movu m0, [srcq+8*0] |
| movu m1, [srcq+8*1] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| .hv_w16_loop: |
| %if ARCH_X86_32 |
| %define m0tmp [dstq] |
| %else |
| %define m0tmp m8 |
| %endif |
| add srcq, ssq |
| movu m2, [srcq+8*0] |
| movu m3, [srcq+8*1] |
| pshufb m2, m4 |
| pshufb m3, m4 |
| pmaddubsw m2, m5 |
| pmaddubsw m3, m5 |
| mova m0tmp, m2 |
| psubw m2, m0 |
| paddw m2, m2 |
| pmulhw m2, m6 |
| paddw m2, m0 |
| mova m0, m3 |
| psubw m3, m1 |
| paddw m3, m3 |
| pmulhw m3, m6 |
| paddw m3, m1 |
| mova m1, m0 |
| mova m0, m0tmp |
| pmulhrsw m2, m7 |
| pmulhrsw m3, m7 |
| packuswb m2, m3 |
| mova [dstq], m2 |
| add dstq, dsmp |
| dec hd |
| jg .hv_w16_loop |
| movzx hd, t0w |
| add r4, mmsize |
| add r6, mmsize |
| mov dstq, r4 |
| mov srcq, r6 |
| sub t0d, 1<<16 |
| jg .hv_w16_loop0 |
| %if WIN64 |
| movaps xmm8, r4m |
| %endif |
| RET |
| .hv_w32: |
| lea t0d, [hq+(1<<16)] |
| jmp .hv_w16gt |
| .hv_w64: |
| lea t0d, [hq+(3<<16)] |
| jmp .hv_w16gt |
| .hv_w128: |
| lea t0d, [hq+(7<<16)] |
| jmp .hv_w16gt |
| |
| DECLARE_REG_TMP 3, 5, 6 |
| %if ARCH_X86_32 |
| %define base t2-prep_ssse3 |
| %else |
| %define base 0 |
| %endif |
| cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 |
| movifnidn mxyd, r5m ; mx |
| LEA t2, prep_ssse3 |
| tzcnt wd, wm |
| movifnidn hd, hm |
| test mxyd, mxyd |
| jnz .h |
| mov mxyd, r6m ; my |
| test mxyd, mxyd |
| jnz .v |
| .prep: |
| movzx wd, word [t2+wq*2+table_offset(prep,)] |
| add wq, t2 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .prep_w4: |
| movd m0, [srcq+strideq*0] |
| movd m1, [srcq+strideq*1] |
| movd m2, [srcq+strideq*2] |
| movd m3, [srcq+stride3q ] |
| punpckldq m0, m1 |
| punpckldq m2, m3 |
| lea srcq, [srcq+strideq*4] |
| pxor m1, m1 |
| punpcklbw m0, m1 |
| punpcklbw m2, m1 |
| psllw m0, 4 |
| psllw m2, 4 |
| mova [tmpq+mmsize*0], m0 |
| mova [tmpq+mmsize*1], m2 |
| add tmpq, 32 |
| sub hd, 4 |
| jg .prep_w4 |
| RET |
| .prep_w8: |
| movq m0, [srcq+strideq*0] |
| movq m1, [srcq+strideq*1] |
| movq m2, [srcq+strideq*2] |
| movq m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| pxor m4, m4 |
| punpcklbw m0, m4 |
| punpcklbw m1, m4 |
| punpcklbw m2, m4 |
| punpcklbw m3, m4 |
| psllw m0, 4 |
| psllw m1, 4 |
| psllw m2, 4 |
| psllw m3, 4 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 4 |
| jg .prep_w8 |
| RET |
| .prep_w16: |
| movq m0, [srcq+strideq*0+8*0] |
| movq m1, [srcq+strideq*0+8*1] |
| movq m2, [srcq+strideq*1+8*0] |
| movq m3, [srcq+strideq*1+8*1] |
| lea srcq, [srcq+strideq*2] |
| pxor m4, m4 |
| punpcklbw m0, m4 |
| punpcklbw m1, m4 |
| punpcklbw m2, m4 |
| punpcklbw m3, m4 |
| psllw m0, 4 |
| psllw m1, 4 |
| psllw m2, 4 |
| psllw m3, 4 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 2 |
| jg .prep_w16 |
| RET |
| .prep_w16gt: |
| mov t1q, srcq |
| mov r3q, t2q |
| .prep_w16gt_hloop: |
| movq m0, [t1q+8*0] |
| movq m1, [t1q+8*1] |
| movq m2, [t1q+8*2] |
| movq m3, [t1q+8*3] |
| pxor m4, m4 |
| punpcklbw m0, m4 |
| punpcklbw m1, m4 |
| punpcklbw m2, m4 |
| punpcklbw m3, m4 |
| psllw m0, 4 |
| psllw m1, 4 |
| psllw m2, 4 |
| psllw m3, 4 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| add t1q, 32 |
| sub r3q, 1 |
| jg .prep_w16gt_hloop |
| lea srcq, [srcq+strideq] |
| sub hd, 1 |
| jg .prep_w16gt |
| RET |
| .prep_w32: |
| mov t2q, 1 |
| jmp .prep_w16gt |
| .prep_w64: |
| mov t2q, 2 |
| jmp .prep_w16gt |
| .prep_w128: |
| mov t2q, 4 |
| jmp .prep_w16gt |
| .h: |
| ; 16 * src[x] + (mx * (src[x + 1] - src[x])) |
| ; = (16 - mx) * src[x] + mx * src[x + 1] |
| imul mxyd, 0xff01 |
| mova m4, [base+bilin_h_shuf8] |
| add mxyd, 16 << 8 |
| movd xm5, mxyd |
| mov mxyd, r6m ; my |
| pshuflw m5, m5, q0000 |
| punpcklqdq m5, m5 |
| test mxyd, mxyd |
| jnz .hv |
| %if ARCH_X86_32 |
| mov t1, t2 ; save base reg for w4 |
| %endif |
| movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] |
| add wq, t2 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .h_w4: |
| %if ARCH_X86_32 |
| mova m4, [t1-prep_ssse3+bilin_h_shuf4] |
| %else |
| mova m4, [bilin_h_shuf4] |
| %endif |
| .h_w4_loop: |
| movq m0, [srcq+strideq*0] |
| movhps m0, [srcq+strideq*1] |
| movq m1, [srcq+strideq*2] |
| movhps m1, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| pshufb m1, m4 |
| pmaddubsw m1, m5 |
| mova [tmpq+0 ], m0 |
| mova [tmpq+16], m1 |
| add tmpq, 32 |
| sub hd, 4 |
| jg .h_w4_loop |
| RET |
| .h_w8: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*2] |
| movu m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pshufb m3, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmaddubsw m2, m5 |
| pmaddubsw m3, m5 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 4 |
| jg .h_w8 |
| RET |
| .h_w16: |
| movu m0, [srcq+strideq*0+8*0] |
| movu m1, [srcq+strideq*0+8*1] |
| movu m2, [srcq+strideq*1+8*0] |
| movu m3, [srcq+strideq*1+8*1] |
| lea srcq, [srcq+strideq*2] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pshufb m3, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmaddubsw m2, m5 |
| pmaddubsw m3, m5 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 2 |
| jg .h_w16 |
| RET |
| .h_w16gt: |
| mov t1q, srcq |
| mov r3q, t2q |
| .h_w16gt_hloop: |
| movu m0, [t1q+8*0] |
| movu m1, [t1q+8*1] |
| movu m2, [t1q+8*2] |
| movu m3, [t1q+8*3] |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pshufb m3, m4 |
| pmaddubsw m0, m5 |
| pmaddubsw m1, m5 |
| pmaddubsw m2, m5 |
| pmaddubsw m3, m5 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| add t1q, 32 |
| sub r3q, 1 |
| jg .h_w16gt_hloop |
| lea srcq, [srcq+strideq] |
| sub hd, 1 |
| jg .h_w16gt |
| RET |
| .h_w32: |
| mov t2q, 1 |
| jmp .h_w16gt |
| .h_w64: |
| mov t2q, 2 |
| jmp .h_w16gt |
| .h_w128: |
| mov t2q, 4 |
| jmp .h_w16gt |
| .v: |
| movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] |
| imul mxyd, 0xff01 |
| add mxyd, 16 << 8 |
| add wq, t2 |
| lea stride3q, [strideq*3] |
| movd m5, mxyd |
| pshuflw m5, m5, q0000 |
| punpcklqdq m5, m5 |
| jmp wq |
| .v_w4: |
| movd m0, [srcq+strideq*0] |
| .v_w4_loop: |
| movd m1, [srcq+strideq*1] |
| movd m2, [srcq+strideq*2] |
| movd m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| punpcklwd m0, m1 ; 0 1 _ _ |
| punpcklwd m1, m2 ; 1 2 _ _ |
| punpcklbw m1, m0 |
| pmaddubsw m1, m5 |
| pshufd m1, m1, q3120 |
| mova [tmpq+16*0], m1 |
| movd m0, [srcq+strideq*0] |
| punpcklwd m2, m3 ; 2 3 _ _ |
| punpcklwd m3, m0 ; 3 4 _ _ |
| punpcklbw m3, m2 |
| pmaddubsw m3, m5 |
| pshufd m3, m3, q3120 |
| mova [tmpq+16*1], m3 |
| add tmpq, 32 |
| sub hd, 4 |
| jg .v_w4_loop |
| RET |
| .v_w8: |
| movq m0, [srcq+strideq*0] |
| .v_w8_loop: |
| movq m1, [srcq+strideq*2] |
| movq m2, [srcq+strideq*1] |
| movq m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| shufpd m4, m0, m1, 0x0c ; 0 2 |
| movq m0, [srcq+strideq*0] |
| shufpd m2, m3, 0x0c ; 1 3 |
| shufpd m1, m0, 0x0c ; 2 4 |
| punpcklbw m3, m2, m4 |
| pmaddubsw m3, m5 |
| mova [tmpq+16*0], m3 |
| punpckhbw m3, m2, m4 |
| pmaddubsw m3, m5 |
| mova [tmpq+16*2], m3 |
| punpcklbw m3, m1, m2 |
| punpckhbw m1, m2 |
| pmaddubsw m3, m5 |
| pmaddubsw m1, m5 |
| mova [tmpq+16*1], m3 |
| mova [tmpq+16*3], m1 |
| add tmpq, 16*4 |
| sub hd, 4 |
| jg .v_w8_loop |
| RET |
| .v_w16: |
| movu m0, [srcq+strideq*0] |
| .v_w16_loop: |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*2] |
| punpcklbw m3, m1, m0 |
| punpckhbw m4, m1, m0 |
| pmaddubsw m3, m5 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*0], m3 |
| mova [tmpq+16*1], m4 |
| punpcklbw m3, m2, m1 |
| punpckhbw m4, m2, m1 |
| pmaddubsw m3, m5 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*2], m3 |
| mova [tmpq+16*3], m4 |
| movu m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| movu m0, [srcq+strideq*0] |
| add tmpq, 16*8 |
| punpcklbw m1, m3, m2 |
| punpckhbw m4, m3, m2 |
| pmaddubsw m1, m5 |
| pmaddubsw m4, m5 |
| mova [tmpq-16*4], m1 |
| mova [tmpq-16*3], m4 |
| punpcklbw m1, m0, m3 |
| punpckhbw m2, m0, m3 |
| pmaddubsw m1, m5 |
| pmaddubsw m2, m5 |
| mova [tmpq-16*2], m1 |
| mova [tmpq-16*1], m2 |
| sub hd, 4 |
| jg .v_w16_loop |
| RET |
| .v_w32: |
| lea t2d, [hq+(0<<16)] |
| mov t0d, 64 |
| .v_w32_start: |
| %if ARCH_X86_64 |
| %if WIN64 |
| PUSH r7 |
| %endif |
| mov r7, tmpq |
| %endif |
| mov t1, srcq |
| .v_w32_loop_h: |
| movu m0, [srcq+strideq*0+16*0] ; 0L |
| movu m1, [srcq+strideq*0+16*1] ; 0U |
| .v_w32_loop_v: |
| movu m2, [srcq+strideq*1+16*0] ; 1L |
| movu m3, [srcq+strideq*1+16*1] ; 1U |
| lea srcq, [srcq+strideq*2] |
| punpcklbw m4, m2, m0 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*0], m4 |
| punpckhbw m4, m2, m0 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*1], m4 |
| punpcklbw m4, m3, m1 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*2], m4 |
| punpckhbw m4, m3, m1 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*3], m4 |
| add tmpq, t0q |
| movu m0, [srcq+strideq*0+16*0] ; 2L |
| movu m1, [srcq+strideq*0+16*1] ; 2U |
| punpcklbw m4, m0, m2 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*0], m4 |
| punpckhbw m4, m0, m2 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*1], m4 |
| punpcklbw m4, m1, m3 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*2], m4 |
| punpckhbw m4, m1, m3 |
| pmaddubsw m4, m5 |
| mova [tmpq+16*3], m4 |
| add tmpq, t0q |
| sub hd, 2 |
| jg .v_w32_loop_v |
| movzx hd, t2w |
| add t1, 32 |
| mov srcq, t1 |
| %if ARCH_X86_64 |
| add r7, 2*16*2 |
| mov tmpq, r7 |
| %else |
| mov tmpq, tmpmp |
| add tmpq, 2*16*2 |
| mov tmpmp, tmpq |
| %endif |
| sub t2d, 1<<16 |
| jg .v_w32_loop_h |
| %if WIN64 |
| POP r7 |
| %endif |
| RET |
| .v_w64: |
| lea t2d, [hq+(1<<16)] |
| mov t0d, 128 |
| jmp .v_w32_start |
| .v_w128: |
| lea t2d, [hq+(3<<16)] |
| mov t0d, 256 |
| jmp .v_w32_start |
| .hv: |
| ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 |
| ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 8 |
| movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] |
| shl mxyd, 11 |
| movd xm6, mxyd |
| add wq, t2 |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| %if ARCH_X86_32 |
| mov t1, t2 ; save base reg for w4 |
| %endif |
| lea stride3q, [strideq*3] |
| jmp wq |
| .hv_w4: |
| %if ARCH_X86_32 |
| mova m4, [t1-prep_ssse3+bilin_h_shuf4] |
| %else |
| mova m4, [bilin_h_shuf4] |
| %endif |
| movq m0, [srcq+strideq*0] ; 0 _ |
| punpcklqdq m0, m0 |
| pshufb m0, m4 |
| pmaddubsw m0, m5 |
| .hv_w4_loop: |
| movq m1, [srcq+strideq*1] |
| movhps m1, [srcq+strideq*2] ; 1 _ 2 _ |
| movq m2, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| movhps m2, [srcq+strideq*0] ; 3 _ 4 _ |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pmaddubsw m1, m5 ; 1 + 2 + |
| shufpd m3, m0, m1, 0x01 ; 0 + 1 + |
| pmaddubsw m0, m2, m5 ; 3 + 4 + |
| shufpd m2, m1, m0, 0x01 ; 2 + 3 + |
| psubw m1, m3 |
| pmulhrsw m1, m6 |
| paddw m1, m3 |
| psubw m3, m0, m2 |
| pmulhrsw m3, m6 |
| paddw m3, m2 |
| mova [tmpq+16*0], m1 |
| mova [tmpq+16*1], m3 |
| add tmpq, 32 |
| sub hd, 4 |
| jg .hv_w4_loop |
| RET |
| .hv_w8: |
| movu m0, [srcq+strideq*0] |
| pshufb m0, m4 |
| pmaddubsw m0, m5 ; 0 + |
| .hv_w8_loop: |
| movu m1, [srcq+strideq*1] ; 1 |
| movu m2, [srcq+strideq*2] ; 2 |
| pshufb m1, m4 |
| pshufb m2, m4 |
| pmaddubsw m1, m5 ; 1 + |
| pmaddubsw m2, m5 ; 2 + |
| psubw m3, m1, m0 ; 1-0 |
| pmulhrsw m3, m6 |
| paddw m3, m0 |
| psubw m7, m2, m1 ; 2-1 |
| pmulhrsw m7, m6 |
| paddw m7, m1 |
| mova [tmpq+16*0], m3 |
| mova [tmpq+16*1], m7 |
| movu m1, [srcq+stride3q ] ; 3 |
| lea srcq, [srcq+strideq*4] |
| movu m0, [srcq+strideq*0] ; 4 |
| pshufb m1, m4 |
| pshufb m0, m4 |
| pmaddubsw m1, m5 ; 3 + |
| pmaddubsw m0, m5 ; 4 + |
| psubw m3, m1, m2 ; 3-2 |
| pmulhrsw m3, m6 |
| paddw m3, m2 |
| psubw m7, m0, m1 ; 4-3 |
| pmulhrsw m7, m6 |
| paddw m7, m1 |
| mova [tmpq+16*2], m3 |
| mova [tmpq+16*3], m7 |
| add tmpq, 16*4 |
| sub hd, 4 |
| jg .hv_w8_loop |
| RET |
| .hv_w16: |
| lea t2d, [hq+(0<<16)] |
| mov t0d, 32 |
| .hv_w16_start: |
| %if ARCH_X86_64 |
| %if WIN64 |
| PUSH r7 |
| %endif |
| mov r7, tmpq |
| %endif |
| mov t1, srcq |
| .hv_w16_loop_h: |
| movu m0, [srcq+strideq*0+8*0] ; 0L |
| movu m1, [srcq+strideq*0+8*1] ; 0U |
| pshufb m0, m4 |
| pshufb m1, m4 |
| pmaddubsw m0, m5 ; 0L + |
| pmaddubsw m1, m5 ; 0U + |
| .hv_w16_loop_v: |
| movu m2, [srcq+strideq*1+8*0] ; 1L |
| pshufb m2, m4 |
| pmaddubsw m2, m5 ; 1L + |
| psubw m3, m2, m0 ; 1L-0L |
| pmulhrsw m3, m6 |
| paddw m3, m0 |
| mova [tmpq+16*0], m3 |
| movu m3, [srcq+strideq*1+8*1] ; 1U |
| lea srcq, [srcq+strideq*2] |
| pshufb m3, m4 |
| pmaddubsw m3, m5 ; 1U + |
| psubw m0, m3, m1 ; 1U-0U |
| pmulhrsw m0, m6 |
| paddw m0, m1 |
| mova [tmpq+16*1], m0 |
| add tmpq, t0q |
| movu m0, [srcq+strideq*0+8*0] ; 2L |
| pshufb m0, m4 |
| pmaddubsw m0, m5 ; 2L + |
| psubw m1, m0, m2 ; 2L-1L |
| pmulhrsw m1, m6 |
| paddw m1, m2 |
| mova [tmpq+16*0], m1 |
| movu m1, [srcq+strideq*0+8*1] ; 2U |
| pshufb m1, m4 |
| pmaddubsw m1, m5 ; 2U + |
| psubw m2, m1, m3 ; 2U-1U |
| pmulhrsw m2, m6 |
| paddw m2, m3 |
| mova [tmpq+16*1], m2 |
| add tmpq, t0q |
| sub hd, 2 |
| jg .hv_w16_loop_v |
| movzx hd, t2w |
| add t1, 16 |
| mov srcq, t1 |
| %if ARCH_X86_64 |
| add r7, 2*16 |
| mov tmpq, r7 |
| %else |
| mov tmpq, tmpmp |
| add tmpq, 2*16 |
| mov tmpmp, tmpq |
| %endif |
| sub t2d, 1<<16 |
| jg .hv_w16_loop_h |
| %if WIN64 |
| POP r7 |
| %endif |
| RET |
| .hv_w32: |
| lea t2d, [hq+(1<<16)] |
| mov t0d, 64 |
| jmp .hv_w16_start |
| .hv_w64: |
| lea t2d, [hq+(3<<16)] |
| mov t0d, 128 |
| jmp .hv_w16_start |
| .hv_w128: |
| lea t2d, [hq+(7<<16)] |
| mov t0d, 256 |
| jmp .hv_w16_start |
| |
| ; int8_t subpel_filters[5][15][8] |
| %assign FILTER_REGULAR (0*15 << 16) | 3*15 |
| %assign FILTER_SMOOTH (1*15 << 16) | 4*15 |
| %assign FILTER_SHARP (2*15 << 16) | 3*15 |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 1, 2 |
| %elif WIN64 |
| DECLARE_REG_TMP 4, 5 |
| %else |
| DECLARE_REG_TMP 7, 8 |
| %endif |
| |
| %macro PUT_8TAP_FN 3 ; type, type_h, type_v |
| cglobal put_8tap_%1 |
| mov t0d, FILTER_%2 |
| mov t1d, FILTER_%3 |
| %ifnidn %1, sharp_smooth ; skip the jump in the last filter |
| jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) |
| %endif |
| %endmacro |
| |
| PUT_8TAP_FN regular, REGULAR, REGULAR |
| PUT_8TAP_FN regular_sharp, REGULAR, SHARP |
| PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH |
| PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR |
| PUT_8TAP_FN smooth, SMOOTH, SMOOTH |
| PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP |
| PUT_8TAP_FN sharp_regular, SHARP, REGULAR |
| PUT_8TAP_FN sharp, SHARP, SHARP |
| PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH |
| |
| %if ARCH_X86_32 |
| %define base_reg r1 |
| %define base base_reg-put_ssse3 |
| %define W32_RESTORE_DSQ mov dsq, dsm |
| %define W32_RESTORE_SSQ mov ssq, ssm |
| %else |
| %define base_reg r8 |
| %define base 0 |
| %define W32_RESTORE_DSQ |
| %define W32_RESTORE_SSQ |
| %endif |
| |
| cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 |
| %assign org_stack_offset stack_offset |
| imul mxd, mxm, 0x010101 |
| add mxd, t0d ; 8tap_h, mx, 4tap_h |
| %if ARCH_X86_64 |
| imul myd, mym, 0x010101 |
| add myd, t1d ; 8tap_v, my, 4tap_v |
| %else |
| imul ssd, mym, 0x010101 |
| add ssd, t1d ; 8tap_v, my, 4tap_v |
| mov srcq, srcm |
| %endif |
| mov wd, wm |
| movifnidn hd, hm |
| LEA base_reg, put_ssse3 |
| test mxd, 0xf00 |
| jnz .h |
| %if ARCH_X86_32 |
| test ssd, 0xf00 |
| %else |
| test myd, 0xf00 |
| %endif |
| jnz .v |
| tzcnt wd, wd |
| movzx wd, word [base_reg+wq*2+table_offset(put,)] |
| add wq, base_reg |
| ; put_bilin mangling jump |
| %assign stack_offset org_stack_offset |
| %if ARCH_X86_32 |
| mov dsq, dsm |
| mov ssq, ssm |
| %elif WIN64 |
| pop r8 |
| %endif |
| lea r6, [ssq*3] |
| jmp wq |
| .h: |
| %if ARCH_X86_32 |
| test ssd, 0xf00 |
| %else |
| test myd, 0xf00 |
| %endif |
| jnz .hv |
| W32_RESTORE_SSQ |
| WIN64_SPILL_XMM 12 |
| cmp wd, 4 |
| jl .h_w2 |
| je .h_w4 |
| tzcnt wd, wd |
| %if ARCH_X86_64 |
| mova m10, [base+subpel_h_shufA] |
| mova m11, [base+subpel_h_shufB] |
| mova m9, [base+subpel_h_shufC] |
| %endif |
| shr mxd, 16 |
| sub srcq, 3 |
| movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] |
| movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0] |
| pshufd m5, m5, q0000 |
| movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4] |
| pshufd m6, m6, q0000 |
| mova m7, [base+pw_34] ; 2 + (8 << 2) |
| add wq, base_reg |
| jmp wq |
| .h_w2: |
| %if ARCH_X86_32 |
| and mxd, 0xff |
| %else |
| movzx mxd, mxb |
| %endif |
| dec srcq |
| mova m4, [base+subpel_h_shuf4] |
| movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] |
| pshufd m3, m3, q0000 |
| mova m5, [base+pw_34] ; 2 + (8 << 2) |
| W32_RESTORE_DSQ |
| .h_w2_loop: |
| movq m0, [srcq+ssq*0] |
| movhps m0, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pshufb m0, m4 |
| pmaddubsw m0, m3 |
| phaddw m0, m0 |
| paddw m0, m5 ; pw34 |
| psraw m0, 6 |
| packuswb m0, m0 |
| movd r4d, m0 |
| mov [dstq+dsq*0], r4w |
| shr r4d, 16 |
| mov [dstq+dsq*1], r4w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w2_loop |
| RET |
| .h_w4: |
| %if ARCH_X86_32 |
| and mxd, 0xff |
| %else |
| movzx mxd, mxb |
| %endif |
| dec srcq |
| movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] |
| pshufd m3, m3, q0000 |
| mova m5, [base+pw_34] ; 2 + (8 << 2) |
| mova m6, [base+subpel_h_shufA] |
| W32_RESTORE_DSQ |
| .h_w4_loop: |
| movq m0, [srcq+ssq*0] ; 1 |
| movq m1, [srcq+ssq*1] ; 2 |
| lea srcq, [srcq+ssq*2] |
| pshufb m0, m6 ; subpel_h_shufA |
| pshufb m1, m6 ; subpel_h_shufA |
| pmaddubsw m0, m3 ; subpel_filters |
| pmaddubsw m1, m3 ; subpel_filters |
| phaddw m0, m1 |
| paddw m0, m5 ; pw34 |
| psraw m0, 6 |
| packuswb m0, m0 |
| movd [dstq+dsq*0], m0 |
| psrlq m0, 32 |
| movd [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w4_loop |
| RET |
| ; |
| %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] |
| %if ARCH_X86_32 |
| pshufb %2, %1, [base+subpel_h_shufB] |
| pshufb %3, %1, [base+subpel_h_shufC] |
| pshufb %1, [base+subpel_h_shufA] |
| %else |
| pshufb %2, %1, m11; subpel_h_shufB |
| pshufb %3, %1, m9 ; subpel_h_shufC |
| pshufb %1, m10 ; subpel_h_shufA |
| %endif |
| pmaddubsw %4, %2, m5 ; subpel +0 B0 |
| pmaddubsw %2, m6 ; subpel +4 B4 |
| pmaddubsw %3, m6 ; C4 |
| pmaddubsw %1, m5 ; A0 |
| paddw %3, %4 ; C4+B0 |
| paddw %1, %2 ; A0+B4 |
| phaddw %1, %3 |
| paddw %1, m7 ; pw34 |
| psraw %1, 6 |
| %endmacro |
| ; |
| .h_w8: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| PUT_8TAP_H m0, m2, m3, m4 |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_H m1, m2, m3, m4 |
| packuswb m0, m1 |
| %if ARCH_X86_32 |
| movq [dstq ], m0 |
| add dstq, dsm |
| movhps [dstq ], m0 |
| add dstq, dsm |
| %else |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| %endif |
| sub hd, 2 |
| jg .h_w8 |
| RET |
| .h_w16: |
| xor r6d, r6d |
| jmp .h_start |
| .h_w32: |
| mov r6, -16*1 |
| jmp .h_start |
| .h_w64: |
| mov r6, -16*3 |
| jmp .h_start |
| .h_w128: |
| mov r6, -16*7 |
| .h_start: |
| sub srcq, r6 |
| sub dstq, r6 |
| mov r4, r6 |
| .h_loop: |
| movu m0, [srcq+r6+8*0] |
| movu m1, [srcq+r6+8*1] |
| PUT_8TAP_H m0, m2, m3, m4 |
| PUT_8TAP_H m1, m2, m3, m4 |
| packuswb m0, m1 |
| mova [dstq+r6], m0 |
| add r6, mmsize |
| jle .h_loop |
| add srcq, ssq |
| %if ARCH_X86_32 |
| add dstq, dsm |
| %else |
| add dstq, dsq |
| %endif |
| mov r6, r4 |
| dec hd |
| jg .h_loop |
| RET |
| .v: |
| %if ARCH_X86_32 |
| movzx mxd, ssb |
| shr ssd, 16 |
| cmp hd, 4 |
| cmovle ssd, mxd |
| lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3] |
| %else |
| %assign stack_offset org_stack_offset |
| WIN64_SPILL_XMM 16 |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| lea myq, [base_reg+myq*8+subpel_filters-put_ssse3] |
| %endif |
| tzcnt r6d, wd |
| movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] |
| mova m7, [base+pw_512] |
| psrlw m2, m7, 1 ; 0x0100 |
| add r6, base_reg |
| %if ARCH_X86_32 |
| %define subpel0 [rsp+mmsize*0] |
| %define subpel1 [rsp+mmsize*1] |
| %define subpel2 [rsp+mmsize*2] |
| %define subpel3 [rsp+mmsize*3] |
| %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed |
| ALLOC_STACK -mmsize*4 |
| %assign regs_used 7 |
| movd m0, [ssq+0] |
| pshufb m0, m2 |
| mova subpel0, m0 |
| movd m0, [ssq+2] |
| pshufb m0, m2 |
| mova subpel1, m0 |
| movd m0, [ssq+4] |
| pshufb m0, m2 |
| mova subpel2, m0 |
| movd m0, [ssq+6] |
| pshufb m0, m2 |
| mova subpel3, m0 |
| mov ssq, [rstk+stack_offset+gprsize*4] |
| lea ssq, [ssq*3] |
| sub srcq, ssq |
| mov ssq, [rstk+stack_offset+gprsize*4] |
| mov dsq, [rstk+stack_offset+gprsize*2] |
| %else |
| %define subpel0 m8 |
| %define subpel1 m9 |
| %define subpel2 m10 |
| %define subpel3 m11 |
| movd subpel0, [myq+0] |
| pshufb subpel0, m2 |
| movd subpel1, [myq+2] |
| pshufb subpel1, m2 |
| movd subpel2, [myq+4] |
| pshufb subpel2, m2 |
| movd subpel3, [myq+6] |
| pshufb subpel3, m2 |
| lea ss3q, [ssq*3] |
| sub srcq, ss3q |
| %endif |
| jmp r6 |
| .v_w2: |
| movd m2, [srcq+ssq*0] ; 0 |
| pinsrw m2, [srcq+ssq*1], 2 ; 0 1 |
| pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3 |
| add srcq, ssq |
| %else |
| pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3 |
| lea srcq, [srcq+ssq*4] |
| %endif |
| movd m3, [srcq+ssq*0] ; 4 |
| movd m1, [srcq+ssq*1] ; 5 |
| movd m0, [srcq+ssq*2] ; 6 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| %else |
| add srcq, ss3q |
| %endif |
| punpckldq m3, m1 ; 4 5 _ _ |
| punpckldq m1, m0 ; 5 6 _ _ |
| palignr m4, m3, m2, 4 ; 1 2 3 4 |
| punpcklbw m3, m1 ; 45 56 |
| punpcklbw m1, m2, m4 ; 01 12 |
| punpckhbw m2, m4 ; 23 34 |
| .v_w2_loop: |
| pmaddubsw m5, m1, subpel0 ; a0 b0 |
| mova m1, m2 |
| pmaddubsw m2, subpel1 ; a1 b1 |
| paddw m5, m2 |
| mova m2, m3 |
| pmaddubsw m3, subpel2 ; a2 b2 |
| paddw m5, m3 |
| movd m4, [srcq+ssq*0] ; 7 |
| punpckldq m3, m0, m4 ; 6 7 _ _ |
| movd m0, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpckldq m4, m0 ; 7 8 _ _ |
| punpcklbw m3, m4 ; 67 78 |
| pmaddubsw m4, m3, subpel3 ; a3 b3 |
| paddw m5, m4 |
| pmulhrsw m5, m7 |
| packuswb m5, m5 |
| pshuflw m5, m5, q2020 |
| movd r6d, m5 |
| mov [dstq+dsq*0], r6w |
| shr r6d, 16 |
| mov [dstq+dsq*1], r6w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w2_loop |
| RET |
| .v_w4: |
| %if ARCH_X86_32 |
| .v_w8: |
| .v_w16: |
| .v_w32: |
| .v_w64: |
| .v_w128: |
| %endif ; ARCH_X86_32 |
| lea r6d, [wq - 4] ; horizontal loop |
| mov r4, dstq |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < mmsize |
| %define srcm [rsp+mmsize*4+gprsize] |
| %endif |
| mov srcm, srcq |
| %else |
| mov r7, srcq |
| %endif |
| shl r6d, (16 - 2) ; (wq / 4) << 16 |
| mov r6w, hw |
| .v_w4_loop0: |
| movd m2, [srcq+ssq*0] ; 0 |
| movhps m2, [srcq+ssq*2] ; 0 _ 2 |
| movd m3, [srcq+ssq*1] ; 1 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| movhps m3, [srcq+ssq*0] ; 1 _ 3 |
| lea srcq, [srcq+ssq*1] |
| %else |
| movhps m3, [srcq+ss3q ] ; 1 _ 3 |
| lea srcq, [srcq+ssq*4] |
| %endif |
| pshufd m2, m2, q2020 ; 0 2 0 2 |
| pshufd m3, m3, q2020 ; 1 3 1 3 |
| punpckldq m2, m3 ; 0 1 2 3 |
| movd m3, [srcq+ssq*0] ; 4 |
| movd m1, [srcq+ssq*1] ; 5 |
| movd m0, [srcq+ssq*2] ; 6 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| %else |
| add srcq, ss3q |
| %endif |
| punpckldq m3, m1 ; 4 5 _ _ |
| punpckldq m1, m0 ; 5 6 _ _ |
| palignr m4, m3, m2, 4 ; 1 2 3 4 |
| punpcklbw m3, m1 ; 45 56 |
| punpcklbw m1, m2, m4 ; 01 12 |
| punpckhbw m2, m4 ; 23 34 |
| .v_w4_loop: |
| pmaddubsw m5, m1, subpel0 ; a0 b0 |
| mova m1, m2 |
| pmaddubsw m2, subpel1 ; a1 b1 |
| paddw m5, m2 |
| mova m2, m3 |
| pmaddubsw m3, subpel2 ; a2 b2 |
| paddw m5, m3 |
| movd m4, [srcq+ssq*0] |
| punpckldq m3, m0, m4 ; 6 7 _ _ |
| movd m0, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpckldq m4, m0 ; 7 8 _ _ |
| punpcklbw m3, m4 ; 67 78 |
| pmaddubsw m4, m3, subpel3 ; a3 b3 |
| paddw m5, m4 |
| pmulhrsw m5, m7 |
| packuswb m5, m5 |
| movd [dstq+dsq*0], m5 |
| pshufd m5, m5, q0101 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w4_loop |
| mov hw, r6w ; reset vertical loop |
| add r4, 4 |
| mov dstq, r4 |
| %if ARCH_X86_32 |
| mov srcq, srcm |
| add srcq, 4 |
| mov srcm, srcq |
| %else |
| add r7, 4 |
| mov srcq, r7 |
| %endif |
| sub r6d, 1<<16 ; horizontal-- |
| jg .v_w4_loop0 |
| RET |
| %if ARCH_X86_64 |
| .v_w8: |
| .v_w16: |
| .v_w32: |
| .v_w64: |
| .v_w128: |
| lea r6d, [wq - 8] ; horizontal loop |
| mov r4, dstq |
| mov r7, srcq |
| shl r6d, 8 - 3; (wq / 8) << 8 |
| mov r6b, hb |
| .v_w8_loop0: |
| movq m4, [srcq+ssq*0] ; 0 |
| movq m5, [srcq+ssq*1] ; 1 |
| lea srcq, [srcq+ssq*2] |
| movq m6, [srcq+ssq*0] ; 2 |
| movq m0, [srcq+ssq*1] ; 3 |
| lea srcq, [srcq+ssq*2] |
| movq m1, [srcq+ssq*0] ; 4 |
| movq m2, [srcq+ssq*1] ; 5 |
| lea srcq, [srcq+ssq*2] ; |
| movq m3, [srcq+ssq*0] ; 6 |
| shufpd m4, m0, 0x0c |
| shufpd m5, m1, 0x0c |
| punpcklbw m1, m4, m5 ; 01 |
| punpckhbw m4, m5 ; 34 |
| shufpd m6, m2, 0x0c |
| punpcklbw m2, m5, m6 ; 12 |
| punpckhbw m5, m6 ; 45 |
| shufpd m0, m3, 0x0c |
| punpcklbw m3, m6, m0 ; 23 |
| punpckhbw m6, m0 ; 56 |
| .v_w8_loop: |
| movq m12, [srcq+ssq*1] ; 8 |
| lea srcq, [srcq+ssq*2] |
| movq m13, [srcq+ssq*0] ; 9 |
| pmaddubsw m14, m1, subpel0 ; a0 |
| pmaddubsw m15, m2, subpel0 ; b0 |
| mova m1, m3 |
| mova m2, m4 |
| pmaddubsw m3, subpel1 ; a1 |
| pmaddubsw m4, subpel1 ; b1 |
| paddw m14, m3 |
| paddw m15, m4 |
| mova m3, m5 |
| mova m4, m6 |
| pmaddubsw m5, subpel2 ; a2 |
| pmaddubsw m6, subpel2 ; b2 |
| paddw m14, m5 |
| paddw m15, m6 |
| shufpd m6, m0, m12, 0x0d |
| shufpd m0, m12, m13, 0x0c |
| punpcklbw m5, m6, m0 ; 67 |
| punpckhbw m6, m0 ; 78 |
| pmaddubsw m12, m5, subpel3 ; a3 |
| pmaddubsw m13, m6, subpel3 ; b3 |
| paddw m14, m12 |
| paddw m15, m13 |
| pmulhrsw m14, m7 |
| pmulhrsw m15, m7 |
| packuswb m14, m15 |
| movq [dstq+dsq*0], xm14 |
| movhps [dstq+dsq*1], xm14 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w8_loop |
| movzx hd, r6b ; reset vertical loop |
| add r4, 8 |
| add r7, 8 |
| mov dstq, r4 |
| mov srcq, r7 |
| sub r6d, 1<<8 ; horizontal-- |
| jg .v_w8_loop0 |
| RET |
| %endif ;ARCH_X86_64 |
| %undef subpel0 |
| %undef subpel1 |
| %undef subpel2 |
| %undef subpel3 |
| .hv: |
| %assign stack_offset org_stack_offset |
| cmp wd, 4 |
| jg .hv_w8 |
| and mxd, 0xff |
| dec srcq |
| movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] |
| %if ARCH_X86_32 |
| movzx mxd, ssb |
| shr ssd, 16 |
| cmp hd, 4 |
| cmovle ssd, mxd |
| movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] |
| W32_RESTORE_SSQ |
| lea r6, [ssq*3] |
| sub srcq, r6 |
| %define base_reg r6 |
| mov r6, r1; use as new base |
| %assign regs_used 2 |
| ALLOC_STACK -mmsize*14 |
| %assign regs_used 7 |
| mov dsq, [rstk+stack_offset+gprsize*2] |
| %define subpelv0 [rsp+mmsize*0] |
| %define subpelv1 [rsp+mmsize*1] |
| %define subpelv2 [rsp+mmsize*2] |
| %define subpelv3 [rsp+mmsize*3] |
| punpcklqdq m0, m0 |
| punpcklbw m0, m0 |
| psraw m0, 8 ; sign-extend |
| pshufd m6, m0, q0000 |
| mova subpelv0, m6 |
| pshufd m6, m0, q1111 |
| mova subpelv1, m6 |
| pshufd m6, m0, q2222 |
| mova subpelv2, m6 |
| pshufd m6, m0, q3333 |
| mova subpelv3, m6 |
| %else |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] |
| ALLOC_STACK mmsize*14, 14 |
| lea ss3q, [ssq*3] |
| sub srcq, ss3q |
| %define subpelv0 m10 |
| %define subpelv1 m11 |
| %define subpelv2 m12 |
| %define subpelv3 m13 |
| punpcklqdq m0, m0 |
| punpcklbw m0, m0 |
| psraw m0, 8 ; sign-extend |
| mova m8, [base+pw_8192] |
| mova m9, [base+pd_512] |
| pshufd m10, m0, q0000 |
| pshufd m11, m0, q1111 |
| pshufd m12, m0, q2222 |
| pshufd m13, m0, q3333 |
| %endif |
| pshufd m7, m1, q0000 |
| cmp wd, 4 |
| je .hv_w4 |
| .hv_w2: |
| mova m6, [base+subpel_h_shuf4] |
| ; |
| movq m2, [srcq+ssq*0] ; 0 |
| movhps m2, [srcq+ssq*1] ; 0 _ 1 |
| movq m0, [srcq+ssq*2] ; 2 |
| %if ARCH_X86_32 |
| %define w8192reg [base+pw_8192] |
| %define d512reg [base+pd_512] |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| movhps m0, [srcq+ssq*0] ; 2 _ 3 |
| lea srcq, [srcq+ssq*1] |
| %else |
| %define w8192reg m8 |
| %define d512reg m9 |
| movhps m0, [srcq+ss3q ] ; 2 _ 3 |
| lea srcq, [srcq+ssq*4] |
| %endif |
| pshufb m2, m6 ; 0 ~ 1 ~ |
| pshufb m0, m6 ; 2 ~ 3 ~ |
| pmaddubsw m2, m7 ; subpel_filters |
| pmaddubsw m0, m7 ; subpel_filters |
| phaddw m2, m0 ; 0 1 2 3 |
| pmulhrsw m2, w8192reg |
| ; |
| movq m3, [srcq+ssq*0] ; 4 |
| movhps m3, [srcq+ssq*1] ; 4 _ 5 |
| movq m0, [srcq+ssq*2] ; 6 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| %else |
| add srcq, ss3q |
| %endif |
| pshufb m3, m6 ; 4 ~ 5 ~ |
| pshufb m0, m6 ; 6 ~ |
| pmaddubsw m3, m7 ; subpel_filters |
| pmaddubsw m0, m7 ; subpel_filters |
| phaddw m3, m0 ; 4 5 6 _ |
| pmulhrsw m3, w8192reg |
| ; |
| palignr m4, m3, m2, 4; V 1 2 3 4 |
| punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 |
| punpckhwd m2, m4 ; V 23 34 2 3 3 4 |
| pshufd m0, m3, q2121; V 5 6 5 6 |
| punpcklwd m3, m0 ; V 45 56 4 5 5 6 |
| .hv_w2_loop: |
| pmaddwd m5, m1, subpelv0; V a0 b0 |
| mova m1, m2 ; V |
| pmaddwd m2, subpelv1 ; V a1 b1 |
| paddd m5, m2 ; V |
| mova m2, m3 ; V |
| pmaddwd m3, subpelv2 ; a2 b2 |
| paddd m5, m3 ; V |
| movq m4, [srcq+ssq*0] ; V 7 |
| movhps m4, [srcq+ssq*1] ; V 7 8 |
| lea srcq, [srcq+ssq*2] ; V |
| pshufb m4, m6 |
| pmaddubsw m4, m7 |
| phaddw m4, m4 |
| pmulhrsw m4, w8192reg |
| palignr m3, m4, m0, 12 |
| mova m0, m4 |
| punpcklwd m3, m0 ; V 67 78 |
| pmaddwd m4, m3, subpelv3 ; V a3 b3 |
| paddd m5, d512reg |
| paddd m5, m4 |
| psrad m5, 10 |
| packssdw m5, m5 |
| packuswb m5, m5 |
| movd r4d, m5 |
| mov [dstq+dsq*0], r4w |
| shr r4d, 16 |
| mov [dstq+dsq*1], r4w |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w2_loop |
| RET |
| %undef w8192reg |
| %undef d512reg |
| ; |
| .hv_w4: |
| %define hv4_line_0_0 4 |
| %define hv4_line_0_1 5 |
| %define hv4_line_0_2 6 |
| %define hv4_line_0_3 7 |
| %define hv4_line_0_4 8 |
| %define hv4_line_0_5 9 |
| %define hv4_line_1_0 10 |
| %define hv4_line_1_1 11 |
| %define hv4_line_1_2 12 |
| %define hv4_line_1_3 13 |
| ; |
| %macro SAVELINE_W4 3 |
| mova [rsp+mmsize*hv4_line_%3_%2], %1 |
| %endmacro |
| %macro RESTORELINE_W4 3 |
| mova %1, [rsp+mmsize*hv4_line_%3_%2] |
| %endmacro |
| ; |
| %if ARCH_X86_32 |
| %define w8192reg [base+pw_8192] |
| %define d512reg [base+pd_512] |
| %else |
| %define w8192reg m8 |
| %define d512reg m9 |
| %endif |
| ; lower shuffle 0 1 2 3 4 |
| mova m6, [base+subpel_h_shuf4] |
| movq m5, [srcq+ssq*0] ; 0 _ _ _ |
| movhps m5, [srcq+ssq*1] ; 0 _ 1 _ |
| movq m4, [srcq+ssq*2] ; 2 _ _ _ |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| movhps m4, [srcq+ssq*0] ; 2 _ 3 _ |
| add srcq, ssq |
| %else |
| movhps m4, [srcq+ss3q ] ; 2 _ 3 _ |
| lea srcq, [srcq+ssq*4] |
| %endif |
| pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ |
| pmaddubsw m2, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m2, m0 ;H 0 1 2 3 |
| pmulhrsw m2, w8192reg ;H pw_8192 |
| SAVELINE_W4 m2, 2, 0 |
| ; upper shuffle 2 3 4 5 6 |
| mova m6, [base+subpel_h_shuf4+16] |
| pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ |
| pmaddubsw m2, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m2, m0 ;H 0 1 2 3 |
| pmulhrsw m2, w8192reg ;H pw_8192 |
| ; |
| ; lower shuffle |
| mova m6, [base+subpel_h_shuf4] |
| movq m5, [srcq+ssq*0] ; 4 _ _ _ |
| movhps m5, [srcq+ssq*1] ; 4 _ 5 _ |
| movq m4, [srcq+ssq*2] ; 6 _ _ _ |
| pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ |
| pmaddubsw m3, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m3, m0 ;H 4 5 6 7 |
| pmulhrsw m3, w8192reg ;H pw_8192 |
| SAVELINE_W4 m3, 3, 0 |
| ; upper shuffle |
| mova m6, [base+subpel_h_shuf4+16] |
| pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ |
| pmaddubsw m3, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m3, m0 ;H 4 5 6 7 |
| pmulhrsw m3, w8192reg ;H pw_8192 |
| ; |
| %if ARCH_X86_32 |
| lea srcq, [srcq+ssq*2] |
| add srcq, ssq |
| %else |
| add srcq, ss3q |
| %endif |
| ;process high |
| palignr m4, m3, m2, 4;V 1 2 3 4 |
| punpcklwd m1, m2, m4 ; V 01 12 |
| punpckhwd m2, m4 ; V 23 34 |
| pshufd m0, m3, q2121;V 5 6 5 6 |
| punpcklwd m3, m0 ; V 45 56 |
| SAVELINE_W4 m0, 0, 1 |
| SAVELINE_W4 m1, 1, 1 |
| SAVELINE_W4 m2, 2, 1 |
| SAVELINE_W4 m3, 3, 1 |
| ;process low |
| RESTORELINE_W4 m2, 2, 0 |
| RESTORELINE_W4 m3, 3, 0 |
| palignr m4, m3, m2, 4;V 1 2 3 4 |
| punpcklwd m1, m2, m4 ; V 01 12 |
| punpckhwd m2, m4 ; V 23 34 |
| pshufd m0, m3, q2121;V 5 6 5 6 |
| punpcklwd m3, m0 ; V 45 56 |
| .hv_w4_loop: |
| ;process low |
| pmaddwd m5, m1, subpelv0 ; V a0 b0 |
| mova m1, m2 |
| pmaddwd m2, subpelv1; V a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, subpelv2; V a2 b2 |
| paddd m5, m3 |
| ; |
| mova m6, [base+subpel_h_shuf4] |
| movq m4, [srcq+ssq*0] ; 7 |
| movhps m4, [srcq+ssq*1] ; 7 _ 8 _ |
| pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ |
| pmaddubsw m4, m7 ;H subpel_filters |
| phaddw m4, m4 ;H 7 8 7 8 |
| pmulhrsw m4, w8192reg ;H pw_8192 |
| palignr m3, m4, m0, 12 ; 6 7 8 7 |
| mova m0, m4 |
| punpcklwd m3, m4 ; 67 78 |
| pmaddwd m4, m3, subpelv3; a3 b3 |
| paddd m5, d512reg ; pd_512 |
| paddd m5, m4 |
| psrad m5, 10 |
| SAVELINE_W4 m0, 0, 0 |
| SAVELINE_W4 m1, 1, 0 |
| SAVELINE_W4 m2, 2, 0 |
| SAVELINE_W4 m3, 3, 0 |
| SAVELINE_W4 m5, 5, 0 |
| ;process high |
| RESTORELINE_W4 m0, 0, 1 |
| RESTORELINE_W4 m1, 1, 1 |
| RESTORELINE_W4 m2, 2, 1 |
| RESTORELINE_W4 m3, 3, 1 |
| pmaddwd m5, m1, subpelv0; V a0 b0 |
| mova m1, m2 |
| pmaddwd m2, subpelv1; V a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, subpelv2; V a2 b2 |
| paddd m5, m3 |
| ; |
| mova m6, [base+subpel_h_shuf4+16] |
| movq m4, [srcq+ssq*0] ; 7 |
| movhps m4, [srcq+ssq*1] ; 7 _ 8 _ |
| pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ |
| pmaddubsw m4, m7 ;H subpel_filters |
| phaddw m4, m4 ;H 7 8 7 8 |
| pmulhrsw m4, w8192reg ;H pw_8192 |
| palignr m3, m4, m0, 12 ; 6 7 8 7 |
| mova m0, m4 |
| punpcklwd m3, m4 ; 67 78 |
| pmaddwd m4, m3, subpelv3; a3 b3 |
| paddd m5, d512reg ; pd_512 |
| paddd m5, m4 |
| psrad m4, m5, 10 |
| ; |
| RESTORELINE_W4 m5, 5, 0 |
| packssdw m5, m4 ; d -> w |
| packuswb m5, m5 ; w -> b |
| pshuflw m5, m5, q3120 |
| lea srcq, [srcq+ssq*2] |
| movd [dstq+dsq*0], m5 |
| psrlq m5, 32 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| SAVELINE_W4 m0, 0, 1 |
| SAVELINE_W4 m1, 1, 1 |
| SAVELINE_W4 m2, 2, 1 |
| SAVELINE_W4 m3, 3, 1 |
| RESTORELINE_W4 m0, 0, 0 |
| RESTORELINE_W4 m1, 1, 0 |
| RESTORELINE_W4 m2, 2, 0 |
| RESTORELINE_W4 m3, 3, 0 |
| jg .hv_w4_loop |
| RET |
| %undef subpelv0 |
| %undef subpelv1 |
| %undef subpelv2 |
| %undef subpelv3 |
| ; |
| .hv_w8: |
| %assign stack_offset org_stack_offset |
| %define hv8_line_1 0 |
| %define hv8_line_2 1 |
| %define hv8_line_3 2 |
| %define hv8_line_4 3 |
| %define hv8_line_6 4 |
| %macro SAVELINE_W8 2 |
| mova [rsp+hv8_line_%1*mmsize], %2 |
| %endmacro |
| %macro RESTORELINE_W8 2 |
| mova %2, [rsp+hv8_line_%1*mmsize] |
| %endmacro |
| shr mxd, 16 |
| sub srcq, 3 |
| %if ARCH_X86_32 |
| %define base_reg r1 |
| %define subpelh0 [rsp+mmsize*5] |
| %define subpelh1 [rsp+mmsize*6] |
| %define subpelv0 [rsp+mmsize*7] |
| %define subpelv1 [rsp+mmsize*8] |
| %define subpelv2 [rsp+mmsize*9] |
| %define subpelv3 [rsp+mmsize*10] |
| %define accuv0 [rsp+mmsize*11] |
| %define accuv1 [rsp+mmsize*12] |
| movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] |
| movzx mxd, ssb |
| shr ssd, 16 |
| cmp hd, 4 |
| cmovle ssd, mxd |
| movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] |
| mov ssq, ssmp |
| ALLOC_STACK -mmsize*13 |
| %if STACK_ALIGNMENT < 16 |
| %define srcm [rsp+mmsize*13+gprsize*1] |
| %define dsm [rsp+mmsize*13+gprsize*2] |
| mov r6, [rstk+stack_offset+gprsize*2] |
| mov dsm, r6 |
| %endif |
| pshufd m0, m1, q0000 |
| pshufd m1, m1, q1111 |
| punpcklbw m5, m5 |
| psraw m5, 8 ; sign-extend |
| pshufd m2, m5, q0000 |
| pshufd m3, m5, q1111 |
| pshufd m4, m5, q2222 |
| pshufd m5, m5, q3333 |
| mova subpelh0, m0 |
| mova subpelh1, m1 |
| mova subpelv0, m2 |
| mova subpelv1, m3 |
| mova subpelv2, m4 |
| mova subpelv3, m5 |
| lea r6, [ssq*3] |
| sub srcq, r6 |
| mov srcm, srcq |
| %else |
| ALLOC_STACK mmsize*5, 16 |
| %define subpelh0 m10 |
| %define subpelh1 m11 |
| %define subpelv0 m12 |
| %define subpelv1 m13 |
| %define subpelv2 m14 |
| %define subpelv3 m15 |
| %define accuv0 m8 |
| %define accuv1 m9 |
| movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] |
| pshufd subpelh0, m0, q0000 |
| pshufd subpelh1, m0, q1111 |
| punpcklqdq m1, m1 |
| punpcklbw m1, m1 |
| psraw m1, 8 ; sign-extend |
| pshufd subpelv0, m1, q0000 |
| pshufd subpelv1, m1, q1111 |
| pshufd subpelv2, m1, q2222 |
| pshufd subpelv3, m1, q3333 |
| lea ss3q, [ssq*3] |
| sub srcq, ss3q |
| mov r7, srcq |
| %endif |
| lea r6d, [wq-4] |
| mov r4, dstq |
| shl r6d, (16 - 2) |
| mov r6w, hw |
| .hv_w8_loop0: |
| movu m4, [srcq+ssq*0] ; 0 = _ _ |
| movu m5, [srcq+ssq*1] ; 1 = _ _ |
| lea srcq, [srcq+ssq*2] |
| ; |
| %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] |
| %if ARCH_X86_32 |
| pshufb %3, %1, [base+subpel_h_shufB] |
| pshufb %4, %1, [base+subpel_h_shufC] |
| pshufb %1, [base+subpel_h_shufA] |
| %else |
| pshufb %3, %1, %6 ; subpel_h_shufB |
| pshufb %4, %1, %7 ; subpel_h_shufC |
| pshufb %1, %5 ; subpel_h_shufA |
| %endif |
| pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 |
| pmaddubsw %4, subpelh1; subpel +4 B4 |
| pmaddubsw %3, subpelh1; C4 |
| pmaddubsw %1, subpelh0; A0 |
| paddw %2, %4 ; C0+B4 |
| paddw %1, %3 ; A0+C4 |
| phaddw %1, %2 |
| %endmacro |
| ; |
| %if ARCH_X86_64 |
| mova m7, [base+subpel_h_shufA] |
| mova m8, [base+subpel_h_shufB] |
| mova m9, [base+subpel_h_shufC] |
| %endif |
| HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ |
| HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ |
| movu m6, [srcq+ssq*0] ; 2 = _ _ |
| movu m0, [srcq+ssq*1] ; 3 = _ _ |
| lea srcq, [srcq+ssq*2] |
| HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ |
| HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ |
| ; |
| mova m7, [base+pw_8192] |
| pmulhrsw m4, m7 ; H pw_8192 |
| pmulhrsw m5, m7 ; H pw_8192 |
| pmulhrsw m6, m7 ; H pw_8192 |
| pmulhrsw m0, m7 ; H pw_8192 |
| punpcklwd m1, m4, m5 ; 0 1 ~ |
| punpcklwd m2, m5, m6 ; 1 2 ~ |
| punpcklwd m3, m6, m0 ; 2 3 ~ |
| SAVELINE_W8 1, m1 |
| SAVELINE_W8 2, m2 |
| SAVELINE_W8 3, m3 |
| ; |
| mova m7, [base+subpel_h_shufA] |
| movu m4, [srcq+ssq*0] ; 4 = _ _ |
| movu m5, [srcq+ssq*1] ; 5 = _ _ |
| lea srcq, [srcq+ssq*2] |
| movu m6, [srcq+ssq*0] ; 6 = _ _ |
| HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ |
| HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ |
| HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ |
| mova m7, [base+pw_8192] |
| pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ |
| pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ |
| pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ |
| punpcklwd m4, m0, m1 ; 3 4 ~ |
| punpcklwd m5, m1, m2 ; 4 5 ~ |
| punpcklwd m6, m2, m3 ; 5 6 ~ |
| ; |
| SAVELINE_W8 6, m3 |
| RESTORELINE_W8 1, m1 |
| RESTORELINE_W8 2, m2 |
| RESTORELINE_W8 3, m3 |
| .hv_w8_loop: |
| ; m8 accu for V a |
| ; m9 accu for V b |
| SAVELINE_W8 1, m3 |
| SAVELINE_W8 2, m4 |
| SAVELINE_W8 3, m5 |
| SAVELINE_W8 4, m6 |
| %if ARCH_X86_32 |
| pmaddwd m0, m1, subpelv0 ; a0 |
| pmaddwd m7, m2, subpelv0 ; b0 |
| pmaddwd m3, subpelv1 ; a1 |
| pmaddwd m4, subpelv1 ; b1 |
| paddd m0, m3 |
| paddd m7, m4 |
| pmaddwd m5, subpelv2 ; a2 |
| pmaddwd m6, subpelv2 ; b2 |
| paddd m0, m5 |
| paddd m7, m6 |
| mova m5, [base+pd_512] |
| paddd m0, m5 ; pd_512 |
| paddd m7, m5 ; pd_512 |
| mova accuv0, m0 |
| mova accuv1, m7 |
| %else |
| pmaddwd m8, m1, subpelv0 ; a0 |
| pmaddwd m9, m2, subpelv0 ; b0 |
| pmaddwd m3, subpelv1 ; a1 |
| pmaddwd m4, subpelv1 ; b1 |
| paddd m8, m3 |
| paddd m9, m4 |
| pmaddwd m5, subpelv2 ; a2 |
| pmaddwd m6, subpelv2 ; b2 |
| paddd m8, m5 |
| paddd m9, m6 |
| mova m7, [base+pd_512] |
| paddd m8, m7 ; pd_512 |
| paddd m9, m7 ; pd_512 |
| mova m7, [base+subpel_h_shufB] |
| mova m6, [base+subpel_h_shufC] |
| mova m5, [base+subpel_h_shufA] |
| %endif |
| movu m0, [srcq+ssq*1] ; 7 |
| movu m4, [srcq+ssq*2] ; 8 |
| lea srcq, [srcq+ssq*2] |
| HV_H_W8 m0, m1, m2, m3, m5, m7, m6 |
| HV_H_W8 m4, m1, m2, m3, m5, m7, m6 |
| mova m5, [base+pw_8192] |
| pmulhrsw m0, m5 ; H pw_8192 |
| pmulhrsw m4, m5 ; H pw_8192 |
| RESTORELINE_W8 6, m6 |
| punpcklwd m5, m6, m0 ; 6 7 ~ |
| punpcklwd m6, m0, m4 ; 7 8 ~ |
| pmaddwd m1, m5, subpelv3 ; a3 |
| paddd m2, m1, accuv0 |
| pmaddwd m1, m6, subpelv3 ; b3 |
| paddd m1, m1, accuv1 ; H + V |
| psrad m2, 10 |
| psrad m1, 10 |
| packssdw m2, m1 ; d -> w |
| packuswb m2, m1 ; w -> b |
| movd [dstq+dsq*0], m2 |
| psrlq m2, 32 |
| %if ARCH_X86_32 |
| add dstq, dsm |
| movd [dstq+dsq*0], m2 |
| add dstq, dsm |
| %else |
| movd [dstq+dsq*1], m2 |
| lea dstq, [dstq+dsq*2] |
| %endif |
| sub hd, 2 |
| jle .hv_w8_outer |
| SAVELINE_W8 6, m4 |
| RESTORELINE_W8 1, m1 |
| RESTORELINE_W8 2, m2 |
| RESTORELINE_W8 3, m3 |
| RESTORELINE_W8 4, m4 |
| jmp .hv_w8_loop |
| .hv_w8_outer: |
| movzx hd, r6w |
| add r4, 4 |
| mov dstq, r4 |
| %if ARCH_X86_32 |
| mov srcq, srcm |
| add srcq, 4 |
| mov srcm, srcq |
| %else |
| add r7, 4 |
| mov srcq, r7 |
| %endif |
| sub r6d, 1<<16 |
| jg .hv_w8_loop0 |
| RET |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 1, 2 |
| %elif WIN64 |
| DECLARE_REG_TMP 6, 4 |
| %else |
| DECLARE_REG_TMP 6, 7 |
| %endif |
| %macro PREP_8TAP_FN 3 ; type, type_h, type_v |
| cglobal prep_8tap_%1 |
| mov t0d, FILTER_%2 |
| mov t1d, FILTER_%3 |
| %ifnidn %1, sharp_smooth ; skip the jump in the last filter |
| jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) |
| %endif |
| %endmacro |
| |
| PREP_8TAP_FN regular, REGULAR, REGULAR |
| PREP_8TAP_FN regular_sharp, REGULAR, SHARP |
| PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH |
| PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR |
| PREP_8TAP_FN smooth, SMOOTH, SMOOTH |
| PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP |
| PREP_8TAP_FN sharp_regular, SHARP, REGULAR |
| PREP_8TAP_FN sharp, SHARP, SHARP |
| PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH |
| |
| %if ARCH_X86_32 |
| %define base_reg r2 |
| %define base base_reg-prep_ssse3 |
| %define W32_RESTORE_SSQ mov strideq, stridem |
| %else |
| %define base_reg r7 |
| %define base 0 |
| %define W32_RESTORE_SSQ |
| %endif |
| |
| cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 |
| %assign org_stack_offset stack_offset |
| imul mxd, mxm, 0x010101 |
| add mxd, t0d ; 8tap_h, mx, 4tap_h |
| imul myd, mym, 0x010101 |
| add myd, t1d ; 8tap_v, my, 4tap_v |
| movsxd wq, wm |
| movifnidn srcd, srcm |
| movifnidn hd, hm |
| LEA base_reg, prep_ssse3 |
| test mxd, 0xf00 |
| jnz .h |
| test myd, 0xf00 |
| jnz .v |
| tzcnt wd, wd |
| movzx wd, word [base_reg+wq*2+table_offset(prep,)] |
| add wq, base_reg |
| movifnidn strided, stridem |
| lea r6, [strideq*3] |
| %assign stack_offset org_stack_offset |
| %if WIN64 |
| pop r8 |
| pop r7 |
| %endif |
| jmp wq |
| .h: |
| test myd, 0xf00 |
| jnz .hv |
| WIN64_SPILL_XMM 12 |
| cmp wd, 4 |
| je .h_w4 |
| tzcnt wd, wd |
| %if ARCH_X86_64 |
| mova m10, [base+subpel_h_shufA] |
| mova m11, [base+subpel_h_shufB] |
| mova m9, [base+subpel_h_shufC] |
| %endif |
| shr mxd, 16 |
| sub srcq, 3 |
| movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] |
| movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0] |
| pshufd m5, m5, q0000 |
| movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4] |
| pshufd m6, m6, q0000 |
| mova m7, [base+pw_8192] |
| add wq, base_reg |
| jmp wq |
| .h_w4: |
| %if ARCH_X86_32 |
| and mxd, 0xff |
| %else |
| movzx mxd, mxb |
| %endif |
| dec srcq |
| movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] |
| pshufd m4, m4, q0000 |
| mova m6, [base+pw_8192] |
| mova m5, [base+subpel_h_shufA] |
| W32_RESTORE_SSQ |
| %if ARCH_X86_64 |
| lea stride3q, [strideq*3] |
| %endif |
| .h_w4_loop: |
| movq m0, [srcq+strideq*0] ; 0 |
| movq m1, [srcq+strideq*1] ; 1 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+strideq*2] |
| movq m2, [srcq+strideq*0] ; 2 |
| movq m3, [srcq+strideq*1] ; 3 |
| lea srcq, [srcq+strideq*2] |
| %else |
| movq m2, [srcq+strideq*2] ; 2 |
| movq m3, [srcq+stride3q ] ; 3 |
| lea srcq, [srcq+strideq*4] |
| %endif |
| pshufb m0, m5 ; subpel_h_shufA |
| pshufb m1, m5 |
| pshufb m2, m5 |
| pshufb m3, m5 |
| pmaddubsw m0, m4 ; subpel_filters + 2 |
| pmaddubsw m1, m4 |
| pmaddubsw m2, m4 |
| pmaddubsw m3, m4 |
| phaddw m0, m1 |
| phaddw m2, m3 |
| pmulhrsw m0, m6 ; pw_8192 |
| pmulhrsw m2, m6 ; pw_8192 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m2 |
| add tmpq, 32 |
| sub hd, 4 |
| jg .h_w4_loop |
| RET |
| ; |
| %macro PREP_8TAP_H 4 ; dst/src, tmp[1-3] |
| %if ARCH_X86_32 |
| pshufb %2, %1, [base+subpel_h_shufB] |
| pshufb %3, %1, [base+subpel_h_shufC] |
| pshufb %1, [base+subpel_h_shufA] |
| %else |
| pshufb %2, %1, m11; subpel_h_shufB |
| pshufb %3, %1, m9 ; subpel_h_shufC |
| pshufb %1, m10 ; subpel_h_shufA |
| %endif |
| pmaddubsw %4, %2, m5 ; subpel +0 B0 |
| pmaddubsw %2, m6 ; subpel +4 B4 |
| pmaddubsw %3, m6 ; subpel +4 C4 |
| pmaddubsw %1, m5 ; subpel +0 A0 |
| paddw %3, %4 |
| paddw %1, %2 |
| phaddw %1, %3 |
| pmulhrsw %1, m7 ; 8192 |
| %endmacro |
| ; |
| .h_w8: |
| %if ARCH_X86_32 |
| mov r3, r2 |
| %define base_reg r3 |
| W32_RESTORE_SSQ |
| %endif |
| .h_w8_loop: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*1] |
| lea srcq, [srcq+strideq*2] |
| PREP_8TAP_H m0, m2, m3, m4 |
| PREP_8TAP_H m1, m2, m3, m4 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| add tmpq, 32 |
| sub hd, 2 |
| jg .h_w8_loop |
| RET |
| .h_w16: |
| xor r6d, r6d |
| jmp .h_start |
| .h_w32: |
| mov r6, -16*1 |
| jmp .h_start |
| .h_w64: |
| mov r6, -16*3 |
| jmp .h_start |
| .h_w128: |
| mov r6, -16*7 |
| .h_start: |
| %if ARCH_X86_32 |
| mov r3, r2 |
| %define base_reg r3 |
| %endif |
| sub srcq, r6 |
| mov r5, r6 |
| W32_RESTORE_SSQ |
| .h_loop: |
| movu m0, [srcq+r6+8*0] |
| movu m1, [srcq+r6+8*1] |
| PREP_8TAP_H m0, m2, m3, m4 |
| PREP_8TAP_H m1, m2, m3, m4 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| add tmpq, 32 |
| add r6, 16 |
| jle .h_loop |
| add srcq, strideq |
| mov r6, r5 |
| dec hd |
| jg .h_loop |
| RET |
| %if ARCH_X86_32 |
| %define base_reg r2 |
| %endif |
| |
| .v: |
| %if ARCH_X86_32 |
| mov mxd, myd |
| and mxd, 0xff |
| %else |
| %assign stack_offset org_stack_offset |
| WIN64_SPILL_XMM 16 |
| movzx mxd, myb |
| %endif |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3] |
| mova m2, [base+pw_512] |
| psrlw m2, m2, 1 ; 0x0100 |
| mova m7, [base+pw_8192] |
| %if ARCH_X86_32 |
| %define subpel0 [rsp+mmsize*0] |
| %define subpel1 [rsp+mmsize*1] |
| %define subpel2 [rsp+mmsize*2] |
| %define subpel3 [rsp+mmsize*3] |
| %assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed |
| ALLOC_STACK -mmsize*4 |
| %assign regs_used 7 |
| movd m0, [myq+0] |
| pshufb m0, m2 |
| mova subpel0, m0 |
| movd m0, [myq+2] |
| pshufb m0, m2 |
| mova subpel1, m0 |
| movd m0, [myq+4] |
| pshufb m0, m2 |
| mova subpel2, m0 |
| movd m0, [myq+6] |
| pshufb m0, m2 |
| mova subpel3, m0 |
| mov strideq, [rstk+stack_offset+gprsize*3] |
| lea strideq, [strideq*3] |
| sub [rstk+stack_offset+gprsize*2], strideq |
| mov strideq, [rstk+stack_offset+gprsize*3] |
| mov srcq, [rstk+stack_offset+gprsize*2] |
| %else |
| %define subpel0 m8 |
| %define subpel1 m9 |
| %define subpel2 m10 |
| %define subpel3 m11 |
| movd subpel0, [myq+0] |
| pshufb subpel0, m2 |
| movd subpel1, [myq+2] |
| pshufb subpel1, m2 |
| movd subpel2, [myq+4] |
| pshufb subpel2, m2 |
| movd subpel3, [myq+6] |
| pshufb subpel3, m2 |
| lea stride3q, [strideq*3] |
| sub srcq, stride3q |
| cmp wd, 8 |
| jg .v_w16 |
| je .v_w8 |
| %endif |
| .v_w4: |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < mmsize |
| %define srcm [rsp+mmsize*4+gprsize*1] |
| %define tmpm [rsp+mmsize*4+gprsize*2] |
| %endif |
| mov tmpm, tmpq |
| mov srcm, srcq |
| lea r5d, [wq - 4] ; horizontal loop |
| shl r5d, (16 - 2) ; (wq / 4) << 16 |
| mov r5w, hw |
| .v_w4_loop0: |
| %endif |
| movd m2, [srcq+strideq*0] ; 0 |
| movhps m2, [srcq+strideq*2] ; 0 _ 2 |
| movd m3, [srcq+strideq*1] ; 1 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+strideq*2] |
| movhps m3, [srcq+strideq*1] ; 1 _ 3 |
| lea srcq, [srcq+strideq*2] |
| %else |
| movhps m3, [srcq+stride3q ] ; 1 _ 3 |
| lea srcq, [srcq+strideq*4] |
| %endif |
| pshufd m2, m2, q2020 ; 0 2 0 2 |
| pshufd m3, m3, q2020 ; 1 3 1 3 |
| punpckldq m2, m3 ; 0 1 2 3 |
| movd m3, [srcq+strideq*0] ; 4 |
| movd m1, [srcq+strideq*1] ; 5 |
| movd m0, [srcq+strideq*2] ; 6 |
| %if ARCH_X86_32 |
| lea srcq, [srcq+strideq*2] |
| add srcq, strideq |
| %else |
| add srcq, stride3q |
| %endif |
| punpckldq m3, m1 ; 4 5 _ _ |
| punpckldq m1, m0 ; 5 6 _ _ |
| palignr m4, m3, m2, 4 ; 1 2 3 4 |
| punpcklbw m3, m1 ; 45 56 |
| punpcklbw m1, m2, m4 ; 01 12 |
| punpckhbw m2, m4 ; 23 34 |
| .v_w4_loop: |
| pmaddubsw m5, m1, subpel0 ; a0 b0 |
| mova m1, m2 |
| pmaddubsw m2, subpel1 ; a1 b1 |
| paddw m5, m2 |
| mova m2, m3 |
| pmaddubsw m3, subpel2 ; a2 b2 |
| paddw m5, m3 |
| movd m4, [srcq+strideq*0] |
| punpckldq m3, m0, m4 ; 6 7 _ _ |
| movd m0, [srcq+strideq*1] |
| lea srcq, [srcq+strideq*2] |
| punpckldq m4, m0 ; 7 8 _ _ |
| punpcklbw m3, m4 ; 67 78 |
| pmaddubsw m4, m3, subpel3 ; a3 b3 |
| paddw m5, m4 |
| pmulhrsw m5, m7 |
| movq [tmpq+wq*0], m5 |
| movhps [tmpq+wq*2], m5 |
| lea tmpq, [tmpq+wq*4] |
| sub hd, 2 |
| jg .v_w4_loop |
| %if ARCH_X86_32 |
| mov hw, r5w ; reset vertical loop |
| mov tmpq, tmpm |
| mov srcq, srcm |
| add tmpq, 8 |
| add srcq, 4 |
| mov tmpm, tmpq |
| mov srcm, srcq |
| sub r5d, 1<<16 ; horizontal-- |
| jg .v_w4_loop0 |
| %endif |
| RET |
| |
| %if ARCH_X86_64 |
| .v_w8: |
| .v_w16: |
| lea r5d, [wq - 8] ; horizontal loop |
| mov r8, tmpq |
| mov r6, srcq |
| shl r5d, 8 - 3; (wq / 8) << 8 |
| mov r5b, hb |
| .v_w8_loop0: |
| movq m4, [srcq+strideq*0] ; 0 |
| movq m5, [srcq+strideq*1] ; 1 |
| lea srcq, [srcq+strideq*2] |
| movq m6, [srcq+strideq*0] ; 2 |
| movq m0, [srcq+strideq*1] ; 3 |
| lea srcq, [srcq+strideq*2] |
| movq m1, [srcq+strideq*0] ; 4 |
| movq m2, [srcq+strideq*1] ; 5 |
| lea srcq, [srcq+strideq*2] ; |
| movq m3, [srcq+strideq*0] ; 6 |
| shufpd m4, m0, 0x0c |
| shufpd m5, m1, 0x0c |
| punpcklbw m1, m4, m5 ; 01 |
| punpckhbw m4, m5 ; 34 |
| shufpd m6, m2, 0x0c |
| punpcklbw m2, m5, m6 ; 12 |
| punpckhbw m5, m6 ; 45 |
| shufpd m0, m3, 0x0c |
| punpcklbw m3, m6, m0 ; 23 |
| punpckhbw m6, m0 ; 56 |
| .v_w8_loop: |
| movq m12, [srcq+strideq*1] ; 8 |
| lea srcq, [srcq+strideq*2] |
| movq m13, [srcq+strideq*0] ; 9 |
| pmaddubsw m14, m1, subpel0 ; a0 |
| pmaddubsw m15, m2, subpel0 ; b0 |
| mova m1, m3 |
| mova m2, m4 |
| pmaddubsw m3, subpel1 ; a1 |
| pmaddubsw m4, subpel1 ; b1 |
| paddw m14, m3 |
| paddw m15, m4 |
| mova m3, m5 |
| mova m4, m6 |
| pmaddubsw m5, subpel2 ; a2 |
| pmaddubsw m6, subpel2 ; b2 |
| paddw m14, m5 |
| paddw m15, m6 |
| shufpd m6, m0, m12, 0x0d |
| shufpd m0, m12, m13, 0x0c |
| punpcklbw m5, m6, m0 ; 67 |
| punpckhbw m6, m0 ; 78 |
| pmaddubsw m12, m5, subpel3 ; a3 |
| pmaddubsw m13, m6, subpel3 ; b3 |
| paddw m14, m12 |
| paddw m15, m13 |
| pmulhrsw m14, m7 |
| pmulhrsw m15, m7 |
| movu [tmpq+wq*0], xm14 |
| movu [tmpq+wq*2], xm15 |
| lea tmpq, [tmpq+wq*4] |
| sub hd, 2 |
| jg .v_w8_loop |
| movzx hd, r5b ; reset vertical loop |
| add r8, 16 |
| add r6, 8 |
| mov tmpq, r8 |
| mov srcq, r6 |
| sub r5d, 1<<8 ; horizontal-- |
| jg .v_w8_loop0 |
| RET |
| %endif ;ARCH_X86_64 |
| %undef subpel0 |
| %undef subpel1 |
| %undef subpel2 |
| %undef subpel3 |
| |
| .hv: |
| %assign stack_offset org_stack_offset |
| cmp wd, 4 |
| jg .hv_w8 |
| and mxd, 0xff |
| movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] |
| %if ARCH_X86_32 |
| mov mxd, myd |
| and mxd, 0xff |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] |
| mov r5, r2; use as new base |
| %define base_reg r5 |
| %assign regs_used 2 |
| ALLOC_STACK -mmsize*14 |
| %assign regs_used 7 |
| mov strideq, [rstk+stack_offset+gprsize*3] |
| lea strideq, [strideq*3 + 1] |
| sub [rstk+stack_offset+gprsize*2], strideq |
| mov strideq, [rstk+stack_offset+gprsize*3] |
| mov srcq, [rstk+stack_offset+gprsize*2] |
| %define subpelv0 [rsp+mmsize*0] |
| %define subpelv1 [rsp+mmsize*1] |
| %define subpelv2 [rsp+mmsize*2] |
| %define subpelv3 [rsp+mmsize*3] |
| punpcklbw m0, m0 |
| psraw m0, 8 ; sign-extend |
| pshufd m6, m0, q0000 |
| mova subpelv0, m6 |
| pshufd m6, m0, q1111 |
| mova subpelv1, m6 |
| pshufd m6, m0, q2222 |
| mova subpelv2, m6 |
| pshufd m6, m0, q3333 |
| mova subpelv3, m6 |
| %else |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] |
| ALLOC_STACK mmsize*14, 14 |
| lea stride3q, [strideq*3] |
| sub srcq, stride3q |
| dec srcq |
| %define subpelv0 m10 |
| %define subpelv1 m11 |
| %define subpelv2 m12 |
| %define subpelv3 m13 |
| punpcklbw m0, m0 |
| psraw m0, 8 ; sign-extend |
| mova m8, [base+pw_8192] |
| mova m9, [base+pd_32] |
| pshufd m10, m0, q0000 |
| pshufd m11, m0, q1111 |
| pshufd m12, m0, q2222 |
| pshufd m13, m0, q3333 |
| %endif |
| pshufd m7, m1, q0000 |
| .hv_w4: |
| %define hv4_line_0_0 4 |
| %define hv4_line_0_1 5 |
| %define hv4_line_0_2 6 |
| %define hv4_line_0_3 7 |
| %define hv4_line_0_4 8 |
| %define hv4_line_0_5 9 |
| %define hv4_line_1_0 10 |
| %define hv4_line_1_1 11 |
| %define hv4_line_1_2 12 |
| %define hv4_line_1_3 13 |
| ; |
| ; |
| %if ARCH_X86_32 |
| %define w8192reg [base+pw_8192] |
| %define d32reg [base+pd_32] |
| %else |
| %define w8192reg m8 |
| %define d32reg m9 |
| %endif |
| ; lower shuffle 0 1 2 3 4 |
| mova m6, [base+subpel_h_shuf4] |
| movq m5, [srcq+strideq*0] ; 0 _ _ _ |
| movhps m5, [srcq+strideq*1] ; 0 _ 1 _ |
| movq m4, [srcq+strideq*2] ; 2 _ _ _ |
| %if ARCH_X86_32 |
| lea srcq, [srcq+strideq*2] |
| add srcq, strideq |
| movhps m4, [srcq+strideq*0] ; 2 _ 3 _ |
| add srcq, strideq |
| %else |
| movhps m4, [srcq+stride3q ] ; 2 _ 3 _ |
| lea srcq, [srcq+strideq*4] |
| %endif |
| pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ |
| pmaddubsw m2, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m2, m0 ;H 0 1 2 3 |
| pmulhrsw m2, w8192reg ;H pw_8192 |
| SAVELINE_W4 m2, 2, 0 |
| ; upper shuffle 2 3 4 5 6 |
| mova m6, [base+subpel_h_shuf4+16] |
| pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ |
| pmaddubsw m2, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m2, m0 ;H 0 1 2 3 |
| pmulhrsw m2, w8192reg ;H pw_8192 |
| ; |
| ; lower shuffle |
| mova m6, [base+subpel_h_shuf4] |
| movq m5, [srcq+strideq*0] ; 4 _ _ _ |
| movhps m5, [srcq+strideq*1] ; 4 _ 5 _ |
| movq m4, [srcq+strideq*2] ; 6 _ _ _ |
| pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ |
| pmaddubsw m3, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m3, m0 ;H 4 5 6 7 |
| pmulhrsw m3, w8192reg ;H pw_8192 |
| SAVELINE_W4 m3, 3, 0 |
| ; upper shuffle |
| mova m6, [base+subpel_h_shuf4+16] |
| pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ |
| pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ |
| pmaddubsw m3, m7 ;H subpel_filters |
| pmaddubsw m0, m7 ;H subpel_filters |
| phaddw m3, m0 ;H 4 5 6 7 |
| pmulhrsw m3, w8192reg ;H pw_8192 |
| ; |
| %if ARCH_X86_32 |
| lea srcq, [srcq+strideq*2] |
| add srcq, strideq |
| %else |
| add srcq, stride3q |
| %endif |
| ;process high |
| palignr m4, m3, m2, 4;V 1 2 3 4 |
| punpcklwd m1, m2, m4 ; V 01 12 |
| punpckhwd m2, m4 ; V 23 34 |
| pshufd m0, m3, q2121;V 5 6 5 6 |
| punpcklwd m3, m0 ; V 45 56 |
| SAVELINE_W4 m0, 0, 1 |
| SAVELINE_W4 m1, 1, 1 |
| SAVELINE_W4 m2, 2, 1 |
| SAVELINE_W4 m3, 3, 1 |
| ;process low |
| RESTORELINE_W4 m2, 2, 0 |
| RESTORELINE_W4 m3, 3, 0 |
| palignr m4, m3, m2, 4;V 1 2 3 4 |
| punpcklwd m1, m2, m4 ; V 01 12 |
| punpckhwd m2, m4 ; V 23 34 |
| pshufd m0, m3, q2121;V 5 6 5 6 |
| punpcklwd m3, m0 ; V 45 56 |
| .hv_w4_loop: |
| ;process low |
| pmaddwd m5, m1, subpelv0 ; V a0 b0 |
| mova m1, m2 |
| pmaddwd m2, subpelv1; V a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, subpelv2; V a2 b2 |
| paddd m5, m3 |
| ; |
| mova m6, [base+subpel_h_shuf4] |
| movq m4, [srcq+strideq*0] ; 7 |
| movhps m4, [srcq+strideq*1] ; 7 _ 8 _ |
| pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ |
| pmaddubsw m4, m7 ;H subpel_filters |
| phaddw m4, m4 ;H 7 8 7 8 |
| pmulhrsw m4, w8192reg ;H pw_8192 |
| palignr m3, m4, m0, 12 ; 6 7 8 7 |
| mova m0, m4 |
| punpcklwd m3, m4 ; 67 78 |
| pmaddwd m4, m3, subpelv3; a3 b3 |
| paddd m5, d32reg ; pd_32 |
| paddd m5, m4 |
| psrad m5, 6 |
| SAVELINE_W4 m0, 0, 0 |
| SAVELINE_W4 m1, 1, 0 |
| SAVELINE_W4 m2, 2, 0 |
| SAVELINE_W4 m3, 3, 0 |
| SAVELINE_W4 m5, 5, 0 |
| ;process high |
| RESTORELINE_W4 m0, 0, 1 |
| RESTORELINE_W4 m1, 1, 1 |
| RESTORELINE_W4 m2, 2, 1 |
| RESTORELINE_W4 m3, 3, 1 |
| pmaddwd m5, m1, subpelv0; V a0 b0 |
| mova m1, m2 |
| pmaddwd m2, subpelv1; V a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, subpelv2; V a2 b2 |
| paddd m5, m3 |
| ; |
| mova m6, [base+subpel_h_shuf4+16] |
| movq m4, [srcq+strideq*0] ; 7 |
| movhps m4, [srcq+strideq*1] ; 7 _ 8 _ |
| pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ |
| pmaddubsw m4, m7 ;H subpel_filters |
| phaddw m4, m4 ;H 7 8 7 8 |
| pmulhrsw m4, w8192reg ;H pw_8192 |
| palignr m3, m4, m0, 12 ; 6 7 8 7 |
| mova m0, m4 |
| punpcklwd m3, m4 ; 67 78 |
| pmaddwd m4, m3, subpelv3; a3 b3 |
| paddd m5, d32reg ; pd_32 |
| paddd m5, m4 |
| psrad m4, m5, 6 |
| ; |
| RESTORELINE_W4 m5, 5, 0 |
| packssdw m5, m4 |
| pshufd m5, m5, q3120 |
| movu [tmpq], m5 |
| lea srcq, [srcq+strideq*2] |
| add tmpq, 16 |
| sub hd, 2 |
| SAVELINE_W4 m0, 0, 1 |
| SAVELINE_W4 m1, 1, 1 |
| SAVELINE_W4 m2, 2, 1 |
| SAVELINE_W4 m3, 3, 1 |
| RESTORELINE_W4 m0, 0, 0 |
| RESTORELINE_W4 m1, 1, 0 |
| RESTORELINE_W4 m2, 2, 0 |
| RESTORELINE_W4 m3, 3, 0 |
| jg .hv_w4_loop |
| RET |
| %undef subpelv0 |
| %undef subpelv1 |
| %undef subpelv2 |
| %undef subpelv3 |
| ; |
| |
| |
| .hv_w8: |
| %assign stack_offset org_stack_offset |
| %define hv8_line_1 0 |
| %define hv8_line_2 1 |
| %define hv8_line_3 2 |
| %define hv8_line_4 3 |
| %define hv8_line_6 4 |
| shr mxd, 16 |
| %if ARCH_X86_32 |
| %define base_reg r2 |
| %define subpelh0 [rsp+mmsize*5] |
| %define subpelh1 [rsp+mmsize*6] |
| %define subpelv0 [rsp+mmsize*7] |
| %define subpelv1 [rsp+mmsize*8] |
| %define subpelv2 [rsp+mmsize*9] |
| %define subpelv3 [rsp+mmsize*10] |
| %define accuv0 [rsp+mmsize*11] |
| %define accuv1 [rsp+mmsize*12] |
| movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] |
| movzx mxd, myw |
| and mxd, 0xff |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] |
| ALLOC_STACK -mmsize*13 |
| %if STACK_ALIGNMENT < mmsize |
| mov rstk, r2m |
| %define tmpm [rsp+mmsize*13+gprsize*1] |
| %define srcm [rsp+mmsize*13+gprsize*2] |
| %define stridem [rsp+mmsize*13+gprsize*3] |
| mov stridem, rstk |
| %endif |
| mov r6, r2 |
| %define base_reg r6 |
| pshufd m0, m1, q0000 |
| pshufd m1, m1, q1111 |
| punpcklbw m5, m5 |
| psraw m5, 8 ; sign-extend |
| pshufd m2, m5, q0000 |
| pshufd m3, m5, q1111 |
| pshufd m4, m5, q2222 |
| pshufd m5, m5, q3333 |
| mova subpelh0, m0 |
| mova subpelh1, m1 |
| mova subpelv0, m2 |
| mova subpelv1, m3 |
| mova subpelv2, m4 |
| mova subpelv3, m5 |
| W32_RESTORE_SSQ |
| lea strided, [strided*3] |
| sub srcd, strided |
| sub srcd, 3 |
| mov srcm, srcd |
| W32_RESTORE_SSQ |
| %else |
| ALLOC_STACK mmsize*5, 16 |
| %define subpelh0 m10 |
| %define subpelh1 m11 |
| %define subpelv0 m12 |
| %define subpelv1 m13 |
| %define subpelv2 m14 |
| %define subpelv3 m15 |
| %define accuv0 m8 |
| %define accuv1 m9 |
| movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmovle myd, mxd |
| movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] |
| pshufd subpelh0, m0, q0000 |
| pshufd subpelh1, m0, q1111 |
| punpcklbw m1, m1 |
| psraw m1, 8 ; sign-extend |
| pshufd subpelv0, m1, q0000 |
| pshufd subpelv1, m1, q1111 |
| pshufd subpelv2, m1, q2222 |
| pshufd subpelv3, m1, q3333 |
| lea stride3q, [strideq*3] |
| sub srcq, 3 |
| sub srcq, stride3q |
| mov r6, srcq |
| %endif |
| lea r5d, [wq-4] |
| %if ARCH_X86_64 |
| mov r8, tmpq |
| %else |
| mov tmpm, tmpq |
| %endif |
| shl r5d, (16 - 2) |
| mov r5w, hw |
| .hv_w8_loop0: |
| movu m4, [srcq+strideq*0] ; 0 = _ _ |
| movu m5, [srcq+strideq*1] ; 1 = _ _ |
| lea srcq, [srcq+strideq*2] |
| %if ARCH_X86_64 |
| mova m7, [base+subpel_h_shufA] |
| mova m8, [base+subpel_h_shufB] |
| mova m9, [base+subpel_h_shufC] |
| %endif |
| HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ |
| HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ |
| movu m6, [srcq+strideq*0] ; 2 = _ _ |
| movu m0, [srcq+strideq*1] ; 3 = _ _ |
| lea srcq, [srcq+strideq*2] |
| HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ |
| HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ |
| ; |
| mova m7, [base+pw_8192] |
| pmulhrsw m4, m7 ; H pw_8192 |
| pmulhrsw m5, m7 ; H pw_8192 |
| pmulhrsw m6, m7 ; H pw_8192 |
| pmulhrsw m0, m7 ; H pw_8192 |
| punpcklwd m1, m4, m5 ; 0 1 ~ |
| punpcklwd m2, m5, m6 ; 1 2 ~ |
| punpcklwd m3, m6, m0 ; 2 3 ~ |
| SAVELINE_W8 1, m1 |
| SAVELINE_W8 2, m2 |
| SAVELINE_W8 3, m3 |
| ; |
| mova m7, [base+subpel_h_shufA] |
| movu m4, [srcq+strideq*0] ; 4 = _ _ |
| movu m5, [srcq+strideq*1] ; 5 = _ _ |
| lea srcq, [srcq+strideq*2] |
| movu m6, [srcq+strideq*0] ; 6 = _ _ |
| HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ |
| HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ |
| HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ |
| mova m7, [base+pw_8192] |
| pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ |
| pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ |
| pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ |
| punpcklwd m4, m0, m1 ; 3 4 ~ |
| punpcklwd m5, m1, m2 ; 4 5 ~ |
| punpcklwd m6, m2, m3 ; 5 6 ~ |
| ; |
| SAVELINE_W8 6, m3 |
| RESTORELINE_W8 1, m1 |
| RESTORELINE_W8 2, m2 |
| RESTORELINE_W8 3, m3 |
| .hv_w8_loop: |
| ; m8 accu for V a |
| ; m9 accu for V b |
| SAVELINE_W8 1, m3 |
| SAVELINE_W8 2, m4 |
| SAVELINE_W8 3, m5 |
| SAVELINE_W8 4, m6 |
| %if ARCH_X86_32 |
| pmaddwd m0, m1, subpelv0 ; a0 |
| pmaddwd m7, m2, subpelv0 ; b0 |
| pmaddwd m3, subpelv1 ; a1 |
| pmaddwd m4, subpelv1 ; b1 |
| paddd m0, m3 |
| paddd m7, m4 |
| pmaddwd m5, subpelv2 ; a2 |
| pmaddwd m6, subpelv2 ; b2 |
| paddd m0, m5 |
| paddd m7, m6 |
| mova m5, [base+pd_32] |
| paddd m0, m5 ; pd_512 |
| paddd m7, m5 ; pd_512 |
| mova accuv0, m0 |
| mova accuv1, m7 |
| %else |
| pmaddwd m8, m1, subpelv0 ; a0 |
| pmaddwd m9, m2, subpelv0 ; b0 |
| pmaddwd m3, subpelv1 ; a1 |
| pmaddwd m4, subpelv1 ; b1 |
| paddd m8, m3 |
| paddd m9, m4 |
| pmaddwd m5, subpelv2 ; a2 |
| pmaddwd m6, subpelv2 ; b2 |
| paddd m8, m5 |
| paddd m9, m6 |
| mova m7, [base+pd_32] |
| paddd m8, m7 ; pd_512 |
| paddd m9, m7 ; pd_512 |
| mova m7, [base+subpel_h_shufB] |
| mova m6, [base+subpel_h_shufC] |
| mova m5, [base+subpel_h_shufA] |
| %endif |
| movu m0, [srcq+strideq*1] ; 7 |
| movu m4, [srcq+strideq*2] ; 8 |
| lea srcq, [srcq+strideq*2] |
| HV_H_W8 m0, m1, m2, m3, m5, m7, m6 |
| HV_H_W8 m4, m1, m2, m3, m5, m7, m6 |
| mova m5, [base+pw_8192] |
| pmulhrsw m0, m5 ; H pw_8192 |
| pmulhrsw m4, m5 ; H pw_8192 |
| RESTORELINE_W8 6, m6 |
| punpcklwd m5, m6, m0 ; 6 7 ~ |
| punpcklwd m6, m0, m4 ; 7 8 ~ |
| pmaddwd m1, m5, subpelv3 ; a3 |
| paddd m2, m1, accuv0 |
| pmaddwd m1, m6, subpelv3 ; b3 |
| paddd m1, m1, accuv1 ; H + V |
| psrad m2, 6 |
| psrad m1, 6 |
| packssdw m2, m1 ; d -> w |
| movq [tmpq+wq*0], m2 |
| movhps [tmpq+wq*2], m2 |
| lea tmpq, [tmpq+wq*4] |
| sub hd, 2 |
| jle .hv_w8_outer |
| SAVELINE_W8 6, m4 |
| RESTORELINE_W8 1, m1 |
| RESTORELINE_W8 2, m2 |
| RESTORELINE_W8 3, m3 |
| RESTORELINE_W8 4, m4 |
| jmp .hv_w8_loop |
| .hv_w8_outer: |
| movzx hd, r5w |
| %if ARCH_X86_32 |
| add dword tmpm, 8 |
| mov tmpq, tmpm |
| mov srcq, srcm |
| add srcq, 4 |
| mov srcm, srcq |
| %else |
| add r8, 8 |
| mov tmpq, r8 |
| add r6, 4 |
| mov srcq, r6 |
| %endif |
| sub r5d, 1<<16 |
| jg .hv_w8_loop0 |
| RET |
| |
| %if ARCH_X86_32 |
| %macro SAVE_ALPHA_BETA 0 |
| mov alpham, alphad |
| mov betam, betad |
| %endmacro |
| |
| %macro SAVE_DELTA_GAMMA 0 |
| mov deltam, deltad |
| mov gammam, gammad |
| %endmacro |
| |
| %macro LOAD_ALPHA_BETA_MX 0 |
| mov mym, myd |
| mov alphad, alpham |
| mov betad, betam |
| mov mxd, mxm |
| %endmacro |
| |
| %macro LOAD_DELTA_GAMMA_MY 0 |
| mov mxm, mxd |
| mov deltad, deltam |
| mov gammad, gammam |
| mov myd, mym |
| %endmacro |
| |
| %define PIC_reg r2 |
| %define PIC_base_offset $$ |
| %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) |
| %else |
| %define SAVE_ALPHA_BETA |
| %define SAVE_DELTA_GAMMA |
| %define PIC_sym(sym) sym |
| %endif |
| |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < required_stack_alignment |
| %assign copy_args 8*4 |
| %else |
| %assign copy_args 0 |
| %endif |
| %endif |
| |
| %macro RELOC_ARGS 0 |
| %if copy_args |
| mov r0, r0m |
| mov r1, r1m |
| mov r2, r2m |
| mov r3, r3m |
| mov r5, r5m |
| mov dstm, r0 |
| mov dsm, r1 |
| mov srcm, r2 |
| mov ssm, r3 |
| mov mxm, r5 |
| mov r0, r6m |
| mov mym, r0 |
| %endif |
| %endmacro |
| |
| %macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 |
| %if cpuflag(sse4) |
| pblendw %1, %2, 0xAA |
| %else |
| pand %2, m10 |
| por %1, %2 |
| %endif |
| %endmacro |
| |
| %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 |
| ; Can be done using gathers, but that's terribly slow on many CPU:s |
| %if ARCH_X86_32 |
| %define m8 m4 |
| %define m9 m5 |
| %define m14 m6 |
| %define m15 m7 |
| %define m11 m7 |
| %endif |
| %if notcpuflag(ssse3) || ARCH_X86_32 |
| pxor m11, m11 |
| %endif |
| lea tmp1d, [myq+deltaq*4] |
| lea tmp2d, [myq+deltaq*1] |
| shr myd, 10 |
| shr tmp1d, 10 |
| movq m2, [filterq+myq *8] ; a |
| movq m8, [filterq+tmp1q*8] ; e |
| lea tmp1d, [tmp2q+deltaq*4] |
| lea myd, [tmp2q+deltaq*1] |
| shr tmp2d, 10 |
| shr tmp1d, 10 |
| movq m3, [filterq+tmp2q*8] ; b |
| movq m0, [filterq+tmp1q*8] ; f |
| punpcklwd m2, m3 |
| punpcklwd m8, m0 |
| lea tmp1d, [myq+deltaq*4] |
| lea tmp2d, [myq+deltaq*1] |
| shr myd, 10 |
| shr tmp1d, 10 |
| movq m0, [filterq+myq *8] ; c |
| movq m9, [filterq+tmp1q*8] ; g |
| lea tmp1d, [tmp2q+deltaq*4] |
| lea myd, [tmp2q+gammaq] ; my += gamma |
| shr tmp2d, 10 |
| shr tmp1d, 10 |
| movq m3, [filterq+tmp2q*8] ; d |
| movq m1, [filterq+tmp1q*8] ; h |
| punpcklwd m0, m3 |
| punpcklwd m9, m1 |
| punpckldq m1, m2, m0 |
| punpckhdq m2, m0 |
| punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 |
| punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 |
| punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 |
| punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 |
| pmaddwd m0, %3 |
| pmaddwd m3, %5 |
| pmaddwd m1, %7 |
| pmaddwd m14, %9 |
| paddd m0, m3 |
| paddd m1, m14 |
| paddd m0, m1 |
| mova %1, m0 |
| %if ARCH_X86_64 |
| SWAP m3, m14 |
| %endif |
| punpckldq m0, m8, m9 |
| punpckhdq m8, m9 |
| punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 |
| punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 |
| punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 |
| punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 |
| pmaddwd m1, %4 |
| pmaddwd m14, %6 |
| pmaddwd m2, %8 |
| pmaddwd m15, %10 |
| paddd m1, m14 |
| paddd m2, m15 |
| paddd m1, m2 |
| mova %2, m1 |
| %if ARCH_X86_64 |
| SWAP m14, m3 |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_64 |
| %define counterd r4d |
| %else |
| %if copy_args == 0 |
| %define counterd dword r4m |
| %else |
| %define counterd dword [esp+stack_size-4*7] |
| %endif |
| %endif |
| |
| %macro WARP_AFFINE_8X8T 0 |
| %if ARCH_X86_64 |
| cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts |
| %else |
| cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts |
| %if copy_args |
| %define tmpm [esp+stack_size-4*1] |
| %define tsm [esp+stack_size-4*2] |
| %endif |
| %endif |
| call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main |
| .loop: |
| %if ARCH_X86_32 |
| %define m12 m4 |
| %define m13 m5 |
| %define m14 m6 |
| %define m15 m7 |
| mova m12, [esp+0xC0] |
| mova m13, [esp+0xD0] |
| mova m14, [esp+0xE0] |
| mova m15, [esp+0xF0] |
| %endif |
| %if cpuflag(ssse3) |
| psrad m12, 13 |
| psrad m13, 13 |
| psrad m14, 13 |
| psrad m15, 13 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| mova m13, [PIC_sym(pw_8192)] |
| pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 |
| pmulhrsw m14, m13 |
| %else |
| %if ARCH_X86_32 |
| %define m10 m0 |
| %endif |
| mova m10, [PIC_sym(pd_16384)] |
| paddd m12, m10 |
| paddd m13, m10 |
| paddd m14, m10 |
| paddd m15, m10 |
| psrad m12, 15 |
| psrad m13, 15 |
| psrad m14, 15 |
| psrad m15, 15 |
| packssdw m12, m13 |
| packssdw m14, m15 |
| %endif |
| mova [tmpq+tsq*0], m12 |
| mova [tmpq+tsq*2], m14 |
| dec counterd |
| jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end |
| %if ARCH_X86_32 |
| mov tmpm, tmpd |
| mov r0, [esp+0x100] |
| mov r1, [esp+0x104] |
| %endif |
| call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2 |
| lea tmpq, [tmpq+tsq*4] |
| jmp .loop |
| %endmacro |
| |
| %macro WARP_AFFINE_8X8 0 |
| %if ARCH_X86_64 |
| cglobal warp_affine_8x8, 6, 14, 16, 0x90, \ |
| dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ |
| filter, tmp1, delta, my, gamma |
| %else |
| cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \ |
| dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ |
| filter, tmp1, delta, my, gamma |
| %define alphaq r0 |
| %define alphad r0 |
| %define alpham [esp+gprsize+0x100] |
| %define betaq r1 |
| %define betad r1 |
| %define betam [esp+gprsize+0x104] |
| %define deltaq r0 |
| %define deltad r0 |
| %define deltam [esp+gprsize+0x108] |
| %define gammaq r1 |
| %define gammad r1 |
| %define gammam [esp+gprsize+0x10C] |
| %define filterq r3 |
| %define tmp1q r4 |
| %define tmp1d r4 |
| %define tmp1m [esp+gprsize+0x110] |
| %define myq r5 |
| %define myd r5 |
| %define mym r6m |
| %if copy_args |
| %define dstm [esp+stack_size-4*1] |
| %define dsm [esp+stack_size-4*2] |
| %define srcm [esp+stack_size-4*3] |
| %define ssm [esp+stack_size-4*4] |
| %define mxm [esp+stack_size-4*5] |
| %define mym [esp+stack_size-4*6] |
| %endif |
| %endif |
| call .main |
| jmp .start |
| .loop: |
| %if ARCH_X86_32 |
| mov dstm, dstd |
| mov alphad, [esp+0x100] |
| mov betad, [esp+0x104] |
| %endif |
| call .main2 |
| lea dstq, [dstq+dsq*2] |
| .start: |
| %if notcpuflag(sse4) |
| %if cpuflag(ssse3) |
| %define roundval pw_8192 |
| %else |
| %define roundval pd_262144 |
| %endif |
| %if ARCH_X86_64 |
| mova m10, [PIC_sym(roundval)] |
| %else |
| %define m10 [PIC_sym(roundval)] |
| %endif |
| %endif |
| %if ARCH_X86_32 |
| %define m12 m5 |
| %define m13 m6 |
| mova m12, [esp+0xC0] |
| mova m13, [esp+0xD0] |
| %endif |
| %if cpuflag(sse4) |
| %if ARCH_X86_32 |
| %define m11 m4 |
| pxor m11, m11 |
| %endif |
| psrad m12, 18 |
| psrad m13, 18 |
| packusdw m12, m13 |
| pavgw m12, m11 ; (x + (1 << 10)) >> 11 |
| %else |
| %if cpuflag(ssse3) |
| psrad m12, 17 |
| psrad m13, 17 |
| packssdw m12, m13 |
| pmulhrsw m12, m10 |
| %else |
| paddd m12, m10 |
| paddd m13, m10 |
| psrad m12, 19 |
| psrad m13, 19 |
| packssdw m12, m13 |
| %endif |
| %endif |
| %if ARCH_X86_32 |
| %define m14 m6 |
| %define m15 m7 |
| mova m14, [esp+0xE0] |
| mova m15, [esp+0xF0] |
| %endif |
| %if cpuflag(sse4) |
| psrad m14, 18 |
| psrad m15, 18 |
| packusdw m14, m15 |
| pavgw m14, m11 ; (x + (1 << 10)) >> 11 |
| %else |
| %if cpuflag(ssse3) |
| psrad m14, 17 |
| psrad m15, 17 |
| packssdw m14, m15 |
| pmulhrsw m14, m10 |
| %else |
| paddd m14, m10 |
| paddd m15, m10 |
| psrad m14, 19 |
| psrad m15, 19 |
| packssdw m14, m15 |
| %endif |
| %endif |
| packuswb m12, m14 |
| movq [dstq+dsq*0], m12 |
| movhps [dstq+dsq*1], m12 |
| dec counterd |
| jg .loop |
| .end: |
| RET |
| ALIGN function_align |
| .main: |
| %assign stack_offset stack_offset+gprsize |
| %if ARCH_X86_32 |
| %assign stack_size stack_size+4 |
| %if copy_args |
| %assign stack_offset stack_offset-4 |
| %endif |
| RELOC_ARGS |
| LEA PIC_reg, $$ |
| %define PIC_mem [esp+gprsize+0x114] |
| mov abcdd, abcdm |
| %if copy_args == 0 |
| mov ssd, ssm |
| mov mxd, mxm |
| %endif |
| mov PIC_mem, PIC_reg |
| mov srcd, srcm |
| %endif |
| movsx deltad, word [abcdq+2*2] |
| movsx gammad, word [abcdq+2*3] |
| lea tmp1d, [deltaq*3] |
| sub gammad, tmp1d ; gamma -= delta*3 |
| SAVE_DELTA_GAMMA |
| %if ARCH_X86_32 |
| mov abcdd, abcdm |
| %endif |
| movsx alphad, word [abcdq+2*0] |
| movsx betad, word [abcdq+2*1] |
| lea tmp1q, [ssq*3+3] |
| add mxd, 512+(64<<10) |
| lea tmp2d, [alphaq*3] |
| sub srcq, tmp1q ; src -= src_stride*3 + 3 |
| %if ARCH_X86_32 |
| mov srcm, srcd |
| mov PIC_reg, PIC_mem |
| %endif |
| sub betad, tmp2d ; beta -= alpha*3 |
| lea filterq, [PIC_sym(mc_warp_filter)] |
| %if ARCH_X86_64 |
| mov myd, r6m |
| %if cpuflag(ssse3) |
| pxor m11, m11 |
| %endif |
| %endif |
| call .h |
| psrld m2, m0, 16 |
| psrld m3, m1, 16 |
| %if ARCH_X86_32 |
| %if notcpuflag(ssse3) |
| mova [esp+gprsize+0x00], m2 |
| %endif |
| mova [esp+gprsize+0x10], m3 |
| %endif |
| call .h |
| psrld m4, m0, 16 |
| psrld m5, m1, 16 |
| %if ARCH_X86_32 |
| mova [esp+gprsize+0x20], m4 |
| mova [esp+gprsize+0x30], m5 |
| %endif |
| call .h |
| %if ARCH_X86_64 |
| %define blendmask [rsp+gprsize+0x80] |
| %else |
| %if notcpuflag(ssse3) |
| mova m2, [esp+gprsize+0x00] |
| %endif |
| mova m3, [esp+gprsize+0x10] |
| %define blendmask [esp+gprsize+0x120] |
| %define m10 m7 |
| %endif |
| pcmpeqd m10, m10 |
| pslld m10, 16 |
| mova blendmask, m10 |
| BLENDHWDW m2, m0 ; 0 |
| BLENDHWDW m3, m1 ; 2 |
| mova [rsp+gprsize+0x00], m2 |
| mova [rsp+gprsize+0x10], m3 |
| call .h |
| %if ARCH_X86_32 |
| mova m4, [esp+gprsize+0x20] |
| mova m5, [esp+gprsize+0x30] |
| %endif |
| mova m10, blendmask |
| BLENDHWDW m4, m0 ; 1 |
| BLENDHWDW m5, m1 ; 3 |
| mova [rsp+gprsize+0x20], m4 |
| mova [rsp+gprsize+0x30], m5 |
| call .h |
| %if ARCH_X86_32 |
| %if notcpuflag(ssse3) |
| mova m2, [esp+gprsize+0x00] |
| %endif |
| mova m3, [esp+gprsize+0x10] |
| %define m10 m5 |
| %endif |
| psrld m6, m2, 16 |
| psrld m7, m3, 16 |
| mova m10, blendmask |
| BLENDHWDW m6, m0 ; 2 |
| BLENDHWDW m7, m1 ; 4 |
| mova [rsp+gprsize+0x40], m6 |
| mova [rsp+gprsize+0x50], m7 |
| call .h |
| %if ARCH_X86_32 |
| mova m4, [esp+gprsize+0x20] |
| mova m5, [esp+gprsize+0x30] |
| %endif |
| psrld m2, m4, 16 |
| psrld m3, m5, 16 |
| mova m10, blendmask |
| BLENDHWDW m2, m0 ; 3 |
| BLENDHWDW m3, m1 ; 5 |
| mova [rsp+gprsize+0x60], m2 |
| mova [rsp+gprsize+0x70], m3 |
| call .h |
| %if ARCH_X86_32 |
| mova m6, [esp+gprsize+0x40] |
| mova m7, [esp+gprsize+0x50] |
| %define m10 m7 |
| %endif |
| psrld m4, m6, 16 |
| psrld m5, m7, 16 |
| mova m10, blendmask |
| BLENDHWDW m4, m0 ; 4 |
| BLENDHWDW m5, m1 ; 6 |
| %if ARCH_X86_64 |
| add myd, 512+(64<<10) |
| mova m6, m2 |
| mova m7, m3 |
| %else |
| mova [esp+gprsize+0x80], m4 |
| mova [esp+gprsize+0x90], m5 |
| add dword mym, 512+(64<<10) |
| %endif |
| mov counterd, 4 |
| SAVE_ALPHA_BETA |
| .main2: |
| call .h |
| %if ARCH_X86_32 |
| mova m6, [esp+gprsize+0x60] |
| mova m7, [esp+gprsize+0x70] |
| %define m10 m5 |
| %endif |
| psrld m6, 16 |
| psrld m7, 16 |
| mova m10, blendmask |
| BLENDHWDW m6, m0 ; 5 |
| BLENDHWDW m7, m1 ; 7 |
| %if ARCH_X86_64 |
| WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ |
| m4, m5, \ |
| [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ |
| m6, m7 |
| %else |
| mova [esp+gprsize+0xA0], m6 |
| mova [esp+gprsize+0xB0], m7 |
| LOAD_DELTA_GAMMA_MY |
| WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ |
| [esp+gprsize+0x00], [esp+gprsize+0x10], \ |
| [esp+gprsize+0x80], [esp+gprsize+0x90], \ |
| [esp+gprsize+0x20], [esp+gprsize+0x30], \ |
| [esp+gprsize+0xA0], [esp+gprsize+0xB0] |
| LOAD_ALPHA_BETA_MX |
| %endif |
| call .h |
| mova m2, [rsp+gprsize+0x40] |
| mova m3, [rsp+gprsize+0x50] |
| %if ARCH_X86_32 |
| mova m4, [rsp+gprsize+0x80] |
| mova m5, [rsp+gprsize+0x90] |
| %define m10 m7 |
| %endif |
| mova [rsp+gprsize+0x00], m2 |
| mova [rsp+gprsize+0x10], m3 |
| mova [rsp+gprsize+0x40], m4 |
| mova [rsp+gprsize+0x50], m5 |
| psrld m4, 16 |
| psrld m5, 16 |
| mova m10, blendmask |
| BLENDHWDW m4, m0 ; 6 |
| BLENDHWDW m5, m1 ; 8 |
| %if ARCH_X86_64 |
| WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ |
| m6, m7, \ |
| [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ |
| m4, m5 |
| %else |
| mova [esp+gprsize+0x80], m4 |
| mova [esp+gprsize+0x90], m5 |
| LOAD_DELTA_GAMMA_MY |
| WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ |
| [esp+gprsize+0x20], [esp+gprsize+0x30], \ |
| [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ |
| [esp+gprsize+0x00], [esp+gprsize+0x10], \ |
| [esp+gprsize+0x80], [esp+gprsize+0x90] |
| mov mym, myd |
| mov dstd, dstm |
| mov dsd, dsm |
| mov mxd, mxm |
| %endif |
| mova m2, [rsp+gprsize+0x60] |
| mova m3, [rsp+gprsize+0x70] |
| %if ARCH_X86_32 |
| mova m6, [esp+gprsize+0xA0] |
| mova m7, [esp+gprsize+0xB0] |
| %endif |
| mova [rsp+gprsize+0x20], m2 |
| mova [rsp+gprsize+0x30], m3 |
| mova [rsp+gprsize+0x60], m6 |
| mova [rsp+gprsize+0x70], m7 |
| ret |
| ALIGN function_align |
| .h: |
| %if ARCH_X86_32 |
| %define m8 m3 |
| %define m9 m4 |
| %define m10 m5 |
| %define m14 m6 |
| %define m15 m7 |
| %endif |
| lea tmp1d, [mxq+alphaq*4] |
| lea tmp2d, [mxq+alphaq*1] |
| %if ARCH_X86_32 |
| %assign stack_offset stack_offset+4 |
| %assign stack_size stack_size+4 |
| %define PIC_mem [esp+gprsize*2+0x114] |
| mov PIC_mem, PIC_reg |
| mov srcd, srcm |
| %endif |
| movu m10, [srcq] |
| %if ARCH_X86_32 |
| add srcd, ssm |
| mov srcm, srcd |
| mov PIC_reg, PIC_mem |
| %else |
| add srcq, ssq |
| %endif |
| shr mxd, 10 |
| shr tmp1d, 10 |
| movq m1, [filterq+mxq *8] ; 0 X |
| movq m8, [filterq+tmp1q*8] ; 4 X |
| lea tmp1d, [tmp2q+alphaq*4] |
| lea mxd, [tmp2q+alphaq*1] |
| shr tmp2d, 10 |
| shr tmp1d, 10 |
| movhps m1, [filterq+tmp2q*8] ; 0 1 |
| movhps m8, [filterq+tmp1q*8] ; 4 5 |
| lea tmp1d, [mxq+alphaq*4] |
| lea tmp2d, [mxq+alphaq*1] |
| shr mxd, 10 |
| shr tmp1d, 10 |
| %if cpuflag(ssse3) |
| movq m14, [filterq+mxq *8] ; 2 X |
| movq m9, [filterq+tmp1q*8] ; 6 X |
| lea tmp1d, [tmp2q+alphaq*4] |
| lea mxd, [tmp2q+betaq] ; mx += beta |
| shr tmp2d, 10 |
| shr tmp1d, 10 |
| movhps m14, [filterq+tmp2q*8] ; 2 3 |
| movhps m9, [filterq+tmp1q*8] ; 6 7 |
| pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] |
| pmaddubsw m0, m1 |
| pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] |
| pmaddubsw m1, m8 |
| pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] |
| pmaddubsw m15, m14 |
| pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] |
| pmaddubsw m10, m9 |
| phaddw m0, m15 |
| phaddw m1, m10 |
| %else |
| %if ARCH_X86_32 |
| %define m11 m2 |
| %endif |
| pcmpeqw m0, m0 |
| psrlw m14, m0, 8 |
| psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 |
| pand m14, m10 ; 00 02 04 06 08 10 12 14 |
| packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 |
| psrldq m9, m0, 4 |
| pshufd m0, m14, q0220 |
| pand m0, m9 |
| psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ |
| pslldq m15, m14, 12 |
| por m0, m15 ; shufA |
| psrlw m15, m0, 8 |
| psraw m11, m1, 8 |
| psllw m0, 8 |
| psllw m1, 8 |
| psrlw m0, 8 |
| psraw m1, 8 |
| pmullw m15, m11 |
| pmullw m0, m1 |
| paddw m0, m15 ; pmaddubsw m0, m1 |
| pshufd m15, m14, q0220 |
| pand m15, m9 |
| psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ |
| pslldq m1, m14, 12 |
| por m15, m1 ; shufC |
| pshufd m1, m14, q0220 |
| pand m1, m9 |
| psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ |
| pslldq m11, m14, 12 |
| por m1, m11 ; shufB |
| pshufd m10, m14, q0220 |
| pand m10, m9 |
| psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ |
| pslldq m14, m14, 12 |
| por m10, m14 ; shufD |
| psrlw m9, m1, 8 |
| psraw m11, m8, 8 |
| psllw m1, 8 |
| psllw m8, 8 |
| psrlw m1, 8 |
| psraw m8, 8 |
| pmullw m9, m11 |
| pmullw m1, m8 |
| paddw m1, m9 ; pmaddubsw m1, m8 |
| movq m14, [filterq+mxq *8] ; 2 X |
| movq m9, [filterq+tmp1q*8] ; 6 X |
| lea tmp1d, [tmp2q+alphaq*4] |
| lea mxd, [tmp2q+betaq] ; mx += beta |
| shr tmp2d, 10 |
| shr tmp1d, 10 |
| movhps m14, [filterq+tmp2q*8] ; 2 3 |
| movhps m9, [filterq+tmp1q*8] ; 6 7 |
| psrlw m8, m15, 8 |
| psraw m11, m14, 8 |
| psllw m15, 8 |
| psllw m14, 8 |
| psrlw m15, 8 |
| psraw m14, 8 |
| pmullw m8, m11 |
| pmullw m15, m14 |
| paddw m15, m8 ; pmaddubsw m15, m14 |
| psrlw m8, m10, 8 |
| psraw m11, m9, 8 |
| psllw m10, 8 |
| psllw m9, 8 |
| psrlw m10, 8 |
| psraw m9, 8 |
| pmullw m8, m11 |
| pmullw m10, m9 |
| paddw m10, m8 ; pmaddubsw m10, m9 |
| pslld m8, m0, 16 |
| pslld m9, m1, 16 |
| pslld m14, m15, 16 |
| pslld m11, m10, 16 |
| paddw m0, m8 |
| paddw m1, m9 |
| paddw m15, m14 |
| paddw m10, m11 |
| psrad m0, 16 |
| psrad m1, 16 |
| psrad m15, 16 |
| psrad m10, 16 |
| packssdw m0, m15 ; phaddw m0, m15 |
| packssdw m1, m10 ; phaddw m1, m10 |
| %endif |
| mova m14, [PIC_sym(pw_8192)] |
| mova m9, [PIC_sym(pd_32768)] |
| pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 |
| pmaddwd m1, m14 |
| paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword |
| paddd m1, m9 |
| ret |
| %endmacro |
| |
| INIT_XMM sse4 |
| WARP_AFFINE_8X8 |
| WARP_AFFINE_8X8T |
| |
| INIT_XMM ssse3 |
| WARP_AFFINE_8X8 |
| WARP_AFFINE_8X8T |
| |
| INIT_XMM sse2 |
| WARP_AFFINE_8X8 |
| WARP_AFFINE_8X8T |
| |
| INIT_XMM ssse3 |
| |
| %if WIN64 |
| DECLARE_REG_TMP 6, 4 |
| %else |
| DECLARE_REG_TMP 6, 7 |
| %endif |
| |
| %macro BIDIR_FN 1 ; op |
| %1 0 |
| lea stride3q, [strideq*3] |
| jmp wq |
| .w4_loop: |
| %1_INC_PTR 2 |
| %1 0 |
| lea dstq, [dstq+strideq*4] |
| .w4: ; tile 4x |
| movd [dstq ], m0 ; copy dw[0] |
| pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] |
| movd [dstq+strideq*1], m1 ; copy dw[1] |
| punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] |
| movd [dstq+strideq*2], m0 ; dw[2] |
| psrlq m0, 32 ; shift right in dw[3] |
| movd [dstq+stride3q ], m0 ; copy |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| .w8_loop: |
| %1_INC_PTR 2 |
| %1 0 |
| lea dstq, [dstq+strideq*2] |
| .w8: |
| movq [dstq ], m0 |
| movhps [dstq+strideq*1], m0 |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w16_loop: |
| %1_INC_PTR 2 |
| %1 0 |
| lea dstq, [dstq+strideq] |
| .w16: |
| mova [dstq ], m0 |
| dec hd |
| jg .w16_loop |
| RET |
| .w32_loop: |
| %1_INC_PTR 4 |
| %1 0 |
| lea dstq, [dstq+strideq] |
| .w32: |
| mova [dstq ], m0 |
| %1 2 |
| mova [dstq + 16 ], m0 |
| dec hd |
| jg .w32_loop |
| RET |
| .w64_loop: |
| %1_INC_PTR 8 |
| %1 0 |
| add dstq, strideq |
| .w64: |
| %assign i 0 |
| %rep 4 |
| mova [dstq + i*16 ], m0 |
| %assign i i+1 |
| %if i < 4 |
| %1 2*i |
| %endif |
| %endrep |
| dec hd |
| jg .w64_loop |
| RET |
| .w128_loop: |
| %1_INC_PTR 16 |
| %1 0 |
| add dstq, strideq |
| .w128: |
| %assign i 0 |
| %rep 8 |
| mova [dstq + i*16 ], m0 |
| %assign i i+1 |
| %if i < 8 |
| %1 2*i |
| %endif |
| %endrep |
| dec hd |
| jg .w128_loop |
| RET |
| %endmacro |
| |
| %macro AVG 1 ; src_offset |
| ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel |
| mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 |
| paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 |
| mova m1, [tmp1q+(%1+1)*mmsize] |
| paddw m1, [tmp2q+(%1+1)*mmsize] |
| pmulhrsw m0, m2 |
| pmulhrsw m1, m2 |
| packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit |
| %endmacro |
| |
| %macro AVG_INC_PTR 1 |
| add tmp1q, %1*mmsize |
| add tmp2q, %1*mmsize |
| %endmacro |
| |
| cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 |
| LEA r6, avg_ssse3_table |
| tzcnt wd, wm ; leading zeros |
| movifnidn hd, hm ; move h(stack) to h(register) if not already that register |
| movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg |
| mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align |
| add wq, r6 |
| BIDIR_FN AVG |
| |
| %macro W_AVG 1 ; src_offset |
| ; (a * weight + b * (16 - weight) + 128) >> 8 |
| ; = ((a - b) * weight + (b << 4) + 128) >> 8 |
| ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 |
| ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 |
| mova m2, [tmp1q+(%1+0)*mmsize] |
| mova m0, m2 |
| psubw m2, [tmp2q+(%1+0)*mmsize] |
| mova m3, [tmp1q+(%1+1)*mmsize] |
| mova m1, m3 |
| psubw m3, [tmp2q+(%1+1)*mmsize] |
| pmulhw m2, m4 |
| pmulhw m3, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pmulhrsw m0, m5 |
| pmulhrsw m1, m5 |
| packuswb m0, m1 |
| %endmacro |
| |
| %define W_AVG_INC_PTR AVG_INC_PTR |
| |
| cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 |
| LEA r6, w_avg_ssse3_table |
| tzcnt wd, wm |
| movd m4, r6m |
| movifnidn hd, hm |
| pxor m0, m0 |
| movsxd wq, dword [r6+wq*4] |
| mova m5, [pw_2048+r6-w_avg_ssse3_table] |
| pshufb m4, m0 |
| psllw m4, 12 ; (weight-16) << 12 when interpreted as signed |
| add wq, r6 |
| cmp dword r6m, 7 |
| jg .weight_gt7 |
| mov r6, tmp1q |
| psubw m0, m4 |
| mov tmp1q, tmp2q |
| mova m4, m0 ; -weight |
| mov tmp2q, r6 |
| .weight_gt7: |
| BIDIR_FN W_AVG |
| |
| %macro MASK 1 ; src_offset |
| ; (a * m + b * (64 - m) + 512) >> 10 |
| ; = ((a - b) * m + (b << 6) + 512) >> 10 |
| ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 |
| mova m3, [maskq+(%1+0)*(mmsize/2)] |
| mova m0, [tmp2q+(%1+0)*mmsize] ; b |
| psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a |
| mova m6, m3 ; m |
| psubb m3, m4, m6 ; -m |
| paddw m1, m1 ; (b - a) << 1 |
| paddb m3, m3 ; -m << 1 |
| punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) |
| pmulhw m1, m2 ; (-m * (b - a)) << 10 |
| paddw m0, m1 ; + b |
| mova m1, [tmp2q+(%1+1)*mmsize] ; b |
| psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a |
| paddw m2, m2 ; (b - a) << 1 |
| mova m6, m3 ; (-m << 1) |
| punpckhbw m3, m4, m6 ; (-m << 9) |
| pmulhw m2, m3 ; (-m << 9) |
| paddw m1, m2 ; (-m * (b - a)) << 10 |
| pmulhrsw m0, m5 ; round |
| pmulhrsw m1, m5 ; round |
| packuswb m0, m1 ; interleave 16 -> 8 |
| %endmacro |
| |
| %macro MASK_INC_PTR 1 |
| add maskq, %1*mmsize/2 |
| add tmp1q, %1*mmsize |
| add tmp2q, %1*mmsize |
| %endmacro |
| |
| %if ARCH_X86_64 |
| cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 |
| movifnidn hd, hm |
| %else |
| cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 |
| %define hd dword r5m |
| %endif |
| %define base r6-mask_ssse3_table |
| LEA r6, mask_ssse3_table |
| tzcnt wd, wm |
| movsxd wq, dword [r6+wq*4] |
| pxor m4, m4 |
| mova m5, [base+pw_2048] |
| add wq, r6 |
| mov maskq, r6m |
| BIDIR_FN MASK |
| %undef hd |
| |
| %macro W_MASK_420_B 2 ; src_offset in bytes, mask_out |
| ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] **** |
| mova m0, [tmp1q+(%1)] |
| mova m1, [tmp2q+(%1)] |
| mova m2, reg_pw_6903 |
| psubw m1, m0 |
| pabsw m%2, m1 ; abs(tmp1 - tmp2) |
| mova m3, m2 |
| psubusw m2, m%2 |
| psrlw m2, 8 ; 64 - m |
| mova m%2, m2 |
| psllw m2, 10 |
| pmulhw m1, m2 ; tmp2 * () |
| paddw m0, m1 ; tmp1 + () |
| ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] **** |
| mova m1, [tmp1q+(%1)+mmsize] |
| mova m2, [tmp2q+(%1)+mmsize] |
| psubw m2, m1 |
| pabsw m7, m2 ; abs(tmp1 - tmp2) |
| psubusw m3, m7 |
| psrlw m3, 8 ; 64 - m |
| phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0] |
| psllw m3, 10 |
| pmulhw m2, m3 |
| %if ARCH_X86_32 |
| mova reg_pw_2048, [base+pw_2048] |
| %endif |
| paddw m1, m2 |
| pmulhrsw m0, reg_pw_2048 ; round/scale 2048 |
| pmulhrsw m1, reg_pw_2048 ; round/scale 2048 |
| packuswb m0, m1 ; concat m0 = u8.dst[15..0] |
| %endmacro |
| |
| %macro W_MASK_420 2 |
| W_MASK_420_B (%1*16), %2 |
| %endmacro |
| |
| %define base r6-w_mask_420_ssse3_table |
| %if ARCH_X86_64 |
| %define reg_pw_6903 m8 |
| %define reg_pw_2048 m9 |
| ; args: dst, stride, tmp1, tmp2, w, h, mask, sign |
| cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask |
| lea r6, [w_mask_420_ssse3_table] |
| mov wd, wm |
| tzcnt r7d, wd |
| movd m0, r7m ; sign |
| movifnidn hd, hm |
| movsxd r7, [r6+r7*4] |
| mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 |
| mova reg_pw_2048, [base+pw_2048] |
| movd m6, [base+pw_258] ; 64 * 4 + 2 |
| add r7, r6 |
| mov maskq, maskmp |
| psubw m6, m0 |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| W_MASK_420 0, 4 |
| jmp r7 |
| %define loop_w r7d |
| %else |
| %define reg_pw_6903 [base+pw_6903] |
| %define reg_pw_2048 m3 |
| cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask |
| tzcnt wd, wm |
| LEA r6, w_mask_420_ssse3_table |
| movd m0, r7m ; sign |
| mov maskq, r6mp |
| mov wd, [r6+wq*4] |
| movd m6, [base+pw_258] |
| add wq, r6 |
| psubw m6, m0 |
| pshuflw m6, m6, q0000 |
| punpcklqdq m6, m6 |
| W_MASK_420 0, 4 |
| jmp wd |
| %define loop_w dword r0m |
| %define hd dword r5m |
| %endif |
| .w4_loop: |
| add tmp1q, 2*16 |
| add tmp2q, 2*16 |
| W_MASK_420 0, 4 |
| lea dstq, [dstq+strideq*2] |
| add maskq, 4 |
| .w4: |
| movd [dstq ], m0 ; copy m0[0] |
| pshuflw m1, m0, q1032 |
| movd [dstq+strideq*1], m1 ; copy m0[1] |
| lea dstq, [dstq+strideq*2] |
| punpckhqdq m0, m0 |
| movd [dstq+strideq*0], m0 ; copy m0[2] |
| psrlq m0, 32 |
| movd [dstq+strideq*1], m0 ; copy m0[3] |
| psubw m1, m6, m4 ; a _ c _ |
| psrlq m4, 32 ; b _ d _ |
| psubw m1, m4 |
| psrlw m1, 2 |
| packuswb m1, m1 |
| pshuflw m1, m1, q2020 |
| movd [maskq], m1 |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| .w8_loop: |
| add tmp1q, 2*16 |
| add tmp2q, 2*16 |
| W_MASK_420 0, 4 |
| lea dstq, [dstq+strideq*2] |
| add maskq, 4 |
| .w8: |
| movq [dstq ], m0 |
| movhps [dstq+strideq*1], m0 |
| psubw m0, m6, m4 |
| punpckhqdq m4, m4 |
| psubw m0, m4 |
| psrlw m0, 2 |
| packuswb m0, m0 |
| movd [maskq], m0 |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w16: ; w32/64/128 |
| %if ARCH_X86_32 |
| mov wd, wm ; because we altered it in 32bit setup |
| %endif |
| mov loop_w, wd ; use width as counter |
| jmp .w16ge_inner_loop_first |
| .w16ge_loop: |
| lea tmp1q, [tmp1q+wq*2] ; skip even line pixels |
| lea tmp2q, [tmp2q+wq*2] ; skip even line pixels |
| sub dstq, wq |
| mov loop_w, wd |
| lea dstq, [dstq+strideq*2] |
| .w16ge_inner_loop: |
| W_MASK_420_B 0, 4 |
| .w16ge_inner_loop_first: |
| mova [dstq ], m0 |
| W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16)) |
| mova [dstq+strideq*1], m0 |
| psubw m1, m6, m4 ; m9 == 64 * 4 + 2 |
| psubw m1, m5 ; - odd line mask |
| psrlw m1, 2 ; >> 2 |
| packuswb m1, m1 |
| movq [maskq], m1 |
| add tmp1q, 2*16 |
| add tmp2q, 2*16 |
| add maskq, 8 |
| add dstq, 16 |
| sub loop_w, 16 |
| jg .w16ge_inner_loop |
| sub hd, 2 |
| jg .w16ge_loop |
| RET |
| |
| %undef reg_pw_6903 |
| %undef reg_pw_2048 |
| %undef dst_bak |
| %undef loop_w |
| %undef orig_w |
| %undef hd |
| |
| %macro BLEND_64M 4; a, b, mask1, mask2 |
| punpcklbw m0, %1, %2; {b;a}[7..0] |
| punpckhbw %1, %2 ; {b;a}[15..8] |
| pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 |
| pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 |
| pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 |
| pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 |
| packuswb m0, %1 ; {blendpx}[15..0] u8 |
| %endmacro |
| |
| %macro BLEND 2; a, b |
| psubb m3, m4, m0 ; m3 = (64 - m) |
| punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] |
| punpckhbw m3, m0 ; {m;(64-m)}[15..8] |
| BLEND_64M %1, %2, m2, m3 |
| %endmacro |
| |
| cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask |
| %define base r6-blend_ssse3_table |
| LEA r6, blend_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movifnidn maskq, maskmp |
| movsxd wq, dword [r6+wq*4] |
| mova m4, [base+pb_64] |
| mova m5, [base+pw_512] |
| add wq, r6 |
| lea r6, [dsq*3] |
| jmp wq |
| .w4: |
| movq m0, [maskq]; m |
| movd m1, [dstq+dsq*0] ; a |
| movd m6, [dstq+dsq*1] |
| punpckldq m1, m6 |
| movq m6, [tmpq] ; b |
| psubb m3, m4, m0 ; m3 = (64 - m) |
| punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] |
| punpcklbw m1, m6 ; {b;a}[7..0] |
| pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 |
| pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 |
| packuswb m1, m0 ; {blendpx}[15..0] u8 |
| movd [dstq+dsq*0], m1 |
| psrlq m1, 32 |
| movd [dstq+dsq*1], m1 |
| add maskq, 8 |
| add tmpq, 8 |
| lea dstq, [dstq+dsq*2] ; dst_stride * 2 |
| sub hd, 2 |
| jg .w4 |
| RET |
| .w8: |
| mova m0, [maskq]; m |
| movq m1, [dstq+dsq*0] ; a |
| movhps m1, [dstq+dsq*1] |
| mova m6, [tmpq] ; b |
| BLEND m1, m6 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| add maskq, 16 |
| add tmpq, 16 |
| lea dstq, [dstq+dsq*2] ; dst_stride * 2 |
| sub hd, 2 |
| jg .w8 |
| RET |
| .w16: |
| mova m0, [maskq]; m |
| mova m1, [dstq] ; a |
| mova m6, [tmpq] ; b |
| BLEND m1, m6 |
| mova [dstq], m0 |
| add maskq, 16 |
| add tmpq, 16 |
| add dstq, dsq ; dst_stride |
| dec hd |
| jg .w16 |
| RET |
| .w32: |
| %assign i 0 |
| %rep 2 |
| mova m0, [maskq+16*i]; m |
| mova m1, [dstq+16*i] ; a |
| mova m6, [tmpq+16*i] ; b |
| BLEND m1, m6 |
| mova [dstq+i*16], m0 |
| %assign i i+1 |
| %endrep |
| add maskq, 32 |
| add tmpq, 32 |
| add dstq, dsq ; dst_stride |
| dec hd |
| jg .w32 |
| RET |
| |
| cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask |
| %define base r5-blend_v_ssse3_table |
| LEA r5, blend_v_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, dword [r5+wq*4] |
| mova m5, [base+pw_512] |
| add wq, r5 |
| add maskq, obmc_masks-blend_v_ssse3_table |
| jmp wq |
| .w2: |
| movd m3, [maskq+4] |
| punpckldq m3, m3 |
| ; 2 mask blend is provided for 4 pixels / 2 lines |
| .w2_loop: |
| movd m1, [dstq+dsq*0] ; a {..;a;a} |
| pinsrw m1, [dstq+dsq*1], 1 |
| movd m2, [tmpq] ; b |
| punpcklbw m0, m1, m2; {b;a}[7..0] |
| pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 |
| pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 |
| packuswb m0, m1 ; {blendpx}[8..0] u8 |
| movd r3d, m0 |
| mov [dstq+dsq*0], r3w |
| shr r3d, 16 |
| mov [dstq+dsq*1], r3w |
| add tmpq, 2*2 |
| lea dstq, [dstq + dsq * 2] |
| sub hd, 2 |
| jg .w2_loop |
| RET |
| .w4: |
| movddup m3, [maskq+8] |
| ; 4 mask blend is provided for 8 pixels / 2 lines |
| .w4_loop: |
| movd m1, [dstq+dsq*0] ; a |
| movd m2, [dstq+dsq*1] ; |
| punpckldq m1, m2 |
| movq m2, [tmpq] ; b |
| punpcklbw m1, m2 ; {b;a}[7..0] |
| pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 |
| pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 |
| packuswb m1, m1 ; {blendpx}[8..0] u8 |
| movd [dstq], m1 |
| psrlq m1, 32 |
| movd [dstq+dsq*1], m1 |
| add tmpq, 2*4 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .w4_loop |
| RET |
| .w8: |
| mova m3, [maskq+16] |
| ; 8 mask blend is provided for 16 pixels |
| .w8_loop: |
| movq m1, [dstq+dsq*0] ; a |
| movhps m1, [dstq+dsq*1] |
| mova m2, [tmpq]; b |
| BLEND_64M m1, m2, m3, m3 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| add tmpq, 16 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w16: |
| ; 16 mask blend is provided for 32 pixels |
| mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) |
| mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) |
| .w16_loop: |
| mova m1, [dstq] ; a |
| mova m2, [tmpq] ; b |
| BLEND_64M m1, m2, m3, m4 |
| mova [dstq], m0 |
| add tmpq, 16 |
| add dstq, dsq |
| dec hd |
| jg .w16_loop |
| RET |
| .w32: |
| %if WIN64 |
| mova [rsp+8], xmm6 |
| %endif |
| mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) |
| mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) |
| mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) |
| ; 16 mask blend is provided for 64 pixels |
| .w32_loop: |
| mova m1, [dstq+16*0] ; a |
| mova m2, [tmpq+16*0] ; b |
| BLEND_64M m1, m2, m3, m4 |
| movq m1, [dstq+16*1] ; a |
| punpcklbw m1, [tmpq+16*1] ; b |
| pmaddubsw m1, m6 |
| pmulhrsw m1, m5 |
| packuswb m1, m1 |
| mova [dstq+16*0], m0 |
| movq [dstq+16*1], m1 |
| add tmpq, 32 |
| add dstq, dsq |
| dec hd |
| jg .w32_loop |
| %if WIN64 |
| mova xmm6, [rsp+8] |
| %endif |
| RET |
| |
| cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask |
| %define base t0-blend_h_ssse3_table |
| %if ARCH_X86_32 |
| ; We need to keep the PIC pointer for w4, reload wd from stack instead |
| DECLARE_REG_TMP 6 |
| %else |
| DECLARE_REG_TMP 5 |
| mov r6d, wd |
| %endif |
| LEA t0, blend_h_ssse3_table |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, dword [t0+wq*4] |
| mova m5, [base+pw_512] |
| add wq, t0 |
| lea maskq, [base+obmc_masks+hq*2] |
| lea hd, [hq*3] |
| shr hd, 2 ; h * 3/4 |
| lea maskq, [maskq+hq*2] |
| neg hq |
| jmp wq |
| .w2: |
| movd m0, [dstq+dsq*0] |
| pinsrw m0, [dstq+dsq*1], 1 |
| movd m2, [maskq+hq*2] |
| movd m1, [tmpq] |
| punpcklwd m2, m2 |
| punpcklbw m0, m1 |
| pmaddubsw m0, m2 |
| pmulhrsw m0, m5 |
| packuswb m0, m0 |
| movd r3d, m0 |
| mov [dstq+dsq*0], r3w |
| shr r3d, 16 |
| mov [dstq+dsq*1], r3w |
| lea dstq, [dstq+dsq*2] |
| add tmpq, 2*2 |
| add hq, 2 |
| jl .w2 |
| RET |
| .w4: |
| %if ARCH_X86_32 |
| mova m3, [base+blend_shuf] |
| %else |
| mova m3, [blend_shuf] |
| %endif |
| .w4_loop: |
| movd m0, [dstq+dsq*0] |
| movd m2, [dstq+dsq*1] |
| punpckldq m0, m2 ; a |
| movq m1, [tmpq] ; b |
| movq m2, [maskq+hq*2] ; m |
| pshufb m2, m3 |
| punpcklbw m0, m1 |
| pmaddubsw m0, m2 |
| pmulhrsw m0, m5 |
| packuswb m0, m0 |
| movd [dstq+dsq*0], m0 |
| psrlq m0, 32 |
| movd [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| add tmpq, 4*2 |
| add hq, 2 |
| jl .w4_loop |
| RET |
| .w8: |
| movd m4, [maskq+hq*2] |
| punpcklwd m4, m4 |
| pshufd m3, m4, q0000 |
| pshufd m4, m4, q1111 |
| movq m1, [dstq+dsq*0] ; a |
| movhps m1, [dstq+dsq*1] |
| mova m2, [tmpq] |
| BLEND_64M m1, m2, m3, m4 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| add tmpq, 8*2 |
| add hq, 2 |
| jl .w8 |
| RET |
| ; w16/w32/w64/w128 |
| .w16: |
| %if ARCH_X86_32 |
| mov r6d, wm |
| %endif |
| sub dsq, r6 |
| .w16_loop0: |
| movd m3, [maskq+hq*2] |
| pshuflw m3, m3, q0000 |
| punpcklqdq m3, m3 |
| mov wd, r6d |
| .w16_loop: |
| mova m1, [dstq] ; a |
| mova m2, [tmpq] ; b |
| BLEND_64M m1, m2, m3, m3 |
| mova [dstq], m0 |
| add dstq, 16 |
| add tmpq, 16 |
| sub wd, 16 |
| jg .w16_loop |
| add dstq, dsq |
| inc hq |
| jl .w16_loop0 |
| RET |
| |
| ; emu_edge args: |
| ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, |
| ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, |
| ; const pixel *ref, const ptrdiff_t ref_stride |
| ; |
| ; bw, bh total filled size |
| ; iw, ih, copied block -> fill bottom, right |
| ; x, y, offset in bw/bh -> fill top, left |
| cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ |
| y, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| ; we assume that the buffer (stride) is larger than width, so we can |
| ; safely overwrite by a few bytes |
| pxor m1, m1 |
| |
| %if ARCH_X86_64 |
| %define reg_zero r12q |
| %define reg_tmp r10 |
| %define reg_src srcq |
| %define reg_bottomext bottomextq |
| %define reg_rightext rightextq |
| %define reg_blkm r9m |
| %else |
| %define reg_zero r6 |
| %define reg_tmp r0 |
| %define reg_src r1 |
| %define reg_bottomext r0 |
| %define reg_rightext r1 |
| %define reg_blkm r2m |
| %endif |
| ; |
| ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) |
| xor reg_zero, reg_zero |
| lea reg_tmp, [ihq-1] |
| cmp yq, ihq |
| cmovl reg_tmp, yq |
| test yq, yq |
| cmovl reg_tmp, reg_zero |
| %if ARCH_X86_64 |
| imul reg_tmp, sstrideq |
| add srcq, reg_tmp |
| %else |
| imul reg_tmp, sstridem |
| mov reg_src, srcm |
| add reg_src, reg_tmp |
| %endif |
| ; |
| ; ref += iclip(x, 0, iw - 1) |
| lea reg_tmp, [iwq-1] |
| cmp xq, iwq |
| cmovl reg_tmp, xq |
| test xq, xq |
| cmovl reg_tmp, reg_zero |
| add reg_src, reg_tmp |
| %if ARCH_X86_32 |
| mov srcm, reg_src |
| %endif |
| ; |
| ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) |
| %if ARCH_X86_32 |
| mov r1, r1m ; restore bh |
| %endif |
| lea reg_bottomext, [yq+bhq] |
| sub reg_bottomext, ihq |
| lea r3, [bhq-1] |
| cmovl reg_bottomext, reg_zero |
| ; |
| |
| DEFINE_ARGS bw, bh, iw, ih, x, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; top_ext = iclip(-y, 0, bh - 1) |
| neg topextq |
| cmovl topextq, reg_zero |
| cmp reg_bottomext, bhq |
| cmovge reg_bottomext, r3 |
| cmp topextq, bhq |
| cmovg topextq, r3 |
| %if ARCH_X86_32 |
| mov r4m, reg_bottomext |
| ; |
| ; right_ext = iclip(x + bw - iw, 0, bw - 1) |
| mov r0, r0m ; restore bw |
| %endif |
| lea reg_rightext, [xq+bwq] |
| sub reg_rightext, iwq |
| lea r2, [bwq-1] |
| cmovl reg_rightext, reg_zero |
| |
| DEFINE_ARGS bw, bh, iw, ih, leftext, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; left_ext = iclip(-x, 0, bw - 1) |
| neg leftextq |
| cmovl leftextq, reg_zero |
| cmp reg_rightext, bwq |
| cmovge reg_rightext, r2 |
| %if ARCH_X86_32 |
| mov r3m, r1 |
| %endif |
| cmp leftextq, bwq |
| cmovge leftextq, r2 |
| |
| %undef reg_zero |
| %undef reg_tmp |
| %undef reg_src |
| %undef reg_bottomext |
| %undef reg_rightext |
| |
| DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; center_h = bh - top_ext - bottom_ext |
| %if ARCH_X86_64 |
| lea r3, [bottomextq+topextq] |
| sub centerhq, r3 |
| %else |
| mov r1, centerhm ; restore r1 |
| sub centerhq, topextq |
| sub centerhq, r4m |
| mov r1m, centerhq |
| %endif |
| ; |
| ; blk += top_ext * PXSTRIDE(dst_stride) |
| mov r2, topextq |
| %if ARCH_X86_64 |
| imul r2, dstrideq |
| %else |
| mov r6, r6m ; restore dstq |
| imul r2, dstridem |
| %endif |
| add dstq, r2 |
| mov reg_blkm, dstq ; save pointer for ext |
| ; |
| ; center_w = bw - left_ext - right_ext |
| mov centerwq, bwq |
| %if ARCH_X86_64 |
| lea r3, [rightextq+leftextq] |
| sub centerwq, r3 |
| %else |
| sub centerwq, r3m |
| sub centerwq, leftextq |
| %endif |
| |
| ; vloop Macro |
| %macro v_loop 3 ; need_left_ext, need_right_ext, suffix |
| %if ARCH_X86_64 |
| %define reg_tmp r12 |
| %else |
| %define reg_tmp r0 |
| %endif |
| .v_loop_%3: |
| %if ARCH_X86_32 |
| mov r0, r0m |
| mov r1, r1m |
| %endif |
| %if %1 |
| test leftextq, leftextq |
| jz .body_%3 |
| ; left extension |
| %if ARCH_X86_64 |
| movd m0, [srcq] |
| %else |
| mov r3, srcm |
| movd m0, [r3] |
| %endif |
| pshufb m0, m1 |
| xor r3, r3 |
| .left_loop_%3: |
| mova [dstq+r3], m0 |
| add r3, mmsize |
| cmp r3, leftextq |
| jl .left_loop_%3 |
| ; body |
| .body_%3: |
| lea reg_tmp, [dstq+leftextq] |
| %endif |
| xor r3, r3 |
| .body_loop_%3: |
| %if ARCH_X86_64 |
| movu m0, [srcq+r3] |
| %else |
| mov r1, srcm |
| movu m0, [r1+r3] |
| %endif |
| %if %1 |
| movu [reg_tmp+r3], m0 |
| %else |
| movu [dstq+r3], m0 |
| %endif |
| add r3, mmsize |
| cmp r3, centerwq |
| jl .body_loop_%3 |
| %if %2 |
| ; right extension |
| %if ARCH_X86_64 |
| test rightextq, rightextq |
| %else |
| mov r1, r3m |
| test r1, r1 |
| %endif |
| jz .body_loop_end_%3 |
| %if %1 |
| add reg_tmp, centerwq |
| %else |
| lea reg_tmp, [dstq+centerwq] |
| %endif |
| %if ARCH_X86_64 |
| movd m0, [srcq+centerwq-1] |
| %else |
| mov r3, srcm |
| movd m0, [r3+centerwq-1] |
| %endif |
| pshufb m0, m1 |
| xor r3, r3 |
| .right_loop_%3: |
| movu [reg_tmp+r3], m0 |
| add r3, mmsize |
| %if ARCH_X86_64 |
| cmp r3, rightextq |
| %else |
| cmp r3, r3m |
| %endif |
| jl .right_loop_%3 |
| .body_loop_end_%3: |
| %endif |
| %if ARCH_X86_64 |
| add dstq, dstrideq |
| add srcq, sstrideq |
| dec centerhq |
| jg .v_loop_%3 |
| %else |
| add dstq, dstridem |
| mov r0, sstridem |
| add srcm, r0 |
| sub dword centerhm, 1 |
| jg .v_loop_%3 |
| mov r0, r0m ; restore r0 |
| %endif |
| %endmacro ; vloop MACRO |
| |
| test leftextq, leftextq |
| jnz .need_left_ext |
| %if ARCH_X86_64 |
| test rightextq, rightextq |
| jnz .need_right_ext |
| %else |
| cmp leftextq, r3m ; leftextq == 0 |
| jne .need_right_ext |
| %endif |
| v_loop 0, 0, 0 |
| jmp .body_done |
| |
| ;left right extensions |
| .need_left_ext: |
| %if ARCH_X86_64 |
| test rightextq, rightextq |
| %else |
| mov r3, r3m |
| test r3, r3 |
| %endif |
| jnz .need_left_right_ext |
| v_loop 1, 0, 1 |
| jmp .body_done |
| |
| .need_left_right_ext: |
| v_loop 1, 1, 2 |
| jmp .body_done |
| |
| .need_right_ext: |
| v_loop 0, 1, 3 |
| |
| .body_done: |
| ; r0 ; bw |
| ; r1 ;; x loop |
| ; r4 ;; y loop |
| ; r5 ; topextq |
| ; r6 ;dstq |
| ; r7 ;dstrideq |
| ; r8 ; srcq |
| %if ARCH_X86_64 |
| %define reg_dstride dstrideq |
| %else |
| %define reg_dstride r2 |
| %endif |
| ; |
| ; bottom edge extension |
| %if ARCH_X86_64 |
| test bottomextq, bottomextq |
| jz .top |
| %else |
| xor r1, r1 |
| cmp r1, r4m |
| je .top |
| %endif |
| ; |
| %if ARCH_X86_64 |
| mov srcq, dstq |
| sub srcq, dstrideq |
| xor r1, r1 |
| %else |
| mov r3, dstq |
| mov reg_dstride, dstridem |
| sub r3, reg_dstride |
| mov srcm, r3 |
| %endif |
| ; |
| .bottom_x_loop: |
| %if ARCH_X86_64 |
| mova m0, [srcq+r1] |
| lea r3, [dstq+r1] |
| mov r4, bottomextq |
| %else |
| mov r3, srcm |
| mova m0, [r3+r1] |
| lea r3, [dstq+r1] |
| mov r4, r4m |
| %endif |
| ; |
| .bottom_y_loop: |
| mova [r3], m0 |
| add r3, reg_dstride |
| dec r4 |
| jg .bottom_y_loop |
| add r1, mmsize |
| cmp r1, bwq |
| jl .bottom_x_loop |
| |
| .top: |
| ; top edge extension |
| test topextq, topextq |
| jz .end |
| %if ARCH_X86_64 |
| mov srcq, reg_blkm |
| %else |
| mov r3, reg_blkm |
| mov reg_dstride, dstridem |
| %endif |
| mov dstq, dstm |
| xor r1, r1 |
| ; |
| .top_x_loop: |
| %if ARCH_X86_64 |
| mova m0, [srcq+r1] |
| %else |
| mov r3, reg_blkm |
| mova m0, [r3+r1] |
| %endif |
| lea r3, [dstq+r1] |
| mov r4, topextq |
| ; |
| .top_y_loop: |
| mova [r3], m0 |
| add r3, reg_dstride |
| dec r4 |
| jg .top_y_loop |
| add r1, mmsize |
| cmp r1, bwq |
| jl .top_x_loop |
| |
| .end: |
| RET |
| |
| %undef reg_dstride |
| %undef reg_blkm |
| %undef reg_tmp |