| ; Copyright © 2021, VideoLAN and dav1d authors |
| ; Copyright © 2021, Two Orioles, LLC |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA |
| |
| ; dav1d_obmc_masks[] << 9 |
| obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 |
| dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 |
| dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 |
| dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 |
| dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 |
| dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 |
| dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 |
| |
| blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 |
| spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 |
| spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 |
| spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 |
| spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 |
| spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 |
| unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 |
| rescale_mul: dd 0, 1, 2, 3 |
| resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 |
| db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 |
| bdct_lb_q: times 8 db 0 |
| times 8 db 4 |
| times 8 db 8 |
| times 8 db 12 |
| |
| pw_2: times 8 dw 2 |
| pw_16: times 4 dw 16 |
| prep_mul: times 4 dw 16 |
| times 8 dw 4 |
| pw_64: times 8 dw 64 |
| pw_256: times 8 dw 256 |
| pw_2048: times 4 dw 2048 |
| bidir_mul: times 4 dw 2048 |
| pw_8192: times 8 dw 8192 |
| pw_27615: times 8 dw 27615 |
| pw_32766: times 8 dw 32766 |
| pw_m512: times 8 dw -512 |
| pd_63: times 4 dd 63 |
| pd_64: times 4 dd 64 |
| pd_512: times 4 dd 512 |
| pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 |
| pd_0x3ff: times 4 dd 0x3ff |
| pd_0x4000: times 4 dd 0x4000 |
| pq_0x400000: times 2 dq 0x400000 |
| pq_0x40000000: times 2 dq 0x40000000 |
| pd_65538: times 2 dd 65538 |
| |
| put_bilin_h_rnd: times 4 dw 8 |
| times 4 dw 10 |
| s_8tap_h_rnd: times 2 dd 2 |
| times 2 dd 8 |
| put_s_8tap_v_rnd: times 2 dd 512 |
| times 2 dd 128 |
| s_8tap_h_sh: dd 2, 4 |
| put_s_8tap_v_sh: dd 10, 8 |
| bidir_rnd: times 4 dw -16400 |
| times 4 dw -16388 |
| put_8tap_h_rnd: dd 34, 34, 40, 40 |
| prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) |
| prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) |
| |
| warp8x8_shift: dd 11, 13 |
| warp8x8_rnd1: dd 1024, 1024, 4096, 4096 |
| warp8x8_rnd2: times 4 dw 4096 |
| times 4 dw 16384 |
| warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) |
| |
| %macro BIDIR_JMP_TABLE 2-* |
| %xdefine %1_%2_table (%%table - 2*%3) |
| %xdefine %%base %1_%2_table |
| %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) |
| %%table: |
| %rep %0 - 2 |
| dd %%prefix %+ .w%3 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 |
| BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 |
| BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 |
| BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 |
| |
| %macro BASE_JMP_TABLE 3-* |
| %xdefine %1_%2_table (%%table - %3) |
| %xdefine %%base %1_%2 |
| %%table: |
| %rep %0 - 2 |
| dw %%base %+ _w%3 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) |
| %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) |
| |
| BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 |
| BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 |
| |
| %macro SCALED_JMP_TABLE 2-* |
| %xdefine %1_%2_table (%%table - %3) |
| %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) |
| %%table: |
| %rep %0 - 2 |
| dw %%base %+ .w%3 - %%base |
| %rotate 1 |
| %endrep |
| %rotate 2 |
| %%dy_1024: |
| %xdefine %1_%2_dy1_table (%%dy_1024 - %3) |
| %rep %0 - 2 |
| dw %%base %+ .dy1_w%3 - %%base |
| %rotate 1 |
| %endrep |
| %rotate 2 |
| %%dy_2048: |
| %xdefine %1_%2_dy2_table (%%dy_2048 - %3) |
| %rep %0 - 2 |
| dw %%base %+ .dy2_w%3 - %%base |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 |
| SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 |
| |
| cextern mc_subpel_filters |
| %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) |
| |
| cextern mc_warp_filter |
| cextern resize_filter |
| |
| SECTION .text |
| |
| %if UNIX64 |
| DECLARE_REG_TMP 7 |
| %else |
| DECLARE_REG_TMP 5 |
| %endif |
| |
| INIT_XMM ssse3 |
| cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy |
| %define base t0-put_ssse3 |
| mov mxyd, r6m ; mx |
| LEA t0, put_ssse3 |
| movifnidn wd, wm |
| test mxyd, mxyd |
| jnz .h |
| mov mxyd, r7m ; my |
| test mxyd, mxyd |
| jnz .v |
| .put: |
| tzcnt wd, wd |
| movzx wd, word [base+put_ssse3_table+wq*2] |
| add wq, t0 |
| movifnidn hd, hm |
| jmp wq |
| .put_w2: |
| mov r4d, [srcq+ssq*0] |
| mov r6d, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mov [dstq+dsq*0], r4d |
| mov [dstq+dsq*1], r6d |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w2 |
| RET |
| .put_w4: |
| movq m0, [srcq+ssq*0] |
| movq m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movq [dstq+dsq*0], m0 |
| movq [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w4 |
| RET |
| .put_w8: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mova [dstq+dsq*0], m0 |
| mova [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w8 |
| RET |
| .put_w16: |
| movu m0, [srcq+ssq*0+16*0] |
| movu m1, [srcq+ssq*0+16*1] |
| movu m2, [srcq+ssq*1+16*0] |
| movu m3, [srcq+ssq*1+16*1] |
| lea srcq, [srcq+ssq*2] |
| mova [dstq+dsq*0+16*0], m0 |
| mova [dstq+dsq*0+16*1], m1 |
| mova [dstq+dsq*1+16*0], m2 |
| mova [dstq+dsq*1+16*1], m3 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .put_w16 |
| RET |
| .put_w32: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| add srcq, ssq |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| add dstq, dsq |
| dec hd |
| jg .put_w32 |
| RET |
| .put_w64: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| movu m0, [srcq+16*4] |
| movu m1, [srcq+16*5] |
| movu m2, [srcq+16*6] |
| movu m3, [srcq+16*7] |
| add srcq, ssq |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| mova [dstq+16*6], m2 |
| mova [dstq+16*7], m3 |
| add dstq, dsq |
| dec hd |
| jg .put_w64 |
| RET |
| .put_w128: |
| add srcq, 16*8 |
| add dstq, 16*8 |
| .put_w128_loop: |
| movu m0, [srcq-16*8] |
| movu m1, [srcq-16*7] |
| movu m2, [srcq-16*6] |
| movu m3, [srcq-16*5] |
| mova [dstq-16*8], m0 |
| mova [dstq-16*7], m1 |
| mova [dstq-16*6], m2 |
| mova [dstq-16*5], m3 |
| movu m0, [srcq-16*4] |
| movu m1, [srcq-16*3] |
| movu m2, [srcq-16*2] |
| movu m3, [srcq-16*1] |
| mova [dstq-16*4], m0 |
| mova [dstq-16*3], m1 |
| mova [dstq-16*2], m2 |
| mova [dstq-16*1], m3 |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| mova [dstq+16*3], m3 |
| movu m0, [srcq+16*4] |
| movu m1, [srcq+16*5] |
| movu m2, [srcq+16*6] |
| movu m3, [srcq+16*7] |
| add srcq, ssq |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| mova [dstq+16*6], m2 |
| mova [dstq+16*7], m3 |
| add dstq, dsq |
| dec hd |
| jg .put_w128_loop |
| RET |
| .h: |
| movd m5, mxyd |
| mov mxyd, r7m ; my |
| mova m4, [base+pw_16] |
| pshufb m5, [base+pw_256] |
| psubw m4, m5 |
| test mxyd, mxyd |
| jnz .hv |
| ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v |
| mov r6d, r8m ; bitdepth_max |
| shr r6d, 11 |
| movddup m3, [base+put_bilin_h_rnd+r6*8] |
| movifnidn hd, hm |
| sub wd, 8 |
| jg .h_w16 |
| je .h_w8 |
| cmp wd, -4 |
| je .h_w4 |
| .h_w2: |
| movq m1, [srcq+ssq*0] |
| movhps m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pmullw m0, m4, m1 |
| psrlq m1, 16 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| psrlw m0, 4 |
| movd [dstq+dsq*0], m0 |
| punpckhqdq m0, m0 |
| movd [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w2 |
| RET |
| .h_w4: |
| movq m0, [srcq+ssq*0] |
| movhps m0, [srcq+ssq*1] |
| movq m1, [srcq+ssq*0+2] |
| movhps m1, [srcq+ssq*1+2] |
| lea srcq, [srcq+ssq*2] |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| psrlw m0, 4 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w4 |
| RET |
| .h_w8: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*0+2] |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| movu m1, [srcq+ssq*1] |
| movu m2, [srcq+ssq*1+2] |
| lea srcq, [srcq+ssq*2] |
| pmullw m1, m4 |
| pmullw m2, m5 |
| paddw m1, m3 |
| paddw m1, m2 |
| psrlw m0, 4 |
| psrlw m1, 4 |
| mova [dstq+dsq*0], m0 |
| mova [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w8 |
| RET |
| .h_w16: |
| lea srcq, [srcq+wq*2] |
| lea dstq, [dstq+wq*2] |
| neg wq |
| .h_w16_loop0: |
| mov r6, wq |
| .h_w16_loop: |
| movu m0, [srcq+r6*2+ 0] |
| movu m1, [srcq+r6*2+ 2] |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| movu m1, [srcq+r6*2+16] |
| movu m2, [srcq+r6*2+18] |
| pmullw m1, m4 |
| pmullw m2, m5 |
| paddw m1, m3 |
| paddw m1, m2 |
| psrlw m0, 4 |
| psrlw m1, 4 |
| mova [dstq+r6*2+16*0], m0 |
| mova [dstq+r6*2+16*1], m1 |
| add r6, 16 |
| jl .h_w16_loop |
| add srcq, ssq |
| add dstq, dsq |
| dec hd |
| jg .h_w16_loop0 |
| RET |
| .v: |
| shl mxyd, 11 |
| movd m5, mxyd |
| pshufb m5, [base+pw_256] |
| movifnidn hd, hm |
| cmp wd, 4 |
| jg .v_w8 |
| je .v_w4 |
| .v_w2: |
| movd m0, [srcq+ssq*0] |
| .v_w2_loop: |
| movd m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklqdq m2, m0, m1 |
| movd m0, [srcq+ssq*0] |
| punpcklqdq m1, m0 |
| psubw m1, m2 |
| pmulhrsw m1, m5 |
| paddw m1, m2 |
| movd [dstq+dsq*0], m1 |
| punpckhqdq m1, m1 |
| movd [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w2_loop |
| RET |
| .v_w4: |
| movq m0, [srcq+ssq*0] |
| .v_w4_loop: |
| movq m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklqdq m2, m0, m1 |
| movq m0, [srcq+ssq*0] |
| punpcklqdq m1, m0 |
| psubw m1, m2 |
| pmulhrsw m1, m5 |
| paddw m1, m2 |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w4_loop |
| RET |
| .v_w8: |
| %if ARCH_X86_64 |
| %if WIN64 |
| push r7 |
| %endif |
| shl wd, 5 |
| mov r7, srcq |
| lea r6d, [wq+hq-256] |
| mov r4, dstq |
| %else |
| mov r6, srcq |
| %endif |
| .v_w8_loop0: |
| movu m0, [srcq+ssq*0] |
| .v_w8_loop: |
| movu m3, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| psubw m1, m3, m0 |
| pmulhrsw m1, m5 |
| paddw m1, m0 |
| movu m0, [srcq+ssq*0] |
| psubw m2, m0, m3 |
| pmulhrsw m2, m5 |
| paddw m2, m3 |
| mova [dstq+dsq*0], m1 |
| mova [dstq+dsq*1], m2 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w8_loop |
| %if ARCH_X86_64 |
| add r7, 16 |
| add r4, 16 |
| movzx hd, r6b |
| mov srcq, r7 |
| mov dstq, r4 |
| sub r6d, 1<<8 |
| %else |
| mov dstq, dstmp |
| add r6, 16 |
| mov hd, hm |
| add dstq, 16 |
| mov srcq, r6 |
| mov dstmp, dstq |
| sub wd, 8 |
| %endif |
| jg .v_w8_loop0 |
| %if WIN64 |
| pop r7 |
| %endif |
| RET |
| .hv: |
| WIN64_SPILL_XMM 8 |
| shl mxyd, 11 |
| mova m3, [base+pw_2] |
| movd m6, mxyd |
| mova m7, [base+pw_8192] |
| pshufb m6, [base+pw_256] |
| test dword r8m, 0x800 |
| jnz .hv_12bpc |
| psllw m4, 2 |
| psllw m5, 2 |
| mova m7, [base+pw_2048] |
| .hv_12bpc: |
| movifnidn hd, hm |
| cmp wd, 4 |
| jg .hv_w8 |
| je .hv_w4 |
| .hv_w2: |
| movddup m0, [srcq+ssq*0] |
| pshufhw m1, m0, q0321 |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| psrlw m0, 2 |
| .hv_w2_loop: |
| movq m2, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movhps m2, [srcq+ssq*0] |
| pmullw m1, m4, m2 |
| psrlq m2, 16 |
| pmullw m2, m5 |
| paddw m1, m3 |
| paddw m1, m2 |
| psrlw m1, 2 ; 1 _ 2 _ |
| shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ |
| mova m0, m1 |
| psubw m1, m2 |
| paddw m1, m1 |
| pmulhw m1, m6 |
| paddw m1, m2 |
| pmulhrsw m1, m7 |
| movd [dstq+dsq*0], m1 |
| punpckhqdq m1, m1 |
| movd [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w2_loop |
| RET |
| .hv_w4: |
| movddup m0, [srcq+ssq*0] |
| movddup m1, [srcq+ssq*0+2] |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| psrlw m0, 2 |
| .hv_w4_loop: |
| movq m1, [srcq+ssq*1] |
| movq m2, [srcq+ssq*1+2] |
| lea srcq, [srcq+ssq*2] |
| movhps m1, [srcq+ssq*0] |
| movhps m2, [srcq+ssq*0+2] |
| pmullw m1, m4 |
| pmullw m2, m5 |
| paddw m1, m3 |
| paddw m1, m2 |
| psrlw m1, 2 ; 1 2 |
| shufpd m2, m0, m1, 0x01 ; 0 1 |
| mova m0, m1 |
| psubw m1, m2 |
| paddw m1, m1 |
| pmulhw m1, m6 |
| paddw m1, m2 |
| pmulhrsw m1, m7 |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w4_loop |
| RET |
| .hv_w8: |
| %if ARCH_X86_64 |
| %if WIN64 |
| push r7 |
| %endif |
| shl wd, 5 |
| lea r6d, [wq+hq-256] |
| mov r4, srcq |
| mov r7, dstq |
| %else |
| mov r6, srcq |
| %endif |
| .hv_w8_loop0: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*0+2] |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m3 |
| paddw m0, m1 |
| psrlw m0, 2 |
| .hv_w8_loop: |
| movu m1, [srcq+ssq*1] |
| movu m2, [srcq+ssq*1+2] |
| lea srcq, [srcq+ssq*2] |
| pmullw m1, m4 |
| pmullw m2, m5 |
| paddw m1, m3 |
| paddw m1, m2 |
| psrlw m1, 2 |
| psubw m2, m1, m0 |
| paddw m2, m2 |
| pmulhw m2, m6 |
| paddw m2, m0 |
| pmulhrsw m2, m7 |
| mova [dstq+dsq*0], m2 |
| movu m0, [srcq+ssq*0] |
| movu m2, [srcq+ssq*0+2] |
| pmullw m0, m4 |
| pmullw m2, m5 |
| paddw m0, m3 |
| paddw m0, m2 |
| psrlw m0, 2 |
| psubw m2, m0, m1 |
| paddw m2, m2 |
| pmulhw m2, m6 |
| paddw m2, m1 |
| pmulhrsw m2, m7 |
| mova [dstq+dsq*1], m2 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w8_loop |
| %if ARCH_X86_64 |
| add r4, 16 |
| add r7, 16 |
| movzx hd, r6b |
| mov srcq, r4 |
| mov dstq, r7 |
| sub r6d, 1<<8 |
| %else |
| mov dstq, dstmp |
| add r6, 16 |
| mov hd, hm |
| add dstq, 16 |
| mov srcq, r6 |
| mov dstmp, dstq |
| sub wd, 8 |
| %endif |
| jg .hv_w8_loop0 |
| %if WIN64 |
| pop r7 |
| %endif |
| RET |
| |
| cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 |
| %define base r6-prep_ssse3 |
| movifnidn mxyd, r5m ; mx |
| LEA r6, prep_ssse3 |
| movifnidn hd, hm |
| test mxyd, mxyd |
| jnz .h |
| mov mxyd, r6m ; my |
| test mxyd, mxyd |
| jnz .v |
| .prep: |
| tzcnt wd, wd |
| movzx wd, word [base+prep_ssse3_table+wq*2] |
| mov r5d, r7m ; bitdepth_max |
| mova m5, [base+pw_8192] |
| add wq, r6 |
| shr r5d, 11 |
| movddup m4, [base+prep_mul+r5*8] |
| lea stride3q, [strideq*3] |
| jmp wq |
| .prep_w4: |
| movq m0, [srcq+strideq*0] |
| movhps m0, [srcq+strideq*1] |
| movq m1, [srcq+strideq*2] |
| movhps m1, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| pmullw m0, m4 |
| pmullw m1, m4 |
| psubw m0, m5 |
| psubw m1, m5 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| add tmpq, 16*2 |
| sub hd, 4 |
| jg .prep_w4 |
| RET |
| .prep_w8: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*2] |
| movu m3, [srcq+stride3q ] |
| lea srcq, [srcq+strideq*4] |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 4 |
| jg .prep_w8 |
| RET |
| .prep_w16: |
| movu m0, [srcq+strideq*0+16*0] |
| movu m1, [srcq+strideq*0+16*1] |
| movu m2, [srcq+strideq*1+16*0] |
| movu m3, [srcq+strideq*1+16*1] |
| lea srcq, [srcq+strideq*2] |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| sub hd, 2 |
| jg .prep_w16 |
| RET |
| .prep_w32: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| add srcq, strideq |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| add tmpq, 16*4 |
| dec hd |
| jg .prep_w32 |
| RET |
| .prep_w64: |
| movu m0, [srcq+16*0] |
| movu m1, [srcq+16*1] |
| movu m2, [srcq+16*2] |
| movu m3, [srcq+16*3] |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| movu m0, [srcq+16*4] |
| movu m1, [srcq+16*5] |
| movu m2, [srcq+16*6] |
| movu m3, [srcq+16*7] |
| add srcq, strideq |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*4], m0 |
| mova [tmpq+16*5], m1 |
| mova [tmpq+16*6], m2 |
| mova [tmpq+16*7], m3 |
| add tmpq, 16*8 |
| dec hd |
| jg .prep_w64 |
| RET |
| .prep_w128: |
| movu m0, [srcq+16* 0] |
| movu m1, [srcq+16* 1] |
| movu m2, [srcq+16* 2] |
| movu m3, [srcq+16* 3] |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| mova [tmpq+16*2], m2 |
| mova [tmpq+16*3], m3 |
| movu m0, [srcq+16* 4] |
| movu m1, [srcq+16* 5] |
| movu m2, [srcq+16* 6] |
| movu m3, [srcq+16* 7] |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq+16*4], m0 |
| mova [tmpq+16*5], m1 |
| mova [tmpq+16*6], m2 |
| mova [tmpq+16*7], m3 |
| movu m0, [srcq+16* 8] |
| movu m1, [srcq+16* 9] |
| movu m2, [srcq+16*10] |
| movu m3, [srcq+16*11] |
| add tmpq, 16*16 |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq-16*8], m0 |
| mova [tmpq-16*7], m1 |
| mova [tmpq-16*6], m2 |
| mova [tmpq-16*5], m3 |
| movu m0, [srcq+16*12] |
| movu m1, [srcq+16*13] |
| movu m2, [srcq+16*14] |
| movu m3, [srcq+16*15] |
| add srcq, strideq |
| REPX {pmullw x, m4}, m0, m1, m2, m3 |
| REPX {psubw x, m5}, m0, m1, m2, m3 |
| mova [tmpq-16*4], m0 |
| mova [tmpq-16*3], m1 |
| mova [tmpq-16*2], m2 |
| mova [tmpq-16*1], m3 |
| dec hd |
| jg .prep_w128 |
| RET |
| .h: |
| movd m4, mxyd |
| mov mxyd, r6m ; my |
| mova m3, [base+pw_16] |
| pshufb m4, [base+pw_256] |
| mova m5, [base+pw_32766] |
| psubw m3, m4 |
| test dword r7m, 0x800 |
| jnz .h_12bpc |
| psllw m3, 2 |
| psllw m4, 2 |
| .h_12bpc: |
| test mxyd, mxyd |
| jnz .hv |
| sub wd, 8 |
| je .h_w8 |
| jg .h_w16 |
| .h_w4: |
| movq m0, [srcq+strideq*0] |
| movhps m0, [srcq+strideq*1] |
| movq m1, [srcq+strideq*0+2] |
| movhps m1, [srcq+strideq*1+2] |
| lea srcq, [srcq+strideq*2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| psraw m0, 2 |
| mova [tmpq], m0 |
| add tmpq, 16 |
| sub hd, 2 |
| jg .h_w4 |
| RET |
| .h_w8: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*1+2] |
| lea srcq, [srcq+strideq*2] |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m0, 2 |
| psraw m1, 2 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| add tmpq, 16*2 |
| sub hd, 2 |
| jg .h_w8 |
| RET |
| .h_w16: |
| lea srcq, [srcq+wq*2] |
| neg wq |
| .h_w16_loop0: |
| mov r6, wq |
| .h_w16_loop: |
| movu m0, [srcq+r6*2+ 0] |
| movu m1, [srcq+r6*2+ 2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| movu m1, [srcq+r6*2+16] |
| movu m2, [srcq+r6*2+18] |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m0, 2 |
| psraw m1, 2 |
| mova [tmpq+16*0], m0 |
| mova [tmpq+16*1], m1 |
| add tmpq, 16*2 |
| add r6, 16 |
| jl .h_w16_loop |
| add srcq, strideq |
| dec hd |
| jg .h_w16_loop0 |
| RET |
| .v: |
| movd m4, mxyd |
| mova m3, [base+pw_16] |
| pshufb m4, [base+pw_256] |
| mova m5, [base+pw_32766] |
| psubw m3, m4 |
| test dword r7m, 0x800 |
| jnz .v_12bpc |
| psllw m3, 2 |
| psllw m4, 2 |
| .v_12bpc: |
| cmp wd, 8 |
| je .v_w8 |
| jg .v_w16 |
| .v_w4: |
| movq m0, [srcq+strideq*0] |
| .v_w4_loop: |
| movq m2, [srcq+strideq*1] |
| lea srcq, [srcq+strideq*2] |
| punpcklqdq m1, m0, m2 ; 0 1 |
| movq m0, [srcq+strideq*0] |
| punpcklqdq m2, m0 ; 1 2 |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m1, 2 |
| mova [tmpq], m1 |
| add tmpq, 16 |
| sub hd, 2 |
| jg .v_w4_loop |
| RET |
| .v_w8: |
| movu m0, [srcq+strideq*0] |
| .v_w8_loop: |
| movu m2, [srcq+strideq*1] |
| lea srcq, [srcq+strideq*2] |
| pmullw m0, m3 |
| pmullw m1, m4, m2 |
| psubw m0, m5 |
| paddw m1, m0 |
| movu m0, [srcq+strideq*0] |
| psraw m1, 2 |
| pmullw m2, m3 |
| mova [tmpq+16*0], m1 |
| pmullw m1, m4, m0 |
| psubw m2, m5 |
| paddw m1, m2 |
| psraw m1, 2 |
| mova [tmpq+16*1], m1 |
| add tmpq, 16*2 |
| sub hd, 2 |
| jg .v_w8_loop |
| RET |
| .v_w16: |
| %if WIN64 |
| push r7 |
| %endif |
| mov r5, srcq |
| %if ARCH_X86_64 |
| lea r6d, [wq*4-32] |
| mov wd, wd |
| lea r6d, [hq+r6*8] |
| mov r7, tmpq |
| %else |
| mov r6d, wd |
| %endif |
| .v_w16_loop0: |
| movu m0, [srcq+strideq*0] |
| .v_w16_loop: |
| movu m2, [srcq+strideq*1] |
| lea srcq, [srcq+strideq*2] |
| pmullw m0, m3 |
| pmullw m1, m4, m2 |
| psubw m0, m5 |
| paddw m1, m0 |
| movu m0, [srcq+strideq*0] |
| psraw m1, 2 |
| pmullw m2, m3 |
| mova [tmpq+wq*0], m1 |
| pmullw m1, m4, m0 |
| psubw m2, m5 |
| paddw m1, m2 |
| psraw m1, 2 |
| mova [tmpq+wq*2], m1 |
| lea tmpq, [tmpq+wq*4] |
| sub hd, 2 |
| jg .v_w16_loop |
| %if ARCH_X86_64 |
| add r5, 16 |
| add r7, 16 |
| movzx hd, r6b |
| mov srcq, r5 |
| mov tmpq, r7 |
| sub r6d, 1<<8 |
| %else |
| mov tmpq, tmpmp |
| add r5, 16 |
| mov hd, hm |
| add tmpq, 16 |
| mov srcq, r5 |
| mov tmpmp, tmpq |
| sub r6d, 8 |
| %endif |
| jg .v_w16_loop0 |
| %if WIN64 |
| pop r7 |
| %endif |
| RET |
| .hv: |
| WIN64_SPILL_XMM 7 |
| shl mxyd, 11 |
| movd m6, mxyd |
| pshufb m6, [base+pw_256] |
| cmp wd, 8 |
| je .hv_w8 |
| jg .hv_w16 |
| .hv_w4: |
| movddup m0, [srcq+strideq*0] |
| movddup m1, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| psraw m0, 2 |
| .hv_w4_loop: |
| movq m1, [srcq+strideq*1] |
| movq m2, [srcq+strideq*1+2] |
| lea srcq, [srcq+strideq*2] |
| movhps m1, [srcq+strideq*0] |
| movhps m2, [srcq+strideq*0+2] |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m1, 2 ; 1 2 |
| shufpd m2, m0, m1, 0x01 ; 0 1 |
| mova m0, m1 |
| psubw m1, m2 |
| pmulhrsw m1, m6 |
| paddw m1, m2 |
| mova [tmpq], m1 |
| add tmpq, 16 |
| sub hd, 2 |
| jg .hv_w4_loop |
| RET |
| .hv_w8: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| psraw m0, 2 |
| .hv_w8_loop: |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*1+2] |
| lea srcq, [srcq+strideq*2] |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m1, 2 |
| psubw m2, m1, m0 |
| pmulhrsw m2, m6 |
| paddw m2, m0 |
| mova [tmpq+16*0], m2 |
| movu m0, [srcq+strideq*0] |
| movu m2, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m2, m4 |
| psubw m0, m5 |
| paddw m0, m2 |
| psraw m0, 2 |
| psubw m2, m0, m1 |
| pmulhrsw m2, m6 |
| paddw m2, m1 |
| mova [tmpq+16*1], m2 |
| add tmpq, 16*2 |
| sub hd, 2 |
| jg .hv_w8_loop |
| RET |
| .hv_w16: |
| %if WIN64 |
| push r7 |
| %endif |
| mov r5, srcq |
| %if ARCH_X86_64 |
| lea r6d, [wq*4-32] |
| mov wd, wd |
| lea r6d, [hq+r6*8] |
| mov r7, tmpq |
| %else |
| mov r6d, wd |
| %endif |
| .hv_w16_loop0: |
| movu m0, [srcq+strideq*0] |
| movu m1, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m1, m4 |
| psubw m0, m5 |
| paddw m0, m1 |
| psraw m0, 2 |
| .hv_w16_loop: |
| movu m1, [srcq+strideq*1] |
| movu m2, [srcq+strideq*1+2] |
| lea srcq, [srcq+strideq*2] |
| pmullw m1, m3 |
| pmullw m2, m4 |
| psubw m1, m5 |
| paddw m1, m2 |
| psraw m1, 2 |
| psubw m2, m1, m0 |
| pmulhrsw m2, m6 |
| paddw m2, m0 |
| mova [tmpq+wq*0], m2 |
| movu m0, [srcq+strideq*0] |
| movu m2, [srcq+strideq*0+2] |
| pmullw m0, m3 |
| pmullw m2, m4 |
| psubw m0, m5 |
| paddw m0, m2 |
| psraw m0, 2 |
| psubw m2, m0, m1 |
| pmulhrsw m2, m6 |
| paddw m2, m1 |
| mova [tmpq+wq*2], m2 |
| lea tmpq, [tmpq+wq*4] |
| sub hd, 2 |
| jg .hv_w16_loop |
| %if ARCH_X86_64 |
| add r5, 16 |
| add r7, 16 |
| movzx hd, r6b |
| mov srcq, r5 |
| mov tmpq, r7 |
| sub r6d, 1<<8 |
| %else |
| mov tmpq, tmpmp |
| add r5, 16 |
| mov hd, hm |
| add tmpq, 16 |
| mov srcq, r5 |
| mov tmpmp, tmpq |
| sub r6d, 8 |
| %endif |
| jg .hv_w16_loop0 |
| %if WIN64 |
| pop r7 |
| %endif |
| RET |
| |
| ; int8_t subpel_filters[5][15][8] |
| %assign FILTER_REGULAR (0*15 << 16) | 3*15 |
| %assign FILTER_SMOOTH (1*15 << 16) | 4*15 |
| %assign FILTER_SHARP (2*15 << 16) | 3*15 |
| |
| %macro FN 4 ; prefix, type, type_h, type_v |
| cglobal %1_%2_16bpc |
| mov t0d, FILTER_%3 |
| %ifidn %3, %4 |
| mov t1d, t0d |
| %else |
| mov t1d, FILTER_%4 |
| %endif |
| %ifnidn %2, regular ; skip the jump in the last filter |
| jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 1, 2, 6 |
| %elif WIN64 |
| DECLARE_REG_TMP 4, 5, 8 |
| %else |
| DECLARE_REG_TMP 7, 8, 8 |
| %endif |
| |
| %define PUT_8TAP_FN FN put_8tap, |
| PUT_8TAP_FN sharp, SHARP, SHARP |
| PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH |
| PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP |
| PUT_8TAP_FN smooth, SMOOTH, SMOOTH |
| PUT_8TAP_FN sharp_regular, SHARP, REGULAR |
| PUT_8TAP_FN regular_sharp, REGULAR, SHARP |
| PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR |
| PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH |
| PUT_8TAP_FN regular, REGULAR, REGULAR |
| |
| %if ARCH_X86_32 |
| cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my |
| %define mxb r0b |
| %define mxd r0 |
| %define mxq r0 |
| %define myb r1b |
| %define myd r1 |
| %define myq r1 |
| %define m8 [esp+16*0] |
| %define m9 [esp+16*1] |
| %define m10 [esp+16*2] |
| %define m11 [esp+16*3] |
| %define m12 [esp+16*4] |
| %define m13 [esp+16*5] |
| %define m14 [esp+16*6] |
| %define m15 [esp+16*7] |
| %else |
| cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my |
| %endif |
| %define base t2-put_ssse3 |
| imul mxd, mxm, 0x010101 |
| add mxd, t0d ; 8tap_h, mx, 4tap_h |
| imul myd, mym, 0x010101 |
| add myd, t1d ; 8tap_v, my, 4tap_v |
| LEA t2, put_ssse3 |
| movifnidn wd, wm |
| movifnidn srcq, srcmp |
| movifnidn ssq, ssmp |
| movifnidn hd, hm |
| test mxd, 0xf00 |
| jnz .h |
| test myd, 0xf00 |
| jnz .v |
| tzcnt wd, wd |
| movzx wd, word [base+put_ssse3_table+wq*2] |
| movifnidn dstq, dstmp |
| movifnidn dsq, dsmp |
| add wq, t2 |
| %if WIN64 |
| pop r8 |
| pop r7 |
| %endif |
| jmp wq |
| .h: |
| test myd, 0xf00 |
| jnz .hv |
| mov myd, r8m |
| movd m5, r8m |
| shr myd, 11 |
| movddup m4, [base+put_8tap_h_rnd+myq*8] |
| movifnidn dsq, dsmp |
| pshufb m5, [base+pw_256] |
| cmp wd, 4 |
| jg .h_w8 |
| movzx mxd, mxb |
| lea srcq, [srcq-2] |
| movq m3, [base+subpel_filters+mxq*8] |
| movifnidn dstq, dstmp |
| punpcklbw m3, m3 |
| psraw m3, 8 ; sign-extend |
| je .h_w4 |
| .h_w2: |
| mova m2, [base+spel_h_shuf2] |
| pshufd m3, m3, q2121 |
| .h_w2_loop: |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pshufb m0, m2 |
| pshufb m1, m2 |
| pmaddwd m0, m3 |
| pmaddwd m1, m3 |
| phaddd m0, m1 |
| paddd m0, m4 |
| psrad m0, 6 |
| packssdw m0, m0 |
| pxor m1, m1 |
| pminsw m0, m5 |
| pmaxsw m0, m1 |
| movd [dstq+dsq*0], m0 |
| pshuflw m0, m0, q3232 |
| movd [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .h_w2_loop |
| RET |
| .h_w4: |
| WIN64_SPILL_XMM 8 |
| mova m6, [base+spel_h_shufA] |
| mova m7, [base+spel_h_shufB] |
| pshufd m2, m3, q1111 |
| pshufd m3, m3, q2222 |
| .h_w4_loop: |
| movu m1, [srcq] |
| add srcq, ssq |
| pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 |
| pshufb m1, m7 ; 2 3 3 4 4 5 5 6 |
| pmaddwd m0, m2 |
| pmaddwd m1, m3 |
| paddd m0, m4 |
| paddd m0, m1 |
| psrad m0, 6 |
| packssdw m0, m0 |
| pxor m1, m1 |
| pminsw m0, m5 |
| pmaxsw m0, m1 |
| movq [dstq], m0 |
| add dstq, dsq |
| dec hd |
| jg .h_w4_loop |
| RET |
| .h_w8: |
| %if WIN64 |
| %assign stack_offset stack_offset - stack_size_padded |
| WIN64_SPILL_XMM 12 |
| %endif |
| shr mxd, 16 |
| movq m3, [base+subpel_filters+mxq*8] |
| movifnidn dstq, dstmp |
| mova m6, [base+spel_h_shufA] |
| mova m7, [base+spel_h_shufB] |
| %if UNIX64 |
| mov wd, wd |
| %endif |
| lea srcq, [srcq+wq*2] |
| punpcklbw m3, m3 |
| lea dstq, [dstq+wq*2] |
| psraw m3, 8 |
| neg wq |
| %if ARCH_X86_32 |
| ALLOC_STACK -16*4 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| %else |
| pshufd m8, m3, q0000 |
| pshufd m9, m3, q1111 |
| pshufd m10, m3, q2222 |
| pshufd m11, m3, q3333 |
| %endif |
| .h_w8_loop0: |
| mov r6, wq |
| .h_w8_loop: |
| movu m0, [srcq+r6*2- 6] |
| movu m1, [srcq+r6*2+ 2] |
| pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 |
| pshufb m0, m7 ; 2 3 3 4 4 5 5 6 |
| pmaddwd m2, m8 ; abcd0 |
| pmaddwd m0, m9 ; abcd1 |
| pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 |
| pshufb m1, m7 ; 6 7 7 8 8 9 9 a |
| paddd m2, m4 |
| paddd m0, m2 |
| pmaddwd m2, m10, m3 ; abcd2 |
| pmaddwd m3, m8 ; efgh0 |
| paddd m0, m2 |
| pmaddwd m2, m11, m1 ; abcd3 |
| pmaddwd m1, m9 ; efgh1 |
| paddd m0, m2 |
| movu m2, [srcq+r6*2+10] |
| paddd m3, m4 |
| paddd m1, m3 |
| pshufb m3, m2, m6 ; 8 9 9 a a b b c |
| pshufb m2, m7 ; a b b c c d d e |
| pmaddwd m3, m10 ; efgh2 |
| pmaddwd m2, m11 ; efgh3 |
| paddd m1, m3 |
| paddd m1, m2 |
| psrad m0, 6 |
| psrad m1, 6 |
| packssdw m0, m1 |
| pxor m1, m1 |
| pminsw m0, m5 |
| pmaxsw m0, m1 |
| mova [dstq+r6*2], m0 |
| add r6, 8 |
| jl .h_w8_loop |
| add srcq, ssq |
| add dstq, dsq |
| dec hd |
| jg .h_w8_loop0 |
| RET |
| .v: |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 6 |
| cmovb myd, mxd |
| movq m3, [base+subpel_filters+myq*8] |
| %if STACK_ALIGNMENT < 16 |
| %xdefine rstk rsp |
| %else |
| %assign stack_offset stack_offset - stack_size_padded |
| %endif |
| %if WIN64 |
| WIN64_SPILL_XMM 15 |
| %endif |
| movd m7, r8m |
| movifnidn dstq, dstmp |
| movifnidn dsq, dsmp |
| punpcklbw m3, m3 |
| pshufb m7, [base+pw_256] |
| psraw m3, 8 ; sign-extend |
| %if ARCH_X86_32 |
| ALLOC_STACK -16*7 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| %else |
| pshufd m8, m3, q0000 |
| pshufd m9, m3, q1111 |
| pshufd m10, m3, q2222 |
| pshufd m11, m3, q3333 |
| %endif |
| lea r6, [ssq*3] |
| sub srcq, r6 |
| cmp wd, 2 |
| jne .v_w4 |
| .v_w2: |
| movd m1, [srcq+ssq*0] |
| movd m4, [srcq+ssq*1] |
| movd m2, [srcq+ssq*2] |
| add srcq, r6 |
| movd m5, [srcq+ssq*0] |
| movd m3, [srcq+ssq*1] |
| movd m6, [srcq+ssq*2] |
| add srcq, r6 |
| movd m0, [srcq+ssq*0] |
| punpckldq m1, m4 ; 0 1 |
| punpckldq m4, m2 ; 1 2 |
| punpckldq m2, m5 ; 2 3 |
| punpckldq m5, m3 ; 3 4 |
| punpckldq m3, m6 ; 4 5 |
| punpckldq m6, m0 ; 5 6 |
| punpcklwd m1, m4 ; 01 12 |
| punpcklwd m2, m5 ; 23 34 |
| punpcklwd m3, m6 ; 45 56 |
| pxor m6, m6 |
| .v_w2_loop: |
| movd m4, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pmaddwd m5, m8, m1 ; a0 b0 |
| mova m1, m2 |
| pmaddwd m2, m9 ; a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, m10 ; a2 b2 |
| paddd m5, m3 |
| punpckldq m3, m0, m4 ; 6 7 |
| movd m0, [srcq+ssq*0] |
| punpckldq m4, m0 ; 7 8 |
| punpcklwd m3, m4 ; 67 78 |
| pmaddwd m4, m11, m3 ; a3 b3 |
| paddd m5, m4 |
| psrad m5, 5 |
| packssdw m5, m5 |
| pmaxsw m5, m6 |
| pavgw m5, m6 |
| pminsw m5, m7 |
| movd [dstq+dsq*0], m5 |
| pshuflw m5, m5, q3232 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w2_loop |
| RET |
| .v_w4: |
| %if ARCH_X86_32 |
| shl wd, 14 |
| %if STACK_ALIGNMENT < 16 |
| mov [esp+4*29], srcq |
| mov [esp+4*30], dstq |
| %else |
| mov srcmp, srcq |
| %endif |
| lea wd, [wq+hq-(1<<16)] |
| %else |
| shl wd, 6 |
| mov r7, srcq |
| mov r8, dstq |
| lea wd, [wq+hq-(1<<8)] |
| %endif |
| .v_w4_loop0: |
| movq m1, [srcq+ssq*0] |
| movq m2, [srcq+ssq*1] |
| movq m3, [srcq+ssq*2] |
| add srcq, r6 |
| movq m4, [srcq+ssq*0] |
| movq m5, [srcq+ssq*1] |
| movq m6, [srcq+ssq*2] |
| add srcq, r6 |
| movq m0, [srcq+ssq*0] |
| punpcklwd m1, m2 ; 01 |
| punpcklwd m2, m3 ; 12 |
| punpcklwd m3, m4 ; 23 |
| punpcklwd m4, m5 ; 34 |
| punpcklwd m5, m6 ; 45 |
| punpcklwd m6, m0 ; 56 |
| %if ARCH_X86_32 |
| jmp .v_w4_loop_start |
| .v_w4_loop: |
| mova m1, m12 |
| mova m2, m13 |
| mova m3, m14 |
| .v_w4_loop_start: |
| pmaddwd m1, m8 ; a0 |
| pmaddwd m2, m8 ; b0 |
| mova m12, m3 |
| mova m13, m4 |
| pmaddwd m3, m9 ; a1 |
| pmaddwd m4, m9 ; b1 |
| paddd m1, m3 |
| paddd m2, m4 |
| mova m14, m5 |
| mova m4, m6 |
| pmaddwd m5, m10 ; a2 |
| pmaddwd m6, m10 ; b2 |
| paddd m1, m5 |
| paddd m2, m6 |
| movq m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklwd m5, m0, m6 ; 67 |
| movq m0, [srcq+ssq*0] |
| pmaddwd m3, m11, m5 ; a3 |
| punpcklwd m6, m0 ; 78 |
| paddd m1, m3 |
| pmaddwd m3, m11, m6 ; b3 |
| paddd m2, m3 |
| psrad m1, 5 |
| psrad m2, 5 |
| packssdw m1, m2 |
| pxor m2, m2 |
| pmaxsw m1, m2 |
| pavgw m1, m2 |
| pminsw m1, m7 |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w4_loop |
| %if STACK_ALIGNMENT < 16 |
| mov srcq, [esp+4*29] |
| mov dstq, [esp+4*30] |
| movzx hd, ww |
| add srcq, 8 |
| add dstq, 8 |
| mov [esp+4*29], srcq |
| mov [esp+4*30], dstq |
| %else |
| mov srcq, srcmp |
| mov dstq, dstmp |
| movzx hd, ww |
| add srcq, 8 |
| add dstq, 8 |
| mov srcmp, srcq |
| mov dstmp, dstq |
| %endif |
| sub wd, 1<<16 |
| %else |
| .v_w4_loop: |
| pmaddwd m12, m8, m1 ; a0 |
| pmaddwd m13, m8, m2 ; b0 |
| mova m1, m3 |
| mova m2, m4 |
| pmaddwd m3, m9 ; a1 |
| pmaddwd m4, m9 ; b1 |
| paddd m12, m3 |
| paddd m13, m4 |
| mova m3, m5 |
| mova m4, m6 |
| pmaddwd m5, m10 ; a2 |
| pmaddwd m6, m10 ; b2 |
| paddd m12, m5 |
| paddd m13, m6 |
| movq m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklwd m5, m0, m6 ; 67 |
| movq m0, [srcq+ssq*0] |
| pmaddwd m14, m11, m5 ; a3 |
| punpcklwd m6, m0 ; 78 |
| paddd m12, m14 |
| pmaddwd m14, m11, m6 ; b3 |
| paddd m13, m14 |
| psrad m12, 5 |
| psrad m13, 5 |
| packssdw m12, m13 |
| pxor m13, m13 |
| pmaxsw m12, m13 |
| pavgw m12, m13 |
| pminsw m12, m7 |
| movq [dstq+dsq*0], m12 |
| movhps [dstq+dsq*1], m12 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .v_w4_loop |
| add r7, 8 |
| add r8, 8 |
| movzx hd, wb |
| mov srcq, r7 |
| mov dstq, r8 |
| sub wd, 1<<8 |
| %endif |
| jg .v_w4_loop0 |
| RET |
| .hv: |
| %if STACK_ALIGNMENT < 16 |
| %xdefine rstk rsp |
| %else |
| %assign stack_offset stack_offset - stack_size_padded |
| %endif |
| %if ARCH_X86_32 |
| movd m4, r8m |
| mova m6, [base+pd_512] |
| pshufb m4, [base+pw_256] |
| %else |
| %if WIN64 |
| ALLOC_STACK 16*6, 16 |
| %endif |
| movd m15, r8m |
| pshufb m15, [base+pw_256] |
| %endif |
| cmp wd, 4 |
| jg .hv_w8 |
| movzx mxd, mxb |
| je .hv_w4 |
| movq m0, [base+subpel_filters+mxq*8] |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 6 |
| cmovb myd, mxd |
| movq m3, [base+subpel_filters+myq*8] |
| %if ARCH_X86_32 |
| mov dstq, dstmp |
| mov dsq, dsmp |
| mova m5, [base+spel_h_shuf2] |
| ALLOC_STACK -16*8 |
| %else |
| mova m6, [base+pd_512] |
| mova m9, [base+spel_h_shuf2] |
| %endif |
| pshuflw m0, m0, q2121 |
| pxor m7, m7 |
| punpcklbw m7, m0 |
| punpcklbw m3, m3 |
| psraw m3, 8 ; sign-extend |
| test dword r8m, 0x800 |
| jz .hv_w2_10bpc |
| psraw m7, 2 |
| psllw m3, 2 |
| .hv_w2_10bpc: |
| lea r6, [ssq*3] |
| sub srcq, 2 |
| sub srcq, r6 |
| %if ARCH_X86_32 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova m9, m5 |
| mova m11, m0 |
| mova m12, m1 |
| mova m13, m2 |
| mova m14, m3 |
| mova m15, m4 |
| %else |
| pshufd m11, m3, q0000 |
| pshufd m12, m3, q1111 |
| pshufd m13, m3, q2222 |
| pshufd m14, m3, q3333 |
| %endif |
| movu m2, [srcq+ssq*0] |
| movu m3, [srcq+ssq*1] |
| movu m1, [srcq+ssq*2] |
| add srcq, r6 |
| movu m4, [srcq+ssq*0] |
| %if ARCH_X86_32 |
| REPX {pshufb x, m5}, m2, m3, m1, m4 |
| %else |
| REPX {pshufb x, m9}, m2, m3, m1, m4 |
| %endif |
| REPX {pmaddwd x, m7}, m2, m3, m1, m4 |
| phaddd m2, m3 ; 0 1 |
| phaddd m1, m4 ; 2 3 |
| movu m3, [srcq+ssq*1] |
| movu m4, [srcq+ssq*2] |
| add srcq, r6 |
| movu m0, [srcq+ssq*0] |
| %if ARCH_X86_32 |
| REPX {pshufb x, m5}, m3, m4, m0 |
| %else |
| REPX {pshufb x, m9}, m3, m4, m0 |
| %endif |
| REPX {pmaddwd x, m7}, m3, m4, m0 |
| phaddd m3, m4 ; 4 5 |
| phaddd m0, m0 ; 6 6 |
| REPX {paddd x, m6}, m2, m1, m3, m0 |
| REPX {psrad x, 10}, m2, m1, m3, m0 |
| packssdw m2, m1 ; 0 1 2 3 |
| packssdw m3, m0 ; 4 5 6 _ |
| palignr m4, m3, m2, 4 ; 1 2 3 4 |
| pshufd m5, m3, q0321 ; 5 6 _ _ |
| punpcklwd m1, m2, m4 ; 01 12 |
| punpckhwd m2, m4 ; 23 34 |
| punpcklwd m3, m5 ; 45 56 |
| .hv_w2_loop: |
| movu m4, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movu m5, [srcq+ssq*0] |
| pshufb m4, m9 |
| pshufb m5, m9 |
| pmaddwd m4, m7 |
| pmaddwd m5, m7 |
| phaddd m4, m5 |
| pmaddwd m5, m11, m1 ; a0 b0 |
| mova m1, m2 |
| pmaddwd m2, m12 ; a1 b1 |
| paddd m5, m2 |
| mova m2, m3 |
| pmaddwd m3, m13 ; a2 b2 |
| paddd m5, m3 |
| paddd m4, m6 |
| psrad m4, 10 ; 7 8 |
| packssdw m0, m4 |
| pshufd m3, m0, q2103 |
| punpckhwd m3, m0 ; 67 78 |
| mova m0, m4 |
| pmaddwd m4, m14, m3 ; a3 b3 |
| paddd m5, m6 |
| paddd m5, m4 |
| psrad m5, 10 |
| packssdw m5, m5 |
| pxor m4, m4 |
| pminsw m5, m15 |
| pmaxsw m5, m4 |
| movd [dstq+dsq*0], m5 |
| pshuflw m5, m5, q3232 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w2_loop |
| RET |
| .hv_w8: |
| shr mxd, 16 |
| .hv_w4: |
| movq m2, [base+subpel_filters+mxq*8] |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 6 |
| cmovb myd, mxd |
| movq m3, [base+subpel_filters+myq*8] |
| %if ARCH_X86_32 |
| %if STACK_ALIGNMENT < 16 |
| %xdefine rstk rsp |
| %else |
| %assign stack_offset stack_offset - stack_size_padded |
| %endif |
| mov dstq, dstmp |
| mov dsq, dsmp |
| mova m0, [base+spel_h_shufA] |
| mova m1, [base+spel_h_shufB] |
| ALLOC_STACK -16*15 |
| mova m8, m0 |
| mova m9, m1 |
| mova m14, m6 |
| %else |
| mova m8, [base+spel_h_shufA] |
| mova m9, [base+spel_h_shufB] |
| %endif |
| pxor m0, m0 |
| punpcklbw m0, m2 |
| punpcklbw m3, m3 |
| psraw m3, 8 |
| test dword r8m, 0x800 |
| jz .hv_w4_10bpc |
| psraw m0, 2 |
| psllw m3, 2 |
| .hv_w4_10bpc: |
| lea r6, [ssq*3] |
| sub srcq, 6 |
| sub srcq, r6 |
| %if ARCH_X86_32 |
| %define tmp esp+16*8 |
| shl wd, 14 |
| %if STACK_ALIGNMENT < 16 |
| mov [esp+4*61], srcq |
| mov [esp+4*62], dstq |
| %else |
| mov srcmp, srcq |
| %endif |
| mova [tmp+16*5], m4 |
| lea wd, [wq+hq-(1<<16)] |
| pshufd m1, m0, q0000 |
| pshufd m2, m0, q1111 |
| pshufd m5, m0, q2222 |
| pshufd m0, m0, q3333 |
| mova m10, m1 |
| mova m11, m2 |
| mova m12, m5 |
| mova m13, m0 |
| %else |
| %if WIN64 |
| %define tmp rsp |
| %else |
| %define tmp rsp-104 ; red zone |
| %endif |
| shl wd, 6 |
| mov r7, srcq |
| mov r8, dstq |
| lea wd, [wq+hq-(1<<8)] |
| pshufd m10, m0, q0000 |
| pshufd m11, m0, q1111 |
| pshufd m12, m0, q2222 |
| pshufd m13, m0, q3333 |
| mova [tmp+16*5], m15 |
| %endif |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova [tmp+16*1], m0 |
| mova [tmp+16*2], m1 |
| mova [tmp+16*3], m2 |
| mova [tmp+16*4], m3 |
| %macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] |
| pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 |
| pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 |
| pmaddwd m%3, m10 |
| pmaddwd m%1, m11 |
| paddd m%3, %5 |
| paddd m%1, m%3 |
| pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 |
| pshufb m%2, m9 ; 6 7 7 8 8 9 9 a |
| pmaddwd m%3, m12 |
| pmaddwd m%2, m13 |
| paddd m%1, m%3 |
| paddd m%1, m%2 |
| psrad m%1, %4 |
| %endmacro |
| .hv_w4_loop0: |
| %if ARCH_X86_64 |
| mova m14, [pd_512] |
| %endif |
| movu m4, [srcq+ssq*0+0] |
| movu m1, [srcq+ssq*0+8] |
| movu m5, [srcq+ssq*1+0] |
| movu m2, [srcq+ssq*1+8] |
| movu m6, [srcq+ssq*2+0] |
| movu m3, [srcq+ssq*2+8] |
| add srcq, r6 |
| PUT_8TAP_HV_H 4, 1, 0, 10 |
| PUT_8TAP_HV_H 5, 2, 0, 10 |
| PUT_8TAP_HV_H 6, 3, 0, 10 |
| movu m7, [srcq+ssq*0+0] |
| movu m2, [srcq+ssq*0+8] |
| movu m1, [srcq+ssq*1+0] |
| movu m3, [srcq+ssq*1+8] |
| PUT_8TAP_HV_H 7, 2, 0, 10 |
| PUT_8TAP_HV_H 1, 3, 0, 10 |
| movu m2, [srcq+ssq*2+0] |
| movu m3, [srcq+ssq*2+8] |
| add srcq, r6 |
| PUT_8TAP_HV_H 2, 3, 0, 10 |
| packssdw m4, m7 ; 0 3 |
| packssdw m5, m1 ; 1 4 |
| movu m0, [srcq+ssq*0+0] |
| movu m1, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 1, 3, 10 |
| packssdw m6, m2 ; 2 5 |
| packssdw m7, m0 ; 3 6 |
| punpcklwd m1, m4, m5 ; 01 |
| punpckhwd m4, m5 ; 34 |
| punpcklwd m2, m5, m6 ; 12 |
| punpckhwd m5, m6 ; 45 |
| punpcklwd m3, m6, m7 ; 23 |
| punpckhwd m6, m7 ; 56 |
| %if ARCH_X86_32 |
| jmp .hv_w4_loop_start |
| .hv_w4_loop: |
| mova m1, [tmp+16*6] |
| mova m2, m15 |
| .hv_w4_loop_start: |
| mova m7, [tmp+16*1] |
| pmaddwd m1, m7 ; a0 |
| pmaddwd m2, m7 ; b0 |
| mova m7, [tmp+16*2] |
| mova [tmp+16*6], m3 |
| pmaddwd m3, m7 ; a1 |
| mova m15, m4 |
| pmaddwd m4, m7 ; b1 |
| mova m7, [tmp+16*3] |
| paddd m1, m3 |
| paddd m2, m4 |
| mova m3, m5 |
| pmaddwd m5, m7 ; a2 |
| mova m4, m6 |
| pmaddwd m6, m7 ; b2 |
| paddd m1, m5 |
| paddd m2, m6 |
| movu m7, [srcq+ssq*1+0] |
| movu m5, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_HV_H 7, 5, 6, 10 |
| packssdw m0, m7 ; 6 7 |
| mova [tmp+16*0], m0 |
| movu m0, [srcq+ssq*0+0] |
| movu m5, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 5, 6, 10 |
| mova m6, [tmp+16*0] |
| packssdw m7, m0 ; 7 8 |
| punpcklwd m5, m6, m7 ; 67 |
| punpckhwd m6, m7 ; 78 |
| pmaddwd m7, m5, [tmp+16*4] |
| paddd m1, m7 ; a3 |
| pmaddwd m7, m6, [tmp+16*4] |
| paddd m2, m7 ; b3 |
| psrad m1, 9 |
| psrad m2, 9 |
| packssdw m1, m2 |
| pxor m7, m7 |
| pmaxsw m1, m7 |
| pavgw m7, m1 |
| pminsw m7, [tmp+16*5] |
| movq [dstq+dsq*0], m7 |
| movhps [dstq+dsq*1], m7 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w4_loop |
| %if STACK_ALIGNMENT < 16 |
| mov srcq, [esp+4*61] |
| mov dstq, [esp+4*62] |
| add srcq, 8 |
| add dstq, 8 |
| mov [esp+4*61], srcq |
| mov [esp+4*62], dstq |
| %else |
| mov srcq, srcmp |
| mov dstq, dstmp |
| add srcq, 8 |
| add dstq, 8 |
| mov srcmp, srcq |
| mov dstmp, dstq |
| %endif |
| movzx hd, ww |
| sub wd, 1<<16 |
| %else |
| .hv_w4_loop: |
| mova m15, [tmp+16*1] |
| pmaddwd m14, m15, m1 ; a0 |
| pmaddwd m15, m2 ; b0 |
| mova m7, [tmp+16*2] |
| mova m1, m3 |
| pmaddwd m3, m7 ; a1 |
| mova m2, m4 |
| pmaddwd m4, m7 ; b1 |
| mova m7, [tmp+16*3] |
| paddd m14, m3 |
| paddd m15, m4 |
| mova m3, m5 |
| pmaddwd m5, m7 ; a2 |
| mova m4, m6 |
| pmaddwd m6, m7 ; b2 |
| paddd m14, m5 |
| paddd m15, m6 |
| movu m7, [srcq+ssq*1+0] |
| movu m5, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] |
| packssdw m0, m7 ; 6 7 |
| mova [tmp+16*0], m0 |
| movu m0, [srcq+ssq*0+0] |
| movu m5, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] |
| mova m6, [tmp+16*0] |
| packssdw m7, m0 ; 7 8 |
| punpcklwd m5, m6, m7 ; 67 |
| punpckhwd m6, m7 ; 78 |
| pmaddwd m7, m5, [tmp+16*4] |
| paddd m14, m7 ; a3 |
| pmaddwd m7, m6, [tmp+16*4] |
| paddd m15, m7 ; b3 |
| psrad m14, 9 |
| psrad m15, 9 |
| packssdw m14, m15 |
| pxor m7, m7 |
| pmaxsw m14, m7 |
| pavgw m7, m14 |
| pminsw m7, [tmp+16*5] |
| movq [dstq+dsq*0], m7 |
| movhps [dstq+dsq*1], m7 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .hv_w4_loop |
| add r7, 8 |
| add r8, 8 |
| movzx hd, wb |
| mov srcq, r7 |
| mov dstq, r8 |
| sub wd, 1<<8 |
| %endif |
| jg .hv_w4_loop0 |
| RET |
| %undef tmp |
| |
| %if ARCH_X86_32 |
| DECLARE_REG_TMP 2, 1, 6, 4 |
| %elif WIN64 |
| DECLARE_REG_TMP 6, 4, 7, 4 |
| %else |
| DECLARE_REG_TMP 6, 7, 7, 8 |
| %endif |
| |
| %define PREP_8TAP_FN FN prep_8tap, |
| PREP_8TAP_FN sharp, SHARP, SHARP |
| PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH |
| PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP |
| PREP_8TAP_FN smooth, SMOOTH, SMOOTH |
| PREP_8TAP_FN sharp_regular, SHARP, REGULAR |
| PREP_8TAP_FN regular_sharp, REGULAR, SHARP |
| PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR |
| PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH |
| PREP_8TAP_FN regular, REGULAR, REGULAR |
| |
| %if ARCH_X86_32 |
| cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my |
| %define mxb r0b |
| %define mxd r0 |
| %define mxq r0 |
| %define myb r2b |
| %define myd r2 |
| %define myq r2 |
| %else |
| cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my |
| %endif |
| %define base t2-prep_ssse3 |
| imul mxd, mxm, 0x010101 |
| add mxd, t0d ; 8tap_h, mx, 4tap_h |
| imul myd, mym, 0x010101 |
| add myd, t1d ; 8tap_v, my, 4tap_v |
| LEA t2, prep_ssse3 |
| movifnidn wd, wm |
| movifnidn srcq, srcmp |
| test mxd, 0xf00 |
| jnz .h |
| movifnidn hd, hm |
| test myd, 0xf00 |
| jnz .v |
| tzcnt wd, wd |
| mov myd, r7m ; bitdepth_max |
| movzx wd, word [base+prep_ssse3_table+wq*2] |
| mova m5, [base+pw_8192] |
| shr myd, 11 |
| add wq, t2 |
| movddup m4, [base+prep_mul+myq*8] |
| movifnidn ssq, ssmp |
| movifnidn tmpq, tmpmp |
| lea r6, [ssq*3] |
| %if WIN64 |
| pop r7 |
| %endif |
| jmp wq |
| .h: |
| test myd, 0xf00 |
| jnz .hv |
| movifnidn ssq, r2mp |
| movifnidn hd, r4m |
| movddup m5, [base+prep_8tap_1d_rnd] |
| cmp wd, 4 |
| jne .h_w8 |
| movzx mxd, mxb |
| movq m0, [base+subpel_filters+mxq*8] |
| mova m3, [base+spel_h_shufA] |
| mova m4, [base+spel_h_shufB] |
| movifnidn tmpq, tmpmp |
| sub srcq, 2 |
| WIN64_SPILL_XMM 8 |
| punpcklbw m0, m0 |
| psraw m0, 8 |
| test dword r7m, 0x800 |
| jnz .h_w4_12bpc |
| psllw m0, 2 |
| .h_w4_12bpc: |
| pshufd m6, m0, q1111 |
| pshufd m7, m0, q2222 |
| .h_w4_loop: |
| movu m1, [srcq+ssq*0] |
| movu m2, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 |
| pshufb m1, m4 ; 2 3 3 4 4 5 5 6 |
| pmaddwd m0, m6 |
| pmaddwd m1, m7 |
| paddd m0, m5 |
| paddd m0, m1 |
| pshufb m1, m2, m3 |
| pshufb m2, m4 |
| pmaddwd m1, m6 |
| pmaddwd m2, m7 |
| paddd m1, m5 |
| paddd m1, m2 |
| psrad m0, 4 |
| psrad m1, 4 |
| packssdw m0, m1 |
| mova [tmpq], m0 |
| add tmpq, 16 |
| sub hd, 2 |
| jg .h_w4_loop |
| RET |
| .h_w8: |
| WIN64_SPILL_XMM 11 |
| shr mxd, 16 |
| movq m2, [base+subpel_filters+mxq*8] |
| mova m4, [base+spel_h_shufA] |
| mova m6, [base+spel_h_shufB] |
| movifnidn tmpq, r0mp |
| add wd, wd |
| punpcklbw m2, m2 |
| add srcq, wq |
| psraw m2, 8 |
| add tmpq, wq |
| neg wq |
| test dword r7m, 0x800 |
| jnz .h_w8_12bpc |
| psllw m2, 2 |
| .h_w8_12bpc: |
| pshufd m7, m2, q0000 |
| %if ARCH_X86_32 |
| ALLOC_STACK -16*3 |
| pshufd m0, m2, q1111 |
| pshufd m1, m2, q2222 |
| pshufd m2, m2, q3333 |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| %else |
| pshufd m8, m2, q1111 |
| pshufd m9, m2, q2222 |
| pshufd m10, m2, q3333 |
| %endif |
| .h_w8_loop0: |
| mov r6, wq |
| .h_w8_loop: |
| movu m0, [srcq+r6- 6] |
| movu m1, [srcq+r6+ 2] |
| pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 |
| pshufb m0, m6 ; 2 3 3 4 4 5 5 6 |
| pmaddwd m2, m7 ; abcd0 |
| pmaddwd m0, m8 ; abcd1 |
| pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 |
| pshufb m1, m6 ; 6 7 7 8 8 9 9 a |
| paddd m2, m5 |
| paddd m0, m2 |
| pmaddwd m2, m9, m3 ; abcd2 |
| pmaddwd m3, m7 ; efgh0 |
| paddd m0, m2 |
| pmaddwd m2, m10, m1 ; abcd3 |
| pmaddwd m1, m8 ; efgh1 |
| paddd m0, m2 |
| movu m2, [srcq+r6+10] |
| paddd m3, m5 |
| paddd m1, m3 |
| pshufb m3, m2, m4 ; a b b c c d d e |
| pshufb m2, m6 ; 8 9 9 a a b b c |
| pmaddwd m3, m9 ; efgh2 |
| pmaddwd m2, m10 ; efgh3 |
| paddd m1, m3 |
| paddd m1, m2 |
| psrad m0, 4 |
| psrad m1, 4 |
| packssdw m0, m1 |
| mova [tmpq+r6], m0 |
| add r6, 16 |
| jl .h_w8_loop |
| add srcq, ssq |
| sub tmpq, wq |
| dec hd |
| jg .h_w8_loop0 |
| RET |
| .v: |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmove myd, mxd |
| movq m3, [base+subpel_filters+myq*8] |
| %if STACK_ALIGNMENT < 16 |
| %xdefine rstk rsp |
| %else |
| %assign stack_offset stack_offset - stack_size_padded |
| %endif |
| WIN64_SPILL_XMM 15 |
| movddup m7, [base+prep_8tap_1d_rnd] |
| movifnidn ssq, r2mp |
| movifnidn tmpq, r0mp |
| punpcklbw m3, m3 |
| psraw m3, 8 ; sign-extend |
| test dword r7m, 0x800 |
| jnz .v_12bpc |
| psllw m3, 2 |
| .v_12bpc: |
| %if ARCH_X86_32 |
| ALLOC_STACK -16*7 |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova m8, m0 |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| %else |
| pshufd m8, m3, q0000 |
| pshufd m9, m3, q1111 |
| pshufd m10, m3, q2222 |
| pshufd m11, m3, q3333 |
| %endif |
| lea r6, [ssq*3] |
| sub srcq, r6 |
| mov r6d, wd |
| shl wd, 6 |
| mov r5, srcq |
| %if ARCH_X86_64 |
| mov r7, tmpq |
| %elif STACK_ALIGNMENT < 16 |
| mov [esp+4*29], tmpq |
| %endif |
| lea wd, [wq+hq-(1<<8)] |
| .v_loop0: |
| movq m1, [srcq+ssq*0] |
| movq m2, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movq m3, [srcq+ssq*0] |
| movq m4, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movq m5, [srcq+ssq*0] |
| movq m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| movq m0, [srcq+ssq*0] |
| punpcklwd m1, m2 ; 01 |
| punpcklwd m2, m3 ; 12 |
| punpcklwd m3, m4 ; 23 |
| punpcklwd m4, m5 ; 34 |
| punpcklwd m5, m6 ; 45 |
| punpcklwd m6, m0 ; 56 |
| %if ARCH_X86_32 |
| jmp .v_loop_start |
| .v_loop: |
| mova m1, m12 |
| mova m2, m13 |
| mova m3, m14 |
| .v_loop_start: |
| pmaddwd m1, m8 ; a0 |
| pmaddwd m2, m8 ; b0 |
| mova m12, m3 |
| mova m13, m4 |
| pmaddwd m3, m9 ; a1 |
| pmaddwd m4, m9 ; b1 |
| paddd m1, m3 |
| paddd m2, m4 |
| mova m14, m5 |
| mova m4, m6 |
| pmaddwd m5, m10 ; a2 |
| pmaddwd m6, m10 ; b2 |
| paddd m1, m5 |
| paddd m2, m6 |
| movq m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklwd m5, m0, m6 ; 67 |
| movq m0, [srcq+ssq*0] |
| pmaddwd m3, m11, m5 ; a3 |
| punpcklwd m6, m0 ; 78 |
| paddd m1, m7 |
| paddd m1, m3 |
| pmaddwd m3, m11, m6 ; b3 |
| paddd m2, m7 |
| paddd m2, m3 |
| psrad m1, 4 |
| psrad m2, 4 |
| packssdw m1, m2 |
| movq [tmpq+r6*0], m1 |
| movhps [tmpq+r6*2], m1 |
| lea tmpq, [tmpq+r6*4] |
| sub hd, 2 |
| jg .v_loop |
| %if STACK_ALIGNMENT < 16 |
| mov tmpq, [esp+4*29] |
| add r5, 8 |
| add tmpq, 8 |
| mov srcq, r5 |
| mov [esp+4*29], tmpq |
| %else |
| mov tmpq, tmpmp |
| add r5, 8 |
| add tmpq, 8 |
| mov srcq, r5 |
| mov tmpmp, tmpq |
| %endif |
| %else |
| .v_loop: |
| pmaddwd m12, m8, m1 ; a0 |
| pmaddwd m13, m8, m2 ; b0 |
| mova m1, m3 |
| mova m2, m4 |
| pmaddwd m3, m9 ; a1 |
| pmaddwd m4, m9 ; b1 |
| paddd m12, m3 |
| paddd m13, m4 |
| mova m3, m5 |
| mova m4, m6 |
| pmaddwd m5, m10 ; a2 |
| pmaddwd m6, m10 ; b2 |
| paddd m12, m5 |
| paddd m13, m6 |
| movq m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| punpcklwd m5, m0, m6 ; 67 |
| movq m0, [srcq+ssq*0] |
| pmaddwd m14, m11, m5 ; a3 |
| punpcklwd m6, m0 ; 78 |
| paddd m12, m7 |
| paddd m12, m14 |
| pmaddwd m14, m11, m6 ; b3 |
| paddd m13, m7 |
| paddd m13, m14 |
| psrad m12, 4 |
| psrad m13, 4 |
| packssdw m12, m13 |
| movq [tmpq+r6*0], m12 |
| movhps [tmpq+r6*2], m12 |
| lea tmpq, [tmpq+r6*4] |
| sub hd, 2 |
| jg .v_loop |
| add r5, 8 |
| add r7, 8 |
| mov srcq, r5 |
| mov tmpq, r7 |
| %endif |
| movzx hd, wb |
| sub wd, 1<<8 |
| jg .v_loop0 |
| RET |
| .hv: |
| %if STACK_ALIGNMENT < 16 |
| %xdefine rstk rsp |
| %else |
| %assign stack_offset stack_offset - stack_size_padded |
| %endif |
| movzx t3d, mxb |
| shr mxd, 16 |
| cmp wd, 4 |
| cmove mxd, t3d |
| movifnidn hd, r4m |
| movq m2, [base+subpel_filters+mxq*8] |
| movzx mxd, myb |
| shr myd, 16 |
| cmp hd, 4 |
| cmove myd, mxd |
| movq m3, [base+subpel_filters+myq*8] |
| %if ARCH_X86_32 |
| mov ssq, r2mp |
| mov tmpq, r0mp |
| mova m0, [base+spel_h_shufA] |
| mova m1, [base+spel_h_shufB] |
| mova m4, [base+prep_8tap_2d_rnd] |
| ALLOC_STACK -16*14 |
| mova m8, m0 |
| mova m9, m1 |
| mova m14, m4 |
| %else |
| %if WIN64 |
| ALLOC_STACK 16*6, 16 |
| %endif |
| mova m8, [base+spel_h_shufA] |
| mova m9, [base+spel_h_shufB] |
| %endif |
| pxor m0, m0 |
| punpcklbw m0, m2 |
| punpcklbw m3, m3 |
| psraw m0, 4 |
| psraw m3, 8 |
| test dword r7m, 0x800 |
| jz .hv_10bpc |
| psraw m0, 2 |
| .hv_10bpc: |
| lea r6, [ssq*3] |
| sub srcq, 6 |
| sub srcq, r6 |
| mov r6d, wd |
| shl wd, 6 |
| mov r5, srcq |
| %if ARCH_X86_32 |
| %define tmp esp+16*8 |
| %if STACK_ALIGNMENT < 16 |
| mov [esp+4*61], tmpq |
| %endif |
| pshufd m1, m0, q0000 |
| pshufd m2, m0, q1111 |
| pshufd m5, m0, q2222 |
| pshufd m0, m0, q3333 |
| mova m10, m1 |
| mova m11, m2 |
| mova m12, m5 |
| mova m13, m0 |
| %else |
| %if WIN64 |
| %define tmp rsp |
| %else |
| %define tmp rsp-88 ; red zone |
| %endif |
| mov r7, tmpq |
| pshufd m10, m0, q0000 |
| pshufd m11, m0, q1111 |
| pshufd m12, m0, q2222 |
| pshufd m13, m0, q3333 |
| %endif |
| lea wd, [wq+hq-(1<<8)] |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| mova [tmp+16*1], m0 |
| mova [tmp+16*2], m1 |
| mova [tmp+16*3], m2 |
| mova [tmp+16*4], m3 |
| .hv_loop0: |
| %if ARCH_X86_64 |
| mova m14, [prep_8tap_2d_rnd] |
| %endif |
| movu m4, [srcq+ssq*0+0] |
| movu m1, [srcq+ssq*0+8] |
| movu m5, [srcq+ssq*1+0] |
| movu m2, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| movu m6, [srcq+ssq*0+0] |
| movu m3, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 4, 1, 0, 6 |
| PUT_8TAP_HV_H 5, 2, 0, 6 |
| PUT_8TAP_HV_H 6, 3, 0, 6 |
| movu m7, [srcq+ssq*1+0] |
| movu m2, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| movu m1, [srcq+ssq*0+0] |
| movu m3, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 7, 2, 0, 6 |
| PUT_8TAP_HV_H 1, 3, 0, 6 |
| movu m2, [srcq+ssq*1+0] |
| movu m3, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_HV_H 2, 3, 0, 6 |
| packssdw m4, m7 ; 0 3 |
| packssdw m5, m1 ; 1 4 |
| movu m0, [srcq+ssq*0+0] |
| movu m1, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 1, 3, 6 |
| packssdw m6, m2 ; 2 5 |
| packssdw m7, m0 ; 3 6 |
| punpcklwd m1, m4, m5 ; 01 |
| punpckhwd m4, m5 ; 34 |
| punpcklwd m2, m5, m6 ; 12 |
| punpckhwd m5, m6 ; 45 |
| punpcklwd m3, m6, m7 ; 23 |
| punpckhwd m6, m7 ; 56 |
| %if ARCH_X86_32 |
| jmp .hv_loop_start |
| .hv_loop: |
| mova m1, [tmp+16*5] |
| mova m2, m15 |
| .hv_loop_start: |
| mova m7, [tmp+16*1] |
| pmaddwd m1, m7 ; a0 |
| pmaddwd m2, m7 ; b0 |
| mova m7, [tmp+16*2] |
| mova [tmp+16*5], m3 |
| pmaddwd m3, m7 ; a1 |
| mova m15, m4 |
| pmaddwd m4, m7 ; b1 |
| mova m7, [tmp+16*3] |
| paddd m1, m14 |
| paddd m2, m14 |
| paddd m1, m3 |
| paddd m2, m4 |
| mova m3, m5 |
| pmaddwd m5, m7 ; a2 |
| mova m4, m6 |
| pmaddwd m6, m7 ; b2 |
| paddd m1, m5 |
| paddd m2, m6 |
| movu m7, [srcq+ssq*1+0] |
| movu m5, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_HV_H 7, 5, 6, 6 |
| packssdw m0, m7 ; 6 7 |
| mova [tmp+16*0], m0 |
| movu m0, [srcq+ssq*0+0] |
| movu m5, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 5, 6, 6 |
| mova m6, [tmp+16*0] |
| packssdw m7, m0 ; 7 8 |
| punpcklwd m5, m6, m7 ; 67 |
| punpckhwd m6, m7 ; 78 |
| pmaddwd m7, m5, [tmp+16*4] |
| paddd m1, m7 ; a3 |
| pmaddwd m7, m6, [tmp+16*4] |
| paddd m2, m7 ; b3 |
| psrad m1, 6 |
| psrad m2, 6 |
| packssdw m1, m2 |
| movq [tmpq+r6*0], m1 |
| movhps [tmpq+r6*2], m1 |
| lea tmpq, [tmpq+r6*4] |
| sub hd, 2 |
| jg .hv_loop |
| %if STACK_ALIGNMENT < 16 |
| mov tmpq, [esp+4*61] |
| add r5, 8 |
| add tmpq, 8 |
| mov srcq, r5 |
| mov [esp+4*61], tmpq |
| %else |
| mov tmpq, tmpmp |
| add r5, 8 |
| add tmpq, 8 |
| mov srcq, r5 |
| mov tmpmp, tmpq |
| %endif |
| %else |
| .hv_loop: |
| mova m15, [tmp+16*1] |
| mova m7, [prep_8tap_2d_rnd] |
| pmaddwd m14, m15, m1 ; a0 |
| pmaddwd m15, m2 ; b0 |
| paddd m14, m7 |
| paddd m15, m7 |
| mova m7, [tmp+16*2] |
| mova m1, m3 |
| pmaddwd m3, m7 ; a1 |
| mova m2, m4 |
| pmaddwd m4, m7 ; b1 |
| mova m7, [tmp+16*3] |
| paddd m14, m3 |
| paddd m15, m4 |
| mova m3, m5 |
| pmaddwd m5, m7 ; a2 |
| mova m4, m6 |
| pmaddwd m6, m7 ; b2 |
| paddd m14, m5 |
| paddd m15, m6 |
| movu m7, [srcq+ssq*1+0] |
| movu m5, [srcq+ssq*1+8] |
| lea srcq, [srcq+ssq*2] |
| PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] |
| packssdw m0, m7 ; 6 7 |
| mova [tmp+16*0], m0 |
| movu m0, [srcq+ssq*0+0] |
| movu m5, [srcq+ssq*0+8] |
| PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] |
| mova m6, [tmp+16*0] |
| packssdw m7, m0 ; 7 8 |
| punpcklwd m5, m6, m7 ; 67 |
| punpckhwd m6, m7 ; 78 |
| pmaddwd m7, m5, [tmp+16*4] |
| paddd m14, m7 ; a3 |
| pmaddwd m7, m6, [tmp+16*4] |
| paddd m15, m7 ; b3 |
| psrad m14, 6 |
| psrad m15, 6 |
| packssdw m14, m15 |
| movq [tmpq+r6*0], m14 |
| movhps [tmpq+r6*2], m14 |
| lea tmpq, [tmpq+r6*4] |
| sub hd, 2 |
| jg .hv_loop |
| add r5, 8 |
| add r7, 8 |
| mov srcq, r5 |
| mov tmpq, r7 |
| %endif |
| movzx hd, wb |
| sub wd, 1<<8 |
| jg .hv_loop0 |
| RET |
| %undef tmp |
| |
| %macro movifprep 2 |
| %if isprep |
| mov %1, %2 |
| %endif |
| %endmacro |
| |
| %macro SAVE_REG 1 |
| %xdefine r%1_save r%1 |
| %xdefine r%1q_save r%1q |
| %xdefine r%1d_save r%1d |
| %if ARCH_X86_32 |
| %define r%1m_save [rstk+stack_offset+(%1+1)*4] |
| %endif |
| %endmacro |
| |
| %macro LOAD_REG 1 |
| %xdefine r%1 r%1_save |
| %xdefine r%1q r%1q_save |
| %xdefine r%1d r%1d_save |
| %if ARCH_X86_32 |
| %define r%1m r%1m_save |
| %endif |
| %undef r%1d_save |
| %undef r%1q_save |
| %undef r%1_save |
| %endmacro |
| |
| %macro REMAP_REG 2-3 |
| %xdefine r%1 r%2 |
| %xdefine r%1q r%2q |
| %xdefine r%1d r%2d |
| %if ARCH_X86_32 |
| %if %3 == 0 |
| %xdefine r%1m r%2m |
| %else |
| %define r%1m [rstk+stack_offset+(%1+1)*4] |
| %endif |
| %endif |
| %endmacro |
| |
| %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 |
| %if isprep |
| %if ARCH_X86_64 |
| SAVE_REG 14 |
| %assign %%i 14 |
| %rep 14 |
| %assign %%j %%i-1 |
| REMAP_REG %%i, %%j |
| %assign %%i %%i-1 |
| %endrep |
| %else |
| SAVE_REG 5 |
| %assign %%i 5 |
| %rep 5 |
| %assign %%j %%i-1 |
| REMAP_REG %%i, %%j, 0 |
| %assign %%i %%i-1 |
| %endrep |
| %endif |
| %endif |
| %endmacro |
| |
| %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 |
| %if isprep |
| %assign %%i 1 |
| %if ARCH_X86_64 |
| %rep 13 |
| %assign %%j %%i+1 |
| REMAP_REG %%i, %%j |
| %assign %%i %%i+1 |
| %endrep |
| LOAD_REG 14 |
| %else |
| %rep 4 |
| %assign %%j %%i+1 |
| REMAP_REG %%i, %%j, 1 |
| %assign %%i %%i+1 |
| %endrep |
| LOAD_REG 5 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged |
| MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT |
| RET |
| %if %1 |
| MCT_8TAP_SCALED_REMAP_REGS_TO_PREV |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_32 |
| %macro MC_4TAP_SCALED_H 1 ; dst_mem |
| movu m7, [srcq+ssq*0] |
| movu m2, [srcq+ssq*1] |
| movu m5, [r4 +ssq*0] |
| movu m6, [r4 +ssq*1] |
| lea srcq, [srcq+ssq*2] |
| lea r4, [r4 +ssq*2] |
| REPX {pshufb x, m12}, m7, m2 |
| REPX {pmaddwd x, m13}, m7, m2 |
| REPX {pshufb x, m14}, m5, m6 |
| REPX {pmaddwd x, m15}, m5, m6 |
| phaddd m7, m5 |
| phaddd m2, m6 |
| mova m5, [esp+0x00] |
| movd m6, [esp+0x10] |
| paddd m7, m5 |
| paddd m2, m5 |
| psrad m7, m6 |
| psrad m2, m6 |
| packssdw m7, m2 |
| mova [stk+%1], m7 |
| %endmacro |
| %endif |
| |
| %if ARCH_X86_64 |
| %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] |
| movu m%1, [srcq+ r4*2] |
| movu m%2, [srcq+ r6*2] |
| movu m%3, [srcq+ r7*2] |
| movu m%4, [srcq+ r9*2] |
| movu m%5, [srcq+r10*2] |
| movu m%6, [srcq+r11*2] |
| movu m%7, [srcq+r13*2] |
| movu m%8, [srcq+ rX*2] |
| add srcq, ssq |
| pmaddwd m%1, [stk+0x10] |
| pmaddwd m%2, [stk+0x20] |
| pmaddwd m%3, [stk+0x30] |
| pmaddwd m%4, [stk+0x40] |
| pmaddwd m%5, [stk+0x50] |
| pmaddwd m%6, [stk+0x60] |
| pmaddwd m%7, [stk+0x70] |
| pmaddwd m%8, [stk+0x80] |
| phaddd m%1, m%2 |
| phaddd m%3, m%4 |
| phaddd m%5, m%6 |
| phaddd m%7, m%8 |
| phaddd m%1, m%3 |
| phaddd m%5, m%7 |
| paddd m%1, hround |
| paddd m%5, hround |
| psrad m%1, m12 |
| psrad m%5, m12 |
| packssdw m%1, m%5 |
| %endmacro |
| %else |
| %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets |
| %if %3 == 1 |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| %endif |
| movu m0, [srcq+r0*2] |
| movu m1, [srcq+rX*2] |
| movu m2, [srcq+r4*2] |
| movu m3, [srcq+r5*2] |
| mov r0, [stk+16] |
| mov rX, [stk+20] |
| mov r4, [stk+24] |
| mov r5, [stk+28] |
| pmaddwd m0, [stk+%1+0x00] |
| pmaddwd m1, [stk+%1+0x10] |
| pmaddwd m2, [stk+%1+0x20] |
| pmaddwd m3, [stk+%1+0x30] |
| phaddd m0, m1 |
| phaddd m2, m3 |
| movu m4, [srcq+r0*2] |
| movu m5, [srcq+rX*2] |
| movu m6, [srcq+r4*2] |
| movu m7, [srcq+r5*2] |
| add srcq, ssq |
| pmaddwd m4, [stk+%1+0xa0] |
| pmaddwd m5, [stk+%1+0xb0] |
| pmaddwd m6, [stk+%1+0xc0] |
| pmaddwd m7, [stk+%1+0xd0] |
| phaddd m4, m5 |
| phaddd m6, m7 |
| phaddd m0, m2 |
| phaddd m4, m6 |
| paddd m0, hround |
| paddd m4, hround |
| psrad m0, m12 |
| psrad m4, m12 |
| packssdw m0, m4 |
| %if %2 != 0 |
| mova [stk+%2], m0 |
| %endif |
| %endmacro |
| %endif |
| |
| %macro MC_8TAP_SCALED 1 |
| %ifidn %1, put |
| %assign isput 1 |
| %assign isprep 0 |
| %if ARCH_X86_64 |
| %if required_stack_alignment <= STACK_ALIGNMENT |
| cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax |
| %else |
| cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax |
| %endif |
| %else ; ARCH_X86_32 |
| %if required_stack_alignment <= STACK_ALIGNMENT |
| cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax |
| %else |
| cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax |
| %endif |
| %endif |
| %xdefine base_reg r12 |
| %else ; prep |
| %assign isput 0 |
| %assign isprep 1 |
| %if ARCH_X86_64 |
| %if required_stack_alignment <= STACK_ALIGNMENT |
| cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax |
| %xdefine tmp_stridem r14q |
| %else |
| cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax |
| %define tmp_stridem qword [stk+0x138] |
| %endif |
| %xdefine base_reg r11 |
| %else ; ARCH_X86_32 |
| %if required_stack_alignment <= STACK_ALIGNMENT |
| cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax |
| %else |
| cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax |
| %endif |
| %define tmp_stridem dword [stk+0x138] |
| %endif |
| %endif |
| %if ARCH_X86_32 |
| mov [esp+0x1f0], t0d |
| mov [esp+0x1f4], t1d |
| %if isput && required_stack_alignment > STACK_ALIGNMENT |
| mov dstd, dstm |
| mov dsd, dsm |
| mov srcd, srcm |
| mov ssd, ssm |
| mov hd, hm |
| mov r4, mxm |
| %define r0m [esp+0x200] |
| %define dsm [esp+0x204] |
| %define dsmp dsm |
| %define r1m dsm |
| %define r2m [esp+0x208] |
| %define ssm [esp+0x20c] |
| %define r3m ssm |
| %define hm [esp+0x210] |
| %define mxm [esp+0x214] |
| mov r0m, dstd |
| mov dsm, dsd |
| mov r2m, srcd |
| mov ssm, ssd |
| mov hm, hd |
| mov r0, mym |
| mov r1, dxm |
| mov r2, dym |
| %define mym [esp+0x218] |
| %define dxm [esp+0x21c] |
| %define dym [esp+0x220] |
| mov mxm, r4 |
| mov mym, r0 |
| mov dxm, r1 |
| mov dym, r2 |
| tzcnt wd, wm |
| %endif |
| %if isput |
| mov r3, pxmaxm |
| %define pxmaxm r3 |
| %else |
| mov r2, pxmaxm |
| %endif |
| %if isprep && required_stack_alignment > STACK_ALIGNMENT |
| %xdefine base_reg r5 |
| %else |
| %xdefine base_reg r6 |
| %endif |
| %endif |
| LEA base_reg, %1_8tap_scaled_16bpc_ssse3 |
| %xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 |
| %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT |
| tzcnt wd, wm |
| %endif |
| %if ARCH_X86_64 |
| %if isput |
| mov r7d, pxmaxm |
| %endif |
| %else |
| %define m8 m0 |
| %define m9 m1 |
| %define m14 m4 |
| %define m15 m3 |
| %endif |
| movd m8, dxm |
| movd m14, mxm |
| %if isput |
| movd m15, pxmaxm |
| %endif |
| pshufd m8, m8, q0000 |
| pshufd m14, m14, q0000 |
| %if isput |
| pshuflw m15, m15, q0000 |
| punpcklqdq m15, m15 |
| %endif |
| %if isprep |
| %if UNIX64 |
| mov r5d, t0d |
| DECLARE_REG_TMP 5, 7 |
| %endif |
| %if ARCH_X86_64 |
| mov r6d, pxmaxm |
| %endif |
| %endif |
| %if ARCH_X86_64 |
| mov dyd, dym |
| %endif |
| %if isput |
| %if WIN64 |
| mov r8d, hm |
| DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 |
| %define hm r5m |
| %define dxm r8m |
| %elif ARCH_X86_64 |
| DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 |
| %define hm r6m |
| %else |
| %endif |
| %if ARCH_X86_64 |
| %if required_stack_alignment > STACK_ALIGNMENT |
| %define dsm [rsp+0x138] |
| %define rX r1 |
| %define rXd r1d |
| %else |
| %define dsm dsq |
| %define rX r14 |
| %define rXd r14d |
| %endif |
| %else |
| %define rX r1 |
| %endif |
| %else ; prep |
| %if WIN64 |
| mov r7d, hm |
| DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 |
| %define hm r4m |
| %define dxm r7m |
| %elif ARCH_X86_64 |
| DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 |
| %xdefine hm r7m |
| %endif |
| MCT_8TAP_SCALED_REMAP_REGS_TO_PREV |
| %if ARCH_X86_64 |
| %define rX r14 |
| %define rXd r14d |
| %else |
| %define rX r3 |
| %endif |
| %endif |
| %if ARCH_X86_64 |
| shr r7d, 11 |
| mova m10, [base+pd_0x3ff] |
| movddup m11, [base+s_8tap_h_rnd+r7*8] |
| movd m12, [base+s_8tap_h_sh+r7*4] |
| %if isput |
| movddup m13, [base+put_s_8tap_v_rnd+r7*8] |
| movd m7, [base+put_s_8tap_v_sh+r7*4] |
| %define pxmaxm [rsp] |
| mova pxmaxm, m15 |
| punpcklqdq m12, m7 |
| %endif |
| lea ss3q, [ssq*3] |
| movzx r7d, t1b |
| shr t1d, 16 |
| cmp hd, 6 |
| cmovs t1d, r7d |
| sub srcq, ss3q |
| %else |
| %define m10 [base+pd_0x3ff] |
| %define m11 [esp+0x00] |
| %define m12 [esp+0x10] |
| shr r3, 11 |
| movddup m1, [base+s_8tap_h_rnd+r3*8] |
| movd m2, [base+s_8tap_h_sh+r3*4] |
| %if isput |
| %define m13 [esp+0x20] |
| %define pxmaxm [esp+0x30] |
| %define stk esp+0x40 |
| movddup m5, [base+put_s_8tap_v_rnd+r3*8] |
| movd m6, [base+put_s_8tap_v_sh+r3*4] |
| mova pxmaxm, m15 |
| punpcklqdq m2, m6 |
| mova m13, m5 |
| %else |
| %define m13 [base+pd_m524256] |
| %endif |
| mov ssd, ssm |
| mova m11, m1 |
| mova m12, m2 |
| MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT |
| mov r1, [esp+0x1f4] |
| lea r0, [ssd*3] |
| movzx r2, r1b |
| shr r1, 16 |
| cmp dword hm, 6 |
| cmovs r1, r2 |
| mov [esp+0x1f4], r1 |
| %if isprep |
| mov r1, r1m |
| %endif |
| mov r2, r2m |
| sub srcq, r0 |
| MCT_8TAP_SCALED_REMAP_REGS_TO_PREV |
| %define ss3q r0 |
| %define myd r4 |
| %define dyd dword dym |
| %define hd dword hm |
| %endif |
| cmp dyd, 1024 |
| je .dy1 |
| cmp dyd, 2048 |
| je .dy2 |
| movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] |
| add wq, base_reg |
| jmp wq |
| %if isput |
| .w2: |
| %if ARCH_X86_64 |
| mov myd, mym |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| movzx r4, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r4 |
| %endif |
| pxor m9, m9 |
| punpckldq m9, m8 |
| paddd m14, m9 ; mx+dx*[0-1] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| pand m8, m14, m10 |
| psrld m8, 6 |
| paddd m15, m8 |
| movd r4d, m15 |
| pshufd m15, m15, q0321 |
| %if ARCH_X86_64 |
| movd r6d, m15 |
| %else |
| movd r3d, m15 |
| %endif |
| mova m5, [base+bdct_lb_q] |
| mova m6, [base+spel_s_shuf2] |
| movd m15, [base+subpel_filters+r4*8+2] |
| %if ARCH_X86_64 |
| movd m7, [base+subpel_filters+r6*8+2] |
| %else |
| movd m7, [base+subpel_filters+r3*8+2] |
| %endif |
| pxor m2, m2 |
| pcmpeqd m8, m2 |
| psrld m14, 10 |
| paddd m14, m14 |
| %if ARCH_X86_32 |
| mov r3, r3m |
| pshufb m14, m5 |
| paddb m14, m6 |
| mova [stk], m14 |
| SWAP m5, m0 |
| SWAP m6, m3 |
| %define m15 m6 |
| %endif |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| movu m2, [srcq+ssq*2] |
| movu m3, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| punpckldq m15, m7 |
| %if ARCH_X86_64 |
| pshufb m14, m5 |
| paddb m14, m6 |
| pand m9, m8 |
| pandn m8, m15 |
| SWAP m15, m8 |
| por m15, m9 |
| movu m4, [srcq+ssq*0] |
| movu m5, [srcq+ssq*1] |
| movu m6, [srcq+ssq*2] |
| movu m7, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| %else |
| pand m7, m5, [base+pd_0x4000] |
| pandn m5, m15 |
| por m5, m7 |
| %define m15 m5 |
| %endif |
| punpcklbw m15, m15 |
| psraw m15, 8 |
| REPX {pshufb x, m14}, m0, m1, m2, m3 |
| REPX {pmaddwd x, m15}, m0, m1, m2, m3 |
| %if ARCH_X86_64 |
| REPX {pshufb x, m14}, m4, m5, m6, m7 |
| REPX {pmaddwd x, m15}, m4, m5, m6, m7 |
| phaddd m0, m1 |
| phaddd m2, m3 |
| phaddd m4, m5 |
| phaddd m6, m7 |
| REPX {paddd x, m11}, m0, m2, m4, m6 |
| REPX {psrad x, m12}, m0, m2, m4, m6 |
| packssdw m0, m2 ; 0 1 2 3 |
| packssdw m4, m6 ; 4 5 6 7 |
| SWAP m1, m4 |
| %else |
| mova [stk+0x10], m15 |
| phaddd m0, m1 |
| phaddd m2, m3 |
| movu m1, [srcq+ssq*0] |
| movu m7, [srcq+ssq*1] |
| movu m6, [srcq+ssq*2] |
| movu m3, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| REPX {pshufb x, m14}, m1, m7, m6, m3 |
| REPX {pmaddwd x, m15}, m1, m7, m6, m3 |
| phaddd m1, m7 |
| phaddd m6, m3 |
| REPX {paddd x, m11}, m0, m2, m1, m6 |
| REPX {psrad x, m12}, m0, m2, m1, m6 |
| packssdw m0, m2 |
| packssdw m1, m6 |
| %define m14 [stk+0x00] |
| %define m15 [stk+0x10] |
| %endif |
| palignr m2, m1, m0, 4 ; 1 2 3 4 |
| punpcklwd m3, m0, m2 ; 01 12 |
| punpckhwd m0, m2 ; 23 34 |
| pshufd m5, m1, q0321 ; 5 6 7 _ |
| punpcklwd m2, m1, m5 ; 45 56 |
| punpckhwd m4, m1, m5 ; 67 __ |
| %if ARCH_X86_32 |
| mov myd, mym |
| mov r0, r0m |
| mova [stk+0x20], m3 |
| mova [stk+0x30], m0 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m4 |
| %endif |
| .w2_loop: |
| and myd, 0x3ff |
| %if ARCH_X86_64 |
| mov r6d, 64 << 24 |
| mov r4d, myd |
| shr r4d, 6 |
| lea r4d, [t1+r4] |
| cmovnz r6q, [base+subpel_filters+r4*8] |
| movq m10, r6q |
| punpcklbw m10, m10 |
| psraw m10, 8 |
| pshufd m7, m10, q0000 |
| pshufd m8, m10, q1111 |
| pmaddwd m5, m3, m7 |
| pmaddwd m6, m0, m8 |
| pshufd m9, m10, q2222 |
| pshufd m10, m10, q3333 |
| pmaddwd m7, m2, m9 |
| pmaddwd m8, m4, m10 |
| paddd m5, m6 |
| paddd m7, m8 |
| %else |
| mov r1, [esp+0x1f4] |
| xor r3, r3 |
| mov r5, myd |
| shr r5, 6 |
| lea r1, [r1+r5] |
| mov r5, 64 << 24 |
| cmovnz r3, [base+subpel_filters+r1*8+4] |
| cmovnz r5, [base+subpel_filters+r1*8+0] |
| movd m6, r3 |
| movd m7, r5 |
| punpckldq m7, m6 |
| punpcklbw m7, m7 |
| psraw m7, 8 |
| pshufd m5, m7, q0000 |
| pshufd m6, m7, q1111 |
| pmaddwd m3, m5 |
| pmaddwd m0, m6 |
| pshufd m5, m7, q2222 |
| pshufd m7, m7, q3333 |
| pmaddwd m2, m5 |
| pmaddwd m4, m7 |
| paddd m3, m0 |
| paddd m2, m4 |
| SWAP m5, m3 |
| SWAP m7, m2 |
| %define m8 m3 |
| %endif |
| paddd m5, m13 |
| pshufd m6, m12, q1032 |
| pxor m8, m8 |
| paddd m5, m7 |
| psrad m5, m6 |
| packssdw m5, m5 |
| pmaxsw m5, m8 |
| pminsw m5, pxmaxm |
| movd [dstq], m5 |
| add dstq, dsmp |
| dec hd |
| jz .ret |
| %if ARCH_X86_64 |
| add myd, dyd |
| %else |
| add myd, dym |
| %endif |
| test myd, ~0x3ff |
| %if ARCH_X86_32 |
| SWAP m3, m5 |
| SWAP m2, m7 |
| mova m3, [stk+0x20] |
| mova m0, [stk+0x30] |
| mova m2, [stk+0x40] |
| mova m4, [stk+0x50] |
| %endif |
| jz .w2_loop |
| %if ARCH_X86_32 |
| mov r3, r3m |
| %endif |
| movu m5, [srcq] |
| test myd, 0x400 |
| jz .w2_skip_line |
| add srcq, ssq |
| shufps m3, m0, q1032 ; 01 12 |
| shufps m0, m2, q1032 ; 23 34 |
| shufps m2, m4, q1032 ; 45 56 |
| pshufb m5, m14 |
| pmaddwd m5, m15 |
| phaddd m5, m5 |
| paddd m5, m11 |
| psrad m5, m12 |
| packssdw m5, m5 |
| palignr m4, m5, m1, 12 |
| punpcklqdq m1, m4, m4 ; 6 7 6 7 |
| punpcklwd m4, m1, m5 ; 67 __ |
| %if ARCH_X86_32 |
| mova [stk+0x20], m3 |
| mova [stk+0x30], m0 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m4 |
| %endif |
| jmp .w2_loop |
| .w2_skip_line: |
| movu m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| mova m3, m0 ; 01 12 |
| mova m0, m2 ; 23 34 |
| pshufb m5, m14 |
| pshufb m6, m14 |
| pmaddwd m5, m15 |
| pmaddwd m6, m15 |
| phaddd m5, m6 |
| paddd m5, m11 |
| psrad m5, m12 |
| packssdw m5, m5 ; 6 7 6 7 |
| punpckhqdq m1, m5 ; 4 5 6 7 |
| pshufd m5, m1, q0321 ; 5 6 7 _ |
| punpcklwd m2, m1, m5 ; 45 56 |
| punpckhwd m4, m1, m5 ; 67 __ |
| %if ARCH_X86_32 |
| mova [stk+0x20], m3 |
| mova [stk+0x30], m0 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m4 |
| %endif |
| jmp .w2_loop |
| %endif |
| INIT_XMM ssse3 |
| .w4: |
| %if ARCH_X86_64 |
| mov myd, mym |
| mova [rsp+0x10], m11 |
| mova [rsp+0x20], m12 |
| %if isput |
| mova [rsp+0x30], m13 |
| %endif |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| %define m8 m0 |
| %xdefine m14 m4 |
| %define m15 m3 |
| movzx r4, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r4 |
| %endif |
| pmaddwd m8, [base+rescale_mul] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %else |
| %define m9 [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| pand m0, m14, m10 |
| psrld m0, 6 |
| paddd m15, m0 |
| pshufd m7, m15, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m15 |
| movd r11d, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd r6d, m15 |
| movd r13d, m7 |
| mova m10, [base+bdct_lb_q+ 0] |
| mova m11, [base+bdct_lb_q+16] |
| movd m13, [base+subpel_filters+ r4*8+2] |
| movd m2, [base+subpel_filters+ r6*8+2] |
| movd m15, [base+subpel_filters+r11*8+2] |
| movd m4, [base+subpel_filters+r13*8+2] |
| %else |
| movd r0, m15 |
| movd r4, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd rX, m15 |
| movd r5, m7 |
| mova m5, [base+bdct_lb_q+ 0] |
| mova m6, [base+bdct_lb_q+16] |
| movd m1, [base+subpel_filters+r0*8+2] |
| movd m2, [base+subpel_filters+rX*8+2] |
| movd m3, [base+subpel_filters+r4*8+2] |
| movd m7, [base+subpel_filters+r5*8+2] |
| movifprep r3, r3m |
| SWAP m4, m7 |
| %define m10 m5 |
| %define m11 m6 |
| %define m12 m1 |
| %define m13 m1 |
| %endif |
| psrld m14, 10 |
| paddd m14, m14 |
| punpckldq m13, m2 |
| punpckldq m15, m4 |
| punpcklqdq m13, m15 |
| pxor m2, m2 |
| pcmpeqd m0, m2 |
| %if ARCH_X86_64 |
| pand m9, m0 |
| %else |
| pand m2, m9, m0 |
| %define m9 m2 |
| SWAP m7, m4 |
| %endif |
| pandn m0, m13 |
| %if ARCH_X86_64 |
| SWAP m13, m0 |
| %else |
| %define m13 m0 |
| %endif |
| por m13, m9 |
| punpckhbw m15, m13, m13 |
| punpcklbw m13, m13 |
| psraw m15, 8 |
| psraw m13, 8 |
| pshufb m12, m14, m10 |
| pshufb m14, m11 |
| mova m10, [base+spel_s_shuf2] |
| movd r4d, m14 |
| shr r4d, 24 |
| %if ARCH_X86_32 |
| mova [stk+0x20], m13 |
| mova [stk+0x30], m15 |
| pxor m2, m2 |
| %endif |
| pshufb m7, m14, m2 |
| psubb m14, m7 |
| paddb m12, m10 |
| paddb m14, m10 |
| %if ARCH_X86_64 |
| lea r6, [r4+ssq*1] |
| lea r11, [r4+ssq*2] |
| lea r13, [r4+ss3q ] |
| movu m7, [srcq+ssq*0] |
| movu m9, [srcq+ssq*1] |
| movu m8, [srcq+ssq*2] |
| movu m10, [srcq+ss3q ] |
| movu m1, [srcq+r4 ] |
| movu m3, [srcq+r6 ] |
| movu m2, [srcq+r11 ] |
| movu m4, [srcq+r13 ] |
| lea srcq, [srcq+ssq*4] |
| REPX {pshufb x, m12}, m7, m9, m8, m10 |
| REPX {pmaddwd x, m13}, m7, m9, m8, m10 |
| REPX {pshufb x, m14}, m1, m2, m3, m4 |
| REPX {pmaddwd x, m15}, m1, m2, m3, m4 |
| mova m5, [rsp+0x10] |
| movd xm6, [rsp+0x20] |
| phaddd m7, m1 |
| phaddd m9, m3 |
| phaddd m8, m2 |
| phaddd m10, m4 |
| movu m1, [srcq+ssq*0] |
| movu m2, [srcq+ssq*1] |
| movu m3, [srcq+ssq*2] |
| movu m4, [srcq+ss3q ] |
| REPX {paddd x, m5}, m7, m9, m8, m10 |
| REPX {psrad x, xm6}, m7, m9, m8, m10 |
| packssdw m7, m9 ; 0 1 |
| packssdw m8, m10 ; 2 3 |
| movu m0, [srcq+r4 ] |
| movu m9, [srcq+r6 ] |
| movu m10, [srcq+r11 ] |
| movu m11, [srcq+r13 ] |
| lea srcq, [srcq+ssq*4] |
| REPX {pshufb x, m12}, m1, m2, m3, m4 |
| REPX {pmaddwd x, m13}, m1, m2, m3, m4 |
| REPX {pshufb x, m14}, m0, m9, m10, m11 |
| REPX {pmaddwd x, m15}, m0, m9, m10, m11 |
| phaddd m1, m0 |
| phaddd m2, m9 |
| phaddd m3, m10 |
| phaddd m4, m11 |
| REPX {paddd x, m5}, m1, m2, m3, m4 |
| REPX {psrad x, xm6}, m1, m2, m3, m4 |
| packssdw m1, m2 ; 4 5 |
| packssdw m3, m4 ; 6 7 |
| SWAP m9, m1 |
| shufps m4, m7, m8, q1032 ; 1 2 |
| shufps m5, m8, m9, q1032 ; 3 4 |
| shufps m6, m9, m3, q1032 ; 5 6 |
| pshufd m10, m3, q1032 ; 7 _ |
| punpcklwd m0, m7, m4 ; 01 |
| punpckhwd m7, m4 ; 12 |
| punpcklwd m1, m8, m5 ; 23 |
| punpckhwd m8, m5 ; 34 |
| punpcklwd m2, m9, m6 ; 45 |
| punpckhwd m9, m6 ; 56 |
| punpcklwd m3, m10 ; 67 |
| mova [rsp+0x40], m7 |
| mova [rsp+0x50], m8 |
| mova [rsp+0x60], m9 |
| %else |
| mova [stk+0x00], m12 |
| mova [stk+0x10], m14 |
| add r4, srcq |
| MC_4TAP_SCALED_H 0x40 ; 0 1 |
| MC_4TAP_SCALED_H 0x50 ; 2 3 |
| MC_4TAP_SCALED_H 0x60 ; 4 5 |
| MC_4TAP_SCALED_H 0x70 ; 6 7 |
| mova m4, [stk+0x40] |
| mova m5, [stk+0x50] |
| mova m6, [stk+0x60] |
| mova m7, [stk+0x70] |
| mov [stk+0xc0], r4 |
| shufps m1, m4, m5, q1032 ; 1 2 |
| shufps m2, m5, m6, q1032 ; 3 4 |
| shufps m3, m6, m7, q1032 ; 5 6 |
| pshufd m0, m7, q1032 ; 7 _ |
| mova [stk+0xb0], m0 |
| punpcklwd m0, m4, m1 ; 01 |
| punpckhwd m4, m1 ; 12 |
| punpcklwd m1, m5, m2 ; 23 |
| punpckhwd m5, m2 ; 34 |
| punpcklwd m2, m6, m3 ; 45 |
| punpckhwd m6, m3 ; 56 |
| punpcklwd m3, m7, [stk+0xb0] ; 67 |
| mov myd, mym |
| mov r0, r0m |
| mova [stk+0x40], m0 ; 01 |
| mova [stk+0x50], m1 ; 23 |
| mova [stk+0x60], m2 ; 45 |
| mova [stk+0x70], m3 ; 67 |
| mova [stk+0x80], m4 ; 12 |
| mova [stk+0x90], m5 ; 34 |
| mova [stk+0xa0], m6 ; 56 |
| %define m12 [stk+0x00] |
| %define m14 [stk+0x10] |
| %define m13 [stk+0x20] |
| %define m15 [stk+0x30] |
| %define hrnd_mem [esp+0x00] |
| %define hsh_mem [esp+0x10] |
| %if isput |
| %define vrnd_mem [esp+0x20] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| %endif |
| .w4_loop: |
| and myd, 0x3ff |
| %if ARCH_X86_64 |
| mov r11d, 64 << 24 |
| mov r13d, myd |
| shr r13d, 6 |
| lea r13d, [t1+r13] |
| cmovnz r11q, [base+subpel_filters+r13*8] |
| movq m9, r11q |
| punpcklbw m9, m9 |
| psraw m9, 8 |
| pshufd m7, m9, q0000 |
| pshufd m8, m9, q1111 |
| pmaddwd m4, m0, m7 |
| pmaddwd m5, m1, m8 |
| pshufd m7, m9, q2222 |
| pshufd m9, m9, q3333 |
| pmaddwd m6, m2, m7 |
| pmaddwd m8, m3, m9 |
| %if isput |
| movd m9, [rsp+0x28] |
| %define vrnd_mem [rsp+0x30] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| paddd m4, m5 |
| paddd m6, m8 |
| paddd m4, m6 |
| paddd m4, vrnd_mem |
| %else |
| mov mym, myd |
| mov r5, [esp+0x1f4] |
| xor r3, r3 |
| shr r4, 6 |
| lea r5, [r5+r4] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r5*8+0] |
| cmovnz r3, [base+subpel_filters+r5*8+4] |
| movd m7, r4 |
| movd m6, r3 |
| punpckldq m7, m6 |
| punpcklbw m7, m7 |
| psraw m7, 8 |
| pshufd m4, m7, q0000 |
| pshufd m5, m7, q1111 |
| pshufd m6, m7, q2222 |
| pshufd m7, m7, q3333 |
| pmaddwd m0, m4 |
| pmaddwd m1, m5 |
| pmaddwd m2, m6 |
| pmaddwd m3, m7 |
| %if isput |
| movd m4, [esp+0x18] |
| %endif |
| paddd m0, m1 |
| paddd m2, m3 |
| paddd m0, vrnd_mem |
| paddd m0, m2 |
| SWAP m4, m0 |
| %define m9 m0 |
| %endif |
| %if isput |
| pxor m5, m5 |
| psrad m4, m9 |
| packssdw m4, m4 |
| pmaxsw m4, m5 |
| pminsw m4, pxmaxm |
| movq [dstq], m4 |
| add dstq, dsmp |
| %else |
| psrad m4, 6 |
| packssdw m4, m4 |
| movq [tmpq], m4 |
| add tmpq, 8 |
| %endif |
| dec hd |
| jz .ret |
| %if ARCH_X86_64 |
| add myd, dyd |
| test myd, ~0x3ff |
| jz .w4_loop |
| mova m8, [rsp+0x10] |
| movd m9, [rsp+0x20] |
| movu m4, [srcq] |
| movu m5, [srcq+r4] |
| test myd, 0x400 |
| jz .w4_skip_line |
| mova m0, [rsp+0x40] |
| mova [rsp+0x40], m1 |
| mova m1, [rsp+0x50] |
| mova [rsp+0x50], m2 |
| mova m2, [rsp+0x60] |
| mova [rsp+0x60], m3 |
| pshufb m4, m12 |
| pshufb m5, m14 |
| pmaddwd m4, m13 |
| pmaddwd m5, m15 |
| phaddd m4, m5 |
| paddd m4, m8 |
| psrad m4, m9 |
| packssdw m4, m4 |
| punpcklwd m3, m10, m4 |
| mova m10, m4 |
| add srcq, ssq |
| jmp .w4_loop |
| .w4_skip_line: |
| movu m6, [srcq+ssq*1] |
| movu m7, [srcq+r6] |
| mova m0, [rsp+0x50] |
| mova m11, [rsp+0x60] |
| pshufb m4, m12 |
| pshufb m6, m12 |
| pshufb m5, m14 |
| pshufb m7, m14 |
| pmaddwd m4, m13 |
| pmaddwd m6, m13 |
| pmaddwd m5, m15 |
| pmaddwd m7, m15 |
| mova [rsp+0x40], m0 |
| mova [rsp+0x50], m11 |
| phaddd m4, m5 |
| phaddd m6, m7 |
| paddd m4, m8 |
| paddd m6, m8 |
| psrad m4, m9 |
| psrad m6, m9 |
| packssdw m4, m6 |
| punpcklwd m9, m10, m4 |
| mova [rsp+0x60], m9 |
| pshufd m10, m4, q1032 |
| mova m0, m1 |
| mova m1, m2 |
| mova m2, m3 |
| punpcklwd m3, m4, m10 |
| lea srcq, [srcq+ssq*2] |
| jmp .w4_loop |
| %else |
| SWAP m0, m4 |
| mov myd, mym |
| mov r3, r3m |
| add myd, dym |
| test myd, ~0x3ff |
| jnz .w4_next_line |
| mova m0, [stk+0x40] |
| mova m1, [stk+0x50] |
| mova m2, [stk+0x60] |
| mova m3, [stk+0x70] |
| jmp .w4_loop |
| .w4_next_line: |
| mov r5, [stk+0xc0] |
| movu m4, [srcq] |
| movu m5, [r5] |
| test myd, 0x400 |
| jz .w4_skip_line |
| add [stk+0xc0], ssq |
| mova m0, [stk+0x80] |
| mova m3, [stk+0x50] |
| mova [stk+0x40], m0 |
| mova [stk+0x80], m3 |
| mova m1, [stk+0x90] |
| mova m6, [stk+0x60] |
| mova [stk+0x50], m1 |
| mova [stk+0x90], m6 |
| mova m2, [stk+0xa0] |
| mova m7, [stk+0x70] |
| mova [stk+0x60], m2 |
| mova [stk+0xa0], m7 |
| pshufb m4, m12 |
| pshufb m5, m14 |
| pmaddwd m4, m13 |
| pmaddwd m5, m15 |
| phaddd m4, m5 |
| paddd m4, hrnd_mem |
| psrad m4, hsh_mem |
| packssdw m4, m4 |
| punpcklwd m3, [stk+0xb0], m4 |
| mova [stk+0xb0], m4 |
| mova [stk+0x70], m3 |
| add srcq, ssq |
| jmp .w4_loop |
| .w4_skip_line: |
| movu m6, [srcq+ssq*1] |
| movu m7, [r5 +ssq*1] |
| lea r5, [r5 +ssq*2] |
| mov [stk+0xc0], r5 |
| mova m0, [stk+0x50] |
| mova m1, [stk+0x60] |
| mova m2, [stk+0x70] |
| mova m3, [stk+0x90] |
| pshufb m4, m12 |
| pshufb m6, m12 |
| pshufb m5, m14 |
| pshufb m7, m14 |
| pmaddwd m4, m13 |
| pmaddwd m6, m13 |
| pmaddwd m5, m15 |
| pmaddwd m7, m15 |
| mova [stk+0x40], m0 |
| mova [stk+0x50], m1 |
| mova [stk+0x60], m2 |
| mova [stk+0x80], m3 |
| phaddd m4, m5 |
| phaddd m6, m7 |
| mova m5, [stk+0xa0] |
| mova m7, [stk+0xb0] |
| paddd m4, hrnd_mem |
| paddd m6, hrnd_mem |
| psrad m4, hsh_mem |
| psrad m6, hsh_mem |
| packssdw m4, m6 |
| punpcklwd m7, m4 |
| pshufd m6, m4, q1032 |
| mova [stk+0x90], m5 |
| mova [stk+0xa0], m7 |
| mova [stk+0xb0], m6 |
| punpcklwd m3, m4, m6 |
| mova [stk+0x70], m3 |
| lea srcq, [srcq+ssq*2] |
| jmp .w4_loop |
| %endif |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| %define stk rsp+0x20 |
| %endif |
| .w8: |
| mov dword [stk+0xf0], 1 |
| movifprep tmp_stridem, 16 |
| jmp .w_start |
| .w16: |
| mov dword [stk+0xf0], 2 |
| movifprep tmp_stridem, 32 |
| jmp .w_start |
| .w32: |
| mov dword [stk+0xf0], 4 |
| movifprep tmp_stridem, 64 |
| jmp .w_start |
| .w64: |
| mov dword [stk+0xf0], 8 |
| movifprep tmp_stridem, 128 |
| jmp .w_start |
| .w128: |
| mov dword [stk+0xf0], 16 |
| movifprep tmp_stridem, 256 |
| .w_start: |
| %if ARCH_X86_64 |
| %ifidn %1, put |
| movifnidn dsm, dsq |
| %endif |
| mova [rsp+0x10], m11 |
| %define hround m11 |
| shr t0d, 16 |
| movd m15, t0d |
| %if isprep |
| mova m13, [base+pd_m524256] |
| %endif |
| %else |
| %define hround [esp+0x00] |
| %define m12 [esp+0x10] |
| %define m10 [base+pd_0x3ff] |
| %define m8 m0 |
| %xdefine m14 m4 |
| %define m15 m3 |
| %if isprep |
| %define ssq ssm |
| %endif |
| mov r4, [esp+0x1f0] |
| shr r4, 16 |
| movd m15, r4 |
| mov r0, r0m |
| mov myd, mym |
| %endif |
| sub srcq, 6 |
| pslld m7, m8, 2 ; dx*4 |
| pmaddwd m8, [base+rescale_mul] ; dx*[0-3] |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| mova [stk+0x100], m7 |
| mova [stk+0x120], m15 |
| mov [stk+0x0f8], srcq |
| mov [stk+0x130], r0q ; dstq / tmpq |
| %if ARCH_X86_64 && UNIX64 |
| mov hm, hd |
| %elif ARCH_X86_32 |
| mov r5, hm |
| mov [stk+0x0f4], myd |
| mov [stk+0x134], r5 |
| %endif |
| jmp .hloop |
| .hloop_prep: |
| dec dword [stk+0x0f0] |
| jz .ret |
| %if ARCH_X86_64 |
| add qword [stk+0x130], 16 |
| mov hd, hm |
| %else |
| add dword [stk+0x130], 16 |
| mov myd, [stk+0x0f4] |
| mov r5, [stk+0x134] |
| mov r0, [stk+0x130] |
| %endif |
| mova m7, [stk+0x100] |
| mova m14, [stk+0x110] |
| %if ARCH_X86_64 |
| mova m10, [base+pd_0x3ff] |
| mova m11, [rsp+0x10] |
| %endif |
| mova m15, [stk+0x120] |
| mov srcq, [stk+0x0f8] |
| %if ARCH_X86_64 |
| mov r0q, [stk+0x130] ; dstq / tmpq |
| %else |
| mov mym, myd |
| mov hm, r5 |
| mov r0m, r0 |
| mov r3, r3m |
| %endif |
| paddd m14, m7 |
| .hloop: |
| %if ARCH_X86_64 |
| mova m9, [base+pq_0x40000000] |
| %else |
| %define m9 [base+pq_0x40000000] |
| %endif |
| pxor m1, m1 |
| psrld m2, m14, 10 |
| mova [stk], m2 |
| pand m6, m14, m10 |
| psrld m6, 6 |
| paddd m5, m15, m6 |
| pcmpeqd m6, m1 |
| pshufd m2, m5, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m5 |
| movd r6d, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r7d, m5 |
| movd r9d, m2 |
| movq m0, [base+subpel_filters+r4*8] |
| movq m1, [base+subpel_filters+r6*8] |
| movhps m0, [base+subpel_filters+r7*8] |
| movhps m1, [base+subpel_filters+r9*8] |
| %else |
| movd r0, m5 |
| movd rX, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r4, m5 |
| movd r5, m2 |
| movq m0, [base+subpel_filters+r0*8] |
| movq m1, [base+subpel_filters+rX*8] |
| movhps m0, [base+subpel_filters+r4*8] |
| movhps m1, [base+subpel_filters+r5*8] |
| %endif |
| paddd m14, m7 ; mx+dx*[4-7] |
| pand m5, m14, m10 |
| psrld m5, 6 |
| paddd m15, m5 |
| pxor m2, m2 |
| pcmpeqd m5, m2 |
| mova [stk+0x110], m14 |
| pshufd m4, m15, q1032 |
| %if ARCH_X86_64 |
| movd r10d, m15 |
| movd r11d, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r13d, m15 |
| movd rXd, m4 |
| movq m2, [base+subpel_filters+r10*8] |
| movq m3, [base+subpel_filters+r11*8] |
| movhps m2, [base+subpel_filters+r13*8] |
| movhps m3, [base+subpel_filters+ rX*8] |
| psrld m14, 10 |
| movq r11, m14 |
| punpckhqdq m14, m14 |
| movq rX, m14 |
| mov r10d, r11d |
| shr r11, 32 |
| mov r13d, rXd |
| shr rX, 32 |
| mov r4d, [stk+ 0] |
| mov r6d, [stk+ 4] |
| mov r7d, [stk+ 8] |
| mov r9d, [stk+12] |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m14, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m7, m9, m4 |
| pand m8, m9, m6 |
| pand m15, m9, m14 |
| pand m9, m9, m5 |
| pandn m4, m0 |
| pandn m6, m1 |
| pandn m14, m2 |
| pandn m5, m3 |
| por m7, m4 |
| por m8, m6 |
| por m15, m14 |
| por m9, m5 |
| punpcklbw m0, m7, m7 |
| punpckhbw m7, m7 |
| punpcklbw m1, m8, m8 |
| punpckhbw m8, m8 |
| psraw m0, 8 |
| psraw m7, 8 |
| psraw m1, 8 |
| psraw m8, 8 |
| punpcklbw m2, m15, m15 |
| punpckhbw m15, m15 |
| punpcklbw m3, m9, m9 |
| punpckhbw m9, m9 |
| psraw m2, 8 |
| psraw m15, 8 |
| psraw m3, 8 |
| psraw m9, 8 |
| mova [stk+0x10], m0 |
| mova [stk+0x20], m7 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m8 |
| mova [stk+0x50], m2 |
| mova [stk+0x60], m15 |
| mova [stk+0x70], m3 |
| mova [stk+0x80], m9 |
| MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 |
| mova [stk+0x90], m1 |
| MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 |
| mova [stk+0xa0], m2 |
| MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 |
| mova [stk+0xb0], m3 |
| MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 |
| mova [stk+0xc0], m4 |
| MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 |
| mova [stk+0xd0], m5 |
| MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 |
| MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 |
| MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 |
| mova m5, [stk+0xd0] |
| mova m1, [stk+0x90] |
| mova m2, [stk+0xa0] |
| mova m3, [stk+0xb0] |
| mova m9, [stk+0xc0] |
| mov myd, mym |
| mov dyd, dym |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m8 ; 67a |
| punpckhwd m7, m8 ; 67b |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m9 ; 23a |
| punpckhwd m3, m9 ; 23b |
| mova [stk+0x90], m4 |
| mova [stk+0xa0], m5 |
| mova [stk+0xb0], m6 |
| mova [stk+0xc0], m7 |
| %define hround [rsp+0x10] |
| .vloop: |
| and myd, 0x3ff |
| mov r6d, 64 << 24 |
| mov r4d, myd |
| shr r4d, 6 |
| lea r4d, [t1+r4] |
| cmovnz r6q, [base+subpel_filters+r4*8] |
| movq m11, r6q |
| punpcklbw m11, m11 |
| psraw m11, 8 |
| pshufd m5, m11, q0000 |
| pshufd m7, m11, q1111 |
| pshufd m10, m11, q2222 |
| pshufd m11, m11, q3333 |
| pmaddwd m4, m5, m0 |
| pmaddwd m5, m5, m1 |
| pmaddwd m6, m7, m2 |
| pmaddwd m7, m7, m3 |
| paddd m4, m13 |
| paddd m5, m13 |
| paddd m4, m6 |
| paddd m5, m7 |
| pmaddwd m6, [stk+0x90], m10 |
| pmaddwd m7, [stk+0xa0], m10 |
| pmaddwd m8, [stk+0xb0], m11 |
| pmaddwd m9, [stk+0xc0], m11 |
| paddd m4, m6 |
| paddd m5, m7 |
| %if isput |
| pshufd m6, m12, q1032 |
| %endif |
| paddd m4, m8 |
| paddd m5, m9 |
| %else |
| movd r0, m15 |
| movd rX, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r4, m15 |
| movd r5, m4 |
| mova m14, [stk+0x110] |
| movq m2, [base+subpel_filters+r0*8] |
| movq m3, [base+subpel_filters+rX*8] |
| movhps m2, [base+subpel_filters+r4*8] |
| movhps m3, [base+subpel_filters+r5*8] |
| psrld m14, 10 |
| mova [stk+16], m14 |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m7, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m0, m9, m4 |
| pand m1, m9, m6 |
| pand m2, m9, m7 |
| pand m3, m9, m5 |
| pandn m4, [stk+0x20] |
| pandn m6, [stk+0x30] |
| pandn m7, [stk+0x40] |
| pandn m5, [stk+0x50] |
| por m0, m4 |
| por m1, m6 |
| por m2, m7 |
| por m3, m5 |
| punpcklbw m4, m0, m0 |
| punpckhbw m0, m0 |
| punpcklbw m5, m1, m1 |
| punpckhbw m1, m1 |
| psraw m4, 8 |
| psraw m0, 8 |
| psraw m5, 8 |
| psraw m1, 8 |
| punpcklbw m6, m2, m2 |
| punpckhbw m2, m2 |
| punpcklbw m7, m3, m3 |
| punpckhbw m3, m3 |
| psraw m6, 8 |
| psraw m2, 8 |
| psraw m7, 8 |
| psraw m3, 8 |
| mova [stk+0x0a0], m4 |
| mova [stk+0x0b0], m0 |
| mova [stk+0x0c0], m5 |
| mova [stk+0x0d0], m1 |
| mova [stk+0x140], m6 |
| mova [stk+0x150], m2 |
| mova [stk+0x160], m7 |
| mova [stk+0x170], m3 |
| MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 |
| MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 |
| MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 |
| MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 |
| MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 |
| MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 |
| MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 |
| MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 |
| mova m5, [stk+0x60] |
| mova m6, [stk+0x70] |
| mova m7, [stk+0x80] |
| mova m0, [stk+0x90] |
| mov myd, mym |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m0 ; 67a |
| punpckhwd m7, m0 ; 67b |
| mova [stk+0x60], m4 |
| mova [stk+0x70], m5 |
| mova [stk+0x80], m6 |
| mova [stk+0x90], m7 |
| mova m1, [stk+0x20] |
| mova m2, [stk+0x30] |
| mova m3, [stk+0x40] |
| mova m4, [stk+0x50] |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m4 ; 23a |
| punpckhwd m3, m4 ; 23b |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| .vloop: |
| mov r0, r0m |
| mov r5, [esp+0x1f4] |
| and myd, 0x3ff |
| mov mym, myd |
| xor r3, r3 |
| shr r4, 6 |
| lea r5, [r5+r4] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r5*8+0] |
| cmovnz r3, [base+subpel_filters+r5*8+4] |
| movd m7, r4 |
| movd m6, r3 |
| punpckldq m7, m6 |
| punpcklbw m7, m7 |
| psraw m7, 8 |
| pshufd m4, m7, q0000 |
| pshufd m5, m7, q1111 |
| pmaddwd m0, m4 |
| pmaddwd m1, m4 |
| pmaddwd m2, m5 |
| pmaddwd m3, m5 |
| pshufd m6, m7, q2222 |
| pshufd m7, m7, q3333 |
| paddd m0, m2 |
| paddd m1, m3 |
| pmaddwd m2, [stk+0x60], m6 |
| pmaddwd m3, [stk+0x70], m6 |
| pmaddwd m4, [stk+0x80], m7 |
| pmaddwd m5, [stk+0x90], m7 |
| %if isput |
| movd m6, [esp+0x18] |
| %endif |
| paddd m0, m2 |
| paddd m1, m3 |
| paddd m0, vrnd_mem |
| paddd m1, vrnd_mem |
| paddd m4, m0 |
| paddd m5, m1 |
| %endif |
| %ifidn %1, put |
| psrad m4, m6 |
| psrad m5, m6 |
| packssdw m4, m5 |
| pxor m7, m7 |
| pmaxsw m4, m7 |
| pminsw m4, pxmaxm |
| mova [dstq], m4 |
| add dstq, dsm |
| %else |
| psrad m4, 6 |
| psrad m5, 6 |
| packssdw m4, m5 |
| mova [tmpq], m4 |
| add tmpq, tmp_stridem |
| %endif |
| dec hd |
| jz .hloop_prep |
| %if ARCH_X86_64 |
| add myd, dyd |
| test myd, ~0x3ff |
| jz .vloop |
| test myd, 0x400 |
| mov [stk+0x140], myd |
| mov r4d, [stk+ 0] |
| mov r6d, [stk+ 4] |
| mov r7d, [stk+ 8] |
| mov r9d, [stk+12] |
| jz .skip_line |
| mova m14, [base+unpckw] |
| movu m8, [srcq+r10*2] |
| movu m9, [srcq+r11*2] |
| movu m10, [srcq+r13*2] |
| movu m11, [srcq+ rX*2] |
| movu m4, [srcq+ r4*2] |
| movu m5, [srcq+ r6*2] |
| movu m6, [srcq+ r7*2] |
| movu m7, [srcq+ r9*2] |
| add srcq, ssq |
| mov myd, [stk+0x140] |
| mov dyd, dym |
| pshufd m15, m14, q1032 |
| pshufb m0, m14 ; 0a 1a |
| pshufb m1, m14 ; 0b 1b |
| pshufb m2, m15 ; 3a 2a |
| pshufb m3, m15 ; 3b 2b |
| pmaddwd m8, [stk+0x50] |
| pmaddwd m9, [stk+0x60] |
| pmaddwd m10, [stk+0x70] |
| pmaddwd m11, [stk+0x80] |
| pmaddwd m4, [stk+0x10] |
| pmaddwd m5, [stk+0x20] |
| pmaddwd m6, [stk+0x30] |
| pmaddwd m7, [stk+0x40] |
| phaddd m8, m9 |
| phaddd m10, m11 |
| mova m11, hround |
| phaddd m4, m5 |
| phaddd m6, m7 |
| phaddd m8, m10 |
| phaddd m4, m6 |
| paddd m4, m11 |
| paddd m8, m11 |
| psrad m4, m12 |
| psrad m8, m12 |
| packssdw m4, m8 |
| pshufb m5, [stk+0x90], m14 ; 4a 5a |
| pshufb m6, [stk+0xa0], m14 ; 4b 5b |
| pshufb m7, [stk+0xb0], m15 ; 7a 6a |
| pshufb m8, [stk+0xc0], m15 ; 7b 6b |
| punpckhwd m0, m2 ; 12a |
| punpckhwd m1, m3 ; 12b |
| punpcklwd m2, m5 ; 34a |
| punpcklwd m3, m6 ; 34b |
| punpckhwd m5, m7 ; 56a |
| punpckhwd m6, m8 ; 56b |
| punpcklwd m7, m4 ; 78a |
| punpckhqdq m4, m4 |
| punpcklwd m8, m4 ; 78b |
| mova [stk+0x90], m5 |
| mova [stk+0xa0], m6 |
| mova [stk+0xb0], m7 |
| mova [stk+0xc0], m8 |
| jmp .vloop |
| .skip_line: |
| MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 |
| MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 |
| mov myd, [stk+0x140] |
| mov dyd, dym |
| mova m0, m2 ; 01a |
| mova m1, m3 ; 01b |
| mova m2, [stk+0x90] ; 23a |
| mova m3, [stk+0xa0] ; 23b |
| mova m5, [stk+0xb0] ; 45a |
| mova m6, [stk+0xc0] ; 45b |
| punpcklwd m7, m4, m8 ; 67a |
| punpckhwd m4, m8 ; 67b |
| mova [stk+0x90], m5 |
| mova [stk+0xa0], m6 |
| mova [stk+0xb0], m7 |
| mova [stk+0xc0], m4 |
| %else |
| mov r0m, r0 |
| mov myd, mym |
| mov r3, r3m |
| add myd, dym |
| test myd, ~0x3ff |
| mov mym, myd |
| jnz .next_line |
| mova m0, [stk+0x20] |
| mova m1, [stk+0x30] |
| mova m2, [stk+0x40] |
| mova m3, [stk+0x50] |
| jmp .vloop |
| .next_line: |
| test myd, 0x400 |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| jz .skip_line |
| MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 |
| mova m7, [base+unpckw] |
| pshufd m4, m7, q1032 |
| pshufb m0, [stk+0x20], m7 ; 0a 1a |
| pshufb m1, [stk+0x30], m7 ; 0b 1b |
| pshufb m2, [stk+0x40], m4 ; 3a 2a |
| pshufb m3, [stk+0x50], m4 ; 3b 2b |
| pshufb m5, [stk+0x60], m7 ; 4a 5a |
| pshufb m6, [stk+0x70], m7 ; 4b 5b |
| pshufb m7, [stk+0x80], m4 ; 7a 6a |
| punpckhwd m0, m2 ; 12a |
| punpckhwd m1, m3 ; 12b |
| punpcklwd m2, m5 ; 34a |
| punpcklwd m3, m6 ; 34b |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| punpckhwd m5, m7 ; 56a |
| mova [stk+0x60], m5 |
| pshufb m5, [stk+0x90], m4 ; 7b 6b |
| punpcklwd m7, [stk+0xe0] ; 78a |
| punpckhwd m6, m5 ; 56b |
| mova [stk+0x70], m6 |
| movq m6, [stk+0xe8] |
| mova [stk+0x80], m7 |
| punpcklwd m5, m6 |
| mov myd, mym |
| mova [stk+0x90], m5 |
| jmp .vloop |
| .skip_line: |
| MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 |
| MC_8TAP_SCALED_H 0xa0, 0 ; 9 |
| mova m7, [stk+0xe0] |
| mova m2, [stk+0x60] ; 23a |
| mova m3, [stk+0x70] ; 23b |
| mova m4, [stk+0x80] ; 45a |
| mova m5, [stk+0x90] ; 45b |
| punpcklwd m6, m7, m0 ; 67a |
| punpckhwd m7, m0 ; 67b |
| mova m0, [stk+0x40] ; 01a |
| mova m1, [stk+0x50] ; 01b |
| mov myd, mym |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| mova [stk+0x60], m4 |
| mova [stk+0x70], m5 |
| mova [stk+0x80], m6 |
| mova [stk+0x90], m7 |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| %endif |
| jmp .vloop |
| INIT_XMM ssse3 |
| .dy1: |
| movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] |
| add wq, base_reg |
| jmp wq |
| %if isput |
| .dy1_w2: |
| %if ARCH_X86_64 |
| mov myd, mym |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| %define m8 m0 |
| %define m9 m1 |
| %define m14 m4 |
| %define m15 m3 |
| %define m11 [esp+0x00] |
| %define m12 [esp+0x10] |
| %define m13 [esp+0x20] |
| movzx r5, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r5 |
| mov r1, r1m |
| %endif |
| pxor m9, m9 |
| punpckldq m9, m8 |
| paddd m14, m9 ; mx+dx*[0-1] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| pand m8, m14, m10 |
| psrld m8, 6 |
| paddd m15, m8 |
| movd r4d, m15 |
| pshufd m15, m15, q0321 |
| %if ARCH_X86_64 |
| movd r6d, m15 |
| %else |
| movd r3d, m15 |
| %endif |
| mova m5, [base+bdct_lb_q] |
| mova m6, [base+spel_s_shuf2] |
| movd m15, [base+subpel_filters+r4*8+2] |
| %if ARCH_X86_64 |
| movd m7, [base+subpel_filters+r6*8+2] |
| %else |
| movd m7, [base+subpel_filters+r3*8+2] |
| %endif |
| pxor m2, m2 |
| pcmpeqd m8, m2 |
| psrld m14, 10 |
| paddd m14, m14 |
| %if ARCH_X86_32 |
| mov r3, r3m |
| pshufb m14, m5 |
| paddb m14, m6 |
| mova [stk], m14 |
| SWAP m5, m0 |
| SWAP m6, m3 |
| %define m15 m6 |
| %endif |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*1] |
| movu m2, [srcq+ssq*2] |
| movu m3, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| punpckldq m15, m7 |
| %if ARCH_X86_64 |
| pshufb m14, m5 |
| paddb m14, m6 |
| pand m9, m8 |
| pandn m8, m15 |
| SWAP m15, m8 |
| por m15, m9 |
| movu m4, [srcq+ssq*0] |
| movu m5, [srcq+ssq*1] |
| movu m6, [srcq+ssq*2] |
| add srcq, ss3q |
| shr myd, 6 |
| mov r4d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r4q, [base+subpel_filters+myq*8] |
| %else |
| pand m7, m5, [base+pd_0x4000] |
| pandn m5, m15 |
| por m5, m7 |
| %define m15 m5 |
| mov myd, mym |
| mov r5, [esp+0x1f4] |
| xor r3, r3 |
| shr myd, 6 |
| lea r5, [r5+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r5*8+0] |
| cmovnz r3, [base+subpel_filters+r5*8+4] |
| mov [stk+0x20], r3 |
| mov r3, r3m |
| %endif |
| punpcklbw m15, m15 |
| psraw m15, 8 |
| REPX {pshufb x, m14}, m0, m1, m2, m3 |
| REPX {pmaddwd x, m15}, m0, m1, m2, m3 |
| %if ARCH_X86_64 |
| REPX {pshufb x, m14}, m4, m5, m6 |
| REPX {pmaddwd x, m15}, m4, m5, m6 |
| phaddd m0, m1 |
| phaddd m2, m3 |
| phaddd m4, m5 |
| phaddd m6, m6 |
| REPX {paddd x, m11}, m0, m2, m4, m6 |
| REPX {psrad x, m12}, m0, m2, m4, m6 |
| packssdw m0, m2 ; 0 1 2 3 |
| packssdw m4, m6 ; 4 5 6 |
| SWAP m1, m4 |
| movq m10, r4 |
| %else |
| mova [stk+0x10], m15 |
| phaddd m0, m1 |
| phaddd m2, m3 |
| movu m1, [srcq+ssq*0] |
| movu m7, [srcq+ssq*1] |
| movu m6, [srcq+ssq*2] |
| add srcq, ss3q |
| REPX {pshufb x, m14}, m1, m7, m6 |
| REPX {pmaddwd x, m15}, m1, m7, m6 |
| %define m14 [stk+0x00] |
| %define m15 [stk+0x10] |
| phaddd m1, m7 |
| phaddd m6, m6 |
| REPX {paddd x, m11}, m0, m2, m1, m6 |
| REPX {psrad x, m12}, m0, m2, m1, m6 |
| packssdw m0, m2 |
| packssdw m1, m6 |
| %define m8 m6 |
| %define m9 m4 |
| %define m10 m5 |
| movd m10, r4 |
| movd m9, [stk+0x20] |
| punpckldq m10, m9 |
| %endif |
| punpcklbw m10, m10 |
| psraw m10, 8 |
| pshufd m7, m10, q0000 |
| pshufd m8, m10, q1111 |
| pshufd m9, m10, q2222 |
| pshufd m10, m10, q3333 |
| %if ARCH_X86_32 |
| mova [stk+0x50], m7 |
| mova [stk+0x60], m8 |
| mova [stk+0x70], m9 |
| mova [stk+0x80], m10 |
| %define m7 [stk+0x50] |
| %define m8 [stk+0x60] |
| %define m9 [stk+0x70] |
| %define m10 [stk+0x80] |
| %endif |
| palignr m2, m1, m0, 4 ; 1 2 3 4 |
| punpcklwd m3, m0, m2 ; 01 12 |
| punpckhwd m0, m2 ; 23 34 |
| pshufd m4, m1, q2121 ; 5 6 5 6 |
| punpcklwd m2, m1, m4 ; 45 56 |
| %if ARCH_X86_32 |
| mov r0, r0m |
| %endif |
| .dy1_w2_loop: |
| movu m1, [srcq+ssq*0] |
| movu m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| pmaddwd m5, m3, m7 |
| mova m3, m0 |
| pmaddwd m0, m8 |
| pshufb m1, m14 |
| pshufb m6, m14 |
| pmaddwd m1, m15 |
| pmaddwd m6, m15 |
| phaddd m1, m6 |
| paddd m1, m11 |
| psrad m1, m12 |
| packssdw m1, m1 |
| paddd m5, m0 |
| mova m0, m2 |
| pmaddwd m2, m9 |
| paddd m5, m2 |
| palignr m2, m1, m4, 12 |
| punpcklwd m2, m1 ; 67 78 |
| pmaddwd m4, m2, m10 |
| paddd m5, m13 |
| paddd m5, m4 |
| pxor m6, m6 |
| mova m4, m1 |
| pshufd m1, m12, q1032 |
| psrad m5, m1 |
| packssdw m5, m5 |
| pmaxsw m5, m6 |
| pminsw m5, pxmaxm |
| movd [dstq+dsq*0], m5 |
| pshuflw m5, m5, q1032 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .dy1_w2_loop |
| RET |
| %endif |
| INIT_XMM ssse3 |
| .dy1_w4: |
| %if ARCH_X86_64 |
| mov myd, mym |
| mova [rsp+0x10], m11 |
| mova [rsp+0x20], m12 |
| %if isput |
| mova [rsp+0x30], m13 |
| %define vrnd_mem [rsp+0x30] |
| %define stk rsp+0x40 |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %define stk rsp+0x30 |
| %endif |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| %define m10 [base+pd_0x3ff] |
| %define m9 [base+pd_0x4000] |
| %define m8 m0 |
| %xdefine m14 m4 |
| %define m15 m3 |
| %if isprep |
| %define ssq r3 |
| %endif |
| movzx r5, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r5 |
| %endif |
| pmaddwd m8, [base+rescale_mul] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| pand m0, m14, m10 |
| psrld m0, 6 |
| paddd m15, m0 |
| pshufd m7, m15, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m15 |
| movd r11d, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd r6d, m15 |
| movd r13d, m7 |
| mova m10, [base+bdct_lb_q+ 0] |
| mova m11, [base+bdct_lb_q+16] |
| movd m13, [base+subpel_filters+ r4*8+2] |
| movd m2, [base+subpel_filters+ r6*8+2] |
| movd m15, [base+subpel_filters+r11*8+2] |
| movd m4, [base+subpel_filters+r13*8+2] |
| %else |
| movd r0, m15 |
| movd r4, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd rX, m15 |
| movd r5, m7 |
| mova m5, [base+bdct_lb_q+ 0] |
| mova m6, [base+bdct_lb_q+16] |
| movd m1, [base+subpel_filters+r0*8+2] |
| movd m2, [base+subpel_filters+rX*8+2] |
| movd m3, [base+subpel_filters+r4*8+2] |
| movd m7, [base+subpel_filters+r5*8+2] |
| SWAP m4, m7 |
| %if isprep |
| mov r3, r3m |
| %endif |
| %define m10 m5 |
| %define m11 m6 |
| %define m12 m1 |
| %define m13 m1 |
| %endif |
| psrld m14, 10 |
| paddd m14, m14 |
| punpckldq m13, m2 |
| punpckldq m15, m4 |
| punpcklqdq m13, m15 |
| pxor m2, m2 |
| pcmpeqd m0, m2 |
| %if ARCH_X86_64 |
| pand m9, m0 |
| %else |
| pand m2, m9, m0 |
| %define m9 m2 |
| SWAP m7, m4 |
| %endif |
| pandn m0, m13 |
| %if ARCH_X86_64 |
| SWAP m13, m0 |
| %else |
| %define m13 m0 |
| %endif |
| por m13, m9 |
| punpckhbw m15, m13, m13 |
| punpcklbw m13, m13 |
| psraw m15, 8 |
| psraw m13, 8 |
| pshufb m12, m14, m10 |
| pshufb m14, m11 |
| mova m10, [base+spel_s_shuf2] |
| movd r4d, m14 |
| shr r4d, 24 |
| %if ARCH_X86_32 |
| mova [stk+0x40], m13 |
| mova [stk+0x50], m15 |
| pxor m2, m2 |
| %endif |
| pshufb m7, m14, m2 |
| psubb m14, m7 |
| paddb m12, m10 |
| paddb m14, m10 |
| %if ARCH_X86_64 |
| lea r6, [r4+ssq*1] |
| lea r11, [r4+ssq*2] |
| lea r13, [r4+ss3q ] |
| movu m7, [srcq+ssq*0] |
| movu m9, [srcq+ssq*1] |
| movu m8, [srcq+ssq*2] |
| movu m10, [srcq+ss3q ] |
| movu m1, [srcq+r4 ] |
| movu m3, [srcq+r6 ] |
| movu m2, [srcq+r11 ] |
| movu m4, [srcq+r13 ] |
| lea srcq, [srcq+ssq*4] |
| REPX {pshufb x, m12}, m7, m9, m8, m10 |
| REPX {pmaddwd x, m13}, m7, m9, m8, m10 |
| REPX {pshufb x, m14}, m1, m3, m2, m4 |
| REPX {pmaddwd x, m15}, m1, m3, m2, m4 |
| mova m5, [rsp+0x10] |
| movd xm6, [rsp+0x20] |
| phaddd m7, m1 |
| phaddd m9, m3 |
| phaddd m8, m2 |
| phaddd m10, m4 |
| movu m1, [srcq+ssq*0] |
| movu m2, [srcq+ssq*1] |
| movu m3, [srcq+ssq*2] |
| REPX {paddd x, m5}, m7, m9, m8, m10 |
| REPX {psrad x, xm6}, m7, m9, m8, m10 |
| packssdw m7, m9 ; 0 1 |
| packssdw m8, m10 ; 2 3 |
| movu m0, [srcq+r4 ] |
| movu m9, [srcq+r6 ] |
| movu m10, [srcq+r11 ] |
| add srcq, ss3q |
| REPX {pshufb x, m12}, m1, m2, m3 |
| REPX {pmaddwd x, m13}, m1, m2, m3 |
| REPX {pshufb x, m14}, m0, m9, m10 |
| REPX {pmaddwd x, m15}, m0, m9, m10 |
| phaddd m1, m0 |
| phaddd m2, m9 |
| phaddd m3, m10 |
| shr myd, 6 |
| mov r13d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r13q, [base+subpel_filters+myq*8] |
| REPX {paddd x, m5}, m1, m2, m3 |
| REPX {psrad x, xm6}, m1, m2, m3 |
| packssdw m1, m2 ; 4 5 |
| packssdw m3, m3 ; 6 6 |
| SWAP m9, m1 |
| shufps m4, m7, m8, q1032 ; 1 2 |
| shufps m5, m8, m9, q1032 ; 3 4 |
| shufps m6, m9, m3, q1032 ; 5 6 |
| punpcklwd m0, m7, m4 ; 01 |
| punpckhwd m7, m4 ; 12 |
| punpcklwd m1, m8, m5 ; 23 |
| punpckhwd m8, m5 ; 34 |
| punpcklwd m2, m9, m6 ; 45 |
| punpckhwd m9, m6 ; 56 |
| movq m10, r13 |
| mova [stk+0x00], m1 |
| mova [stk+0x10], m8 |
| mova [stk+0x20], m2 |
| mova [stk+0x30], m9 |
| mova [stk+0x40], m3 |
| %define hrnd_mem [rsp+0x10] |
| %define hsh_mem [rsp+0x20] |
| %define vsh_mem [rsp+0x28] |
| %if isput |
| %define vrnd_mem [rsp+0x30] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| %else |
| mova [stk+0x20], m12 |
| mova [stk+0x30], m14 |
| add r4, srcq |
| MC_4TAP_SCALED_H 0x60 ; 0 1 |
| MC_4TAP_SCALED_H 0x70 ; 2 3 |
| MC_4TAP_SCALED_H 0x80 ; 4 5 |
| movu m7, [srcq] |
| movu m2, [r4] |
| add srcq, ssq |
| add r4, ssq |
| mov [stk+0xb0], r4 |
| pshufb m7, m12 |
| pshufb m2, m14 |
| pmaddwd m7, m13 |
| pmaddwd m2, m15 |
| phaddd m7, m2 |
| paddd m7, [esp+0x00] |
| psrad m7, [esp+0x10] |
| packssdw m7, m7 ; 6 6 |
| mova m4, [stk+0x60] |
| mova m5, [stk+0x70] |
| mova m6, [stk+0x80] |
| mov myd, mym |
| mov rX, [esp+0x1f4] |
| xor r5, r5 |
| shr myd, 6 |
| lea rX, [rX+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+rX*8+0] |
| cmovnz r5, [base+subpel_filters+rX*8+4] |
| mov r3, r3m |
| shufps m1, m4, m5, q1032 ; 1 2 |
| shufps m2, m5, m6, q1032 ; 3 4 |
| shufps m3, m6, m7, q1032 ; 5 6 |
| mova [stk+0xa0], m7 |
| punpcklwd m0, m4, m1 ; 01 |
| punpckhwd m4, m1 ; 12 |
| punpcklwd m1, m5, m2 ; 23 |
| punpckhwd m5, m2 ; 34 |
| punpcklwd m2, m6, m3 ; 45 |
| punpckhwd m6, m3 ; 56 |
| movd m7, r4 |
| movd m3, r5 |
| mov r0, r0m |
| %if isput |
| mov r1, r1m |
| %endif |
| mov r4, [stk+0xb0] |
| mova [stk+0xc0], m4 ; 12 |
| mova [stk+0x60], m1 ; 23 |
| mova [stk+0x70], m2 ; 45 |
| mova [stk+0x80], m5 ; 34 |
| mova [stk+0x90], m6 ; 56 |
| %define m12 [stk+0x20] |
| %define m14 [stk+0x30] |
| %define m13 [stk+0x40] |
| %define m15 [stk+0x50] |
| %define hrnd_mem [esp+0x00] |
| %define hsh_mem [esp+0x10] |
| %define vsh_mem [esp+0x18] |
| %if isput |
| %define vrnd_mem [esp+0x20] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| %define m10 m7 |
| punpckldq m10, m3 |
| %endif |
| punpcklbw m10, m10 |
| psraw m10, 8 |
| pshufd m3, m10, q0000 |
| pshufd m4, m10, q1111 |
| pshufd m5, m10, q2222 |
| pshufd m10, m10, q3333 |
| %if ARCH_X86_32 |
| %xdefine m8 m3 |
| %xdefine m9 m6 |
| %xdefine m11 m5 |
| %xdefine m6 m4 |
| mova [stk+0x100], m3 |
| mova [stk+0x110], m4 |
| mova [stk+0x120], m5 |
| mova [stk+0x130], m10 |
| %define m3 [stk+0x100] |
| %define m4 [stk+0x110] |
| %define m5 [stk+0x120] |
| %define m10 [stk+0x130] |
| mova m7, [stk+0xc0] |
| mova m8, [stk+0x80] |
| %endif |
| .dy1_w4_loop: |
| movu m11, [srcq+ssq*0] |
| movu m6, [srcq+ssq*1] |
| pmaddwd m0, m3 |
| pmaddwd m7, m3 |
| pmaddwd m1, m4 |
| pmaddwd m8, m4 |
| pmaddwd m2, m5 |
| pmaddwd m9, m5 |
| paddd m1, m0 |
| paddd m8, m7 |
| %if ARCH_X86_64 |
| movu m0, [srcq+r4] |
| movu m7, [srcq+r6] |
| %else |
| movu m0, [r4+ssq*0] |
| movu m7, [r4+ssq*1] |
| lea r4, [r4+ssq*2] |
| %endif |
| lea srcq, [srcq+ssq*2] |
| paddd m1, m2 |
| paddd m8, m9 |
| pshufb m11, m12 |
| pshufb m6, m12 |
| pmaddwd m11, m13 |
| pmaddwd m6, m13 |
| pshufb m0, m14 |
| pshufb m7, m14 |
| pmaddwd m0, m15 |
| pmaddwd m7, m15 |
| phaddd m11, m0 |
| phaddd m6, m7 |
| paddd m11, hrnd_mem |
| paddd m6, hrnd_mem |
| psrad m11, hsh_mem |
| psrad m6, hsh_mem |
| packssdw m11, m6 ; 7 8 |
| %if ARCH_X86_64 |
| shufps m9, [stk+0x40], m11, q1032 ; 6 7 |
| mova m0, [stk+0x00] |
| mova [stk+0x40], m11 |
| %else |
| shufps m9, [stk+0xa0], m11, q1032 ; 6 7 |
| mova m0, [stk+0x60] |
| mova [stk+0xa0], m11 |
| %endif |
| punpcklwd m2, m9, m11 ; 67 |
| punpckhwd m9, m11 ; 78 |
| pmaddwd m6, m2, m10 |
| pmaddwd m7, m9, m10 |
| %if isput |
| movd m11, vsh_mem |
| %endif |
| paddd m1, vrnd_mem |
| paddd m8, vrnd_mem |
| paddd m1, m6 |
| paddd m8, m7 |
| %if ARCH_X86_64 |
| mova m7, [stk+0x10] |
| %else |
| mova m7, [stk+0x80] |
| %endif |
| %if isput |
| psrad m1, m11 |
| psrad m8, m11 |
| %else |
| psrad m1, 6 |
| psrad m8, 6 |
| %endif |
| packssdw m1, m8 |
| %if ARCH_X86_64 |
| mova m8, [stk+0x30] |
| %else |
| mova m8, [stk+0x90] |
| %endif |
| %if isput |
| pxor m6, m6 |
| pmaxsw m1, m6 |
| pminsw m1, pxmaxm |
| movq [dstq+dsq*0], m1 |
| movhps [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| %else |
| mova [tmpq], m1 |
| add tmpq, 16 |
| %endif |
| %if ARCH_X86_64 |
| mova m1, [stk+0x20] |
| mova [stk+0x10], m8 |
| mova [stk+0x00], m1 |
| mova [stk+0x20], m2 |
| mova [stk+0x30], m9 |
| %else |
| mova m1, [stk+0x70] |
| mova [stk+0x80], m8 |
| mova [stk+0x60], m1 |
| mova [stk+0x70], m2 |
| mova [stk+0x90], m9 |
| %endif |
| sub hd, 2 |
| jg .dy1_w4_loop |
| MC_8TAP_SCALED_RET ; why not jz .ret? |
| INIT_XMM ssse3 |
| .dy1_w8: |
| mov dword [stk+0xf0], 1 |
| movifprep tmp_stridem, 16 |
| jmp .dy1_w_start |
| .dy1_w16: |
| mov dword [stk+0xf0], 2 |
| movifprep tmp_stridem, 32 |
| jmp .dy1_w_start |
| .dy1_w32: |
| mov dword [stk+0xf0], 4 |
| movifprep tmp_stridem, 64 |
| jmp .dy1_w_start |
| .dy1_w64: |
| mov dword [stk+0xf0], 8 |
| movifprep tmp_stridem, 128 |
| jmp .dy1_w_start |
| .dy1_w128: |
| mov dword [stk+0xf0], 16 |
| movifprep tmp_stridem, 256 |
| .dy1_w_start: |
| mov myd, mym |
| %if ARCH_X86_64 |
| %ifidn %1, put |
| movifnidn dsm, dsq |
| %endif |
| mova [rsp+0x10], m11 |
| mova [rsp+0x20], m12 |
| %define hround m11 |
| %if isput |
| mova [rsp+0x30], m13 |
| %else |
| mova m13, [base+pd_m524256] |
| %endif |
| shr t0d, 16 |
| shr myd, 6 |
| mov r4d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r4q, [base+subpel_filters+myq*8] |
| movd m15, t0d |
| %else |
| %define hround [esp+0x00] |
| %define m12 [esp+0x10] |
| %define m10 [base+pd_0x3ff] |
| %define m8 m0 |
| %xdefine m14 m4 |
| %xdefine m15 m3 |
| %if isprep |
| %define ssq ssm |
| %endif |
| mov r5, [esp+0x1f0] |
| mov r3, [esp+0x1f4] |
| shr r5, 16 |
| movd m15, r5 |
| xor r5, r5 |
| shr myd, 6 |
| lea r3, [r3+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r3*8+0] |
| cmovnz r5, [base+subpel_filters+r3*8+4] |
| mov r0, r0m |
| mov r3, r3m |
| %endif |
| sub srcq, 6 |
| pslld m7, m8, 2 ; dx*4 |
| pmaddwd m8, [base+rescale_mul] ; dx*[0-3] |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| %if ARCH_X86_64 |
| movq m3, r4q |
| %else |
| movd m5, r4 |
| movd m6, r5 |
| punpckldq m5, m6 |
| SWAP m3, m5 |
| %endif |
| punpcklbw m3, m3 |
| psraw m3, 8 |
| mova [stk+0x100], m7 |
| mova [stk+0x120], m15 |
| mov [stk+0x0f8], srcq |
| mov [stk+0x130], r0q ; dstq / tmpq |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| %if ARCH_X86_64 |
| mova [stk+0x140], m0 |
| mova [stk+0x150], m1 |
| mova [stk+0x160], m2 |
| mova [stk+0x170], m3 |
| %if UNIX64 |
| mov hm, hd |
| %endif |
| %else |
| mova [stk+0x180], m0 |
| mova [stk+0x190], m1 |
| mova [stk+0x1a0], m2 |
| mova [stk+0x1b0], m3 |
| SWAP m5, m3 |
| mov r5, hm |
| mov [stk+0x134], r5 |
| %endif |
| jmp .dy1_hloop |
| .dy1_hloop_prep: |
| dec dword [stk+0x0f0] |
| jz .ret |
| %if ARCH_X86_64 |
| add qword [stk+0x130], 16 |
| mov hd, hm |
| %else |
| add dword [stk+0x130], 16 |
| mov r5, [stk+0x134] |
| mov r0, [stk+0x130] |
| %endif |
| mova m7, [stk+0x100] |
| mova m14, [stk+0x110] |
| %if ARCH_X86_64 |
| mova m10, [base+pd_0x3ff] |
| mova m11, [rsp+0x10] |
| %endif |
| mova m15, [stk+0x120] |
| mov srcq, [stk+0x0f8] |
| %if ARCH_X86_64 |
| mov r0q, [stk+0x130] ; dstq / tmpq |
| %else |
| mov hm, r5 |
| mov r0m, r0 |
| mov r3, r3m |
| %endif |
| paddd m14, m7 |
| .dy1_hloop: |
| %if ARCH_X86_64 |
| mova m9, [base+pq_0x40000000] |
| %else |
| %define m9 [base+pq_0x40000000] |
| %endif |
| pxor m1, m1 |
| psrld m2, m14, 10 |
| mova [stk], m2 |
| pand m6, m14, m10 |
| psrld m6, 6 |
| paddd m5, m15, m6 |
| pcmpeqd m6, m1 |
| pshufd m2, m5, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m5 |
| movd r6d, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r7d, m5 |
| movd r9d, m2 |
| movq m0, [base+subpel_filters+r4*8] |
| movq m1, [base+subpel_filters+r6*8] |
| movhps m0, [base+subpel_filters+r7*8] |
| movhps m1, [base+subpel_filters+r9*8] |
| %else |
| movd r0, m5 |
| movd rX, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r4, m5 |
| movd r5, m2 |
| movq m0, [base+subpel_filters+r0*8] |
| movq m1, [base+subpel_filters+rX*8] |
| movhps m0, [base+subpel_filters+r4*8] |
| movhps m1, [base+subpel_filters+r5*8] |
| %endif |
| paddd m14, m7 ; mx+dx*[4-7] |
| pand m5, m14, m10 |
| psrld m5, 6 |
| paddd m15, m5 |
| pxor m2, m2 |
| pcmpeqd m5, m2 |
| mova [stk+0x110], m14 |
| pshufd m4, m15, q1032 |
| %if ARCH_X86_64 |
| movd r10d, m15 |
| movd r11d, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r13d, m15 |
| movd rXd, m4 |
| movq m2, [base+subpel_filters+r10*8] |
| movq m3, [base+subpel_filters+r11*8] |
| movhps m2, [base+subpel_filters+r13*8] |
| movhps m3, [base+subpel_filters+ rX*8] |
| psrld m14, 10 |
| movq r11, m14 |
| punpckhqdq m14, m14 |
| movq rX, m14 |
| mov r10d, r11d |
| shr r11, 32 |
| mov r13d, rXd |
| shr rX, 32 |
| mov r4d, [stk+ 0] |
| mov r6d, [stk+ 4] |
| mov r7d, [stk+ 8] |
| mov r9d, [stk+12] |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m14, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m7, m9, m4 |
| pand m8, m9, m6 |
| pand m15, m9, m14 |
| pand m9, m9, m5 |
| pandn m4, m0 |
| pandn m6, m1 |
| pandn m14, m2 |
| pandn m5, m3 |
| por m7, m4 |
| por m8, m6 |
| por m15, m14 |
| por m9, m5 |
| punpcklbw m0, m7, m7 |
| punpckhbw m7, m7 |
| punpcklbw m1, m8, m8 |
| punpckhbw m8, m8 |
| psraw m0, 8 |
| psraw m7, 8 |
| psraw m1, 8 |
| psraw m8, 8 |
| punpcklbw m2, m15, m15 |
| punpckhbw m15, m15 |
| punpcklbw m3, m9, m9 |
| punpckhbw m9, m9 |
| psraw m2, 8 |
| psraw m15, 8 |
| psraw m3, 8 |
| psraw m9, 8 |
| mova [stk+0x10], m0 |
| mova [stk+0x20], m7 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m8 |
| mova [stk+0x50], m2 |
| mova [stk+0x60], m15 |
| mova [stk+0x70], m3 |
| mova [stk+0x80], m9 |
| MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 |
| mova [stk+0x90], m1 |
| MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 |
| mova [stk+0xa0], m2 |
| MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 |
| mova [stk+0xb0], m3 |
| MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 |
| mova [stk+0xc0], m4 |
| MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 |
| mova [stk+0xd0], m5 |
| MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 |
| MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 |
| MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 |
| mova m5, [stk+0xd0] |
| mova m1, [stk+0x90] |
| mova m2, [stk+0xa0] |
| mova m3, [stk+0xb0] |
| mova m9, [stk+0xc0] |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m8 ; 67a |
| punpckhwd m7, m8 ; 67b |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m9 ; 23a |
| punpckhwd m3, m9 ; 23b |
| mova m10, [stk+0x140] |
| mova m11, [stk+0x150] |
| mova m14, [stk+0x160] |
| mova m15, [stk+0x170] |
| mova [stk+0x90], m4 |
| mova [stk+0xa0], m5 |
| mova [stk+0xb0], m6 |
| mova [stk+0xc0], m7 |
| %define hround [rsp+0x10] |
| %define shift [rsp+0x20] |
| %if isput |
| %define vround [rsp+0x30] |
| %else |
| %define vround [base+pd_m524256] |
| %endif |
| .dy1_vloop: |
| pmaddwd m4, m0, m10 |
| pmaddwd m5, m1, m10 |
| pmaddwd m6, m2, m11 |
| pmaddwd m7, m3, m11 |
| paddd m4, m13 |
| paddd m5, m13 |
| paddd m4, m6 |
| paddd m5, m7 |
| pmaddwd m6, [stk+0x90], m14 |
| pmaddwd m7, [stk+0xa0], m14 |
| pmaddwd m8, [stk+0xb0], m15 |
| pmaddwd m9, [stk+0xc0], m15 |
| paddd m4, m6 |
| paddd m5, m7 |
| %if isput |
| pshufd m6, m12, q1032 |
| %endif |
| paddd m4, m8 |
| paddd m5, m9 |
| %else |
| movd r0, m15 |
| movd rX, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r4, m15 |
| movd r5, m4 |
| mova m14, [stk+0x110] |
| movq m2, [base+subpel_filters+r0*8] |
| movq m3, [base+subpel_filters+rX*8] |
| movhps m2, [base+subpel_filters+r4*8] |
| movhps m3, [base+subpel_filters+r5*8] |
| psrld m14, 10 |
| mova [stk+16], m14 |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m7, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m0, m9, m4 |
| pand m1, m9, m6 |
| pand m2, m9, m7 |
| pand m3, m9, m5 |
| pandn m4, [stk+0x20] |
| pandn m6, [stk+0x30] |
| pandn m7, [stk+0x40] |
| pandn m5, [stk+0x50] |
| por m0, m4 |
| por m1, m6 |
| por m2, m7 |
| por m3, m5 |
| punpcklbw m4, m0, m0 |
| punpckhbw m0, m0 |
| punpcklbw m5, m1, m1 |
| punpckhbw m1, m1 |
| psraw m4, 8 |
| psraw m0, 8 |
| psraw m5, 8 |
| psraw m1, 8 |
| punpcklbw m6, m2, m2 |
| punpckhbw m2, m2 |
| punpcklbw m7, m3, m3 |
| punpckhbw m3, m3 |
| psraw m6, 8 |
| psraw m2, 8 |
| psraw m7, 8 |
| psraw m3, 8 |
| mova [stk+0x0a0], m4 |
| mova [stk+0x0b0], m0 |
| mova [stk+0x0c0], m5 |
| mova [stk+0x0d0], m1 |
| mova [stk+0x140], m6 |
| mova [stk+0x150], m2 |
| mova [stk+0x160], m7 |
| mova [stk+0x170], m3 |
| MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 |
| MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 |
| MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 |
| MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 |
| MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 |
| MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 |
| MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 |
| MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 |
| mova m5, [stk+0x60] |
| mova m6, [stk+0x70] |
| mova m7, [stk+0x80] |
| mova m0, [stk+0x90] |
| mov r0, r0m |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m0 ; 67a |
| punpckhwd m7, m0 ; 67b |
| mova [stk+0x60], m4 |
| mova [stk+0x70], m5 |
| mova [stk+0x80], m6 |
| mova [stk+0x90], m7 |
| mova m1, [stk+0x20] |
| mova m2, [stk+0x30] |
| mova m3, [stk+0x40] |
| mova m4, [stk+0x50] |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m4 ; 23a |
| punpckhwd m3, m4 ; 23b |
| mova m4, [stk+0x180] |
| mova m5, [stk+0x190] |
| mova m6, [stk+0x1a0] |
| mova m7, [stk+0x1b0] |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| .dy1_vloop: |
| pmaddwd m0, m4 |
| pmaddwd m1, m4 |
| pmaddwd m2, m5 |
| pmaddwd m3, m5 |
| paddd m0, m2 |
| paddd m1, m3 |
| pmaddwd m2, [stk+0x60], m6 |
| pmaddwd m3, [stk+0x70], m6 |
| pmaddwd m4, [stk+0x80], m7 |
| pmaddwd m5, [stk+0x90], m7 |
| %if isput |
| movd m6, [esp+0x18] |
| %endif |
| paddd m0, m2 |
| paddd m1, m3 |
| paddd m0, vrnd_mem |
| paddd m1, vrnd_mem |
| paddd m4, m0 |
| paddd m5, m1 |
| %endif |
| %ifidn %1, put |
| psrad m4, m6 |
| psrad m5, m6 |
| packssdw m4, m5 |
| pxor m7, m7 |
| pmaxsw m4, m7 |
| pminsw m4, pxmaxm |
| mova [dstq], m4 |
| add dstq, dsm |
| %else |
| psrad m4, 6 |
| psrad m5, 6 |
| packssdw m4, m5 |
| mova [tmpq], m4 |
| add tmpq, tmp_stridem |
| %endif |
| dec hd |
| jz .dy1_hloop_prep |
| %if ARCH_X86_64 |
| movu m8, [srcq+r10*2] |
| movu m9, [srcq+r11*2] |
| movu m12, [srcq+r13*2] |
| movu m13, [srcq+ rX*2] |
| movu m4, [srcq+ r4*2] |
| movu m5, [srcq+ r6*2] |
| movu m6, [srcq+ r7*2] |
| movu m7, [srcq+ r9*2] |
| add srcq, ssq |
| pmaddwd m8, [stk+0x50] |
| pmaddwd m9, [stk+0x60] |
| pmaddwd m12, [stk+0x70] |
| pmaddwd m13, [stk+0x80] |
| pmaddwd m4, [stk+0x10] |
| pmaddwd m5, [stk+0x20] |
| pmaddwd m6, [stk+0x30] |
| pmaddwd m7, [stk+0x40] |
| phaddd m8, m9 |
| phaddd m12, m13 |
| mova m9, [base+unpckw] |
| mova m13, hround |
| phaddd m4, m5 |
| phaddd m6, m7 |
| phaddd m8, m12 |
| phaddd m4, m6 |
| pshufd m5, m9, q1032 |
| pshufb m0, m9 ; 0a 1a |
| pshufb m1, m9 ; 0b 1b |
| pshufb m2, m5 ; 3a 2a |
| pshufb m3, m5 ; 3b 2b |
| mova m12, shift |
| paddd m4, m13 |
| paddd m8, m13 |
| psrad m4, m12 |
| psrad m8, m12 |
| packssdw m4, m8 |
| pshufb m6, [stk+0x90], m9 ; 4a 5a |
| pshufb m7, [stk+0xa0], m9 ; 4b 5b |
| pshufb m8, [stk+0xb0], m5 ; 7a 6a |
| pshufb m13, [stk+0xc0], m5 ; 7b 6b |
| punpckhwd m0, m2 ; 12a |
| punpckhwd m1, m3 ; 12b |
| punpcklwd m2, m6 ; 34a |
| punpcklwd m3, m7 ; 34b |
| punpckhwd m6, m8 ; 56a |
| punpckhwd m7, m13 ; 56b |
| punpcklwd m8, m4 ; 78a |
| punpckhqdq m4, m4 |
| punpcklwd m13, m4 ; 78b |
| mova [stk+0x90], m6 |
| mova [stk+0xa0], m7 |
| mova [stk+0xb0], m8 |
| mova [stk+0xc0], m13 |
| mova m13, vround |
| %else |
| mov r0m, r0 |
| mov r3, r3m |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 |
| mova m7, [base+unpckw] |
| pshufd m4, m7, q1032 |
| pshufb m0, [stk+0x20], m7 ; 0a 1a |
| pshufb m1, [stk+0x30], m7 ; 0b 1b |
| pshufb m2, [stk+0x40], m4 ; 3a 2a |
| pshufb m3, [stk+0x50], m4 ; 3b 2b |
| pshufb m5, [stk+0x60], m7 ; 4a 5a |
| pshufb m6, [stk+0x70], m7 ; 4b 5b |
| pshufb m7, [stk+0x80], m4 ; 7a 6a |
| punpckhwd m0, m2 ; 12a |
| punpckhwd m1, m3 ; 12b |
| punpcklwd m2, m5 ; 34a |
| punpcklwd m3, m6 ; 34b |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| punpckhwd m5, m7 ; 56a |
| mova [stk+0x60], m5 |
| pshufb m5, [stk+0x90], m4 ; 7b 6b |
| punpcklwd m7, [stk+0xe0] ; 78a |
| mova m4, [stk+0x180] |
| punpckhwd m6, m5 ; 56b |
| mova [stk+0x70], m6 |
| movq m6, [stk+0xe8] |
| mova [stk+0x80], m7 |
| mova m7, [stk+0x1b0] |
| punpcklwd m5, m6 |
| mova m6, [stk+0x1a0] |
| mova [stk+0x90], m5 |
| mova m5, [stk+0x190] |
| mov r0, r0m |
| %endif |
| jmp .dy1_vloop |
| INIT_XMM ssse3 |
| %if ARCH_X86_64 |
| %define stk rsp+0x20 |
| %endif |
| .dy2: |
| movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] |
| add wq, base_reg |
| jmp wq |
| %if isput |
| .dy2_w2: |
| %if ARCH_X86_64 |
| mov myd, mym |
| mova [rsp+0x10], m13 |
| %define vrnd_mem [rsp+0x10] |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| %define m8 m0 |
| %define m9 m1 |
| %define m14 m4 |
| %define m15 m3 |
| %define m11 [esp+0x00] |
| %define m12 [esp+0x10] |
| %define vrnd_mem [esp+0x20] |
| mov r1, r1m |
| movzx r5, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r5 |
| %endif |
| pxor m9, m9 |
| punpckldq m9, m8 |
| paddd m14, m9 ; mx+dx*[0-1] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| pand m8, m14, m10 |
| psrld m8, 6 |
| paddd m15, m8 |
| movd r4d, m15 |
| pshufd m15, m15, q0321 |
| %if ARCH_X86_64 |
| movd r6d, m15 |
| %else |
| movd r3d, m15 |
| %endif |
| mova m5, [base+bdct_lb_q] |
| mova m6, [base+spel_s_shuf2] |
| movd m15, [base+subpel_filters+r4*8+2] |
| %if ARCH_X86_64 |
| movd m7, [base+subpel_filters+r6*8+2] |
| %else |
| movd m7, [base+subpel_filters+r3*8+2] |
| %endif |
| pxor m2, m2 |
| pcmpeqd m8, m2 |
| psrld m14, 10 |
| paddd m14, m14 |
| %if ARCH_X86_32 |
| mov r3, r3m |
| pshufb m14, m5 |
| paddb m14, m6 |
| mova [stk], m14 |
| SWAP m5, m0 |
| SWAP m6, m3 |
| %define m15 m6 |
| %endif |
| movu m0, [srcq+ssq*0] |
| movu m1, [srcq+ssq*2] |
| movu m2, [srcq+ssq*4] |
| punpckldq m15, m7 |
| %if ARCH_X86_64 |
| pshufb m14, m5 |
| paddb m14, m6 |
| pand m9, m8 |
| pandn m8, m15 |
| SWAP m15, m8 |
| por m15, m9 |
| movu m4, [srcq+ssq*1] |
| movu m5, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| movu m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| shr myd, 6 |
| mov r4d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r4q, [base+subpel_filters+myq*8] |
| %else |
| pand m7, m5, [base+pd_0x4000] |
| pandn m5, m15 |
| por m5, m7 |
| %define m15 m5 |
| mov myd, mym |
| mov r5, [esp+0x1f4] |
| xor r3, r3 |
| shr myd, 6 |
| lea r5, [r5+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r5*8+0] |
| cmovnz r3, [base+subpel_filters+r5*8+4] |
| mov [stk+0x20], r3 |
| mov r3, r3m |
| %endif |
| punpcklbw m15, m15 |
| psraw m15, 8 |
| REPX {pshufb x, m14}, m0, m1, m2 |
| REPX {pmaddwd x, m15}, m0, m1, m2 |
| %if ARCH_X86_64 |
| REPX {pshufb x, m14}, m4, m5, m6 |
| REPX {pmaddwd x, m15}, m4, m5, m6 |
| phaddd m0, m1 |
| phaddd m1, m2 |
| phaddd m4, m5 |
| phaddd m5, m6 |
| REPX {paddd x, m11}, m0, m1, m4, m5 |
| REPX {psrad x, m12}, m0, m1, m4, m5 |
| packssdw m0, m1 ; 0 2 2 4 |
| packssdw m4, m5 ; 1 3 3 5 |
| SWAP m2, m4 |
| movq m10, r4 |
| %else |
| mova [stk+0x10], m15 |
| phaddd m0, m1 |
| phaddd m1, m2 |
| movu m2, [srcq+ssq*1] |
| movu m7, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| movu m6, [srcq+ssq*1] |
| lea srcq, [srcq+ssq*2] |
| REPX {pshufb x, m14}, m2, m7, m6 |
| REPX {pmaddwd x, m15}, m2, m7, m6 |
| %define m14 [stk+0x00] |
| %define m15 [stk+0x10] |
| phaddd m2, m7 |
| phaddd m7, m6 |
| REPX {paddd x, m11}, m0, m1, m2, m7 |
| REPX {psrad x, m12}, m0, m1, m2, m7 |
| packssdw m0, m1 |
| packssdw m2, m7 |
| %define m8 m6 |
| %define m9 m4 |
| %define m10 m5 |
| movd m10, r4 |
| movd m9, [stk+0x20] |
| punpckldq m10, m9 |
| %endif |
| punpcklbw m10, m10 |
| psraw m10, 8 |
| pshufd m7, m10, q0000 |
| pshufd m8, m10, q1111 |
| pshufd m9, m10, q2222 |
| pshufd m10, m10, q3333 |
| %if ARCH_X86_32 |
| mova [stk+0x50], m7 |
| mova [stk+0x60], m8 |
| mova [stk+0x70], m9 |
| mova [stk+0x80], m10 |
| %xdefine m13 m7 |
| %define m7 [stk+0x50] |
| %define m8 [stk+0x60] |
| %define m9 [stk+0x70] |
| %define m10 [stk+0x80] |
| %endif |
| punpcklwd m1, m0, m2 ; 01 23 |
| punpckhwd m3, m0, m2 ; 23 45 |
| %if ARCH_X86_32 |
| mov r4, r0m |
| %define dstq r4 |
| mova [stk+0x20], m3 |
| mova [stk+0x30], m0 |
| %endif |
| .dy2_w2_loop: |
| movu m4, [srcq+ssq*0] |
| movu m5, [srcq+ssq*1] |
| movu m6, [srcq+ssq*2] |
| movu m13, [srcq+ss3q ] |
| lea srcq, [srcq+ssq*4] |
| pmaddwd m3, m8 |
| REPX {pshufb x, m14}, m4, m5, m6, m13 |
| REPX {pmaddwd x, m15}, m4, m5, m6, m13 |
| phaddd m4, m5 |
| phaddd m6, m13 |
| pmaddwd m5, m1, m7 |
| paddd m4, m11 |
| paddd m6, m11 |
| psrad m4, m12 |
| psrad m6, m12 |
| packssdw m4, m6 ; 6 7 8 9 |
| paddd m5, m3 |
| pshufd m3, m4, q2200 |
| pshufd m4, m4, q3311 |
| palignr m3, m0, 12 ; 4 6 6 8 |
| palignr m4, m2, 12 ; 5 7 7 9 |
| mova m0, m3 |
| mova m2, m4 |
| punpcklwd m1, m3, m4 |
| punpckhwd m3, m4 |
| pmaddwd m6, m1, m9 |
| pmaddwd m4, m3, m10 |
| paddd m5, vrnd_mem |
| paddd m6, m4 |
| paddd m5, m6 |
| pshufd m4, m12, q1032 |
| pxor m6, m6 |
| psrad m5, m4 |
| packssdw m5, m5 |
| pmaxsw m5, m6 |
| pminsw m5, pxmaxm |
| movd [dstq+dsq*0], m5 |
| pshuflw m5, m5, q1032 |
| movd [dstq+dsq*1], m5 |
| lea dstq, [dstq+dsq*2] |
| sub hd, 2 |
| jg .dy2_w2_loop |
| RET |
| %endif |
| INIT_XMM ssse3 |
| .dy2_w4: |
| %if ARCH_X86_64 |
| mov myd, mym |
| mova [rsp+0x10], m11 |
| mova [rsp+0x20], m12 |
| %if isput |
| mova [rsp+0x30], m13 |
| %define vrnd_mem [rsp+0x30] |
| %define stk rsp+0x40 |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %define stk rsp+0x30 |
| %endif |
| movzx t0d, t0b |
| sub srcq, 2 |
| movd m15, t0d |
| %else |
| %define m10 [base+pd_0x3ff] |
| %define m9 [base+pd_0x4000] |
| %define m8 m0 |
| %xdefine m14 m4 |
| %define m15 m3 |
| %if isprep |
| %define ssq r3 |
| %endif |
| movzx r5, byte [esp+0x1f0] |
| sub srcq, 2 |
| movd m15, r5 |
| %endif |
| pmaddwd m8, [base+rescale_mul] |
| %if ARCH_X86_64 |
| mova m9, [base+pd_0x4000] |
| %endif |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| pand m0, m14, m10 |
| psrld m0, 6 |
| paddd m15, m0 |
| pshufd m7, m15, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m15 |
| movd r11d, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd r6d, m15 |
| movd r13d, m7 |
| mova m10, [base+bdct_lb_q+ 0] |
| mova m11, [base+bdct_lb_q+16] |
| movd m13, [base+subpel_filters+ r4*8+2] |
| movd m2, [base+subpel_filters+ r6*8+2] |
| movd m15, [base+subpel_filters+r11*8+2] |
| movd m4, [base+subpel_filters+r13*8+2] |
| %else |
| movd r1, m15 |
| movd r4, m7 |
| pshufd m15, m15, q0321 |
| pshufd m7, m7, q0321 |
| movd r3, m15 |
| movd r5, m7 |
| mova m5, [base+bdct_lb_q+ 0] |
| mova m6, [base+bdct_lb_q+16] |
| movd m1, [base+subpel_filters+r1*8+2] |
| movd m2, [base+subpel_filters+r3*8+2] |
| movd m3, [base+subpel_filters+r4*8+2] |
| movd m7, [base+subpel_filters+r5*8+2] |
| SWAP m4, m7 |
| mov r3, r3m |
| %if isprep |
| lea ss3q, [ssq*3] |
| %endif |
| %define m10 m5 |
| %define m11 m6 |
| %define m12 m1 |
| %define m13 m1 |
| %endif |
| psrld m14, 10 |
| paddd m14, m14 |
| punpckldq m13, m2 |
| punpckldq m15, m4 |
| punpcklqdq m13, m15 |
| pxor m2, m2 |
| pcmpeqd m0, m2 |
| %if ARCH_X86_64 |
| pand m9, m0 |
| %else |
| pand m2, m9, m0 |
| %define m9 m2 |
| SWAP m7, m4 |
| %endif |
| pandn m0, m13 |
| %if ARCH_X86_64 |
| SWAP m13, m0 |
| %else |
| %define m13 m0 |
| %endif |
| por m13, m9 |
| punpckhbw m15, m13, m13 |
| punpcklbw m13, m13 |
| psraw m15, 8 |
| psraw m13, 8 |
| pshufb m12, m14, m10 |
| pshufb m14, m11 |
| mova m10, [base+spel_s_shuf2] |
| movd r4d, m14 |
| shr r4d, 24 |
| %if ARCH_X86_32 |
| mova [stk+0x40], m13 |
| mova [stk+0x50], m15 |
| pxor m2, m2 |
| %endif |
| pshufb m7, m14, m2 |
| psubb m14, m7 |
| paddb m12, m10 |
| paddb m14, m10 |
| %if ARCH_X86_64 |
| lea r6, [r4+ssq*1] |
| lea r11, [r4+ssq*2] |
| lea r13, [r4+ss3q ] |
| movu m1, [srcq+ssq*0] |
| movu m8, [srcq+ssq*2] |
| movu m9, [srcq+ssq*1] |
| movu m10, [srcq+ss3q ] |
| movu m7, [srcq+r4 ] |
| movu m2, [srcq+r11 ] |
| movu m3, [srcq+r6 ] |
| movu m4, [srcq+r13 ] |
| lea srcq, [srcq+ssq*4] |
| REPX {pshufb x, m12}, m1, m9, m8, m10 |
| REPX {pmaddwd x, m13}, m1, m9, m8, m10 |
| REPX {pshufb x, m14}, m7, m3, m2, m4 |
| REPX {pmaddwd x, m15}, m7, m3, m2, m4 |
| mova m5, [rsp+0x10] |
| movd xm6, [rsp+0x20] |
| phaddd m1, m7 |
| phaddd m8, m2 |
| phaddd m9, m3 |
| phaddd m10, m4 |
| movu m2, [srcq+ssq*0] |
| movu m3, [srcq+ssq*1] |
| REPX {paddd x, m5}, m1, m9, m8, m10 |
| REPX {psrad x, xm6}, m1, m9, m8, m10 |
| packssdw m1, m8 ; 0 2 |
| packssdw m9, m10 ; 1 3 |
| movu m0, [srcq+r4 ] |
| movu m8, [srcq+r6 ] |
| lea srcq, [srcq+ssq*2] |
| REPX {pshufb x, m12}, m2, m3 |
| REPX {pmaddwd x, m13}, m2, m3 |
| REPX {pshufb x, m14}, m0, m8 |
| REPX {pmaddwd x, m15}, m0, m8 |
| phaddd m2, m0 |
| phaddd m3, m8 |
| shr myd, 6 |
| mov r9d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r9q, [base+subpel_filters+myq*8] |
| REPX {paddd x, m5}, m2, m3 |
| REPX {psrad x, xm6}, m2, m3 |
| packssdw m2, m3 ; 4 5 |
| pshufd m3, m2, q1032 ; 5 _ |
| punpcklwd m0, m1, m9 ; 01 |
| punpckhwd m1, m9 ; 23 |
| punpcklwd m2, m3 ; 45 |
| movq m10, r9 |
| %define hrnd_mem [rsp+0x10] |
| %define hsh_mem [rsp+0x20] |
| %define vsh_mem [rsp+0x28] |
| %if isput |
| %define vrnd_mem [rsp+0x30] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| %else |
| mova [stk+0x20], m12 |
| mova [stk+0x30], m14 |
| add r4, srcq |
| MC_4TAP_SCALED_H 0x60 ; 0 1 |
| MC_4TAP_SCALED_H 0x70 ; 2 3 |
| MC_4TAP_SCALED_H 0x80 ; 4 5 |
| mov [stk+0xe0], r4 |
| mova m3, [base+spel_s_shuf8] |
| mova m0, [stk+0x60] |
| mova m1, [stk+0x70] |
| mova m2, [stk+0x80] |
| mov myd, mym |
| mov rX, [esp+0x1f4] |
| xor r5, r5 |
| shr myd, 6 |
| lea rX, [rX+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+rX*8+0] |
| cmovnz r5, [base+subpel_filters+rX*8+4] |
| mov r3, r3m |
| pshufb m0, m3 ; 01 |
| pshufb m1, m3 ; 23 |
| pshufb m2, m3 ; 45 |
| movd m7, r4 |
| movd m4, r5 |
| mov r5, r0m |
| %if isput |
| mov r1, r1m |
| %endif |
| mov r4, [stk+0xe0] |
| %define dstq r5 |
| %define tmpq r5 |
| %define m12 [stk+0x20] |
| %define m14 [stk+0x30] |
| %define m13 [stk+0x40] |
| %define m15 [stk+0x50] |
| %define hrnd_mem [esp+0x00] |
| %define hsh_mem [esp+0x10] |
| %define vsh_mem [esp+0x18] |
| %if isput |
| %define vrnd_mem [esp+0x20] |
| %else |
| %define vrnd_mem [base+pd_m524256] |
| %endif |
| %define m10 m7 |
| punpckldq m10, m4 |
| %endif |
| punpcklbw m10, m10 |
| psraw m10, 8 |
| pshufd m3, m10, q0000 |
| pshufd m4, m10, q1111 |
| pshufd m5, m10, q2222 |
| pshufd m10, m10, q3333 |
| %if ARCH_X86_32 |
| %xdefine m8 m3 |
| %xdefine m9 m6 |
| %xdefine m11 m5 |
| %xdefine m6 m4 |
| mova [stk+0x100], m3 |
| mova [stk+0x110], m4 |
| mova [stk+0x120], m5 |
| mova [stk+0x130], m10 |
| %define m3 [stk+0x100] |
| %define m4 [stk+0x110] |
| %define m5 [stk+0x120] |
| %define m10 [stk+0x130] |
| %endif |
| .dy2_w4_loop: |
| pmaddwd m8, m0, m3 |
| pmaddwd m9, m1, m3 |
| mova m0, m2 |
| pmaddwd m1, m4 |
| pmaddwd m11, m2, m4 |
| paddd m8, vrnd_mem |
| paddd m9, vrnd_mem |
| pmaddwd m2, m5 |
| paddd m8, m1 |
| paddd m9, m11 |
| paddd m8, m2 |
| movu m6, [srcq+ssq*0] |
| movu m1, [srcq+ssq*2] |
| %if ARCH_X86_64 |
| movu m11, [srcq+r4 ] |
| movu m2, [srcq+r11] |
| %else |
| movu m11, [r4+ssq*0] |
| movu m2, [r4+ssq*2] |
| %endif |
| pshufb m6, m12 |
| pshufb m1, m12 |
| pmaddwd m6, m13 |
| pmaddwd m1, m13 |
| pshufb m11, m14 |
| pshufb m2, m14 |
| pmaddwd m11, m15 |
| pmaddwd m2, m15 |
| phaddd m6, m11 |
| phaddd m1, m2 |
| paddd m6, hrnd_mem |
| paddd m1, hrnd_mem |
| psrad m6, hsh_mem |
| psrad m1, hsh_mem |
| movu m7, [srcq+ssq*1] |
| movu m11, [srcq+ss3q ] |
| packssdw m6, m1 ; 6 8 |
| %if ARCH_X86_64 |
| movu m2, [srcq+r6 ] |
| movu m1, [srcq+r13] |
| %else |
| movu m2, [r4+ssq*1] |
| movu m1, [r4+ss3q ] |
| %endif |
| pshufb m7, m12 |
| pshufb m11, m12 |
| pmaddwd m7, m13 |
| pmaddwd m11, m13 |
| pshufb m2, m14 |
| pshufb m1, m14 |
| pmaddwd m2, m15 |
| pmaddwd m1, m15 |
| phaddd m7, m2 |
| phaddd m11, m1 |
| paddd m7, hrnd_mem |
| paddd m11, hrnd_mem |
| psrad m7, hsh_mem |
| psrad m11, hsh_mem |
| packssdw m7, m11 ; 7 9 |
| %if ARCH_X86_32 |
| lea r4, [r4+ssq*4] |
| %endif |
| lea srcq, [srcq+ssq*4] |
| punpcklwd m1, m6, m7 ; 67 |
| punpckhwd m6, m7 ; 89 |
| mova m2, m6 |
| pmaddwd m11, m1, m5 |
| pmaddwd m7, m1, m10 |
| pmaddwd m6, m10 |
| paddd m9, m11 |
| %if isput |
| movd m11, vsh_mem |
| %endif |
| paddd m8, m7 |
| paddd m9, m6 |
| %if isput |
| psrad m8, m11 |
| psrad m9, m11 |
| packssdw m8, m9 |
| pxor m7, m7 |
| pmaxsw m8, m7 |
| pminsw m8, pxmaxm |
| movq [dstq+dsq*0], m8 |
| movhps [dstq+dsq*1], m8 |
| lea dstq, [dstq+dsq*2] |
| %else |
| psrad m8, 6 |
| psrad m9, 6 |
| packssdw m8, m9 |
| mova [tmpq], m8 |
| add tmpq, 16 |
| %endif |
| sub hd, 2 |
| jg .dy2_w4_loop |
| MC_8TAP_SCALED_RET ; why not jz .ret? |
| INIT_XMM ssse3 |
| .dy2_w8: |
| mov dword [stk+0xf0], 1 |
| movifprep tmp_stridem, 16 |
| jmp .dy2_w_start |
| .dy2_w16: |
| mov dword [stk+0xf0], 2 |
| movifprep tmp_stridem, 32 |
| jmp .dy2_w_start |
| .dy2_w32: |
| mov dword [stk+0xf0], 4 |
| movifprep tmp_stridem, 64 |
| jmp .dy2_w_start |
| .dy2_w64: |
| mov dword [stk+0xf0], 8 |
| movifprep tmp_stridem, 128 |
| jmp .dy2_w_start |
| .dy2_w128: |
| mov dword [stk+0xf0], 16 |
| movifprep tmp_stridem, 256 |
| .dy2_w_start: |
| mov myd, mym |
| %if ARCH_X86_64 |
| %ifidn %1, put |
| movifnidn dsm, dsq |
| %endif |
| mova [rsp+0x10], m11 |
| mova [rsp+0x20], m12 |
| %define hround m11 |
| %if isput |
| mova [rsp+0x30], m13 |
| %else |
| mova m13, [base+pd_m524256] |
| %endif |
| shr t0d, 16 |
| shr myd, 6 |
| mov r4d, 64 << 24 |
| lea myd, [t1+myq] |
| cmovnz r4q, [base+subpel_filters+myq*8] |
| movd m15, t0d |
| %else |
| %define hround [esp+0x00] |
| %define m12 [esp+0x10] |
| %define m10 [base+pd_0x3ff] |
| %define m8 m0 |
| %xdefine m14 m4 |
| %xdefine m15 m3 |
| %if isput |
| %define dstq r0 |
| %else |
| %define tmpq r0 |
| %define ssq ssm |
| %endif |
| mov r5, [esp+0x1f0] |
| mov r3, [esp+0x1f4] |
| shr r5, 16 |
| movd m15, r5 |
| xor r5, r5 |
| shr myd, 6 |
| lea r3, [r3+myd] |
| mov r4, 64 << 24 |
| cmovnz r4, [base+subpel_filters+r3*8+0] |
| cmovnz r5, [base+subpel_filters+r3*8+4] |
| mov r0, r0m |
| mov r3, r3m |
| %endif |
| sub srcq, 6 |
| pslld m7, m8, 2 ; dx*4 |
| pmaddwd m8, [base+rescale_mul] ; dx*[0-3] |
| pshufd m15, m15, q0000 |
| paddd m14, m8 ; mx+dx*[0-3] |
| %if ARCH_X86_64 |
| movq m3, r4q |
| %else |
| movd m5, r4 |
| movd m6, r5 |
| punpckldq m5, m6 |
| SWAP m3, m5 |
| %endif |
| punpcklbw m3, m3 |
| psraw m3, 8 |
| mova [stk+0x100], m7 |
| mova [stk+0x120], m15 |
| mov [stk+0x0f8], srcq |
| mov [stk+0x130], r0q ; dstq / tmpq |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| %if ARCH_X86_64 |
| mova [stk+0x140], m0 |
| mova [stk+0x150], m1 |
| mova [stk+0x160], m2 |
| mova [stk+0x170], m3 |
| %if UNIX64 |
| mov hm, hd |
| %endif |
| %else |
| mova [stk+0x180], m0 |
| mova [stk+0x190], m1 |
| mova [stk+0x1a0], m2 |
| mova [stk+0x1b0], m3 |
| SWAP m5, m3 |
| mov r5, hm |
| mov [stk+0x134], r5 |
| %endif |
| jmp .dy2_hloop |
| .dy2_hloop_prep: |
| dec dword [stk+0x0f0] |
| jz .ret |
| %if ARCH_X86_64 |
| add qword [stk+0x130], 16 |
| mov hd, hm |
| %else |
| add dword [stk+0x130], 16 |
| mov r5, [stk+0x134] |
| mov r0, [stk+0x130] |
| %endif |
| mova m7, [stk+0x100] |
| mova m14, [stk+0x110] |
| %if ARCH_X86_64 |
| mova m10, [base+pd_0x3ff] |
| mova m11, [rsp+0x10] |
| %endif |
| mova m15, [stk+0x120] |
| mov srcq, [stk+0x0f8] |
| %if ARCH_X86_64 |
| mov r0q, [stk+0x130] ; dstq / tmpq |
| %else |
| mov hm, r5 |
| mov r0m, r0 |
| mov r3, r3m |
| %endif |
| paddd m14, m7 |
| .dy2_hloop: |
| %if ARCH_X86_64 |
| mova m9, [base+pq_0x40000000] |
| %else |
| %define m9 [base+pq_0x40000000] |
| %endif |
| pxor m1, m1 |
| psrld m2, m14, 10 |
| mova [stk], m2 |
| pand m6, m14, m10 |
| psrld m6, 6 |
| paddd m5, m15, m6 |
| pcmpeqd m6, m1 |
| pshufd m2, m5, q1032 |
| %if ARCH_X86_64 |
| movd r4d, m5 |
| movd r6d, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r7d, m5 |
| movd r9d, m2 |
| movq m0, [base+subpel_filters+r4*8] |
| movq m1, [base+subpel_filters+r6*8] |
| movhps m0, [base+subpel_filters+r7*8] |
| movhps m1, [base+subpel_filters+r9*8] |
| %else |
| movd r0, m5 |
| movd rX, m2 |
| pshufd m5, m5, q0321 |
| pshufd m2, m2, q0321 |
| movd r4, m5 |
| movd r5, m2 |
| movq m0, [base+subpel_filters+r0*8] |
| movq m1, [base+subpel_filters+rX*8] |
| movhps m0, [base+subpel_filters+r4*8] |
| movhps m1, [base+subpel_filters+r5*8] |
| %endif |
| paddd m14, m7 ; mx+dx*[4-7] |
| pand m5, m14, m10 |
| psrld m5, 6 |
| paddd m15, m5 |
| pxor m2, m2 |
| pcmpeqd m5, m2 |
| mova [stk+0x110], m14 |
| pshufd m4, m15, q1032 |
| %if ARCH_X86_64 |
| movd r10d, m15 |
| movd r11d, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r13d, m15 |
| movd rXd, m4 |
| movq m2, [base+subpel_filters+r10*8] |
| movq m3, [base+subpel_filters+r11*8] |
| movhps m2, [base+subpel_filters+r13*8] |
| movhps m3, [base+subpel_filters+ rX*8] |
| psrld m14, 10 |
| movq r11, m14 |
| punpckhqdq m14, m14 |
| movq rX, m14 |
| mov r10d, r11d |
| shr r11, 32 |
| mov r13d, rXd |
| shr rX, 32 |
| mov r4d, [stk+ 0] |
| mov r6d, [stk+ 4] |
| mov r7d, [stk+ 8] |
| mov r9d, [stk+12] |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m14, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m7, m9, m4 |
| pand m8, m9, m6 |
| pand m15, m9, m14 |
| pand m9, m9, m5 |
| pandn m4, m0 |
| pandn m6, m1 |
| pandn m14, m2 |
| pandn m5, m3 |
| por m7, m4 |
| por m8, m6 |
| por m15, m14 |
| por m9, m5 |
| punpcklbw m0, m7, m7 |
| punpckhbw m7, m7 |
| punpcklbw m1, m8, m8 |
| punpckhbw m8, m8 |
| psraw m0, 8 |
| psraw m7, 8 |
| psraw m1, 8 |
| psraw m8, 8 |
| punpcklbw m2, m15, m15 |
| punpckhbw m15, m15 |
| punpcklbw m3, m9, m9 |
| punpckhbw m9, m9 |
| psraw m2, 8 |
| psraw m15, 8 |
| psraw m3, 8 |
| psraw m9, 8 |
| mova [stk+0x10], m0 |
| mova [stk+0x20], m7 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m8 |
| mova [stk+0x50], m2 |
| mova [stk+0x60], m15 |
| mova [stk+0x70], m3 |
| mova [stk+0x80], m9 |
| MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 |
| mova [stk+0x90], m1 |
| MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 |
| mova [stk+0xa0], m2 |
| MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 |
| mova [stk+0xb0], m3 |
| MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 |
| mova [stk+0xc0], m4 |
| MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 |
| mova [stk+0xd0], m5 |
| MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 |
| MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 |
| MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 |
| mova m5, [stk+0xd0] |
| mova m1, [stk+0x90] |
| mova m2, [stk+0xa0] |
| mova m3, [stk+0xb0] |
| mova m9, [stk+0xc0] |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m8 ; 67a |
| punpckhwd m7, m8 ; 67b |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m9 ; 23a |
| punpckhwd m3, m9 ; 23b |
| mova m10, [stk+0x140] |
| mova m11, [stk+0x150] |
| mova m14, [stk+0x160] |
| mova m15, [stk+0x170] |
| mova [stk+0x90], m4 |
| mova [stk+0xa0], m5 |
| mova [stk+0xb0], m6 |
| mova [stk+0xc0], m7 |
| %define hround [rsp+0x10] |
| %define shift [rsp+0x20] |
| %if isput |
| %define vround [rsp+0x30] |
| %else |
| %define vround [base+pd_m524256] |
| %endif |
| .dy2_vloop: |
| pmaddwd m4, m0, m10 |
| pmaddwd m5, m1, m10 |
| pmaddwd m6, m2, m11 |
| pmaddwd m7, m3, m11 |
| paddd m4, m13 |
| paddd m5, m13 |
| paddd m4, m6 |
| paddd m5, m7 |
| pmaddwd m6, [stk+0x90], m14 |
| pmaddwd m7, [stk+0xa0], m14 |
| pmaddwd m8, [stk+0xb0], m15 |
| pmaddwd m9, [stk+0xc0], m15 |
| paddd m4, m6 |
| paddd m5, m7 |
| %if isput |
| pshufd m6, m12, q1032 |
| %endif |
| paddd m4, m8 |
| paddd m5, m9 |
| %else |
| movd r0, m15 |
| movd rX, m4 |
| pshufd m15, m15, q0321 |
| pshufd m4, m4, q0321 |
| movd r4, m15 |
| movd r5, m4 |
| mova m14, [stk+0x110] |
| movq m2, [base+subpel_filters+r0*8] |
| movq m3, [base+subpel_filters+rX*8] |
| movhps m2, [base+subpel_filters+r4*8] |
| movhps m3, [base+subpel_filters+r5*8] |
| psrld m14, 10 |
| mova [stk+16], m14 |
| mov r0, [stk+ 0] |
| mov rX, [stk+ 4] |
| mov r4, [stk+ 8] |
| mov r5, [stk+12] |
| mova [stk+0x20], m0 |
| mova [stk+0x30], m1 |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| pshufd m4, m6, q1100 |
| pshufd m6, m6, q3322 |
| pshufd m7, m5, q1100 |
| pshufd m5, m5, q3322 |
| pand m0, m9, m4 |
| pand m1, m9, m6 |
| pand m2, m9, m7 |
| pand m3, m9, m5 |
| pandn m4, [stk+0x20] |
| pandn m6, [stk+0x30] |
| pandn m7, [stk+0x40] |
| pandn m5, [stk+0x50] |
| por m0, m4 |
| por m1, m6 |
| por m2, m7 |
| por m3, m5 |
| punpcklbw m4, m0, m0 |
| punpckhbw m0, m0 |
| punpcklbw m5, m1, m1 |
| punpckhbw m1, m1 |
| psraw m4, 8 |
| psraw m0, 8 |
| psraw m5, 8 |
| psraw m1, 8 |
| punpcklbw m6, m2, m2 |
| punpckhbw m2, m2 |
| punpcklbw m7, m3, m3 |
| punpckhbw m3, m3 |
| psraw m6, 8 |
| psraw m2, 8 |
| psraw m7, 8 |
| psraw m3, 8 |
| mova [stk+0x0a0], m4 |
| mova [stk+0x0b0], m0 |
| mova [stk+0x0c0], m5 |
| mova [stk+0x0d0], m1 |
| mova [stk+0x140], m6 |
| mova [stk+0x150], m2 |
| mova [stk+0x160], m7 |
| mova [stk+0x170], m3 |
| MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 |
| MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 |
| MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 |
| MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 |
| MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 |
| MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 |
| MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 |
| MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 |
| mova m5, [stk+0x60] |
| mova m6, [stk+0x70] |
| mova m7, [stk+0x80] |
| mova m0, [stk+0x90] |
| mov r0, r0m |
| punpcklwd m4, m5, m6 ; 45a |
| punpckhwd m5, m6 ; 45b |
| punpcklwd m6, m7, m0 ; 67a |
| punpckhwd m7, m0 ; 67b |
| mova [stk+0x60], m4 |
| mova [stk+0x70], m5 |
| mova [stk+0x80], m6 |
| mova [stk+0x90], m7 |
| mova m1, [stk+0x20] |
| mova m2, [stk+0x30] |
| mova m3, [stk+0x40] |
| mova m4, [stk+0x50] |
| punpcklwd m0, m1, m2 ; 01a |
| punpckhwd m1, m2 ; 01b |
| punpcklwd m2, m3, m4 ; 23a |
| punpckhwd m3, m4 ; 23b |
| mova m4, [stk+0x180] |
| mova m5, [stk+0x190] |
| mova m6, [stk+0x1a0] |
| mova m7, [stk+0x1b0] |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| .dy2_vloop: |
| pmaddwd m0, m4 |
| pmaddwd m1, m4 |
| pmaddwd m2, m5 |
| pmaddwd m3, m5 |
| paddd m0, m2 |
| paddd m1, m3 |
| pmaddwd m2, [stk+0x60], m6 |
| pmaddwd m3, [stk+0x70], m6 |
| pmaddwd m4, [stk+0x80], m7 |
| pmaddwd m5, [stk+0x90], m7 |
| %if isput |
| movd m6, [esp+0x18] |
| %endif |
| paddd m0, m2 |
| paddd m1, m3 |
| paddd m0, vrnd_mem |
| paddd m1, vrnd_mem |
| paddd m4, m0 |
| paddd m5, m1 |
| %endif |
| %ifidn %1, put |
| psrad m4, m6 |
| psrad m5, m6 |
| packssdw m4, m5 |
| pxor m7, m7 |
| pmaxsw m4, m7 |
| pminsw m4, pxmaxm |
| mova [dstq], m4 |
| add dstq, dsm |
| %else |
| psrad m4, 6 |
| psrad m5, 6 |
| packssdw m4, m5 |
| mova [tmpq], m4 |
| add tmpq, tmp_stridem |
| %endif |
| dec hd |
| jz .dy2_hloop_prep |
| %if ARCH_X86_64 |
| MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 |
| mova [stk+0xd0], m4 |
| MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 |
| mova m4, [stk+0xd0] |
| mova m0, m2 ; 01a |
| mova m1, m3 ; 01b |
| mova m2, [stk+0x90] ; 23a |
| mova m3, [stk+0xa0] ; 23b |
| mova m5, [stk+0xb0] ; 45a |
| mova m6, [stk+0xc0] ; 45b |
| punpcklwd m7, m4, m8 ; 67a |
| punpckhwd m4, m8 ; 67b |
| mova [stk+0x90], m5 |
| mova [stk+0xa0], m6 |
| mova [stk+0xb0], m7 |
| mova [stk+0xc0], m4 |
| %else |
| mov r0m, r0 |
| mov r3, r3m |
| MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 |
| MC_8TAP_SCALED_H 0xa0, 0 ; 9 |
| mova m7, [stk+0xe0] |
| mova m2, [stk+0x60] ; 23a |
| mova m3, [stk+0x70] ; 23b |
| mova m4, [stk+0x80] ; 45a |
| mova m5, [stk+0x90] ; 45b |
| punpcklwd m6, m7, m0 ; 67a |
| punpckhwd m7, m0 ; 67b |
| mova m0, [stk+0x40] ; 01a |
| mova m1, [stk+0x50] ; 01b |
| mova [stk+0x40], m2 |
| mova [stk+0x50], m3 |
| mova [stk+0x60], m4 |
| mova [stk+0x70], m5 |
| mova m4, [stk+0x180] |
| mova m5, [stk+0x190] |
| mova [stk+0x80], m6 |
| mova [stk+0x90], m7 |
| mova m6, [stk+0x1a0] |
| mova m7, [stk+0x1b0] |
| mov r0, r0m |
| %endif |
| jmp .dy2_vloop |
| INIT_XMM ssse3 |
| .ret: |
| MC_8TAP_SCALED_RET 0 |
| %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT |
| %define r0m [rstk+stack_offset+ 4] |
| %define r1m [rstk+stack_offset+ 8] |
| %define r2m [rstk+stack_offset+12] |
| %define r3m [rstk+stack_offset+16] |
| %endif |
| %undef isput |
| %undef isprep |
| %endmacro |
| |
| %macro BILIN_SCALED_FN 1 |
| cglobal %1_bilin_scaled_16bpc |
| mov t0d, (5*15 << 16) | 5*15 |
| mov t1d, (5*15 << 16) | 5*15 |
| jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) |
| %endmacro |
| |
| %if WIN64 |
| DECLARE_REG_TMP 6, 5 |
| %elif ARCH_X86_64 |
| DECLARE_REG_TMP 6, 8 |
| %else |
| DECLARE_REG_TMP 1, 2 |
| %endif |
| BILIN_SCALED_FN put |
| FN put_8tap_scaled, sharp, SHARP, SHARP |
| FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH |
| FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP |
| FN put_8tap_scaled, smooth, SMOOTH, SMOOTH |
| FN put_8tap_scaled, sharp_regular, SHARP, REGULAR |
| FN put_8tap_scaled, regular_sharp, REGULAR, SHARP |
| FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR |
| FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH |
| FN put_8tap_scaled, regular, REGULAR, REGULAR |
| MC_8TAP_SCALED put |
| |
| %if WIN64 |
| DECLARE_REG_TMP 5, 4 |
| %elif ARCH_X86_64 |
| DECLARE_REG_TMP 6, 7 |
| %else |
| DECLARE_REG_TMP 1, 2 |
| %endif |
| BILIN_SCALED_FN prep |
| FN prep_8tap_scaled, sharp, SHARP, SHARP |
| FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH |
| FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP |
| FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH |
| FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR |
| FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP |
| FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR |
| FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH |
| FN prep_8tap_scaled, regular, REGULAR, REGULAR |
| MC_8TAP_SCALED prep |
| |
| %if ARCH_X86_64 |
| DECLARE_REG_TMP 6 |
| %else |
| DECLARE_REG_TMP 2 |
| %endif |
| |
| %if ARCH_X86_64 |
| ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that |
| ; by allocating 16 bytes more stack space so that stack offsets match up. |
| %if WIN64 && STACK_ALIGNMENT == 16 |
| %assign stksz 16*14 |
| %else |
| %assign stksz 16*13 |
| %endif |
| cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ |
| mx, tmp, alpha, beta, \ |
| filter, my, gamma, cnt |
| %assign stack_size_padded_8x8t stack_size_padded |
| %else |
| cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ |
| filter, mx, my |
| %define m8 [esp+16*13] |
| %define m9 [esp+16*14] |
| %define cntd dword [esp+4*63] |
| %define dstq tmpq |
| %define dsq 0 |
| %if STACK_ALIGNMENT < 16 |
| %define dstm [esp+4*65] |
| %define dsm [esp+4*66] |
| %else |
| %define dstm r0m |
| %define dsm r1m |
| %endif |
| %endif |
| %define base filterq-$$ |
| mov t0d, r7m |
| LEA filterq, $$ |
| shr t0d, 11 |
| %if ARCH_X86_64 |
| movddup m8, [base+warp8x8t_rnd] |
| %else |
| movddup m1, [base+warp8x8t_rnd] |
| mov r1, r1m |
| add r1, r1 |
| mova m8, m1 |
| mov r1m, r1 ; ds *= 2 |
| %endif |
| call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main |
| jmp .start |
| .loop: |
| %if ARCH_X86_64 |
| lea dstq, [dstq+dsq*4] |
| %else |
| add dstq, dsm |
| mov dstm, dstq |
| %endif |
| call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 |
| .start: |
| %if ARCH_X86_32 |
| mov dstq, dstm |
| %endif |
| paddd m1, m8 |
| paddd m2, m8 |
| psrad m1, 15 |
| psrad m2, 15 |
| packssdw m1, m2 |
| mova [dstq+dsq*0], m1 |
| call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 |
| %if ARCH_X86_32 |
| mov dstq, dstm |
| add dstq, dsm |
| %endif |
| paddd m1, m8 |
| paddd m2, m8 |
| psrad m1, 15 |
| psrad m2, 15 |
| packssdw m1, m2 |
| mova [dstq+dsq*2], m1 |
| dec cntd |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ |
| mx, tmp, alpha, beta, \ |
| filter, my, gamma, cnt |
| ASSERT stack_size_padded == stack_size_padded_8x8t |
| %else |
| cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ |
| filter, mx, my |
| %endif |
| mov t0d, r7m |
| LEA filterq, $$ |
| shr t0d, 11 |
| %if ARCH_X86_64 |
| movddup m8, [base+warp8x8_rnd2+t0*8] |
| movd m9, r7m ; pixel_max |
| pshufb m9, [base+pw_256] |
| %else |
| movddup m1, [base+warp8x8_rnd2+t0*8] |
| movd m2, r7m ; pixel_max |
| pshufb m2, [base+pw_256] |
| mova m8, m1 |
| mova m9, m2 |
| %endif |
| call .main |
| jmp .start |
| .loop: |
| %if ARCH_X86_64 |
| lea dstq, [dstq+dsq*2] |
| %else |
| add dstq, dsm |
| mov dstm, dstq |
| %endif |
| call .main2 |
| .start: |
| %if ARCH_X86_32 |
| mov dstq, dstm |
| %endif |
| psrad m1, 16 |
| psrad m2, 16 |
| packssdw m1, m2 |
| pmaxsw m1, m6 |
| pmulhrsw m1, m8 |
| pminsw m1, m9 |
| mova [dstq+dsq*0], m1 |
| call .main3 |
| %if ARCH_X86_32 |
| mov dstq, dstm |
| add dstq, dsm |
| %endif |
| psrad m1, 16 |
| psrad m2, 16 |
| packssdw m1, m2 |
| pmaxsw m1, m6 |
| pmulhrsw m1, m8 |
| pminsw m1, m9 |
| mova [dstq+dsq*1], m1 |
| dec cntd |
| jg .loop |
| RET |
| ALIGN function_align |
| .main: |
| ; Stack args offset by one (r4m -> r5m etc.) due to call |
| %if WIN64 |
| mov deltaq, r5m |
| mov mxd, r6m |
| %endif |
| movd m0, [base+warp8x8_shift+t0*4] |
| movddup m7, [base+warp8x8_rnd1+t0*8] |
| add filterq, mc_warp_filter-$$ |
| %if ARCH_X86_64 |
| movsx alphad, word [deltaq+2*0] |
| movsx betad, word [deltaq+2*1] |
| movsx gammad, word [deltaq+2*2] |
| movsx deltad, word [deltaq+2*3] |
| lea tmpq, [ssq*3] |
| add mxd, 512+(64<<10) |
| sub srcq, tmpq ; src -= ss*3 |
| imul tmpd, alphad, -7 |
| mov myd, r7m |
| add betad, tmpd ; beta -= alpha*7 |
| imul tmpd, gammad, -7 |
| add myd, 512+(64<<10) |
| mov cntd, 4 |
| add deltad, tmpd ; delta -= gamma*7 |
| %else |
| %if STACK_ALIGNMENT < 16 |
| %assign stack_offset stack_offset - gprsize |
| %endif |
| mov r3d, r5m ; abcd |
| %if STACK_ALIGNMENT < 16 |
| mov r0, r1m ; dst |
| mov r1, r2m ; ds |
| mov [esp+gprsize+4*65], r0 |
| mov [esp+gprsize+4*66], r1 |
| %endif |
| movsx alphad, word [r3+2*0] |
| movsx r2d, word [r3+2*1] |
| movsx gammad, word [r3+2*2] |
| movsx r3d, word [r3+2*3] |
| imul r5d, alphad, -7 |
| add r2d, r5d ; beta -= alpha*7 |
| imul r5d, gammad, -7 |
| mov [esp+gprsize+4*60], r2d |
| add r3d, r5d ; delta -= gamma*7 |
| mov [esp+gprsize+4*61], r3d |
| mov r3d, r4m ; ss |
| mov srcq, r3m |
| mov mxd, r6m |
| mov myd, r7m |
| mov dword [esp+gprsize+4*63], 4 ; cnt |
| mov [esp+gprsize+4*62], r3 |
| lea r3, [r3*3] |
| add mxd, 512+(64<<10) |
| add myd, 512+(64<<10) |
| sub srcq, r3 ; src -= ss*3 |
| %if STACK_ALIGNMENT < 16 |
| %assign stack_offset stack_offset + gprsize |
| %endif |
| %endif |
| mova [rsp+gprsize], m0 |
| pxor m6, m6 |
| call .h |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 01 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 1], m1 |
| mova [rsp+gprsize+16* 4], m5 |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 12 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 7], m1 |
| mova [rsp+gprsize+16*10], m5 |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 23 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 2], m1 |
| mova [rsp+gprsize+16* 5], m5 |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 34 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 8], m1 |
| mova [rsp+gprsize+16*11], m5 |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 45 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 3], m1 |
| mova [rsp+gprsize+16* 6], m5 |
| mova m5, m0 |
| call .h |
| punpcklwd m1, m5, m0 ; 56 |
| punpckhwd m5, m0 |
| mova [rsp+gprsize+16* 9], m1 |
| mova [rsp+gprsize+16*12], m5 |
| mova m5, m0 |
| .main2: |
| call .h |
| %macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h |
| lea tmpd, [myq+gammaq] |
| shr myd, 10 |
| movq m4, [filterq+myq*8] ; a |
| lea myd, [tmpq+gammaq] |
| shr tmpd, 10 |
| movq m2, [filterq+tmpq*8] ; b |
| lea tmpd, [myq+gammaq] |
| shr myd, 10 |
| movq m3, [filterq+myq*8] ; c |
| lea myd, [tmpq+gammaq] |
| shr tmpd, 10 |
| movq m1, [filterq+tmpq*8] ; d |
| lea tmpd, [myq+gammaq] |
| shr myd, 10 |
| punpcklwd m4, m2 |
| punpcklwd m3, m1 |
| punpckldq m2, m4, m3 |
| punpckhdq m4, m3 |
| punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 |
| pmaddwd m1, [rsp+gprsize+16*%1] |
| punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 |
| mova m2, [rsp+gprsize+16*%2] |
| pmaddwd m3, m2 |
| mova [rsp+gprsize+16*%1], m2 |
| paddd m1, m3 |
| punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 |
| mova m2, [rsp+gprsize+16*%3] |
| pmaddwd m3, m2 |
| mova [rsp+gprsize+16*%2], m2 |
| paddd m1, m3 |
| punpcklwd m3, m5, m0 ; 67 |
| punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 |
| pmaddwd m2, m3 |
| mova [rsp+gprsize+16*%3], m3 |
| paddd m1, m2 |
| movq m4, [filterq+myq*8] ; e |
| lea myd, [tmpq+gammaq] |
| shr tmpd, 10 |
| movq m3, [filterq+tmpq*8] ; f |
| lea tmpd, [myq+gammaq] |
| shr myd, 10 |
| movq m2, [filterq+myq*8] ; g |
| %if ARCH_X86_64 |
| lea myd, [tmpq+deltaq] ; my += delta |
| %else |
| mov myd, [esp+gprsize+4*61] |
| add myd, tmpd |
| %endif |
| shr tmpd, 10 |
| punpcklwd m4, m3 |
| movq m3, [filterq+tmpq*8] ; h |
| punpcklwd m2, m3 |
| punpckldq m3, m4, m2 |
| punpckhdq m4, m2 |
| punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 |
| pmaddwd m2, [rsp+gprsize+16*%4] |
| punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 |
| mova m3, [rsp+gprsize+16*%5] |
| pmaddwd m6, m3 |
| mova [rsp+gprsize+16*%4], m3 |
| pxor m3, m3 |
| paddd m2, m6 |
| punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 |
| mova m6, [rsp+gprsize+16*%6] |
| pmaddwd m3, m6 |
| mova [rsp+gprsize+16*%5], m6 |
| punpckhwd m5, m0 |
| pxor m6, m6 |
| paddd m2, m3 |
| punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 |
| pmaddwd m3, m5 |
| mova [rsp+gprsize+16*%6], m5 |
| mova m5, m0 |
| paddd m2, m3 |
| %endmacro |
| WARP_V 1, 2, 3, 4, 5, 6 |
| ret |
| .main3: |
| call .h |
| WARP_V 7, 8, 9, 10, 11, 12 |
| ret |
| ALIGN function_align |
| .h: |
| lea tmpd, [mxq+alphaq] |
| shr mxd, 10 |
| movq m3, [filterq+mxq*8] |
| punpcklbw m0, m6, m3 |
| movu m3, [srcq-6] |
| pmaddwd m0, m3 ; 0 |
| lea mxd, [tmpq+alphaq] |
| shr tmpd, 10 |
| movq m3, [filterq+tmpq*8] |
| punpcklbw m2, m6, m3 |
| movu m3, [srcq-4] |
| pmaddwd m2, m3 ; 1 |
| lea tmpd, [mxq+alphaq] |
| shr mxd, 10 |
| movq m3, [filterq+mxq*8] |
| phaddd m0, m2 ; 0 1 |
| punpcklbw m2, m6, m3 |
| movu m3, [srcq-2] |
| pmaddwd m2, m3 ; 2 |
| lea mxd, [tmpq+alphaq] |
| shr tmpd, 10 |
| movq m3, [filterq+tmpq*8] |
| punpcklbw m1, m6, m3 |
| movu m3, [srcq+0] |
| pmaddwd m1, m3 ; 3 |
| lea tmpd, [mxq+alphaq] |
| shr mxd, 10 |
| movq m3, [filterq+mxq*8] |
| phaddd m2, m1 ; 2 3 |
| punpcklbw m1, m6, m3 |
| movu m3, [srcq+2] |
| pmaddwd m1, m3 ; 4 |
| lea mxd, [tmpq+alphaq] |
| shr tmpd, 10 |
| movq m3, [filterq+tmpq*8] |
| phaddd m0, m2 ; 0 1 2 3 |
| punpcklbw m2, m6, m3 |
| movu m3, [srcq+4] |
| pmaddwd m2, m3 ; 5 |
| lea tmpd, [mxq+alphaq] |
| shr mxd, 10 |
| movq m3, [filterq+mxq*8] |
| phaddd m1, m2 ; 4 5 |
| punpcklbw m2, m6, m3 |
| movu m3, [srcq+6] |
| pmaddwd m2, m3 ; 6 |
| %if ARCH_X86_64 |
| lea mxd, [tmpq+betaq] ; mx += beta |
| %else |
| mov mxd, [esp+gprsize*2+4*60] |
| add mxd, tmpd |
| %endif |
| shr tmpd, 10 |
| movq m3, [filterq+tmpq*8] |
| punpcklbw m4, m6, m3 |
| movu m3, [srcq+8] |
| %if ARCH_X86_64 |
| add srcq, ssq |
| %else |
| add srcq, [esp+gprsize*2+4*62] |
| %endif |
| pmaddwd m3, m4 ; 7 |
| phaddd m2, m3 ; 6 7 |
| phaddd m1, m2 ; 4 5 6 7 |
| paddd m0, m7 |
| paddd m1, m7 |
| psrad m0, [rsp+gprsize*2] |
| psrad m1, [rsp+gprsize*2] |
| packssdw m0, m1 |
| ret |
| |
| %macro BIDIR_FN 0 |
| call .main |
| jmp wq |
| .w4_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w4: |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| movq [dstq+strideq*0], m1 |
| movhps [dstq+strideq*1], m1 |
| sub hd, 4 |
| jg .w4_loop |
| .ret: |
| RET |
| .w8_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w8: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| sub hd, 2 |
| jne .w8_loop |
| RET |
| .w16_loop: |
| call .main |
| add dstq, strideq |
| .w16: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| dec hd |
| jg .w16_loop |
| RET |
| .w32_loop: |
| call .main |
| add dstq, strideq |
| .w32: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| dec hd |
| jg .w32_loop |
| RET |
| .w64_loop: |
| call .main |
| add dstq, strideq |
| .w64: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| call .main |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| call .main |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m1 |
| dec hd |
| jg .w64_loop |
| RET |
| .w128_loop: |
| call .main |
| add dstq, strideq |
| .w128: |
| mova [dstq+16* 0], m0 |
| mova [dstq+16* 1], m1 |
| call .main |
| mova [dstq+16* 2], m0 |
| mova [dstq+16* 3], m1 |
| call .main |
| mova [dstq+16* 4], m0 |
| mova [dstq+16* 5], m1 |
| call .main |
| mova [dstq+16* 6], m0 |
| mova [dstq+16* 7], m1 |
| call .main |
| mova [dstq+16* 8], m0 |
| mova [dstq+16* 9], m1 |
| call .main |
| mova [dstq+16*10], m0 |
| mova [dstq+16*11], m1 |
| call .main |
| mova [dstq+16*12], m0 |
| mova [dstq+16*13], m1 |
| call .main |
| mova [dstq+16*14], m0 |
| mova [dstq+16*15], m1 |
| dec hd |
| jg .w128_loop |
| RET |
| %endmacro |
| |
| %if UNIX64 |
| DECLARE_REG_TMP 7 |
| %else |
| DECLARE_REG_TMP 5 |
| %endif |
| |
| cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h |
| %define base r6-avg_ssse3_table |
| LEA r6, avg_ssse3_table |
| tzcnt wd, wm |
| mov t0d, r6m ; pixel_max |
| movsxd wq, [r6+wq*4] |
| shr t0d, 11 |
| movddup m2, [base+bidir_rnd+t0*8] |
| movddup m3, [base+bidir_mul+t0*8] |
| movifnidn hd, hm |
| add wq, r6 |
| BIDIR_FN |
| ALIGN function_align |
| .main: |
| mova m0, [tmp1q+16*0] |
| paddsw m0, [tmp2q+16*0] |
| mova m1, [tmp1q+16*1] |
| paddsw m1, [tmp2q+16*1] |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| pmaxsw m0, m2 |
| pmaxsw m1, m2 |
| psubsw m0, m2 |
| psubsw m1, m2 |
| pmulhw m0, m3 |
| pmulhw m1, m3 |
| ret |
| |
| cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h |
| %define base r6-w_avg_ssse3_table |
| LEA r6, w_avg_ssse3_table |
| tzcnt wd, wm |
| mov t0d, r6m ; weight |
| movd m6, r7m ; pixel_max |
| movddup m5, [base+pd_65538] |
| movsxd wq, [r6+wq*4] |
| pshufb m6, [base+pw_256] |
| add wq, r6 |
| lea r6d, [t0-16] |
| shl t0d, 16 |
| sub t0d, r6d ; 16-weight, weight |
| paddw m5, m6 |
| mov r6d, t0d |
| shl t0d, 2 |
| test dword r7m, 0x800 |
| cmovnz r6d, t0d |
| movifnidn hd, hm |
| movd m4, r6d |
| pslld m5, 7 |
| pxor m7, m7 |
| pshufd m4, m4, q0000 |
| BIDIR_FN |
| ALIGN function_align |
| .main: |
| mova m2, [tmp1q+16*0] |
| mova m0, [tmp2q+16*0] |
| punpckhwd m3, m0, m2 |
| punpcklwd m0, m2 |
| mova m2, [tmp1q+16*1] |
| mova m1, [tmp2q+16*1] |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| pmaddwd m3, m4 |
| pmaddwd m0, m4 |
| paddd m3, m5 |
| paddd m0, m5 |
| psrad m3, 8 |
| psrad m0, 8 |
| packssdw m0, m3 |
| punpckhwd m3, m1, m2 |
| punpcklwd m1, m2 |
| pmaddwd m3, m4 |
| pmaddwd m1, m4 |
| paddd m3, m5 |
| paddd m1, m5 |
| psrad m3, 8 |
| psrad m1, 8 |
| packssdw m1, m3 |
| pminsw m0, m6 |
| pminsw m1, m6 |
| pmaxsw m0, m7 |
| pmaxsw m1, m7 |
| ret |
| |
| %if ARCH_X86_64 |
| cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask |
| %else |
| cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask |
| %define hd dword r5m |
| %define m8 [base+pw_64] |
| %endif |
| %define base r6-mask_ssse3_table |
| LEA r6, mask_ssse3_table |
| tzcnt wd, wm |
| mov t0d, r7m ; pixel_max |
| shr t0d, 11 |
| movsxd wq, [r6+wq*4] |
| movddup m6, [base+bidir_rnd+t0*8] |
| movddup m7, [base+bidir_mul+t0*8] |
| %if ARCH_X86_64 |
| mova m8, [base+pw_64] |
| movifnidn hd, hm |
| %endif |
| add wq, r6 |
| mov maskq, r6mp |
| BIDIR_FN |
| ALIGN function_align |
| .main: |
| movq m3, [maskq+8*0] |
| mova m0, [tmp1q+16*0] |
| mova m4, [tmp2q+16*0] |
| pxor m5, m5 |
| punpcklbw m3, m5 |
| punpckhwd m2, m0, m4 |
| punpcklwd m0, m4 |
| psubw m1, m8, m3 |
| punpckhwd m4, m3, m1 ; m, 64-m |
| punpcklwd m3, m1 |
| pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) |
| pmaddwd m0, m3 |
| movq m3, [maskq+8*1] |
| mova m1, [tmp1q+16*1] |
| mova m4, [tmp2q+16*1] |
| add maskq, 8*2 |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| psrad m2, 5 |
| psrad m0, 5 |
| packssdw m0, m2 |
| punpcklbw m3, m5 |
| punpckhwd m2, m1, m4 |
| punpcklwd m1, m4 |
| psubw m5, m8, m3 |
| punpckhwd m4, m3, m5 ; m, 64-m |
| punpcklwd m3, m5 |
| pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) |
| pmaddwd m1, m3 |
| psrad m2, 5 |
| psrad m1, 5 |
| packssdw m1, m2 |
| pmaxsw m0, m6 |
| pmaxsw m1, m6 |
| psubsw m0, m6 |
| psubsw m1, m6 |
| pmulhw m0, m7 |
| pmulhw m1, m7 |
| ret |
| |
| cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask |
| %define base t0-w_mask_420_ssse3_table |
| LEA t0, w_mask_420_ssse3_table |
| tzcnt wd, wm |
| mov r6d, r8m ; pixel_max |
| movd m0, r7m ; sign |
| shr r6d, 11 |
| movsxd wq, [t0+wq*4] |
| %if ARCH_X86_64 |
| mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 |
| mova m9, [base+pw_64] |
| movddup m10, [base+bidir_rnd+r6*8] |
| movddup m11, [base+bidir_mul+r6*8] |
| %else |
| mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 |
| mova m2, [base+pw_64] |
| movddup m3, [base+bidir_rnd+r6*8] |
| movddup m4, [base+bidir_mul+r6*8] |
| ALLOC_STACK -16*4 |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| mova [rsp+16*2], m3 |
| mova [rsp+16*3], m4 |
| %define m8 [rsp+gprsize+16*0] |
| %define m9 [rsp+gprsize+16*1] |
| %define m10 [rsp+gprsize+16*2] |
| %define m11 [rsp+gprsize+16*3] |
| %endif |
| movd m7, [base+pw_2] |
| psubw m7, m0 |
| pshufb m7, [base+pw_256] |
| add wq, t0 |
| movifnidn hd, r5m |
| mov maskq, r6mp |
| call .main |
| jmp wq |
| .w4_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 4 |
| .w4: |
| movq [dstq+strideq*0], m0 |
| phaddw m2, m3 |
| movhps [dstq+strideq*1], m0 |
| phaddd m2, m2 |
| lea dstq, [dstq+strideq*2] |
| paddw m2, m7 |
| movq [dstq+strideq*0], m1 |
| psrlw m2, 2 |
| movhps [dstq+strideq*1], m1 |
| packuswb m2, m2 |
| movd [maskq], m2 |
| sub hd, 4 |
| jg .w4_loop |
| RET |
| .w8_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 4 |
| .w8: |
| mova [dstq+strideq*0], m0 |
| paddw m2, m3 |
| phaddw m2, m2 |
| mova [dstq+strideq*1], m1 |
| paddw m2, m7 |
| psrlw m2, 2 |
| packuswb m2, m2 |
| movd [maskq], m2 |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w16_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 8 |
| .w16: |
| mova [dstq+strideq*1+16*0], m2 |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*1+16*1], m3 |
| mova [dstq+strideq*0+16*1], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16*0] |
| paddw m3, [dstq+strideq*1+16*1] |
| mova [dstq+strideq*1+16*0], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*1], m1 |
| paddw m2, m7 |
| psrlw m2, 2 |
| packuswb m2, m2 |
| movq [maskq], m2 |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| .w32_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 16 |
| .w32: |
| mova [dstq+strideq*1+16*0], m2 |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*1+16*1], m3 |
| mova [dstq+strideq*0+16*1], m1 |
| call .main |
| mova [dstq+strideq*0+16*2], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*3], m2 |
| mova [dstq+strideq*0+16*3], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16*0] |
| paddw m3, [dstq+strideq*1+16*1] |
| mova [dstq+strideq*1+16*0], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*2], m2 |
| mova [dstq+strideq*1+16*1], m1 |
| call .main |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16*2] |
| paddw m2, [dstq+strideq*1+16*3] |
| mova [dstq+strideq*1+16*2], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16*3], m1 |
| packuswb m3, m2 |
| mova [maskq], m3 |
| sub hd, 2 |
| jg .w32_loop |
| RET |
| .w64_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 16*2 |
| .w64: |
| mova [dstq+strideq*1+16*1], m2 |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*1+16*2], m3 |
| mova [dstq+strideq*0+16*1], m1 |
| call .main |
| mova [dstq+strideq*1+16*3], m2 |
| mova [dstq+strideq*0+16*2], m0 |
| mova [dstq+strideq*1+16*4], m3 |
| mova [dstq+strideq*0+16*3], m1 |
| call .main |
| mova [dstq+strideq*1+16*5], m2 |
| mova [dstq+strideq*0+16*4], m0 |
| mova [dstq+strideq*1+16*6], m3 |
| mova [dstq+strideq*0+16*5], m1 |
| call .main |
| mova [dstq+strideq*0+16*6], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*7], m2 |
| mova [dstq+strideq*0+16*7], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16*1] |
| paddw m3, [dstq+strideq*1+16*2] |
| mova [dstq+strideq*1+16*0], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*2], m2 |
| mova [dstq+strideq*1+16*1], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16*3] |
| paddw m3, [dstq+strideq*1+16*4] |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16*2] |
| mova [dstq+strideq*1+16*2], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16*3], m1 |
| packuswb m3, m2 |
| mova [maskq+16*0], m3 |
| call .main |
| paddw m2, [dstq+strideq*1+16*5] |
| paddw m3, [dstq+strideq*1+16*6] |
| mova [dstq+strideq*1+16*4], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*6], m2 |
| mova [dstq+strideq*1+16*5], m1 |
| call .main |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16*6] |
| paddw m2, [dstq+strideq*1+16*7] |
| mova [dstq+strideq*1+16*6], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16*7], m1 |
| packuswb m3, m2 |
| mova [maskq+16*1], m3 |
| sub hd, 2 |
| jg .w64_loop |
| RET |
| .w128_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| add maskq, 16*4 |
| .w128: |
| mova [dstq+strideq*1+16* 1], m2 |
| mova [dstq+strideq*0+16* 0], m0 |
| mova [dstq+strideq*1+16* 2], m3 |
| mova [dstq+strideq*0+16* 1], m1 |
| call .main |
| mova [dstq+strideq*1+16* 3], m2 |
| mova [dstq+strideq*0+16* 2], m0 |
| mova [dstq+strideq*1+16* 4], m3 |
| mova [dstq+strideq*0+16* 3], m1 |
| call .main |
| mova [dstq+strideq*1+16* 5], m2 |
| mova [dstq+strideq*0+16* 4], m0 |
| mova [dstq+strideq*1+16* 6], m3 |
| mova [dstq+strideq*0+16* 5], m1 |
| call .main |
| mova [dstq+strideq*1+16* 7], m2 |
| mova [dstq+strideq*0+16* 6], m0 |
| mova [dstq+strideq*1+16* 8], m3 |
| mova [dstq+strideq*0+16* 7], m1 |
| call .main |
| mova [dstq+strideq*1+16* 9], m2 |
| mova [dstq+strideq*0+16* 8], m0 |
| mova [dstq+strideq*1+16*10], m3 |
| mova [dstq+strideq*0+16* 9], m1 |
| call .main |
| mova [dstq+strideq*1+16*11], m2 |
| mova [dstq+strideq*0+16*10], m0 |
| mova [dstq+strideq*1+16*12], m3 |
| mova [dstq+strideq*0+16*11], m1 |
| call .main |
| mova [dstq+strideq*1+16*13], m2 |
| mova [dstq+strideq*0+16*12], m0 |
| mova [dstq+strideq*1+16*14], m3 |
| mova [dstq+strideq*0+16*13], m1 |
| call .main |
| mova [dstq+strideq*0+16*14], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*15], m2 |
| mova [dstq+strideq*0+16*15], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16* 1] |
| paddw m3, [dstq+strideq*1+16* 2] |
| mova [dstq+strideq*1+16* 0], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16* 2], m2 |
| mova [dstq+strideq*1+16* 1], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16* 3] |
| paddw m3, [dstq+strideq*1+16* 4] |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16* 2] |
| mova [dstq+strideq*1+16* 2], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16* 3], m1 |
| packuswb m3, m2 |
| mova [maskq+16*0], m3 |
| call .main |
| paddw m2, [dstq+strideq*1+16* 5] |
| paddw m3, [dstq+strideq*1+16* 6] |
| mova [dstq+strideq*1+16* 4], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16* 6], m2 |
| mova [dstq+strideq*1+16* 5], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16* 7] |
| paddw m3, [dstq+strideq*1+16* 8] |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16* 6] |
| mova [dstq+strideq*1+16* 6], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16* 7], m1 |
| packuswb m3, m2 |
| mova [maskq+16*1], m3 |
| call .main |
| paddw m2, [dstq+strideq*1+16* 9] |
| paddw m3, [dstq+strideq*1+16*10] |
| mova [dstq+strideq*1+16* 8], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*10], m2 |
| mova [dstq+strideq*1+16* 9], m1 |
| call .main |
| paddw m2, [dstq+strideq*1+16*11] |
| paddw m3, [dstq+strideq*1+16*12] |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16*10] |
| mova [dstq+strideq*1+16*10], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16*11], m1 |
| packuswb m3, m2 |
| mova [maskq+16*2], m3 |
| call .main |
| paddw m2, [dstq+strideq*1+16*13] |
| paddw m3, [dstq+strideq*1+16*14] |
| mova [dstq+strideq*1+16*12], m0 |
| phaddw m2, m3 |
| mova [dstq+strideq*1+16*14], m2 |
| mova [dstq+strideq*1+16*13], m1 |
| call .main |
| phaddw m2, m3 |
| paddw m3, m7, [dstq+strideq*1+16*14] |
| paddw m2, [dstq+strideq*1+16*15] |
| mova [dstq+strideq*1+16*14], m0 |
| paddw m2, m7 |
| psrlw m3, 2 |
| psrlw m2, 2 |
| mova [dstq+strideq*1+16*15], m1 |
| packuswb m3, m2 |
| mova [maskq+16*3], m3 |
| sub hd, 2 |
| jg .w128_loop |
| RET |
| ALIGN function_align |
| .main: |
| %macro W_MASK 2 ; dst/tmp_offset, mask |
| mova m%1, [tmp1q+16*%1] |
| mova m%2, [tmp2q+16*%1] |
| punpcklwd m4, m%2, m%1 |
| punpckhwd m5, m%2, m%1 |
| psubsw m%1, m%2 |
| pabsw m%1, m%1 |
| psubusw m6, m8, m%1 |
| psrlw m6, 10 ; 64-m |
| psubw m%2, m9, m6 ; m |
| punpcklwd m%1, m6, m%2 |
| punpckhwd m6, m%2 |
| pmaddwd m%1, m4 |
| pmaddwd m6, m5 |
| psrad m%1, 5 |
| psrad m6, 5 |
| packssdw m%1, m6 |
| pmaxsw m%1, m10 |
| psubsw m%1, m10 |
| pmulhw m%1, m11 |
| %endmacro |
| W_MASK 0, 2 |
| W_MASK 1, 3 |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| ret |
| |
| cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask |
| %define base t0-w_mask_422_ssse3_table |
| LEA t0, w_mask_422_ssse3_table |
| tzcnt wd, wm |
| mov r6d, r8m ; pixel_max |
| movd m7, r7m ; sign |
| shr r6d, 11 |
| movsxd wq, [t0+wq*4] |
| %if ARCH_X86_64 |
| mova m8, [base+pw_27615] |
| mova m9, [base+pw_64] |
| movddup m10, [base+bidir_rnd+r6*8] |
| movddup m11, [base+bidir_mul+r6*8] |
| %else |
| mova m1, [base+pw_27615] |
| mova m2, [base+pw_64] |
| movddup m3, [base+bidir_rnd+r6*8] |
| movddup m4, [base+bidir_mul+r6*8] |
| ALLOC_STACK -16*4 |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| mova [rsp+16*2], m3 |
| mova [rsp+16*3], m4 |
| %endif |
| pxor m0, m0 |
| add wq, t0 |
| pshufb m7, m0 |
| movifnidn hd, r5m |
| mov maskq, r6mp |
| call .main |
| jmp wq |
| .w4_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w4: |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| movq [dstq+strideq*0], m1 |
| movhps [dstq+strideq*1], m1 |
| sub hd, 4 |
| jg .w4_loop |
| .end: |
| RET |
| .w8_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w8: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| sub hd, 2 |
| jg .w8_loop |
| .w8_end: |
| RET |
| .w16_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w16: |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m1 |
| call .main |
| mova [dstq+strideq*1+16*0], m0 |
| mova [dstq+strideq*1+16*1], m1 |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| .w32_loop: |
| call .main |
| add dstq, strideq |
| .w32: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| dec hd |
| jg .w32_loop |
| RET |
| .w64_loop: |
| call .main |
| add dstq, strideq |
| .w64: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| call .main |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| call .main |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m1 |
| dec hd |
| jg .w64_loop |
| RET |
| .w128_loop: |
| call .main |
| add dstq, strideq |
| .w128: |
| mova [dstq+16* 0], m0 |
| mova [dstq+16* 1], m1 |
| call .main |
| mova [dstq+16* 2], m0 |
| mova [dstq+16* 3], m1 |
| call .main |
| mova [dstq+16* 4], m0 |
| mova [dstq+16* 5], m1 |
| call .main |
| mova [dstq+16* 6], m0 |
| mova [dstq+16* 7], m1 |
| call .main |
| mova [dstq+16* 8], m0 |
| mova [dstq+16* 9], m1 |
| call .main |
| mova [dstq+16*10], m0 |
| mova [dstq+16*11], m1 |
| call .main |
| mova [dstq+16*12], m0 |
| mova [dstq+16*13], m1 |
| call .main |
| mova [dstq+16*14], m0 |
| mova [dstq+16*15], m1 |
| dec hd |
| jg .w128_loop |
| RET |
| ALIGN function_align |
| .main: |
| W_MASK 0, 2 |
| W_MASK 1, 3 |
| phaddw m2, m3 |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| packuswb m2, m2 |
| pxor m3, m3 |
| psubb m2, m7 |
| pavgb m2, m3 |
| movq [maskq], m2 |
| add maskq, 8 |
| ret |
| |
| cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask |
| %define base t0-w_mask_444_ssse3_table |
| LEA t0, w_mask_444_ssse3_table |
| tzcnt wd, wm |
| mov r6d, r8m ; pixel_max |
| shr r6d, 11 |
| movsxd wq, [t0+wq*4] |
| %if ARCH_X86_64 |
| mova m8, [base+pw_27615] |
| mova m9, [base+pw_64] |
| movddup m10, [base+bidir_rnd+r6*8] |
| movddup m11, [base+bidir_mul+r6*8] |
| %else |
| mova m1, [base+pw_27615] |
| mova m2, [base+pw_64] |
| movddup m3, [base+bidir_rnd+r6*8] |
| movddup m7, [base+bidir_mul+r6*8] |
| ALLOC_STACK -16*3 |
| mova [rsp+16*0], m1 |
| mova [rsp+16*1], m2 |
| mova [rsp+16*2], m3 |
| %define m11 m7 |
| %endif |
| add wq, t0 |
| movifnidn hd, r5m |
| mov maskq, r6mp |
| call .main |
| jmp wq |
| .w4_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w4: |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| movq [dstq+strideq*0], m1 |
| movhps [dstq+strideq*1], m1 |
| sub hd, 4 |
| jg .w4_loop |
| .end: |
| RET |
| .w8_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w8: |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| sub hd, 2 |
| jg .w8_loop |
| .w8_end: |
| RET |
| .w16_loop: |
| call .main |
| lea dstq, [dstq+strideq*2] |
| .w16: |
| mova [dstq+strideq*0+16*0], m0 |
| mova [dstq+strideq*0+16*1], m1 |
| call .main |
| mova [dstq+strideq*1+16*0], m0 |
| mova [dstq+strideq*1+16*1], m1 |
| sub hd, 2 |
| jg .w16_loop |
| RET |
| .w32_loop: |
| call .main |
| add dstq, strideq |
| .w32: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| dec hd |
| jg .w32_loop |
| RET |
| .w64_loop: |
| call .main |
| add dstq, strideq |
| .w64: |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| call .main |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| call .main |
| mova [dstq+16*4], m0 |
| mova [dstq+16*5], m1 |
| call .main |
| mova [dstq+16*6], m0 |
| mova [dstq+16*7], m1 |
| dec hd |
| jg .w64_loop |
| RET |
| .w128_loop: |
| call .main |
| add dstq, strideq |
| .w128: |
| mova [dstq+16* 0], m0 |
| mova [dstq+16* 1], m1 |
| call .main |
| mova [dstq+16* 2], m0 |
| mova [dstq+16* 3], m1 |
| call .main |
| mova [dstq+16* 4], m0 |
| mova [dstq+16* 5], m1 |
| call .main |
| mova [dstq+16* 6], m0 |
| mova [dstq+16* 7], m1 |
| call .main |
| mova [dstq+16* 8], m0 |
| mova [dstq+16* 9], m1 |
| call .main |
| mova [dstq+16*10], m0 |
| mova [dstq+16*11], m1 |
| call .main |
| mova [dstq+16*12], m0 |
| mova [dstq+16*13], m1 |
| call .main |
| mova [dstq+16*14], m0 |
| mova [dstq+16*15], m1 |
| dec hd |
| jg .w128_loop |
| RET |
| ALIGN function_align |
| .main: |
| W_MASK 0, 2 |
| W_MASK 1, 3 |
| packuswb m2, m3 |
| add tmp1q, 16*2 |
| add tmp2q, 16*2 |
| mova [maskq], m2 |
| add maskq, 16 |
| ret |
| |
| ; (a * (64 - m) + b * m + 32) >> 6 |
| ; = (((b - a) * m + 32) >> 6) + a |
| ; = (((b - a) * (m << 9) + 16384) >> 15) + a |
| ; except m << 9 overflows int16_t when m == 64 (which is possible), |
| ; but if we negate m it works out (-64 << 9 == -32768). |
| ; = (((a - b) * (m * -512) + 16384) >> 15) + a |
| cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 |
| %define base r6-blend_ssse3_table |
| LEA r6, blend_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r6+wq*4] |
| movifnidn maskq, maskmp |
| mova m7, [base+pw_m512] |
| add wq, r6 |
| lea stride3q, [strideq*3] |
| pxor m6, m6 |
| jmp wq |
| .w4: |
| mova m5, [maskq] |
| movq m0, [dstq+strideq*0] |
| movhps m0, [dstq+strideq*1] |
| movq m1, [dstq+strideq*2] |
| movhps m1, [dstq+stride3q ] |
| psubw m2, m0, [tmpq+16*0] |
| psubw m3, m1, [tmpq+16*1] |
| add maskq, 16 |
| add tmpq, 32 |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| pmullw m4, m7 |
| pmullw m5, m7 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| movq [dstq+strideq*2], m1 |
| movhps [dstq+stride3q ], m1 |
| lea dstq, [dstq+strideq*4] |
| sub hd, 4 |
| jg .w4 |
| RET |
| .w8: |
| mova m5, [maskq] |
| mova m0, [dstq+strideq*0] |
| mova m1, [dstq+strideq*1] |
| psubw m2, m0, [tmpq+16*0] |
| psubw m3, m1, [tmpq+16*1] |
| add maskq, 16 |
| add tmpq, 32 |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| pmullw m4, m7 |
| pmullw m5, m7 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8 |
| RET |
| .w16: |
| mova m5, [maskq] |
| mova m0, [dstq+16*0] |
| mova m1, [dstq+16*1] |
| psubw m2, m0, [tmpq+16*0] |
| psubw m3, m1, [tmpq+16*1] |
| add maskq, 16 |
| add tmpq, 32 |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| pmullw m4, m7 |
| pmullw m5, m7 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| add dstq, strideq |
| dec hd |
| jg .w16 |
| RET |
| .w32: |
| mova m5, [maskq+16*0] |
| mova m0, [dstq+16*0] |
| mova m1, [dstq+16*1] |
| psubw m2, m0, [tmpq+16*0] |
| psubw m3, m1, [tmpq+16*1] |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| pmullw m4, m7 |
| pmullw m5, m7 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova m5, [maskq+16*1] |
| mova m0, [dstq+16*2] |
| mova m1, [dstq+16*3] |
| psubw m2, m0, [tmpq+16*2] |
| psubw m3, m1, [tmpq+16*3] |
| add maskq, 32 |
| add tmpq, 64 |
| punpcklbw m4, m5, m6 |
| punpckhbw m5, m6 |
| pmullw m4, m7 |
| pmullw m5, m7 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+16*2], m0 |
| mova [dstq+16*3], m1 |
| add dstq, strideq |
| dec hd |
| jg .w32 |
| RET |
| |
| cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h |
| %define base r5-blend_v_ssse3_table |
| LEA r5, blend_v_ssse3_table |
| tzcnt wd, wm |
| movifnidn hd, hm |
| movsxd wq, [r5+wq*4] |
| add wq, r5 |
| jmp wq |
| .w2: |
| movd m4, [base+obmc_masks+2*2] |
| .w2_loop: |
| movd m0, [dstq+strideq*0] |
| movd m2, [tmpq+4*0] |
| movd m1, [dstq+strideq*1] |
| movd m3, [tmpq+4*1] |
| add tmpq, 4*2 |
| psubw m2, m0 |
| psubw m3, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| movd [dstq+strideq*0], m0 |
| movd [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w2_loop |
| RET |
| .w4: |
| movddup m2, [base+obmc_masks+4*2] |
| .w4_loop: |
| movq m0, [dstq+strideq*0] |
| movhps m0, [dstq+strideq*1] |
| mova m1, [tmpq] |
| add tmpq, 8*2 |
| psubw m1, m0 |
| pmulhrsw m1, m2 |
| paddw m0, m1 |
| movq [dstq+strideq*0], m0 |
| movhps [dstq+strideq*1], m0 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w4_loop |
| RET |
| .w8: |
| mova m4, [base+obmc_masks+8*2] |
| .w8_loop: |
| mova m0, [dstq+strideq*0] |
| mova m2, [tmpq+16*0] |
| mova m1, [dstq+strideq*1] |
| mova m3, [tmpq+16*1] |
| add tmpq, 16*2 |
| psubw m2, m0 |
| psubw m3, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m4 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| sub hd, 2 |
| jg .w8_loop |
| RET |
| .w16: |
| mova m4, [base+obmc_masks+16*2] |
| movq m5, [base+obmc_masks+16*3] |
| .w16_loop: |
| mova m0, [dstq+16*0] |
| mova m2, [tmpq+16*0] |
| mova m1, [dstq+16*1] |
| mova m3, [tmpq+16*1] |
| add tmpq, 16*2 |
| psubw m2, m0 |
| psubw m3, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| add dstq, strideq |
| dec hd |
| jg .w16_loop |
| RET |
| .w32: |
| %if WIN64 |
| movaps [rsp+8], m6 |
| %endif |
| mova m4, [base+obmc_masks+16*4] |
| mova m5, [base+obmc_masks+16*5] |
| mova m6, [base+obmc_masks+16*6] |
| .w32_loop: |
| mova m0, [dstq+16*0] |
| mova m2, [tmpq+16*0] |
| mova m1, [dstq+16*1] |
| mova m3, [tmpq+16*1] |
| psubw m2, m0 |
| psubw m3, m1 |
| pmulhrsw m2, m4 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| mova m2, [dstq+16*2] |
| paddw m1, m3 |
| mova m3, [tmpq+16*2] |
| add tmpq, 16*4 |
| psubw m3, m2 |
| pmulhrsw m3, m6 |
| paddw m2, m3 |
| mova [dstq+16*0], m0 |
| mova [dstq+16*1], m1 |
| mova [dstq+16*2], m2 |
| add dstq, strideq |
| dec hd |
| jg .w32_loop |
| %if WIN64 |
| movaps m6, [rsp+8] |
| %endif |
| RET |
| |
| %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp |
| mova m0, [dstq+16*(%1+0)] |
| mova m2, [tmpq+16*(%2+0)] |
| mova m1, [dstq+16*(%1+1)] |
| mova m3, [tmpq+16*(%2+1)] |
| %if %3 |
| add tmpq, 16*%3 |
| %endif |
| psubw m2, m0 |
| psubw m3, m1 |
| pmulhrsw m2, m5 |
| pmulhrsw m3, m5 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+16*(%1+0)], m0 |
| mova [dstq+16*(%1+1)], m1 |
| %endmacro |
| |
| cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask |
| %define base r6-blend_h_ssse3_table |
| LEA r6, blend_h_ssse3_table |
| tzcnt wd, wm |
| mov hd, hm |
| movsxd wq, [r6+wq*4] |
| movddup m4, [base+blend_shuf] |
| lea maskq, [base+obmc_masks+hq*2] |
| lea hd, [hq*3] |
| add wq, r6 |
| shr hd, 2 ; h * 3/4 |
| lea maskq, [maskq+hq*2] |
| neg hq |
| jmp wq |
| .w2: |
| movd m0, [dstq+dsq*0] |
| movd m2, [dstq+dsq*1] |
| movd m3, [maskq+hq*2] |
| movq m1, [tmpq] |
| add tmpq, 4*2 |
| punpckldq m0, m2 |
| punpcklwd m3, m3 |
| psubw m1, m0 |
| pmulhrsw m1, m3 |
| paddw m0, m1 |
| movd [dstq+dsq*0], m0 |
| psrlq m0, 32 |
| movd [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| add hq, 2 |
| jl .w2 |
| RET |
| .w4: |
| mova m3, [base+blend_shuf] |
| .w4_loop: |
| movq m0, [dstq+dsq*0] |
| movhps m0, [dstq+dsq*1] |
| movd m2, [maskq+hq*2] |
| mova m1, [tmpq] |
| add tmpq, 8*2 |
| psubw m1, m0 |
| pshufb m2, m3 |
| pmulhrsw m1, m2 |
| paddw m0, m1 |
| movq [dstq+dsq*0], m0 |
| movhps [dstq+dsq*1], m0 |
| lea dstq, [dstq+dsq*2] |
| add hq, 2 |
| jl .w4_loop |
| RET |
| .w8: |
| movddup m5, [base+blend_shuf+8] |
| %if WIN64 |
| movaps [rsp+ 8], m6 |
| movaps [rsp+24], m7 |
| %endif |
| .w8_loop: |
| movd m7, [maskq+hq*2] |
| mova m0, [dstq+dsq*0] |
| mova m2, [tmpq+16*0] |
| mova m1, [dstq+dsq*1] |
| mova m3, [tmpq+16*1] |
| add tmpq, 16*2 |
| pshufb m6, m7, m4 |
| psubw m2, m0 |
| pshufb m7, m5 |
| psubw m3, m1 |
| pmulhrsw m2, m6 |
| pmulhrsw m3, m7 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova [dstq+dsq*0], m0 |
| mova [dstq+dsq*1], m1 |
| lea dstq, [dstq+dsq*2] |
| add hq, 2 |
| jl .w8_loop |
| %if WIN64 |
| movaps m6, [rsp+ 8] |
| movaps m7, [rsp+24] |
| %endif |
| RET |
| .w16: |
| movd m5, [maskq+hq*2] |
| pshufb m5, m4 |
| BLEND_H_ROW 0, 0, 2 |
| add dstq, dsq |
| inc hq |
| jl .w16 |
| RET |
| .w32: |
| movd m5, [maskq+hq*2] |
| pshufb m5, m4 |
| BLEND_H_ROW 0, 0 |
| BLEND_H_ROW 2, 2, 4 |
| add dstq, dsq |
| inc hq |
| jl .w32 |
| RET |
| .w64: |
| movd m5, [maskq+hq*2] |
| pshufb m5, m4 |
| BLEND_H_ROW 0, 0 |
| BLEND_H_ROW 2, 2 |
| BLEND_H_ROW 4, 4 |
| BLEND_H_ROW 6, 6, 8 |
| add dstq, dsq |
| inc hq |
| jl .w64 |
| RET |
| .w128: |
| movd m5, [maskq+hq*2] |
| pshufb m5, m4 |
| BLEND_H_ROW 0, 0 |
| BLEND_H_ROW 2, 2 |
| BLEND_H_ROW 4, 4 |
| BLEND_H_ROW 6, 6, 16 |
| BLEND_H_ROW 8, -8 |
| BLEND_H_ROW 10, -6 |
| BLEND_H_ROW 12, -4 |
| BLEND_H_ROW 14, -2 |
| add dstq, dsq |
| inc hq |
| jl .w128 |
| RET |
| |
| ; emu_edge args: |
| ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, |
| ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, |
| ; const pixel *ref, const ptrdiff_t ref_stride |
| ; |
| ; bw, bh total filled size |
| ; iw, ih, copied block -> fill bottom, right |
| ; x, y, offset in bw/bh -> fill top, left |
| cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ |
| y, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| ; we assume that the buffer (stride) is larger than width, so we can |
| ; safely overwrite by a few bytes |
| |
| %if ARCH_X86_64 |
| %define reg_zero r12q |
| %define reg_tmp r10 |
| %define reg_src srcq |
| %define reg_bottomext bottomextq |
| %define reg_rightext rightextq |
| %define reg_blkm r9m |
| %else |
| %define reg_zero r6 |
| %define reg_tmp r0 |
| %define reg_src r1 |
| %define reg_bottomext r0 |
| %define reg_rightext r1 |
| %define reg_blkm r2m |
| %endif |
| ; |
| ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) |
| xor reg_zero, reg_zero |
| lea reg_tmp, [ihq-1] |
| cmp yq, ihq |
| cmovs reg_tmp, yq |
| test yq, yq |
| cmovs reg_tmp, reg_zero |
| %if ARCH_X86_64 |
| imul reg_tmp, sstrideq |
| add srcq, reg_tmp |
| %else |
| imul reg_tmp, sstridem |
| mov reg_src, srcm |
| add reg_src, reg_tmp |
| %endif |
| ; |
| ; ref += iclip(x, 0, iw - 1) |
| lea reg_tmp, [iwq-1] |
| cmp xq, iwq |
| cmovs reg_tmp, xq |
| test xq, xq |
| cmovs reg_tmp, reg_zero |
| lea reg_src, [reg_src+reg_tmp*2] |
| %if ARCH_X86_32 |
| mov srcm, reg_src |
| %endif |
| ; |
| ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) |
| %if ARCH_X86_32 |
| mov r1, r1m ; restore bh |
| %endif |
| lea reg_bottomext, [yq+bhq] |
| sub reg_bottomext, ihq |
| lea r3, [bhq-1] |
| cmovs reg_bottomext, reg_zero |
| ; |
| |
| DEFINE_ARGS bw, bh, iw, ih, x, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; top_ext = iclip(-y, 0, bh - 1) |
| neg topextq |
| cmovs topextq, reg_zero |
| cmp reg_bottomext, bhq |
| cmovns reg_bottomext, r3 |
| cmp topextq, bhq |
| cmovg topextq, r3 |
| %if ARCH_X86_32 |
| mov r4m, reg_bottomext |
| ; |
| ; right_ext = iclip(x + bw - iw, 0, bw - 1) |
| mov r0, r0m ; restore bw |
| %endif |
| lea reg_rightext, [xq+bwq] |
| sub reg_rightext, iwq |
| lea r2, [bwq-1] |
| cmovs reg_rightext, reg_zero |
| |
| DEFINE_ARGS bw, bh, iw, ih, leftext, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; left_ext = iclip(-x, 0, bw - 1) |
| neg leftextq |
| cmovs leftextq, reg_zero |
| cmp reg_rightext, bwq |
| cmovns reg_rightext, r2 |
| %if ARCH_X86_32 |
| mov r3m, r1 |
| %endif |
| cmp leftextq, bwq |
| cmovns leftextq, r2 |
| |
| %undef reg_zero |
| %undef reg_tmp |
| %undef reg_src |
| %undef reg_bottomext |
| %undef reg_rightext |
| |
| DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ |
| topext, dst, dstride, src, sstride, \ |
| bottomext, rightext, blk |
| |
| ; center_h = bh - top_ext - bottom_ext |
| %if ARCH_X86_64 |
| lea r3, [bottomextq+topextq] |
| sub centerhq, r3 |
| %else |
| mov r1, centerhm ; restore r1 |
| sub centerhq, topextq |
| sub centerhq, r4m |
| mov r1m, centerhq |
| %endif |
| ; |
| ; blk += top_ext * PXSTRIDE(dst_stride) |
| mov r2, topextq |
| %if ARCH_X86_64 |
| imul r2, dstrideq |
| %else |
| mov r6, r6m ; restore dstq |
| imul r2, dstridem |
| %endif |
| add dstq, r2 |
| mov reg_blkm, dstq ; save pointer for ext |
| ; |
| ; center_w = bw - left_ext - right_ext |
| mov centerwq, bwq |
| %if ARCH_X86_64 |
| lea r3, [rightextq+leftextq] |
| sub centerwq, r3 |
| %else |
| sub centerwq, r3m |
| sub centerwq, leftextq |
| %endif |
| |
| ; vloop Macro |
| %macro v_loop 3 ; need_left_ext, need_right_ext, suffix |
| %if ARCH_X86_64 |
| %define reg_tmp r12 |
| %else |
| %define reg_tmp r0 |
| %endif |
| .v_loop_%3: |
| %if ARCH_X86_32 |
| mov r0, r0m |
| mov r1, r1m |
| %endif |
| %if %1 |
| ; left extension |
| %if ARCH_X86_64 |
| movd m0, [srcq] |
| %else |
| mov r3, srcm |
| movd m0, [r3] |
| %endif |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| xor r3, r3 |
| .left_loop_%3: |
| mova [dstq+r3*2], m0 |
| add r3, mmsize/2 |
| cmp r3, leftextq |
| jl .left_loop_%3 |
| ; body |
| lea reg_tmp, [dstq+leftextq*2] |
| %endif |
| xor r3, r3 |
| .body_loop_%3: |
| %if ARCH_X86_64 |
| movu m0, [srcq+r3*2] |
| %else |
| mov r1, srcm |
| movu m0, [r1+r3*2] |
| %endif |
| %if %1 |
| movu [reg_tmp+r3*2], m0 |
| %else |
| movu [dstq+r3*2], m0 |
| %endif |
| add r3, mmsize/2 |
| cmp r3, centerwq |
| jl .body_loop_%3 |
| %if %2 |
| ; right extension |
| %if %1 |
| lea reg_tmp, [reg_tmp+centerwq*2] |
| %else |
| lea reg_tmp, [dstq+centerwq*2] |
| %endif |
| %if ARCH_X86_64 |
| movd m0, [srcq+centerwq*2-2] |
| %else |
| mov r3, srcm |
| movd m0, [r3+centerwq*2-2] |
| %endif |
| pshuflw m0, m0, q0000 |
| punpcklqdq m0, m0 |
| xor r3, r3 |
| .right_loop_%3: |
| movu [reg_tmp+r3*2], m0 |
| add r3, mmsize/2 |
| %if ARCH_X86_64 |
| cmp r3, rightextq |
| %else |
| cmp r3, r3m |
| %endif |
| jl .right_loop_%3 |
| %endif |
| %if ARCH_X86_64 |
| add dstq, dstrideq |
| add srcq, sstrideq |
| dec centerhq |
| jg .v_loop_%3 |
| %else |
| add dstq, dstridem |
| mov r0, sstridem |
| add srcm, r0 |
| sub dword centerhm, 1 |
| jg .v_loop_%3 |
| mov r0, r0m ; restore r0 |
| %endif |
| %endmacro ; vloop MACRO |
| |
| test leftextq, leftextq |
| jnz .need_left_ext |
| %if ARCH_X86_64 |
| test rightextq, rightextq |
| jnz .need_right_ext |
| %else |
| cmp leftextq, r3m ; leftextq == 0 |
| jne .need_right_ext |
| %endif |
| v_loop 0, 0, 0 |
| jmp .body_done |
| |
| ;left right extensions |
| .need_left_ext: |
| %if ARCH_X86_64 |
| test rightextq, rightextq |
| %else |
| mov r3, r3m |
| test r3, r3 |
| %endif |
| jnz .need_left_right_ext |
| v_loop 1, 0, 1 |
| jmp .body_done |
| |
| .need_left_right_ext: |
| v_loop 1, 1, 2 |
| jmp .body_done |
| |
| .need_right_ext: |
| v_loop 0, 1, 3 |
| |
| .body_done: |
| ; r0 ; bw |
| ; r1 ;; x loop |
| ; r4 ;; y loop |
| ; r5 ; topextq |
| ; r6 ;dstq |
| ; r7 ;dstrideq |
| ; r8 ; srcq |
| %if ARCH_X86_64 |
| %define reg_dstride dstrideq |
| %else |
| %define reg_dstride r2 |
| %endif |
| ; |
| ; bottom edge extension |
| %if ARCH_X86_64 |
| test bottomextq, bottomextq |
| jz .top |
| %else |
| xor r1, r1 |
| cmp r1, r4m |
| je .top |
| %endif |
| ; |
| %if ARCH_X86_64 |
| mov srcq, dstq |
| sub srcq, dstrideq |
| xor r1, r1 |
| %else |
| mov r3, dstq |
| mov reg_dstride, dstridem |
| sub r3, reg_dstride |
| mov srcm, r3 |
| %endif |
| ; |
| .bottom_x_loop: |
| %if ARCH_X86_64 |
| mova m0, [srcq+r1*2] |
| lea r3, [dstq+r1*2] |
| mov r4, bottomextq |
| %else |
| mov r3, srcm |
| mova m0, [r3+r1*2] |
| lea r3, [dstq+r1*2] |
| mov r4, r4m |
| %endif |
| ; |
| .bottom_y_loop: |
| mova [r3], m0 |
| add r3, reg_dstride |
| dec r4 |
| jg .bottom_y_loop |
| add r1, mmsize/2 |
| cmp r1, bwq |
| jl .bottom_x_loop |
| |
| .top: |
| ; top edge extension |
| test topextq, topextq |
| jz .end |
| %if ARCH_X86_64 |
| mov srcq, reg_blkm |
| %else |
| mov r3, reg_blkm |
| mov reg_dstride, dstridem |
| %endif |
| mov dstq, dstm |
| xor r1, r1 |
| ; |
| .top_x_loop: |
| %if ARCH_X86_64 |
| mova m0, [srcq+r1*2] |
| %else |
| mov r3, reg_blkm |
| mova m0, [r3+r1*2] |
| %endif |
| lea r3, [dstq+r1*2] |
| mov r4, topextq |
| ; |
| .top_y_loop: |
| mova [r3], m0 |
| add r3, reg_dstride |
| dec r4 |
| jg .top_y_loop |
| add r1, mmsize/2 |
| cmp r1, bwq |
| jl .top_x_loop |
| |
| .end: |
| RET |
| |
| %undef reg_dstride |
| %undef reg_blkm |
| %undef reg_tmp |
| |
| %macro SCRATCH 3 |
| %if ARCH_X86_32 |
| mova [rsp+%3*mmsize], m%1 |
| %define m%2 [rsp+%3*mmsize] |
| %else |
| SWAP %1, %2 |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_64 |
| cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ |
| dst_w, h, src_w, dx, mx0, pxmax |
| %elif STACK_ALIGNMENT >= 16 |
| cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ |
| dst_w, h, src_w, dx, mx0, pxmax |
| %else |
| cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ |
| dst_w, h, src_w, dx, mx0, pxmax |
| %endif |
| movifnidn dstq, dstmp |
| movifnidn srcq, srcmp |
| %if STACK_ALIGNMENT >= 16 |
| movifnidn dst_wd, dst_wm |
| %endif |
| %if ARCH_X86_64 |
| movifnidn hd, hm |
| %endif |
| sub dword mx0m, 4<<14 |
| sub dword src_wm, 8 |
| movd m4, pxmaxm |
| movd m7, dxm |
| movd m6, mx0m |
| movd m5, src_wm |
| punpcklwd m4, m4 |
| pshufd m4, m4, q0000 |
| pshufd m7, m7, q0000 |
| pshufd m6, m6, q0000 |
| pshufd m5, m5, q0000 |
| mova [rsp+16*3*ARCH_X86_32], m4 |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x |
| LEA r7, $$ |
| %define base r7-$$ |
| %else |
| DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x |
| %define hd dword r5m |
| %if STACK_ALIGNMENT >= 16 |
| LEA r6, $$ |
| %define base r6-$$ |
| %else |
| LEA r4, $$ |
| %define base r4-$$ |
| %endif |
| %endif |
| %if ARCH_X86_64 |
| mova m12, [base+pd_64] |
| mova m11, [base+pd_63] |
| %else |
| %define m12 [base+pd_64] |
| %define m11 [base+pd_63] |
| %endif |
| pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] |
| pslld m7, 2 ; dx*4 |
| pslld m5, 14 |
| paddd m6, m4 ; mx+[0..3]*dx |
| SCRATCH 7, 15, 0 |
| SCRATCH 6, 14, 1 |
| SCRATCH 5, 13, 2 |
| pxor m1, m1 |
| .loop_y: |
| xor xd, xd |
| mova m0, m14 ; per-line working version of mx |
| .loop_x: |
| pcmpgtd m1, m0 |
| pandn m1, m0 |
| psrad m2, m0, 8 ; filter offset (unmasked) |
| pcmpgtd m3, m13, m1 |
| pand m1, m3 |
| pandn m3, m13 |
| por m1, m3 |
| psubd m3, m0, m1 ; pshufb offset |
| psrad m1, 14 ; clipped src_x offset |
| psrad m3, 14 ; pshufb edge_emu offset |
| pand m2, m11 ; filter offset (masked) |
| ; load source pixels |
| %if ARCH_X86_64 |
| movd r8d, m1 |
| pshuflw m1, m1, q3232 |
| movd r9d, m1 |
| punpckhqdq m1, m1 |
| movd r10d, m1 |
| psrlq m1, 32 |
| movd r11d, m1 |
| movu m4, [srcq+r8*2] |
| movu m5, [srcq+r9*2] |
| movu m6, [srcq+r10*2] |
| movu m7, [srcq+r11*2] |
| ; if no emulation is required, we don't need to shuffle or emulate edges |
| packssdw m3, m3 |
| movq r11, m3 |
| test r11, r11 |
| jz .filter |
| movsx r8, r11w |
| sar r11, 16 |
| movsx r9, r11w |
| sar r11, 16 |
| movsx r10, r11w |
| sar r11, 16 |
| movu m1, [base+resize_shuf+8+r8*2] |
| movu m3, [base+resize_shuf+8+r9*2] |
| movu m8, [base+resize_shuf+8+r10*2] |
| movu m9, [base+resize_shuf+8+r11*2] |
| pshufb m4, m1 |
| pshufb m5, m3 |
| pshufb m6, m8 |
| pshufb m7, m9 |
| .filter: |
| movd r8d, m2 |
| pshuflw m2, m2, q3232 |
| movd r9d, m2 |
| punpckhqdq m2, m2 |
| movd r10d, m2 |
| psrlq m2, 32 |
| movd r11d, m2 |
| movq m8, [base+resize_filter+r8*8] |
| movq m2, [base+resize_filter+r9*8] |
| pxor m9, m9 |
| punpcklbw m1, m9, m8 |
| punpcklbw m3, m9, m2 |
| psraw m1, 8 |
| psraw m3, 8 |
| movq m10, [base+resize_filter+r10*8] |
| movq m2, [base+resize_filter+r11*8] |
| punpcklbw m8, m9, m10 |
| punpcklbw m9, m2 |
| psraw m8, 8 |
| psraw m9, 8 |
| pmaddwd m4, m1 |
| pmaddwd m5, m3 |
| pmaddwd m6, m8 |
| pmaddwd m7, m9 |
| phaddd m4, m5 |
| %else |
| movd r3, m1 |
| pshuflw m1, m1, q3232 |
| movd r1, m1 |
| punpckhqdq m1, m1 |
| movu m4, [srcq+r3*2] |
| movu m5, [srcq+r1*2] |
| movd r3, m1 |
| psrlq m1, 32 |
| movd r1, m1 |
| movu m6, [srcq+r3*2] |
| movu m7, [srcq+r1*2] |
| ; if no emulation is required, we don't need to shuffle or emulate edges |
| pxor m1, m1 |
| pcmpeqb m1, m3 |
| pmovmskb r3d, m1 |
| cmp r3d, 0xffff |
| je .filter |
| movd r3, m3 |
| movu m1, [base+resize_shuf+8+r3*2] |
| pshuflw m3, m3, q3232 |
| movd r1, m3 |
| pshufb m4, m1 |
| movu m1, [base+resize_shuf+8+r1*2] |
| punpckhqdq m3, m3 |
| movd r3, m3 |
| pshufb m5, m1 |
| movu m1, [base+resize_shuf+8+r3*2] |
| psrlq m3, 32 |
| movd r1, m3 |
| pshufb m6, m1 |
| movu m1, [base+resize_shuf+8+r1*2] |
| pshufb m7, m1 |
| .filter: |
| mova [esp+4*16], m6 |
| mova [esp+5*16], m7 |
| movd r3, m2 |
| pshuflw m2, m2, q3232 |
| movd r1, m2 |
| movq m6, [base+resize_filter+r3*8] |
| movq m7, [base+resize_filter+r1*8] |
| pxor m3, m3 |
| punpcklbw m1, m3, m6 |
| punpcklbw m3, m7 |
| psraw m1, 8 |
| psraw m3, 8 |
| pmaddwd m4, m1 |
| pmaddwd m5, m3 |
| punpckhqdq m2, m2 |
| movd r3, m2 |
| psrlq m2, 32 |
| movd r1, m2 |
| phaddd m4, m5 |
| movq m2, [base+resize_filter+r3*8] |
| movq m5, [base+resize_filter+r1*8] |
| mova m6, [esp+4*16] |
| mova m7, [esp+5*16] |
| pxor m3, m3 |
| punpcklbw m1, m3, m2 |
| punpcklbw m3, m5 |
| psraw m1, 8 |
| psraw m3, 8 |
| pmaddwd m6, m1 |
| pmaddwd m7, m3 |
| %endif |
| phaddd m6, m7 |
| phaddd m4, m6 |
| pxor m1, m1 |
| psubd m2, m12, m4 |
| psrad m2, 7 |
| packssdw m2, m2 |
| pmaxsw m2, m1 |
| pminsw m2, [rsp+16*3*ARCH_X86_32] |
| movq [dstq+xq*2], m2 |
| paddd m0, m15 |
| add xd, 4 |
| %if STACK_ALIGNMENT >= 16 |
| cmp xd, dst_wd |
| %else |
| cmp xd, dst_wm |
| %endif |
| jl .loop_x |
| add dstq, dst_stridemp |
| add srcq, src_stridemp |
| dec hd |
| jg .loop_y |
| RET |