| ; Copyright © 2018, VideoLAN and dav1d authors |
| ; Copyright © 2018, Two Orioles, LLC |
| ; Copyright © 2019, VideoLabs |
| ; All rights reserved. |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions are met: |
| ; |
| ; 1. Redistributions of source code must retain the above copyright notice, this |
| ; list of conditions and the following disclaimer. |
| ; |
| ; 2. Redistributions in binary form must reproduce the above copyright notice, |
| ; this list of conditions and the following disclaimer in the documentation |
| ; and/or other materials provided with the distribution. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "config.asm" |
| %include "ext/x86/x86inc.asm" |
| |
| SECTION_RODATA 16 |
| |
| %macro DUP8 1-* |
| %rep %0 |
| times 8 db %1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 |
| dd 420, 210, 140, 105, 105, 105, 105, 105 |
| div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 |
| dw 168, 168, 140, 140, 120, 120, 105, 105 |
| dw 420, 420, 210, 210, 140, 140, 105, 105 |
| dw 105, 105, 105, 105, 105, 105, 105, 105 |
| const shufw_6543210x, \ |
| db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 |
| shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 |
| pw_8: times 8 dw 8 |
| pw_128: times 8 dw 128 |
| pw_256: times 8 dw 256 |
| pw_2048: times 8 dw 2048 |
| pw_0x7FFF: times 8 dw 0x7FFF |
| pw_0x8000: times 8 dw 0x8000 |
| tap_table: ; masks for 8-bit shift emulation |
| DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 |
| ; weights |
| DUP8 4, 2, 3, 3, 2, 1 |
| ; taps indices |
| db -1 * 16 + 1, -2 * 16 + 2 |
| db 0 * 16 + 1, -1 * 16 + 2 |
| db 0 * 16 + 1, 0 * 16 + 2 |
| db 0 * 16 + 1, 1 * 16 + 2 |
| db 1 * 16 + 1, 2 * 16 + 2 |
| db 1 * 16 + 0, 2 * 16 + 1 |
| db 1 * 16 + 0, 2 * 16 + 0 |
| db 1 * 16 + 0, 2 * 16 - 1 |
| ; the last 6 are repeats of the first 6 so we don't need to & 7 |
| db -1 * 16 + 1, -2 * 16 + 2 |
| db 0 * 16 + 1, -1 * 16 + 2 |
| db 0 * 16 + 1, 0 * 16 + 2 |
| db 0 * 16 + 1, 1 * 16 + 2 |
| db 1 * 16 + 1, 2 * 16 + 2 |
| db 1 * 16 + 0, 2 * 16 + 1 |
| |
| SECTION .text |
| |
| %macro movif32 2 |
| %if ARCH_X86_32 |
| mov %1, %2 |
| %endif |
| %endmacro |
| |
| %macro PMOVZXBW 2-3 0 ; %3 = half |
| %if cpuflag(sse4) && %3 == 0 |
| pmovzxbw %1, %2 |
| %else |
| %if %3 == 1 |
| movd %1, %2 |
| %else |
| movq %1, %2 |
| %endif |
| punpcklbw %1, m7 |
| %endif |
| %endmacro |
| |
| %macro PSHUFB_0 2 |
| %if cpuflag(ssse3) |
| pshufb %1, %2 |
| %else |
| punpcklbw %1, %1 |
| pshuflw %1, %1, q0000 |
| punpcklqdq %1, %1 |
| %endif |
| %endmacro |
| |
| %macro MOVDDUP 2 |
| %if cpuflag(ssse3) |
| movddup %1, %2 |
| %else |
| movq %1, %2 |
| punpcklqdq %1, %1 |
| %endif |
| %endmacro |
| |
| %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax |
| ; load p0/p1 |
| movsx offq, byte [dirq+kq+%1+14*8] ; off1 |
| %if %6 == 4 |
| movq m5, [stkq+offq*2+32*0] ; p0 |
| movhps m5, [stkq+offq*2+32*1] |
| %else |
| movu m5, [stkq+offq*2+32*0] ; p0 |
| %endif |
| neg offq ; -off1 |
| %if %6 == 4 |
| movq m6, [stkq+offq*2+32*0] ; p1 |
| movhps m6, [stkq+offq*2+32*1] |
| %else |
| movu m6, [stkq+offq*2+32*0] ; p1 |
| %endif |
| %if %7 |
| %if cpuflag(sse4) |
| ; out of bounds values are set to a value that is a both a large unsigned |
| ; value and a negative signed value. |
| ; use signed max and unsigned min to remove them |
| pmaxsw m7, m5 |
| pminuw m8, m5 |
| pmaxsw m7, m6 |
| pminuw m8, m6 |
| %else |
| pcmpeqw m3, m14, m5 |
| pminsw m8, m5 ; min after p0 |
| pandn m3, m5 |
| pmaxsw m7, m3 ; max after p0 |
| pcmpeqw m3, m14, m6 |
| pminsw m8, m6 ; min after p1 |
| pandn m3, m6 |
| pmaxsw m7, m3 ; max after p1 |
| %endif |
| %endif |
| |
| ; accumulate sum[m13] over p0/p1 |
| psubw m5, m4 ; diff_p0(p0 - px) |
| psubw m6, m4 ; diff_p1(p1 - px) |
| packsswb m5, m6 ; convert pixel diff to 8-bit |
| %if cpuflag(ssse3) |
| pshufb m5, m13 ; group diffs p0 and p1 into pairs |
| pabsb m6, m5 |
| psignb m3, %5, m5 |
| %else |
| movlhps m6, m5 |
| punpckhbw m6, m5 |
| pxor m5, m5 |
| pcmpgtb m5, m6 |
| paddb m6, m5 |
| pxor m6, m5 |
| paddb m3, %5, m5 |
| pxor m3, m5 |
| %endif |
| pand m9, %3, m6 ; emulate 8-bit shift |
| psrlw m9, %2 |
| psubusb m5, %4, m9 |
| pminub m5, m6 ; constrain(diff_p) |
| %if cpuflag(ssse3) |
| pmaddubsw m5, m3 ; constrain(diff_p) * taps |
| %else |
| psrlw m9, m5, 8 |
| psraw m6, m3, 8 |
| psllw m5, 8 |
| psllw m3, 8 |
| pmullw m9, m6 |
| pmulhw m5, m3 |
| paddw m5, m9 |
| %endif |
| paddw m0, m5 |
| %endmacro |
| |
| %macro LOAD_BODY 3 ; dst, src, block_width |
| %if %3 == 4 |
| PMOVZXBW m0, [%2+strideq*0] |
| PMOVZXBW m1, [%2+strideq*1] |
| PMOVZXBW m2, [%2+strideq*2] |
| PMOVZXBW m3, [%2+stride3q] |
| mova [%1+32*0], m0 |
| mova [%1+32*1], m1 |
| mova [%1+32*2], m2 |
| mova [%1+32*3], m3 |
| %else |
| movu m0, [%2+strideq*0] |
| movu m1, [%2+strideq*1] |
| movu m2, [%2+strideq*2] |
| movu m3, [%2+stride3q] |
| punpcklbw m4, m0, m7 |
| punpckhbw m0, m7 |
| mova [%1+32*0+ 0], m4 |
| mova [%1+32*0+16], m0 |
| punpcklbw m4, m1, m7 |
| punpckhbw m1, m7 |
| mova [%1+32*1+ 0], m4 |
| mova [%1+32*1+16], m1 |
| punpcklbw m4, m2, m7 |
| punpckhbw m2, m7 |
| mova [%1+32*2+ 0], m4 |
| mova [%1+32*2+16], m2 |
| punpcklbw m4, m3, m7 |
| punpckhbw m3, m7 |
| mova [%1+32*3+ 0], m4 |
| mova [%1+32*3+16], m3 |
| %endif |
| %endmacro |
| |
| %macro CDEF_FILTER_END 2 ; w, minmax |
| pxor m6, m6 |
| pcmpgtw m6, m0 |
| paddw m0, m6 |
| %if cpuflag(ssse3) |
| pmulhrsw m0, m15 |
| %else |
| paddw m0, m15 |
| psraw m0, 4 |
| %endif |
| paddw m4, m0 |
| %if %2 |
| pminsw m4, m7 |
| pmaxsw m4, m8 |
| %endif |
| packuswb m4, m4 |
| %if %1 == 4 |
| movd [dstq+strideq*0], m4 |
| psrlq m4, 32 |
| movd [dstq+strideq*1], m4 |
| add stkq, 32*2 |
| lea dstq, [dstq+strideq*2] |
| %else |
| movq [dstq], m4 |
| add stkq, 32 |
| add dstq, strideq |
| %endif |
| %endmacro |
| |
| %macro CDEF_FILTER 2 ; w, h |
| %if ARCH_X86_64 |
| cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ |
| dst, stride, left, top, bot, pri, dst4, edge, \ |
| stride3 |
| %define px rsp+3*16+2*32 |
| %define base 0 |
| %else |
| cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ |
| dst, stride, left, edge, stride3 |
| %define topq r2 |
| %define botq r2 |
| %define dst4q r2 |
| LEA r5, tap_table |
| %define px esp+7*16+2*32 |
| %define base r5-tap_table |
| %endif |
| mov edged, r9m |
| %if cpuflag(sse4) |
| %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] |
| %else |
| %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] |
| %endif |
| mova m6, OUT_OF_BOUNDS_MEM |
| pxor m7, m7 |
| |
| ; prepare pixel buffers - body/right |
| %if %2 == 8 |
| lea dst4q, [dstq+strideq*4] |
| %endif |
| lea stride3q, [strideq*3] |
| test edgeb, 2 ; have_right |
| jz .no_right |
| LOAD_BODY px, dstq, %1 |
| %if %2 == 8 |
| LOAD_BODY px+4*32, dst4q, %1 |
| %endif |
| jmp .body_done |
| .no_right: |
| PMOVZXBW m0, [dstq+strideq*0], %1 == 4 |
| PMOVZXBW m1, [dstq+strideq*1], %1 == 4 |
| PMOVZXBW m2, [dstq+strideq*2], %1 == 4 |
| PMOVZXBW m3, [dstq+stride3q ], %1 == 4 |
| mova [px+32*0], m0 |
| mova [px+32*1], m1 |
| mova [px+32*2], m2 |
| mova [px+32*3], m3 |
| movd [px+32*0+%1*2], m6 |
| movd [px+32*1+%1*2], m6 |
| movd [px+32*2+%1*2], m6 |
| movd [px+32*3+%1*2], m6 |
| %if %2 == 8 |
| PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 |
| PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 |
| PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 |
| PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 |
| mova [px+32*4], m0 |
| mova [px+32*5], m1 |
| mova [px+32*6], m2 |
| mova [px+32*7], m3 |
| movd [px+32*4+%1*2], m6 |
| movd [px+32*5+%1*2], m6 |
| movd [px+32*6+%1*2], m6 |
| movd [px+32*7+%1*2], m6 |
| %endif |
| .body_done: |
| |
| ; top |
| movifnidn topq, r3mp |
| test edgeb, 4 ; have_top |
| jz .no_top |
| test edgeb, 1 ; have_left |
| jz .top_no_left |
| test edgeb, 2 ; have_right |
| jz .top_no_right |
| %if %1 == 4 |
| PMOVZXBW m0, [topq+strideq*0-2] |
| PMOVZXBW m1, [topq+strideq*1-2] |
| %else |
| movu m0, [topq+strideq*0-4] |
| movu m1, [topq+strideq*1-4] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| movu [px-32*2+8], m2 |
| movu [px-32*1+8], m3 |
| %endif |
| movu [px-32*2-%1], m0 |
| movu [px-32*1-%1], m1 |
| jmp .top_done |
| .top_no_right: |
| %if %1 == 4 |
| PMOVZXBW m0, [topq+strideq*0-%1] |
| PMOVZXBW m1, [topq+strideq*1-%1] |
| movu [px-32*2-8], m0 |
| movu [px-32*1-8], m1 |
| %else |
| movu m0, [topq+strideq*0-%1] |
| movu m1, [topq+strideq*1-%2] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| mova [px-32*2-16], m0 |
| mova [px-32*2+ 0], m2 |
| mova [px-32*1-16], m1 |
| mova [px-32*1+ 0], m3 |
| %endif |
| movd [px-32*2+%1*2], m6 |
| movd [px-32*1+%1*2], m6 |
| jmp .top_done |
| .top_no_left: |
| test edgeb, 2 ; have_right |
| jz .top_no_left_right |
| %if %1 == 4 |
| PMOVZXBW m0, [topq+strideq*0] |
| PMOVZXBW m1, [topq+strideq*1] |
| %else |
| movu m0, [topq+strideq*0] |
| movu m1, [topq+strideq*1] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| movd [px-32*2+16], m2 |
| movd [px-32*1+16], m3 |
| %endif |
| movd [px-32*2- 4], m6 |
| movd [px-32*1- 4], m6 |
| mova [px-32*2+ 0], m0 |
| mova [px-32*1+ 0], m1 |
| jmp .top_done |
| .top_no_left_right: |
| PMOVZXBW m0, [topq+strideq*0], %1 == 4 |
| PMOVZXBW m1, [topq+strideq*1], %1 == 4 |
| movd [px-32*2-4], m6 |
| movd [px-32*1-4], m6 |
| mova [px-32*2+0], m0 |
| mova [px-32*1+0], m1 |
| movd [px-32*2+%1*2], m6 |
| movd [px-32*1+%1*2], m6 |
| jmp .top_done |
| .no_top: |
| movu [px-32*2- 4], m6 |
| movu [px-32*1- 4], m6 |
| %if %1 == 8 |
| movq [px-32*2+12], m6 |
| movq [px-32*1+12], m6 |
| %endif |
| .top_done: |
| |
| ; left |
| test edgeb, 1 ; have_left |
| jz .no_left |
| movifnidn leftq, leftmp |
| %if %2 == 4 |
| movq m0, [leftq] |
| %else |
| movu m0, [leftq] |
| %endif |
| %if %2 == 4 |
| punpcklbw m0, m7 |
| %else |
| punpckhbw m1, m0, m7 |
| punpcklbw m0, m7 |
| movhlps m3, m1 |
| movd [px+32*4-4], m1 |
| movd [px+32*6-4], m3 |
| psrlq m1, 32 |
| psrlq m3, 32 |
| movd [px+32*5-4], m1 |
| movd [px+32*7-4], m3 |
| %endif |
| movhlps m2, m0 |
| movd [px+32*0-4], m0 |
| movd [px+32*2-4], m2 |
| psrlq m0, 32 |
| psrlq m2, 32 |
| movd [px+32*1-4], m0 |
| movd [px+32*3-4], m2 |
| jmp .left_done |
| .no_left: |
| movd [px+32*0-4], m6 |
| movd [px+32*1-4], m6 |
| movd [px+32*2-4], m6 |
| movd [px+32*3-4], m6 |
| %if %2 == 8 |
| movd [px+32*4-4], m6 |
| movd [px+32*5-4], m6 |
| movd [px+32*6-4], m6 |
| movd [px+32*7-4], m6 |
| %endif |
| .left_done: |
| |
| ; bottom |
| movifnidn botq, r4mp |
| test edgeb, 8 ; have_bottom |
| jz .no_bottom |
| test edgeb, 1 ; have_left |
| jz .bottom_no_left |
| test edgeb, 2 ; have_right |
| jz .bottom_no_right |
| %if %1 == 4 |
| PMOVZXBW m0, [botq+strideq*0-(%1/2)] |
| PMOVZXBW m1, [botq+strideq*1-(%1/2)] |
| %else |
| movu m0, [botq+strideq*0-4] |
| movu m1, [botq+strideq*1-4] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| movu [px+32*(%2+0)+8], m2 |
| movu [px+32*(%2+1)+8], m3 |
| %endif |
| movu [px+32*(%2+0)-%1], m0 |
| movu [px+32*(%2+1)-%1], m1 |
| jmp .bottom_done |
| .bottom_no_right: |
| %if %1 == 4 |
| PMOVZXBW m0, [botq+strideq*0-4] |
| PMOVZXBW m1, [botq+strideq*1-4] |
| movu [px+32*(%2+0)-8], m0 |
| movu [px+32*(%2+1)-8], m1 |
| %else |
| movu m0, [botq+strideq*0-8] |
| movu m1, [botq+strideq*1-8] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| mova [px+32*(%2+0)-16], m0 |
| mova [px+32*(%2+0)+ 0], m2 |
| mova [px+32*(%2+1)-16], m1 |
| mova [px+32*(%2+1)+ 0], m3 |
| movd [px+32*(%2-1)+16], m6 ; overwritten by first mova |
| %endif |
| movd [px+32*(%2+0)+%1*2], m6 |
| movd [px+32*(%2+1)+%1*2], m6 |
| jmp .bottom_done |
| .bottom_no_left: |
| test edgeb, 2 ; have_right |
| jz .bottom_no_left_right |
| %if %1 == 4 |
| PMOVZXBW m0, [botq+strideq*0] |
| PMOVZXBW m1, [botq+strideq*1] |
| %else |
| movu m0, [botq+strideq*0] |
| movu m1, [botq+strideq*1] |
| punpckhbw m2, m0, m7 |
| punpcklbw m0, m7 |
| punpckhbw m3, m1, m7 |
| punpcklbw m1, m7 |
| mova [px+32*(%2+0)+16], m2 |
| mova [px+32*(%2+1)+16], m3 |
| %endif |
| mova [px+32*(%2+0)+ 0], m0 |
| mova [px+32*(%2+1)+ 0], m1 |
| movd [px+32*(%2+0)- 4], m6 |
| movd [px+32*(%2+1)- 4], m6 |
| jmp .bottom_done |
| .bottom_no_left_right: |
| PMOVZXBW m0, [botq+strideq*0], %1 == 4 |
| PMOVZXBW m1, [botq+strideq*1], %1 == 4 |
| mova [px+32*(%2+0)+ 0], m0 |
| mova [px+32*(%2+1)+ 0], m1 |
| movd [px+32*(%2+0)+%1*2], m6 |
| movd [px+32*(%2+1)+%1*2], m6 |
| movd [px+32*(%2+0)- 4], m6 |
| movd [px+32*(%2+1)- 4], m6 |
| jmp .bottom_done |
| .no_bottom: |
| movu [px+32*(%2+0)- 4], m6 |
| movu [px+32*(%2+1)- 4], m6 |
| %if %1 == 8 |
| movq [px+32*(%2+0)+12], m6 |
| movq [px+32*(%2+1)+12], m6 |
| %endif |
| .bottom_done: |
| |
| ; actual filter |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec |
| mova m13, [shufb_lohi] |
| %if cpuflag(ssse3) |
| mova m15, [pw_2048] |
| %else |
| mova m15, [pw_8] |
| %endif |
| mova m14, m6 |
| %else |
| DEFINE_ARGS dst, pridmp, sec, damping, pri, tap |
| %xdefine m8 m1 |
| %xdefine m9 m2 |
| %xdefine m10 m0 |
| %xdefine m13 [base+shufb_lohi] |
| %xdefine m14 OUT_OF_BOUNDS_MEM |
| %if cpuflag(ssse3) |
| %xdefine m15 [base+pw_2048] |
| %else |
| %xdefine m15 [base+pw_8] |
| %endif |
| %endif |
| movifnidn prid, r5m |
| movifnidn secd, r6m |
| mov dampingd, r8m |
| movif32 [esp+0x3C], r1d |
| test prid, prid |
| jz .sec_only |
| movd m1, r5m |
| bsr pridmpd, prid |
| test secd, secd |
| jz .pri_only |
| movd m10, r6m |
| tzcnt secd, secd |
| and prid, 1 |
| sub pridmpd, dampingd |
| sub secd, dampingd |
| xor dampingd, dampingd |
| add prid, prid |
| neg pridmpd |
| cmovs pridmpd, dampingd |
| neg secd |
| PSHUFB_0 m1, m7 |
| PSHUFB_0 m10, m7 |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec |
| lea tapq, [tap_table] |
| MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask |
| MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask |
| mov [rsp+0x00], pridmpq ; pri_shift |
| mov [rsp+0x10], secq ; sec_shift |
| DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off |
| %else |
| MOVDDUP m2, [tapq+pridmpq*8] |
| MOVDDUP m3, [tapq+secq*8] |
| mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw |
| mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP |
| mov [esp+0x00], pridmpd |
| mov [esp+0x30], secd |
| DEFINE_ARGS dst, stride, dir, stk, pri, tap, h |
| %define offq dstq |
| %define kd strided |
| %define kq strideq |
| mova [esp+0x10], m2 |
| mova [esp+0x40], m3 |
| mova [esp+0x20], m1 |
| mova [esp+0x50], m10 |
| %endif |
| mov dird, r7m |
| lea stkq, [px] |
| lea priq, [tapq+8*8+priq*8] ; pri_taps |
| mov hd, %1*%2/8 |
| lea dirq, [tapq+dirq*2] |
| .v_loop: |
| movif32 [esp+0x38], dstd |
| mov kd, 1 |
| %if %1 == 4 |
| movq m4, [stkq+32*0] |
| movhps m4, [stkq+32*1] |
| %else |
| mova m4, [stkq+32*0] ; px |
| %endif |
| pxor m0, m0 ; sum |
| mova m7, m4 ; max |
| mova m8, m4 ; min |
| .k_loop: |
| MOVDDUP m2, [priq+kq*8] |
| %if ARCH_X86_64 |
| ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 |
| MOVDDUP m2, [tapq+12*8+kq*8] |
| ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 |
| ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 |
| %else |
| ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 |
| MOVDDUP m2, [tapq+12*8+kq*8] |
| ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 |
| MOVDDUP m2, [tapq+12*8+kq*8] |
| ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 |
| %endif |
| dec kd |
| jge .k_loop |
| movif32 dstq, [esp+0x38] |
| movif32 strideq, [esp+0x3C] |
| CDEF_FILTER_END %1, 1 |
| dec hd |
| jg .v_loop |
| RET |
| |
| .pri_only: |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap |
| lea tapq, [tap_table] |
| %else |
| DEFINE_ARGS dst, pridmp, zero, damping, pri, tap |
| %endif |
| and prid, 1 |
| xor zerod, zerod |
| sub dampingd, pridmpd |
| cmovs dampingd, zerod |
| add prid, prid |
| PSHUFB_0 m1, m7 |
| MOVDDUP m7, [tapq+dampingq*8] |
| mov [rsp+0x00], dampingq |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off |
| %else |
| mov [rsp+0x04], zerod |
| DEFINE_ARGS dst, stride, dir, stk, pri, tap, h |
| %endif |
| mov dird, r7m |
| lea stkq, [px] |
| lea priq, [tapq+8*8+priq*8] |
| mov hd, %1*%2/8 |
| lea dirq, [tapq+dirq*2] |
| .pri_v_loop: |
| movif32 [esp+0x38], dstd |
| mov kd, 1 |
| %if %1 == 4 |
| movq m4, [stkq+32*0] |
| movhps m4, [stkq+32*1] |
| %else |
| mova m4, [stkq+32*0] |
| %endif |
| pxor m0, m0 |
| .pri_k_loop: |
| MOVDDUP m2, [priq+kq*8] |
| ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 |
| dec kd |
| jge .pri_k_loop |
| movif32 dstq, [esp+0x38] |
| movif32 strideq, [esp+0x3C] |
| CDEF_FILTER_END %1, 0 |
| dec hd |
| jg .pri_v_loop |
| RET |
| |
| .sec_only: |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec |
| %else |
| DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero |
| %endif |
| movd m1, r6m |
| tzcnt secd, secd |
| mov dird, r7m |
| xor zerod, zerod |
| sub dampingd, secd |
| cmovs dampingd, zerod |
| PSHUFB_0 m1, m7 |
| %if ARCH_X86_64 |
| lea tapq, [tap_table] |
| %else |
| mov [rsp+0x04], zerod |
| %endif |
| mov [rsp+0x00], dampingq |
| MOVDDUP m7, [tapq+dampingq*8] |
| lea dirq, [tapq+dirq*2] |
| %if ARCH_X86_64 |
| DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k |
| %else |
| DEFINE_ARGS dst, stride, off, stk, dir, tap, h |
| %endif |
| lea stkq, [px] |
| mov hd, %1*%2/8 |
| .sec_v_loop: |
| mov kd, 1 |
| %if %1 == 4 |
| movq m4, [stkq+32*0] |
| movhps m4, [stkq+32*1] |
| %else |
| mova m4, [stkq+32*0] |
| %endif |
| pxor m0, m0 |
| .sec_k_loop: |
| MOVDDUP m2, [tapq+12*8+kq*8] |
| ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 |
| %if ARCH_X86_32 |
| MOVDDUP m2, [tapq+12*8+kq*8] |
| %endif |
| ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 |
| dec kd |
| jge .sec_k_loop |
| movif32 strideq, [esp+0x3C] |
| CDEF_FILTER_END %1, 0 |
| dec hd |
| jg .sec_v_loop |
| RET |
| %endmacro |
| |
| %macro MULLD 2 |
| %if cpuflag(sse4) |
| pmulld %1, %2 |
| %else |
| %if ARCH_X86_32 |
| %define m15 m1 |
| %endif |
| pmulhuw m15, %1, %2 |
| pmullw %1, %2 |
| pslld m15, 16 |
| paddd %1, m15 |
| %endif |
| %endmacro |
| |
| %macro CDEF_DIR 0 |
| %if ARCH_X86_64 |
| cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var |
| lea r6, [strideq*3] |
| movq m1, [srcq+strideq*0] |
| movhps m1, [srcq+strideq*1] |
| movq m3, [srcq+strideq*2] |
| movhps m3, [srcq+r6 ] |
| lea srcq, [srcq+strideq*4] |
| movq m5, [srcq+strideq*0] |
| movhps m5, [srcq+strideq*1] |
| movq m7, [srcq+strideq*2] |
| movhps m7, [srcq+r6 ] |
| |
| pxor m8, m8 |
| psadbw m9, m1, m8 |
| psadbw m2, m3, m8 |
| psadbw m4, m5, m8 |
| psadbw m6, m7, m8 |
| packssdw m9, m2 |
| packssdw m4, m6 |
| packssdw m9, m4 |
| |
| punpcklbw m0, m1, m8 |
| punpckhbw m1, m8 |
| punpcklbw m2, m3, m8 |
| punpckhbw m3, m8 |
| punpcklbw m4, m5, m8 |
| punpckhbw m5, m8 |
| punpcklbw m6, m7, m8 |
| punpckhbw m7, m8 |
| cglobal_label .main |
| mova m8, [pw_128] |
| psubw m0, m8 |
| psubw m1, m8 |
| psubw m2, m8 |
| psubw m3, m8 |
| psubw m4, m8 |
| psubw m5, m8 |
| psubw m6, m8 |
| psubw m7, m8 |
| psllw m8, 3 |
| psubw m9, m8 ; partial_sum_hv[0] |
| |
| paddw m8, m0, m1 |
| paddw m10, m2, m3 |
| paddw m8, m4 |
| paddw m10, m5 |
| paddw m8, m6 |
| paddw m10, m7 |
| paddw m8, m10 ; partial_sum_hv[1] |
| |
| pmaddwd m8, m8 |
| pmaddwd m9, m9 |
| phaddd m9, m8 |
| SWAP m8, m9 |
| MULLD m8, [div_table%+SUFFIX+48] |
| |
| pslldq m9, m1, 2 |
| psrldq m10, m1, 14 |
| pslldq m11, m2, 4 |
| psrldq m12, m2, 12 |
| pslldq m13, m3, 6 |
| psrldq m14, m3, 10 |
| paddw m9, m0 |
| paddw m10, m12 |
| paddw m11, m13 |
| paddw m10, m14 ; partial_sum_diag[0] top/right half |
| paddw m9, m11 ; partial_sum_diag[0] top/left half |
| pslldq m11, m4, 8 |
| psrldq m12, m4, 8 |
| pslldq m13, m5, 10 |
| psrldq m14, m5, 6 |
| paddw m9, m11 |
| paddw m10, m12 |
| paddw m9, m13 |
| paddw m10, m14 |
| pslldq m11, m6, 12 |
| psrldq m12, m6, 4 |
| pslldq m13, m7, 14 |
| psrldq m14, m7, 2 |
| paddw m9, m11 |
| paddw m10, m12 |
| paddw m9, m13 ; partial_sum_diag[0][0-7] |
| paddw m10, m14 ; partial_sum_diag[0][8-14,zero] |
| pshufb m10, [shufw_6543210x] |
| punpckhwd m11, m9, m10 |
| punpcklwd m9, m10 |
| pmaddwd m11, m11 |
| pmaddwd m9, m9 |
| MULLD m11, [div_table%+SUFFIX+16] |
| MULLD m9, [div_table%+SUFFIX+0] |
| paddd m9, m11 ; cost[0a-d] |
| |
| pslldq m10, m0, 14 |
| psrldq m11, m0, 2 |
| pslldq m12, m1, 12 |
| psrldq m13, m1, 4 |
| pslldq m14, m2, 10 |
| psrldq m15, m2, 6 |
| paddw m10, m12 |
| paddw m11, m13 |
| paddw m10, m14 |
| paddw m11, m15 |
| pslldq m12, m3, 8 |
| psrldq m13, m3, 8 |
| pslldq m14, m4, 6 |
| psrldq m15, m4, 10 |
| paddw m10, m12 |
| paddw m11, m13 |
| paddw m10, m14 |
| paddw m11, m15 |
| pslldq m12, m5, 4 |
| psrldq m13, m5, 12 |
| pslldq m14, m6, 2 |
| psrldq m15, m6, 14 |
| paddw m10, m12 |
| paddw m11, m13 |
| paddw m10, m14 |
| paddw m11, m15 ; partial_sum_diag[1][8-14,zero] |
| paddw m10, m7 ; partial_sum_diag[1][0-7] |
| pshufb m11, [shufw_6543210x] |
| punpckhwd m12, m10, m11 |
| punpcklwd m10, m11 |
| pmaddwd m12, m12 |
| pmaddwd m10, m10 |
| MULLD m12, [div_table%+SUFFIX+16] |
| MULLD m10, [div_table%+SUFFIX+0] |
| paddd m10, m12 ; cost[4a-d] |
| phaddd m9, m10 ; cost[0a/b,4a/b] |
| |
| paddw m10, m0, m1 |
| paddw m11, m2, m3 |
| paddw m12, m4, m5 |
| paddw m13, m6, m7 |
| phaddw m0, m4 |
| phaddw m1, m5 |
| phaddw m2, m6 |
| phaddw m3, m7 |
| |
| ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) |
| pslldq m4, m11, 2 |
| psrldq m5, m11, 14 |
| pslldq m6, m12, 4 |
| psrldq m7, m12, 12 |
| pslldq m14, m13, 6 |
| psrldq m15, m13, 10 |
| paddw m4, m10 |
| paddw m5, m7 |
| paddw m4, m6 |
| paddw m5, m15 ; partial_sum_alt[3] right |
| paddw m4, m14 ; partial_sum_alt[3] left |
| pshuflw m6, m5, q3012 |
| punpckhwd m5, m4 |
| punpcklwd m4, m6 |
| pmaddwd m5, m5 |
| pmaddwd m4, m4 |
| MULLD m5, [div_table%+SUFFIX+48] |
| MULLD m4, [div_table%+SUFFIX+32] |
| paddd m4, m5 ; cost[7a-d] |
| |
| pslldq m5, m10, 6 |
| psrldq m6, m10, 10 |
| pslldq m7, m11, 4 |
| psrldq m10, m11, 12 |
| pslldq m11, m12, 2 |
| psrldq m12, 14 |
| paddw m5, m7 |
| paddw m6, m10 |
| paddw m5, m11 |
| paddw m6, m12 |
| paddw m5, m13 |
| pshuflw m7, m6, q3012 |
| punpckhwd m6, m5 |
| punpcklwd m5, m7 |
| pmaddwd m6, m6 |
| pmaddwd m5, m5 |
| MULLD m6, [div_table%+SUFFIX+48] |
| MULLD m5, [div_table%+SUFFIX+32] |
| paddd m5, m6 ; cost[5a-d] |
| |
| pslldq m6, m1, 2 |
| psrldq m7, m1, 14 |
| pslldq m10, m2, 4 |
| psrldq m11, m2, 12 |
| pslldq m12, m3, 6 |
| psrldq m13, m3, 10 |
| paddw m6, m0 |
| paddw m7, m11 |
| paddw m6, m10 |
| paddw m7, m13 ; partial_sum_alt[3] right |
| paddw m6, m12 ; partial_sum_alt[3] left |
| pshuflw m10, m7, q3012 |
| punpckhwd m7, m6 |
| punpcklwd m6, m10 |
| pmaddwd m7, m7 |
| pmaddwd m6, m6 |
| MULLD m7, [div_table%+SUFFIX+48] |
| MULLD m6, [div_table%+SUFFIX+32] |
| paddd m6, m7 ; cost[1a-d] |
| |
| pshufd m0, m0, q1032 |
| pshufd m1, m1, q1032 |
| pshufd m2, m2, q1032 |
| pshufd m3, m3, q1032 |
| |
| pslldq m10, m0, 6 |
| psrldq m11, m0, 10 |
| pslldq m12, m1, 4 |
| psrldq m13, m1, 12 |
| pslldq m14, m2, 2 |
| psrldq m2, 14 |
| paddw m10, m12 |
| paddw m11, m13 |
| paddw m10, m14 |
| paddw m11, m2 |
| paddw m10, m3 |
| pshuflw m12, m11, q3012 |
| punpckhwd m11, m10 |
| punpcklwd m10, m12 |
| pmaddwd m11, m11 |
| pmaddwd m10, m10 |
| MULLD m11, [div_table%+SUFFIX+48] |
| MULLD m10, [div_table%+SUFFIX+32] |
| paddd m10, m11 ; cost[3a-d] |
| |
| phaddd m9, m8 ; cost[0,4,2,6] |
| phaddd m6, m10 |
| phaddd m5, m4 |
| phaddd m6, m5 ; cost[1,3,5,7] |
| pshufd m4, m9, q3120 |
| |
| ; now find the best cost |
| %if cpuflag(sse4) |
| pmaxsd m9, m6 |
| pshufd m0, m9, q1032 |
| pmaxsd m0, m9 |
| pshufd m1, m0, q2301 |
| pmaxsd m0, m1 ; best cost |
| %else |
| pcmpgtd m0, m9, m6 |
| pand m9, m0 |
| pandn m0, m6 |
| por m9, m0 |
| pshufd m1, m9, q1032 |
| pcmpgtd m0, m9, m1 |
| pand m9, m0 |
| pandn m0, m1 |
| por m9, m0 |
| pshufd m1, m9, q2301 |
| pcmpgtd m0, m9, m1 |
| pand m9, m0 |
| pandn m0, m1 |
| por m0, m9 |
| %endif |
| |
| ; get direction and variance |
| punpckhdq m1, m4, m6 |
| punpckldq m4, m6 |
| psubd m2, m0, m1 |
| psubd m3, m0, m4 |
| %if WIN64 |
| WIN64_RESTORE_XMM |
| %define tmp rsp+stack_offset+8 |
| %else |
| %define tmp rsp-40 |
| %endif |
| mova [tmp+0x00], m2 ; emulate ymm in stack |
| mova [tmp+0x10], m3 |
| pcmpeqd m1, m0 ; compute best cost mask |
| pcmpeqd m4, m0 |
| packssdw m4, m1 |
| pmovmskb eax, m4 ; get byte-idx from mask |
| tzcnt eax, eax |
| mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm |
| shr eax, 1 ; get direction by converting byte-idx to word-idx |
| shr r1d, 10 |
| mov [varq], r1d |
| %else |
| cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 |
| %define base r2-shufw_6543210x |
| LEA r2, shufw_6543210x |
| pxor m0, m0 |
| lea stride3q, [strideq*3] |
| movq m5, [srcq+strideq*0] |
| movhps m5, [srcq+strideq*1] |
| movq m7, [srcq+strideq*2] |
| movhps m7, [srcq+stride3q] |
| mova m1, [base+pw_128] |
| psadbw m2, m5, m0 |
| psadbw m3, m7, m0 |
| packssdw m2, m3 |
| punpcklbw m4, m5, m0 |
| punpckhbw m5, m0 |
| punpcklbw m6, m7, m0 |
| punpckhbw m7, m0 |
| psubw m4, m1 |
| psubw m5, m1 |
| psubw m6, m1 |
| psubw m7, m1 |
| |
| mova [esp+0x00], m4 |
| mova [esp+0x10], m5 |
| mova [esp+0x20], m6 |
| mova [esp+0x50], m7 |
| |
| lea srcq, [srcq+strideq*4] |
| movq m5, [srcq+strideq*0] |
| movhps m5, [srcq+strideq*1] |
| movq m7, [srcq+strideq*2] |
| movhps m7, [srcq+stride3q] |
| psadbw m3, m5, m0 |
| psadbw m0, m7 |
| packssdw m3, m0 |
| pxor m0, m0 |
| punpcklbw m4, m5, m0 |
| punpckhbw m5, m0 |
| punpcklbw m6, m7, m0 |
| punpckhbw m7, m0 |
| cglobal_label .main |
| psubw m4, m1 |
| psubw m5, m1 |
| psubw m6, m1 |
| psubw m7, m1 |
| packssdw m2, m3 |
| psllw m1, 3 |
| psubw m2, m1 ; partial_sum_hv[0] |
| pmaddwd m2, m2 |
| |
| mova m3, [esp+0x50] |
| mova m0, [esp+0x00] |
| paddw m0, [esp+0x10] |
| paddw m1, m3, [esp+0x20] |
| paddw m0, m4 |
| paddw m1, m5 |
| paddw m0, m6 |
| paddw m1, m7 |
| paddw m0, m1 ; partial_sum_hv[1] |
| pmaddwd m0, m0 |
| |
| phaddd m2, m0 |
| MULLD m2, [base+div_table%+SUFFIX+48] |
| mova [esp+0x30], m2 |
| |
| mova m1, [esp+0x10] |
| pslldq m0, m1, 2 |
| psrldq m1, 14 |
| paddw m0, [esp+0x00] |
| pslldq m2, m3, 6 |
| psrldq m3, 10 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova m3, [esp+0x20] |
| pslldq m2, m3, 4 |
| psrldq m3, 12 |
| paddw m0, m2 ; partial_sum_diag[0] top/left half |
| paddw m1, m3 ; partial_sum_diag[0] top/right half |
| pslldq m2, m4, 8 |
| psrldq m3, m4, 8 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m5, 10 |
| psrldq m3, m5, 6 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m6, 12 |
| psrldq m3, m6, 4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m7, 14 |
| psrldq m3, m7, 2 |
| paddw m0, m2 ; partial_sum_diag[0][0-7] |
| paddw m1, m3 ; partial_sum_diag[0][8-14,zero] |
| mova m3, [esp+0x50] |
| pshufb m1, [base+shufw_6543210x] |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmaddwd m2, m2 |
| pmaddwd m0, m0 |
| MULLD m2, [base+div_table%+SUFFIX+16] |
| MULLD m0, [base+div_table%+SUFFIX+ 0] |
| paddd m0, m2 ; cost[0a-d] |
| mova [esp+0x40], m0 |
| |
| mova m1, [esp+0x00] |
| pslldq m0, m1, 14 |
| psrldq m1, 2 |
| paddw m0, m7 |
| pslldq m2, m3, 8 |
| psrldq m3, 8 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova m3, [esp+0x20] |
| pslldq m2, m3, 10 |
| psrldq m3, 6 |
| paddw m0, m2 |
| paddw m1, m3 |
| mova m3, [esp+0x10] |
| pslldq m2, m3, 12 |
| psrldq m3, 4 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m4, 6 |
| psrldq m3, m4, 10 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m5, 4 |
| psrldq m3, m5, 12 |
| paddw m0, m2 |
| paddw m1, m3 |
| pslldq m2, m6, 2 |
| psrldq m3, m6, 14 |
| paddw m0, m2 ; partial_sum_diag[1][0-7] |
| paddw m1, m3 ; partial_sum_diag[1][8-14,zero] |
| mova m3, [esp+0x50] |
| pshufb m1, [base+shufw_6543210x] |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmaddwd m2, m2 |
| pmaddwd m0, m0 |
| MULLD m2, [base+div_table%+SUFFIX+16] |
| MULLD m0, [base+div_table%+SUFFIX+ 0] |
| paddd m0, m2 ; cost[4a-d] |
| phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] |
| phaddd m1, [esp+0x30] ; cost[0,4,2,6] |
| mova [esp+0x30], m1 |
| |
| phaddw m0, [esp+0x00], m4 |
| phaddw m1, [esp+0x10], m5 |
| paddw m4, m5 |
| mova m2, [esp+0x20] |
| paddw m5, m2, m3 |
| phaddw m2, m6 |
| paddw m6, m7 |
| phaddw m3, m7 |
| mova m7, [esp+0x00] |
| paddw m7, [esp+0x10] |
| mova [esp+0x00], m0 |
| mova [esp+0x10], m1 |
| mova [esp+0x20], m2 |
| |
| pslldq m1, m4, 4 |
| pslldq m2, m6, 6 |
| pslldq m0, m5, 2 |
| paddw m1, m2 |
| paddw m0, m7 |
| psrldq m2, m5, 14 |
| paddw m0, m1 ; partial_sum_alt[3] left |
| psrldq m1, m4, 12 |
| paddw m1, m2 |
| psrldq m2, m6, 10 |
| paddw m1, m2 ; partial_sum_alt[3] right |
| pshuflw m1, m1, q3012 |
| punpckhwd m2, m0, m1 |
| punpcklwd m0, m1 |
| pmaddwd m2, m2 |
| pmaddwd m0, m0 |
| MULLD m2, [base+div_table%+SUFFIX+48] |
| MULLD m0, [base+div_table%+SUFFIX+32] |
| paddd m0, m2 ; cost[7a-d] |
| mova [esp+0x40], m0 |
| |
| pslldq m0, m7, 6 |
| psrldq m7, 10 |
| pslldq m1, m5, 4 |
| psrldq m5, 12 |
| pslldq m2, m4, 2 |
| psrldq m4, 14 |
| paddw m0, m6 |
| paddw m7, m5 |
| paddw m0, m1 |
| paddw m7, m4 |
| paddw m0, m2 |
| pshuflw m2, m7, q3012 |
| punpckhwd m7, m0 |
| punpcklwd m0, m2 |
| pmaddwd m7, m7 |
| pmaddwd m0, m0 |
| MULLD m7, [base+div_table%+SUFFIX+48] |
| MULLD m0, [base+div_table%+SUFFIX+32] |
| paddd m0, m7 ; cost[5a-d] |
| mova [esp+0x50], m0 |
| |
| mova m7, [esp+0x10] |
| mova m2, [esp+0x20] |
| pslldq m0, m7, 2 |
| psrldq m7, 14 |
| pslldq m4, m2, 4 |
| psrldq m2, 12 |
| pslldq m5, m3, 6 |
| psrldq m6, m3, 10 |
| paddw m0, [esp+0x00] |
| paddw m7, m2 |
| paddw m4, m5 |
| paddw m7, m6 ; partial_sum_alt[3] right |
| paddw m0, m4 ; partial_sum_alt[3] left |
| pshuflw m2, m7, q3012 |
| punpckhwd m7, m0 |
| punpcklwd m0, m2 |
| pmaddwd m7, m7 |
| pmaddwd m0, m0 |
| MULLD m7, [base+div_table%+SUFFIX+48] |
| MULLD m0, [base+div_table%+SUFFIX+32] |
| paddd m0, m7 ; cost[1a-d] |
| SWAP m0, m4 |
| |
| pshufd m0, [esp+0x00], q1032 |
| pshufd m1, [esp+0x10], q1032 |
| pshufd m2, [esp+0x20], q1032 |
| pshufd m3, m3, q1032 |
| mova [esp+0x00], m4 |
| |
| pslldq m4, m0, 6 |
| psrldq m0, 10 |
| pslldq m5, m1, 4 |
| psrldq m1, 12 |
| pslldq m6, m2, 2 |
| psrldq m2, 14 |
| paddw m4, m3 |
| paddw m0, m1 |
| paddw m5, m6 |
| paddw m0, m2 |
| paddw m4, m5 |
| pshuflw m2, m0, q3012 |
| punpckhwd m0, m4 |
| punpcklwd m4, m2 |
| pmaddwd m0, m0 |
| pmaddwd m4, m4 |
| MULLD m0, [base+div_table%+SUFFIX+48] |
| MULLD m4, [base+div_table%+SUFFIX+32] |
| paddd m4, m0 ; cost[3a-d] |
| |
| mova m1, [esp+0x00] |
| mova m2, [esp+0x50] |
| mova m0, [esp+0x30] ; cost[0,4,2,6] |
| phaddd m1, m4 |
| phaddd m2, [esp+0x40] ; cost[1,3,5,7] |
| phaddd m1, m2 |
| pshufd m2, m0, q3120 |
| |
| ; now find the best cost |
| %if cpuflag(sse4) |
| pmaxsd m0, m1 |
| pshufd m3, m0, q1032 |
| pmaxsd m3, m0 |
| pshufd m0, m3, q2301 |
| pmaxsd m0, m3 |
| %else |
| pcmpgtd m3, m0, m1 |
| pand m0, m3 |
| pandn m3, m1 |
| por m0, m3 |
| pshufd m4, m0, q1032 |
| pcmpgtd m3, m0, m4 |
| pand m0, m3 |
| pandn m3, m4 |
| por m0, m3 |
| pshufd m4, m0, q2301 |
| pcmpgtd m3, m0, m4 |
| pand m0, m3 |
| pandn m3, m4 |
| por m0, m3 |
| %endif |
| |
| ; get direction and variance |
| mov vard, varm |
| punpckhdq m3, m2, m1 |
| punpckldq m2, m1 |
| psubd m1, m0, m3 |
| psubd m4, m0, m2 |
| mova [esp+0x00], m1 ; emulate ymm in stack |
| mova [esp+0x10], m4 |
| pcmpeqd m3, m0 ; compute best cost mask |
| pcmpeqd m2, m0 |
| packssdw m2, m3 |
| pmovmskb eax, m2 ; get byte-idx from mask |
| tzcnt eax, eax |
| mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm |
| shr eax, 1 ; get direction by converting byte-idx to word-idx |
| shr r1d, 10 |
| mov [vard], r1d |
| %endif |
| |
| RET |
| %endmacro |
| |
| INIT_XMM sse4 |
| CDEF_FILTER 8, 8 |
| CDEF_FILTER 4, 8 |
| CDEF_FILTER 4, 4 |
| CDEF_DIR |
| |
| INIT_XMM ssse3 |
| CDEF_FILTER 8, 8 |
| CDEF_FILTER 4, 8 |
| CDEF_FILTER 4, 4 |
| CDEF_DIR |
| |
| INIT_XMM sse2 |
| CDEF_FILTER 8, 8 |
| CDEF_FILTER 4, 8 |
| CDEF_FILTER 4, 4 |