| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| %macro LF_ABS 2 |
| ; %1 value not preserved |
| ; %2 value preserved |
| ; output in %1 |
| movdqa scratch1, %2 ; v2 |
| |
| psubusb scratch1, %1 ; v2 - v1 |
| psubusb %1, %2 ; v1 - v2 |
| por %1, scratch1 ; abs(v2 - v1) |
| %endmacro |
| |
| %macro LF_FILTER_HEV_MASK 8-9 |
| |
| LF_ABS %1, %2 ; abs(p3 - p2) |
| LF_ABS %2, %3 ; abs(p2 - p1) |
| pmaxub %1, %2 ; accumulate mask |
| %if %0 == 8 |
| movdqa scratch2, %3 ; save p1 |
| LF_ABS scratch2, %4 ; abs(p1 - p0) |
| %endif |
| LF_ABS %4, %5 ; abs(p0 - q0) |
| LF_ABS %5, %6 ; abs(q0 - q1) |
| %if %0 == 8 |
| pmaxub %5, scratch2 ; accumulate hev |
| %else |
| pmaxub %5, %9 |
| %endif |
| pmaxub %1, %5 ; accumulate mask |
| |
| LF_ABS %3, %6 ; abs(p1 - q1) |
| LF_ABS %6, %7 ; abs(q1 - q2) |
| pmaxub %1, %6 ; accumulate mask |
| LF_ABS %7, %8 ; abs(q2 - q3) |
| pmaxub %1, %7 ; accumulate mask |
| |
| paddusb %4, %4 ; 2 * abs(p0 - q0) |
| pand %3, [GLOBAL(tfe)] |
| psrlw %3, 1 ; abs(p1 - q1) / 2 |
| paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 |
| |
| psubusb %1, [limit] |
| psubusb %4, [blimit] |
| por %1, %4 |
| pcmpeqb %1, zero ; mask |
| |
| psubusb %5, [thresh] |
| pcmpeqb %5, zero ; ~hev |
| %endmacro |
| |
| %macro LF_FILTER 6 |
| ; %1-%4: p1-q1 |
| ; %5: mask |
| ; %6: hev |
| |
| movdqa scratch2, %6 ; save hev |
| |
| pxor %1, [GLOBAL(t80)] ; ps1 |
| pxor %4, [GLOBAL(t80)] ; qs1 |
| movdqa scratch1, %1 |
| psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) |
| pandn scratch2, scratch1 ; vp8_filter &= hev |
| |
| pxor %2, [GLOBAL(t80)] ; ps0 |
| pxor %3, [GLOBAL(t80)] ; qs0 |
| movdqa scratch1, %3 |
| psubsb scratch1, %2 ; qs0 - ps0 |
| paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
| paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
| paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
| pand %5, scratch2 ; &= mask |
| |
| movdqa scratch2, %5 |
| paddsb %5, [GLOBAL(t4)] ; Filter1 |
| paddsb scratch2, [GLOBAL(t3)] ; Filter2 |
| |
| ; Filter1 >> 3 |
| movdqa scratch1, zero |
| pcmpgtb scratch1, %5 |
| psrlw %5, 3 |
| pand scratch1, [GLOBAL(te0)] |
| pand %5, [GLOBAL(t1f)] |
| por %5, scratch1 |
| |
| psubsb %3, %5 ; qs0 - Filter1 |
| pxor %3, [GLOBAL(t80)] |
| |
| ; Filter2 >> 3 |
| movdqa scratch1, zero |
| pcmpgtb scratch1, scratch2 |
| psrlw scratch2, 3 |
| pand scratch1, [GLOBAL(te0)] |
| pand scratch2, [GLOBAL(t1f)] |
| por scratch2, scratch1 |
| |
| paddsb %2, scratch2 ; ps0 + Filter2 |
| pxor %2, [GLOBAL(t80)] |
| |
| ; outer tap adjustments |
| paddsb %5, [GLOBAL(t1)] |
| movdqa scratch1, zero |
| pcmpgtb scratch1, %5 |
| psrlw %5, 1 |
| pand scratch1, [GLOBAL(t80)] |
| pand %5, [GLOBAL(t7f)] |
| por %5, scratch1 |
| pand %5, %6 ; vp8_filter &= ~hev |
| |
| psubsb %4, %5 ; qs1 - vp8_filter |
| pxor %4, [GLOBAL(t80)] |
| |
| paddsb %1, %5 ; ps1 + vp8_filter |
| pxor %1, [GLOBAL(t80)] |
| %endmacro |
| |
| ;void vp8_loop_filter_bh_y_sse2 |
| ;( |
| ; unsigned char *src_ptr, |
| ; int src_pixel_step, |
| ; const char *blimit, |
| ; const char *limit, |
| ; const char *thresh |
| ;) |
| global sym(vp8_loop_filter_bh_y_sse2) PRIVATE |
| sym(vp8_loop_filter_bh_y_sse2): |
| |
| %if LIBVPX_YASM_WIN64 |
| %define src rcx ; src_ptr |
| %define stride rdx ; src_pixel_step |
| %define blimit r8 |
| %define limit r9 |
| %define thresh r10 |
| |
| %define spp rax |
| %define stride3 r11 |
| %define stride5 r12 |
| %define stride7 r13 |
| |
| push rbp |
| mov rbp, rsp |
| SAVE_XMM 11 |
| push r12 |
| push r13 |
| mov thresh, arg(4) |
| %else |
| %define src rdi ; src_ptr |
| %define stride rsi ; src_pixel_step |
| %define blimit rdx |
| %define limit rcx |
| %define thresh r8 |
| |
| %define spp rax |
| %define stride3 r9 |
| %define stride5 r10 |
| %define stride7 r11 |
| %endif |
| |
| %define scratch1 xmm5 |
| %define scratch2 xmm6 |
| %define zero xmm7 |
| |
| %define i0 [src] |
| %define i1 [spp] |
| %define i2 [src + 2 * stride] |
| %define i3 [spp + 2 * stride] |
| %define i4 [src + 4 * stride] |
| %define i5 [spp + 4 * stride] |
| %define i6 [src + 2 * stride3] |
| %define i7 [spp + 2 * stride3] |
| %define i8 [src + 8 * stride] |
| %define i9 [spp + 8 * stride] |
| %define i10 [src + 2 * stride5] |
| %define i11 [spp + 2 * stride5] |
| %define i12 [src + 4 * stride3] |
| %define i13 [spp + 4 * stride3] |
| %define i14 [src + 2 * stride7] |
| %define i15 [spp + 2 * stride7] |
| |
| ; prep work |
| lea spp, [src + stride] |
| lea stride3, [stride + 2 * stride] |
| lea stride5, [stride3 + 2 * stride] |
| lea stride7, [stride3 + 4 * stride] |
| pxor zero, zero |
| |
| ; load the first set into registers |
| movdqa xmm0, i0 |
| movdqa xmm1, i1 |
| movdqa xmm2, i2 |
| movdqa xmm3, i3 |
| movdqa xmm4, i4 |
| movdqa xmm8, i5 |
| movdqa xmm9, i6 ; q2, will contain abs(p1-p0) |
| movdqa xmm10, i7 |
| LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 |
| |
| movdqa xmm1, i2 |
| movdqa xmm2, i3 |
| movdqa xmm3, i4 |
| movdqa xmm8, i5 |
| LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 |
| movdqa i2, xmm1 |
| movdqa i3, xmm2 |
| |
| ; second set |
| movdqa i4, xmm3 |
| movdqa i5, xmm8 |
| |
| movdqa xmm0, i6 |
| movdqa xmm1, i7 |
| movdqa xmm2, i8 |
| movdqa xmm4, i9 |
| movdqa xmm10, i10 ; q2, will contain abs(p1-p0) |
| movdqa xmm11, i11 |
| LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 |
| |
| movdqa xmm0, i6 |
| movdqa xmm1, i7 |
| movdqa xmm4, i8 |
| movdqa xmm8, i9 |
| LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 |
| movdqa i6, xmm0 |
| movdqa i7, xmm1 |
| |
| ; last set |
| movdqa i8, xmm4 |
| movdqa i9, xmm8 |
| |
| movdqa xmm0, i10 |
| movdqa xmm1, i11 |
| movdqa xmm2, i12 |
| movdqa xmm3, i13 |
| movdqa xmm9, i14 ; q2, will contain abs(p1-p0) |
| movdqa xmm11, i15 |
| LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 |
| |
| movdqa xmm0, i10 |
| movdqa xmm1, i11 |
| movdqa xmm3, i12 |
| movdqa xmm8, i13 |
| LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 |
| movdqa i10, xmm0 |
| movdqa i11, xmm1 |
| movdqa i12, xmm3 |
| movdqa i13, xmm8 |
| |
| %if LIBVPX_YASM_WIN64 |
| pop r13 |
| pop r12 |
| RESTORE_XMM |
| pop rbp |
| %endif |
| |
| ret |
| |
| |
| ;void vp8_loop_filter_bv_y_sse2 |
| ;( |
| ; unsigned char *src_ptr, |
| ; int src_pixel_step, |
| ; const char *blimit, |
| ; const char *limit, |
| ; const char *thresh |
| ;) |
| |
| global sym(vp8_loop_filter_bv_y_sse2) PRIVATE |
| sym(vp8_loop_filter_bv_y_sse2): |
| |
| %if LIBVPX_YASM_WIN64 |
| %define src rcx ; src_ptr |
| %define stride rdx ; src_pixel_step |
| %define blimit r8 |
| %define limit r9 |
| %define thresh r10 |
| |
| %define spp rax |
| %define stride3 r11 |
| %define stride5 r12 |
| %define stride7 r13 |
| |
| push rbp |
| mov rbp, rsp |
| SAVE_XMM 15 |
| push r12 |
| push r13 |
| mov thresh, arg(4) |
| %else |
| %define src rdi |
| %define stride rsi |
| %define blimit rdx |
| %define limit rcx |
| %define thresh r8 |
| |
| %define spp rax |
| %define stride3 r9 |
| %define stride5 r10 |
| %define stride7 r11 |
| %endif |
| |
| %define scratch1 xmm5 |
| %define scratch2 xmm6 |
| %define zero xmm7 |
| |
| %define s0 [src] |
| %define s1 [spp] |
| %define s2 [src + 2 * stride] |
| %define s3 [spp + 2 * stride] |
| %define s4 [src + 4 * stride] |
| %define s5 [spp + 4 * stride] |
| %define s6 [src + 2 * stride3] |
| %define s7 [spp + 2 * stride3] |
| %define s8 [src + 8 * stride] |
| %define s9 [spp + 8 * stride] |
| %define s10 [src + 2 * stride5] |
| %define s11 [spp + 2 * stride5] |
| %define s12 [src + 4 * stride3] |
| %define s13 [spp + 4 * stride3] |
| %define s14 [src + 2 * stride7] |
| %define s15 [spp + 2 * stride7] |
| |
| %define i0 [rsp] |
| %define i1 [rsp + 16] |
| %define i2 [rsp + 32] |
| %define i3 [rsp + 48] |
| %define i4 [rsp + 64] |
| %define i5 [rsp + 80] |
| %define i6 [rsp + 96] |
| %define i7 [rsp + 112] |
| %define i8 [rsp + 128] |
| %define i9 [rsp + 144] |
| %define i10 [rsp + 160] |
| %define i11 [rsp + 176] |
| %define i12 [rsp + 192] |
| %define i13 [rsp + 208] |
| %define i14 [rsp + 224] |
| %define i15 [rsp + 240] |
| |
| ALIGN_STACK 16, rax |
| |
| ; reserve stack space |
| %define temp_storage 0 ; size is 256 (16*16) |
| %define stack_size 256 |
| sub rsp, stack_size |
| |
| ; prep work |
| lea spp, [src + stride] |
| lea stride3, [stride + 2 * stride] |
| lea stride5, [stride3 + 2 * stride] |
| lea stride7, [stride3 + 4 * stride] |
| |
| ; 8-f |
| movdqa xmm0, s8 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, s9 ; 80 90 |
| punpckhbw xmm1, s9 ; 88 98 |
| |
| movdqa xmm2, s10 |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, s11 ; a0 b0 |
| punpckhbw xmm3, s11 ; a8 b8 |
| |
| movdqa xmm4, xmm0 |
| punpcklwd xmm0, xmm2 ; 80 90 a0 b0 |
| punpckhwd xmm4, xmm2 ; 84 94 a4 b4 |
| |
| movdqa xmm2, xmm1 |
| punpcklwd xmm1, xmm3 ; 88 98 a8 b8 |
| punpckhwd xmm2, xmm3 ; 8c 9c ac bc |
| |
| ; using xmm[0124] |
| ; work on next 4 rows |
| |
| movdqa xmm3, s12 |
| movdqa xmm5, xmm3 |
| punpcklbw xmm3, s13 ; c0 d0 |
| punpckhbw xmm5, s13 ; c8 d8 |
| |
| movdqa xmm6, s14 |
| movdqa xmm7, xmm6 |
| punpcklbw xmm6, s15 ; e0 f0 |
| punpckhbw xmm7, s15 ; e8 f8 |
| |
| movdqa xmm8, xmm3 |
| punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 |
| punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 |
| |
| movdqa xmm6, xmm5 |
| punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 |
| punpckhwd xmm6, xmm7 ; cc dc ec fc |
| |
| ; pull the third and fourth sets together |
| |
| movdqa xmm7, xmm0 |
| punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 |
| punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 |
| |
| movdqa xmm3, xmm4 |
| punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 |
| punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 |
| |
| movdqa xmm8, xmm1 |
| punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 |
| punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa |
| |
| movdqa xmm5, xmm2 |
| punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc |
| punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe |
| |
| ; save the calculations. we only have 15 registers ... |
| movdqa i0, xmm0 |
| movdqa i1, xmm7 |
| movdqa i2, xmm4 |
| movdqa i3, xmm3 |
| movdqa i4, xmm1 |
| movdqa i5, xmm8 |
| movdqa i6, xmm2 |
| movdqa i7, xmm5 |
| |
| ; 0-7 |
| movdqa xmm0, s0 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, s1 ; 00 10 |
| punpckhbw xmm1, s1 ; 08 18 |
| |
| movdqa xmm2, s2 |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, s3 ; 20 30 |
| punpckhbw xmm3, s3 ; 28 38 |
| |
| movdqa xmm4, xmm0 |
| punpcklwd xmm0, xmm2 ; 00 10 20 30 |
| punpckhwd xmm4, xmm2 ; 04 14 24 34 |
| |
| movdqa xmm2, xmm1 |
| punpcklwd xmm1, xmm3 ; 08 18 28 38 |
| punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c |
| |
| ; using xmm[0124] |
| ; work on next 4 rows |
| |
| movdqa xmm3, s4 |
| movdqa xmm5, xmm3 |
| punpcklbw xmm3, s5 ; 40 50 |
| punpckhbw xmm5, s5 ; 48 58 |
| |
| movdqa xmm6, s6 |
| movdqa xmm7, xmm6 |
| punpcklbw xmm6, s7 ; 60 70 |
| punpckhbw xmm7, s7 ; 68 78 |
| |
| movdqa xmm8, xmm3 |
| punpcklwd xmm3, xmm6 ; 40 50 60 70 |
| punpckhwd xmm8, xmm6 ; 44 54 64 74 |
| |
| movdqa xmm6, xmm5 |
| punpcklwd xmm5, xmm7 ; 48 58 68 78 |
| punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c |
| |
| ; pull the first two sets together |
| |
| movdqa xmm7, xmm0 |
| punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 |
| punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 |
| |
| movdqa xmm3, xmm4 |
| punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 |
| punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 |
| |
| movdqa xmm8, xmm1 |
| punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 |
| punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a |
| |
| movdqa xmm5, xmm2 |
| punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c |
| punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e |
| ; final combination |
| |
| movdqa xmm6, xmm0 |
| punpcklqdq xmm0, i0 |
| punpckhqdq xmm6, i0 |
| |
| movdqa xmm9, xmm7 |
| punpcklqdq xmm7, i1 |
| punpckhqdq xmm9, i1 |
| |
| movdqa xmm10, xmm4 |
| punpcklqdq xmm4, i2 |
| punpckhqdq xmm10, i2 |
| |
| movdqa xmm11, xmm3 |
| punpcklqdq xmm3, i3 |
| punpckhqdq xmm11, i3 |
| |
| movdqa xmm12, xmm1 |
| punpcklqdq xmm1, i4 |
| punpckhqdq xmm12, i4 |
| |
| movdqa xmm13, xmm8 |
| punpcklqdq xmm8, i5 |
| punpckhqdq xmm13, i5 |
| |
| movdqa xmm14, xmm2 |
| punpcklqdq xmm2, i6 |
| punpckhqdq xmm14, i6 |
| |
| movdqa xmm15, xmm5 |
| punpcklqdq xmm5, i7 |
| punpckhqdq xmm15, i7 |
| |
| movdqa i0, xmm0 |
| movdqa i1, xmm6 |
| movdqa i2, xmm7 |
| movdqa i3, xmm9 |
| movdqa i4, xmm4 |
| movdqa i5, xmm10 |
| movdqa i6, xmm3 |
| movdqa i7, xmm11 |
| movdqa i8, xmm1 |
| movdqa i9, xmm12 |
| movdqa i10, xmm8 |
| movdqa i11, xmm13 |
| movdqa i12, xmm2 |
| movdqa i13, xmm14 |
| movdqa i14, xmm5 |
| movdqa i15, xmm15 |
| |
| ; TRANSPOSED DATA AVAILABLE ON THE STACK |
| |
| movdqa xmm12, xmm6 |
| movdqa xmm13, xmm7 |
| |
| pxor zero, zero |
| |
| LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 |
| |
| movdqa xmm1, i2 |
| movdqa xmm2, i3 |
| movdqa xmm8, i4 |
| movdqa xmm9, i5 |
| LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 |
| movdqa i2, xmm1 |
| movdqa i3, xmm2 |
| |
| ; second set |
| movdqa i4, xmm8 |
| movdqa i5, xmm9 |
| |
| movdqa xmm0, i6 |
| movdqa xmm1, i7 |
| movdqa xmm2, i8 |
| movdqa xmm4, i9 |
| movdqa xmm10, i10 ; q2, will contain abs(p1-p0) |
| movdqa xmm11, i11 |
| LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 |
| |
| movdqa xmm0, i6 |
| movdqa xmm1, i7 |
| movdqa xmm3, i8 |
| movdqa xmm4, i9 |
| LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 |
| movdqa i6, xmm0 |
| movdqa i7, xmm1 |
| |
| ; last set |
| movdqa i8, xmm3 |
| movdqa i9, xmm4 |
| |
| movdqa xmm0, i10 |
| movdqa xmm1, i11 |
| movdqa xmm2, i12 |
| movdqa xmm8, i13 |
| movdqa xmm9, i14 ; q2, will contain abs(p1-p0) |
| movdqa xmm11, i15 |
| LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 |
| |
| movdqa xmm0, i10 |
| movdqa xmm1, i11 |
| movdqa xmm4, i12 |
| movdqa xmm8, i13 |
| LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 |
| movdqa i10, xmm0 |
| movdqa i11, xmm1 |
| movdqa i12, xmm4 |
| movdqa i13, xmm8 |
| |
| |
| ; RESHUFFLE AND WRITE OUT |
| ; 8-f |
| movdqa xmm0, i8 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, i9 ; 80 90 |
| punpckhbw xmm1, i9 ; 88 98 |
| |
| movdqa xmm2, i10 |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, i11 ; a0 b0 |
| punpckhbw xmm3, i11 ; a8 b8 |
| |
| movdqa xmm4, xmm0 |
| punpcklwd xmm0, xmm2 ; 80 90 a0 b0 |
| punpckhwd xmm4, xmm2 ; 84 94 a4 b4 |
| |
| movdqa xmm2, xmm1 |
| punpcklwd xmm1, xmm3 ; 88 98 a8 b8 |
| punpckhwd xmm2, xmm3 ; 8c 9c ac bc |
| |
| ; using xmm[0124] |
| ; work on next 4 rows |
| |
| movdqa xmm3, i12 |
| movdqa xmm5, xmm3 |
| punpcklbw xmm3, i13 ; c0 d0 |
| punpckhbw xmm5, i13 ; c8 d8 |
| |
| movdqa xmm6, i14 |
| movdqa xmm7, xmm6 |
| punpcklbw xmm6, i15 ; e0 f0 |
| punpckhbw xmm7, i15 ; e8 f8 |
| |
| movdqa xmm8, xmm3 |
| punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 |
| punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 |
| |
| movdqa xmm6, xmm5 |
| punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 |
| punpckhwd xmm6, xmm7 ; cc dc ec fc |
| |
| ; pull the third and fourth sets together |
| |
| movdqa xmm7, xmm0 |
| punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 |
| punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 |
| |
| movdqa xmm3, xmm4 |
| punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 |
| punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 |
| |
| movdqa xmm8, xmm1 |
| punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 |
| punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa |
| |
| movdqa xmm5, xmm2 |
| punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc |
| punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe |
| |
| ; save the calculations. we only have 15 registers ... |
| movdqa i8, xmm0 |
| movdqa i9, xmm7 |
| movdqa i10, xmm4 |
| movdqa i11, xmm3 |
| movdqa i12, xmm1 |
| movdqa i13, xmm8 |
| movdqa i14, xmm2 |
| movdqa i15, xmm5 |
| |
| ; 0-7 |
| movdqa xmm0, i0 |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, i1 ; 00 10 |
| punpckhbw xmm1, i1 ; 08 18 |
| |
| movdqa xmm2, i2 |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, i3 ; 20 30 |
| punpckhbw xmm3, i3 ; 28 38 |
| |
| movdqa xmm4, xmm0 |
| punpcklwd xmm0, xmm2 ; 00 10 20 30 |
| punpckhwd xmm4, xmm2 ; 04 14 24 34 |
| |
| movdqa xmm2, xmm1 |
| punpcklwd xmm1, xmm3 ; 08 18 28 38 |
| punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c |
| |
| ; using xmm[0124] |
| ; work on next 4 rows |
| |
| movdqa xmm3, i4 |
| movdqa xmm5, xmm3 |
| punpcklbw xmm3, i5 ; 40 50 |
| punpckhbw xmm5, i5 ; 48 58 |
| |
| movdqa xmm6, i6 |
| movdqa xmm7, xmm6 |
| punpcklbw xmm6, i7 ; 60 70 |
| punpckhbw xmm7, i7 ; 68 78 |
| |
| movdqa xmm8, xmm3 |
| punpcklwd xmm3, xmm6 ; 40 50 60 70 |
| punpckhwd xmm8, xmm6 ; 44 54 64 74 |
| |
| movdqa xmm6, xmm5 |
| punpcklwd xmm5, xmm7 ; 48 58 68 78 |
| punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c |
| |
| ; pull the first two sets together |
| |
| movdqa xmm7, xmm0 |
| punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 |
| punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 |
| |
| movdqa xmm3, xmm4 |
| punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 |
| punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 |
| |
| movdqa xmm8, xmm1 |
| punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 |
| punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a |
| |
| movdqa xmm5, xmm2 |
| punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c |
| punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e |
| ; final combination |
| |
| movdqa xmm6, xmm0 |
| punpcklqdq xmm0, i8 |
| punpckhqdq xmm6, i8 |
| |
| movdqa xmm9, xmm7 |
| punpcklqdq xmm7, i9 |
| punpckhqdq xmm9, i9 |
| |
| movdqa xmm10, xmm4 |
| punpcklqdq xmm4, i10 |
| punpckhqdq xmm10, i10 |
| |
| movdqa xmm11, xmm3 |
| punpcklqdq xmm3, i11 |
| punpckhqdq xmm11, i11 |
| |
| movdqa xmm12, xmm1 |
| punpcklqdq xmm1, i12 |
| punpckhqdq xmm12, i12 |
| |
| movdqa xmm13, xmm8 |
| punpcklqdq xmm8, i13 |
| punpckhqdq xmm13, i13 |
| |
| movdqa xmm14, xmm2 |
| punpcklqdq xmm2, i14 |
| punpckhqdq xmm14, i14 |
| |
| movdqa xmm15, xmm5 |
| punpcklqdq xmm5, i15 |
| punpckhqdq xmm15, i15 |
| |
| movdqa s0, xmm0 |
| movdqa s1, xmm6 |
| movdqa s2, xmm7 |
| movdqa s3, xmm9 |
| movdqa s4, xmm4 |
| movdqa s5, xmm10 |
| movdqa s6, xmm3 |
| movdqa s7, xmm11 |
| movdqa s8, xmm1 |
| movdqa s9, xmm12 |
| movdqa s10, xmm8 |
| movdqa s11, xmm13 |
| movdqa s12, xmm2 |
| movdqa s13, xmm14 |
| movdqa s14, xmm5 |
| movdqa s15, xmm15 |
| |
| ; free stack space |
| add rsp, stack_size |
| |
| ; un-ALIGN_STACK |
| pop rsp |
| |
| %if LIBVPX_YASM_WIN64 |
| pop r13 |
| pop r12 |
| RESTORE_XMM |
| pop rbp |
| %endif |
| |
| ret |
| |
| SECTION_RODATA |
| align 16 |
| te0: |
| times 16 db 0xe0 |
| align 16 |
| t7f: |
| times 16 db 0x7f |
| align 16 |
| tfe: |
| times 16 db 0xfe |
| align 16 |
| t1f: |
| times 16 db 0x1f |
| align 16 |
| t80: |
| times 16 db 0x80 |
| align 16 |
| t1: |
| times 16 db 0x01 |
| align 16 |
| t3: |
| times 16 db 0x03 |
| align 16 |
| t4: |
| times 16 db 0x04 |