| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| ; void vp10_temporal_filter_apply_sse2 | arg |
| ; (unsigned char *frame1, | 0 |
| ; unsigned int stride, | 1 |
| ; unsigned char *frame2, | 2 |
| ; unsigned int block_width, | 3 |
| ; unsigned int block_height, | 4 |
| ; int strength, | 5 |
| ; int filter_weight, | 6 |
| ; unsigned int *accumulator, | 7 |
| ; unsigned short *count) | 8 |
| global sym(vp10_temporal_filter_apply_sse2) PRIVATE |
| sym(vp10_temporal_filter_apply_sse2): |
| |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 9 |
| SAVE_XMM 7 |
| GET_GOT rbx |
| push rsi |
| push rdi |
| ALIGN_STACK 16, rax |
| %define block_width 0 |
| %define block_height 16 |
| %define strength 32 |
| %define filter_weight 48 |
| %define rounding_bit 64 |
| %define rbp_backup 80 |
| %define stack_size 96 |
| sub rsp, stack_size |
| mov [rsp + rbp_backup], rbp |
| ; end prolog |
| |
| mov edx, arg(3) |
| mov [rsp + block_width], rdx |
| mov edx, arg(4) |
| mov [rsp + block_height], rdx |
| movd xmm6, arg(5) |
| movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read |
| |
| ; calculate the rounding bit outside the loop |
| ; 0x8000 >> (16 - strength) |
| mov rdx, 16 |
| sub rdx, arg(5) ; 16 - strength |
| movq xmm4, rdx ; can't use rdx w/ shift |
| movdqa xmm5, [GLOBAL(_const_top_bit)] |
| psrlw xmm5, xmm4 |
| movdqa [rsp + rounding_bit], xmm5 |
| |
| mov rsi, arg(0) ; src/frame1 |
| mov rdx, arg(2) ; predictor frame |
| mov rdi, arg(7) ; accumulator |
| mov rax, arg(8) ; count |
| |
| ; dup the filter weight and store for later |
| movd xmm0, arg(6) ; filter_weight |
| pshuflw xmm0, xmm0, 0 |
| punpcklwd xmm0, xmm0 |
| movdqa [rsp + filter_weight], xmm0 |
| |
| mov rbp, arg(1) ; stride |
| pxor xmm7, xmm7 ; zero for extraction |
| |
| mov rcx, [rsp + block_width] |
| imul rcx, [rsp + block_height] |
| add rcx, rdx |
| cmp dword ptr [rsp + block_width], 8 |
| jne .temporal_filter_apply_load_16 |
| |
| .temporal_filter_apply_load_8: |
| movq xmm0, [rsi] ; first row |
| lea rsi, [rsi + rbp] ; += stride |
| punpcklbw xmm0, xmm7 ; src[ 0- 7] |
| movq xmm1, [rsi] ; second row |
| lea rsi, [rsi + rbp] ; += stride |
| punpcklbw xmm1, xmm7 ; src[ 8-15] |
| jmp .temporal_filter_apply_load_finished |
| |
| .temporal_filter_apply_load_16: |
| movdqa xmm0, [rsi] ; src (frame1) |
| lea rsi, [rsi + rbp] ; += stride |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm7 ; src[ 0- 7] |
| punpckhbw xmm1, xmm7 ; src[ 8-15] |
| |
| .temporal_filter_apply_load_finished: |
| movdqa xmm2, [rdx] ; predictor (frame2) |
| movdqa xmm3, xmm2 |
| punpcklbw xmm2, xmm7 ; pred[ 0- 7] |
| punpckhbw xmm3, xmm7 ; pred[ 8-15] |
| |
| ; modifier = src_byte - pixel_value |
| psubw xmm0, xmm2 ; src - pred[ 0- 7] |
| psubw xmm1, xmm3 ; src - pred[ 8-15] |
| |
| ; modifier *= modifier |
| pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 |
| pmullw xmm1, xmm1 ; modifer[ 8-15]^2 |
| |
| ; modifier *= 3 |
| pmullw xmm0, [GLOBAL(_const_3w)] |
| pmullw xmm1, [GLOBAL(_const_3w)] |
| |
| ; modifer += 0x8000 >> (16 - strength) |
| paddw xmm0, [rsp + rounding_bit] |
| paddw xmm1, [rsp + rounding_bit] |
| |
| ; modifier >>= strength |
| psrlw xmm0, [rsp + strength] |
| psrlw xmm1, [rsp + strength] |
| |
| ; modifier = 16 - modifier |
| ; saturation takes care of modifier > 16 |
| movdqa xmm3, [GLOBAL(_const_16w)] |
| movdqa xmm2, [GLOBAL(_const_16w)] |
| psubusw xmm3, xmm1 |
| psubusw xmm2, xmm0 |
| |
| ; modifier *= filter_weight |
| pmullw xmm2, [rsp + filter_weight] |
| pmullw xmm3, [rsp + filter_weight] |
| |
| ; count |
| movdqa xmm4, [rax] |
| movdqa xmm5, [rax+16] |
| ; += modifier |
| paddw xmm4, xmm2 |
| paddw xmm5, xmm3 |
| ; write back |
| movdqa [rax], xmm4 |
| movdqa [rax+16], xmm5 |
| lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) |
| |
| ; load and extract the predictor up to shorts |
| pxor xmm7, xmm7 |
| movdqa xmm0, [rdx] |
| lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) |
| movdqa xmm1, xmm0 |
| punpcklbw xmm0, xmm7 ; pred[ 0- 7] |
| punpckhbw xmm1, xmm7 ; pred[ 8-15] |
| |
| ; modifier *= pixel_value |
| pmullw xmm0, xmm2 |
| pmullw xmm1, xmm3 |
| |
| ; expand to double words |
| movdqa xmm2, xmm0 |
| punpcklwd xmm0, xmm7 ; [ 0- 3] |
| punpckhwd xmm2, xmm7 ; [ 4- 7] |
| movdqa xmm3, xmm1 |
| punpcklwd xmm1, xmm7 ; [ 8-11] |
| punpckhwd xmm3, xmm7 ; [12-15] |
| |
| ; accumulator |
| movdqa xmm4, [rdi] |
| movdqa xmm5, [rdi+16] |
| movdqa xmm6, [rdi+32] |
| movdqa xmm7, [rdi+48] |
| ; += modifier |
| paddd xmm4, xmm0 |
| paddd xmm5, xmm2 |
| paddd xmm6, xmm1 |
| paddd xmm7, xmm3 |
| ; write back |
| movdqa [rdi], xmm4 |
| movdqa [rdi+16], xmm5 |
| movdqa [rdi+32], xmm6 |
| movdqa [rdi+48], xmm7 |
| lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) |
| |
| cmp rdx, rcx |
| je .temporal_filter_apply_epilog |
| pxor xmm7, xmm7 ; zero for extraction |
| cmp dword ptr [rsp + block_width], 16 |
| je .temporal_filter_apply_load_16 |
| jmp .temporal_filter_apply_load_8 |
| |
| .temporal_filter_apply_epilog: |
| ; begin epilog |
| mov rbp, [rsp + rbp_backup] |
| add rsp, stack_size |
| pop rsp |
| pop rdi |
| pop rsi |
| RESTORE_GOT |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| SECTION_RODATA |
| align 16 |
| _const_3w: |
| times 8 dw 3 |
| align 16 |
| _const_top_bit: |
| times 8 dw 1<<15 |
| align 16 |
| _const_16w |
| times 8 dw 16 |