| ; |
| ; Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| %define private_prefix vp9 |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION .text |
| ALIGN 16 |
| |
| ; |
| ; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, |
| ; intptr_t block_size, int64_t *ssz) |
| ; |
| |
| INIT_XMM avx |
| cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz |
| vzeroupper |
| |
| ; If only one iteration is required, then handle this as a special case. |
| ; It is the most frequent case, so we can have a significant gain here |
| ; by not setting up a loop and accumulators. |
| cmp sizeq, 16 |
| jne .generic |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; Common case of size == 16 |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ; Load input vectors |
| mova xm0, [dqcq] |
| packssdw xm0, [dqcq+16] |
| mova xm2, [uqcq] |
| packssdw xm2, [uqcq+16] |
| |
| mova xm1, [dqcq+32] |
| packssdw xm1, [dqcq+48] |
| mova xm3, [uqcq+32] |
| packssdw xm3, [uqcq+48] |
| |
| ; Compute the errors. |
| psubw xm0, xm2 |
| psubw xm1, xm3 |
| |
| ; Individual errors are max 15bit+sign, so squares are 30bit, and |
| ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). |
| pmaddwd xm2, xm2 |
| pmaddwd xm3, xm3 |
| |
| pmaddwd xm0, xm0 |
| pmaddwd xm1, xm1 |
| |
| ; Squares are always positive, so we can use unsigned arithmetic after |
| ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will |
| ; fit in 32bits |
| paddd xm2, xm3 |
| paddd xm0, xm1 |
| |
| ; Accumulate horizontally in 64 bits, there is no chance of overflow here |
| pxor xm5, xm5 |
| |
| pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits |
| psrlq xm2, 32 ; Zero extended high of a pair of 32 bits |
| |
| pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits |
| psrlq xm0, 32 ; Zero extended high of a pair of 32 bits |
| |
| paddq xm2, xm3 |
| paddq xm0, xm1 |
| |
| psrldq xm3, xm2, 8 |
| psrldq xm1, xm0, 8 |
| |
| paddq xm2, xm3 |
| paddq xm0, xm1 |
| |
| ; Store the return value |
| %if ARCH_X86_64 |
| movq rax, xm0 |
| movq [sszq], xm2 |
| %else |
| movd eax, xm0 |
| pextrd edx, xm0, 1 |
| movq [sszd], xm2 |
| %endif |
| RET |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; Generic case of size != 16, speculative low precision |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ALIGN 16 |
| .generic: |
| pxor xm4, xm4 ; sse accumulator |
| pxor xm5, xm5 ; overflow detection register for xm4 |
| pxor xm6, xm6 ; ssz accumulator |
| pxor xm7, xm7 ; overflow detection register for xm6 |
| lea uqcq, [uqcq+sizeq*4] |
| lea dqcq, [dqcq+sizeq*4] |
| neg sizeq |
| |
| ; Push the negative size as the high precision code might need it |
| push sizeq |
| |
| .loop: |
| ; Load input vectors |
| mova xm0, [dqcq+sizeq*4] |
| packssdw xm0, [dqcq+sizeq*4+16] |
| mova xm2, [uqcq+sizeq*4] |
| packssdw xm2, [uqcq+sizeq*4+16] |
| |
| mova xm1, [dqcq+sizeq*4+32] |
| packssdw xm1, [dqcq+sizeq*4+48] |
| mova xm3, [uqcq+sizeq*4+32] |
| packssdw xm3, [uqcq+sizeq*4+48] |
| |
| add sizeq, 16 |
| |
| ; Compute the squared errors. |
| ; Individual errors are max 15bit+sign, so squares are 30bit, and |
| ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). |
| psubw xm0, xm2 |
| pmaddwd xm2, xm2 |
| pmaddwd xm0, xm0 |
| |
| psubw xm1, xm3 |
| pmaddwd xm3, xm3 |
| pmaddwd xm1, xm1 |
| |
| ; Squares are always positive, so we can use unsigned arithmetic after |
| ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will |
| ; fit in 32bits |
| paddd xm2, xm3 |
| paddd xm0, xm1 |
| |
| ; We accumulate using 32 bit arithmetic, but detect potential overflow |
| ; by checking if the MSB of the accumulators have ever been a set bit. |
| ; If yes, we redo the whole compute at the end on higher precision, but |
| ; this happens extremely rarely, so we still achieve a net gain. |
| paddd xm4, xm0 |
| paddd xm6, xm2 |
| por xm5, xm4 ; OR in the accumulator for overflow detection |
| por xm7, xm6 ; OR in the accumulator for overflow detection |
| |
| jnz .loop |
| |
| ; Add pairs horizontally (still only on 32 bits) |
| phaddd xm4, xm4 |
| por xm5, xm4 ; OR in the accumulator for overflow detection |
| phaddd xm6, xm6 |
| por xm7, xm6 ; OR in the accumulator for overflow detection |
| |
| ; Check for possibility of overflow by testing if bit 32 of each dword lane |
| ; have ever been set. If they were not, then there was no overflow and the |
| ; final sum will fit in 32 bits. If overflow happened, then |
| ; we redo the whole computation on higher precision. |
| por xm7, xm5 |
| pmovmskb r4, xm7 |
| test r4, 0x8888 |
| jnz .highprec |
| |
| phaddd xm4, xm4 |
| phaddd xm6, xm6 |
| pmovzxdq xm4, xm4 |
| pmovzxdq xm6, xm6 |
| |
| ; Restore stack |
| pop sizeq |
| |
| ; Store the return value |
| %if ARCH_X86_64 |
| movq rax, xm4 |
| movq [sszq], xm6 |
| %else |
| movd eax, xm4 |
| pextrd edx, xm4, 1 |
| movq [sszd], xm6 |
| %endif |
| RET |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ;; Generic case of size != 16, high precision case |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .highprec: |
| pxor xm4, xm4 ; sse accumulator |
| pxor xm5, xm5 ; dedicated zero register |
| pxor xm6, xm6 ; ssz accumulator |
| pop sizeq |
| |
| .loophp: |
| mova xm0, [dqcq+sizeq*4] |
| packssdw xm0, [dqcq+sizeq*4+16] |
| mova xm2, [uqcq+sizeq*4] |
| packssdw xm2, [uqcq+sizeq*4+16] |
| |
| mova xm1, [dqcq+sizeq*4+32] |
| packssdw xm1, [dqcq+sizeq*4+48] |
| mova xm3, [uqcq+sizeq*4+32] |
| packssdw xm3, [uqcq+sizeq*4+48] |
| |
| add sizeq, 16 |
| |
| ; individual errors are max. 15bit+sign, so squares are 30bit, and |
| ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) |
| |
| psubw xm0, xm2 |
| pmaddwd xm2, xm2 |
| pmaddwd xm0, xm0 |
| |
| psubw xm1, xm3 |
| pmaddwd xm3, xm3 |
| pmaddwd xm1, xm1 |
| |
| ; accumulate in 64bit |
| punpckldq xm7, xm0, xm5 |
| punpckhdq xm0, xm5 |
| paddq xm4, xm7 |
| |
| punpckldq xm7, xm2, xm5 |
| punpckhdq xm2, xm5 |
| paddq xm6, xm7 |
| |
| punpckldq xm7, xm1, xm5 |
| punpckhdq xm1, xm5 |
| paddq xm4, xm7 |
| |
| punpckldq xm7, xm3, xm5 |
| punpckhdq xm3, xm5 |
| paddq xm6, xm7 |
| |
| paddq xm4, xm0 |
| paddq xm4, xm1 |
| paddq xm6, xm2 |
| paddq xm6, xm3 |
| |
| jnz .loophp |
| |
| ; Accumulate horizontally |
| movhlps xm5, xm4 |
| movhlps xm7, xm6 |
| paddq xm4, xm5 |
| paddq xm6, xm7 |
| |
| ; Store the return value |
| %if ARCH_X86_64 |
| movq rax, xm4 |
| movq [sszq], xm6 |
| %else |
| movd eax, xm4 |
| pextrd edx, xm4, 1 |
| movq [sszd], xm6 |
| %endif |
| RET |
| |
| END |