| ; Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| ; Use of this source code is governed by a BSD-style license that can be |
| ; found in the LICENSE file. |
| |
| %include "x86inc.asm" |
| |
| ; |
| ; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM |
| ; processors. |
| ; |
| SECTION_TEXT |
| CPU SSE, SSE3, SSE3, SSSE3 |
| |
| ; |
| ; XMM registers representing constants. We must not use these registers as |
| ; destination operands. |
| ; for (int i = 0; i < 16; i += 4) { |
| ; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; |
| ; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; |
| ; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; |
| ; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; |
| ; } |
| ; |
| %define XMM_CONST_Y0 xmm7 |
| %define XMM_CONST_Y1 xmm6 |
| %define XMM_CONST_U xmm5 |
| %define XMM_CONST_V xmm4 |
| %define XMM_CONST_128 xmm3 |
| |
| ; |
| ; LOAD_XMM %1 (xmm), %2 (imm32) |
| ; Loads an immediate value to an XMM register. |
| ; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; |
| ; |
| %macro LOAD_XMM 2 |
| mov TEMPd, %2 |
| movd %1, TEMPd |
| pshufd %1, %1, 00000000B |
| %endmacro |
| |
| ; |
| ; UNPACKRGB %1 (xmm), %2 (imm8) |
| ; Unpacks one RGB pixel in the specified XMM register. |
| ; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; |
| ; %1.b[%2] = 0; |
| ; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; |
| ; |
| %macro UNPACKRGB 2 |
| movdqa xmm1, %1 |
| psrldq xmm1, %2 |
| pslldq xmm1, %2 |
| pxor %1, xmm1 |
| pslldq xmm1, 1 |
| por %1, xmm1 |
| %endmacro |
| |
| ; |
| ; READ_ARGB %1 (xmm), %2 (imm) |
| ; Read the specified number of ARGB (or RGB) pixels from the source and store |
| ; them to the destination xmm register. If the input format is RGB, we read RGB |
| ; pixels and convert them to ARGB pixels. (For this case, the alpha values of |
| ; the output pixels become 0.) |
| ; |
| %macro READ_ARGB 2 |
| |
| %if PIXELSIZE == 4 |
| |
| ; Read ARGB pixels from the source. (This macro assumes the input buffer may |
| ; not be aligned to a 16-byte boundary.) |
| %if %2 == 1 |
| movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] |
| %elif %2 == 2 |
| movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] |
| %elif %2 == 4 |
| movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] |
| %else |
| %error unsupported number of pixels. |
| %endif |
| |
| %elif PIXELSIZE == 3 |
| |
| ; Read RGB pixels from the source and convert them to ARGB pixels. |
| %if %2 == 1 |
| ; Read one RGB pixel and convert it to one ARGB pixel. |
| ; Save the WIDTH register to xmm1. (This macro needs to break it.) |
| MOVq xmm1, WIDTHq |
| |
| ; Once read three bytes from the source to TEMPd, and copy it to the |
| ; destination xmm register. |
| lea WIDTHq, [WIDTHq + WIDTHq * 2] |
| movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] |
| shl TEMPd, 16 |
| mov TEMPw, WORD [ARGBq + WIDTHq * 2] |
| movd %1, TEMPd |
| |
| ; Restore the WIDTH register. |
| MOVq WIDTHq, xmm1 |
| %elif %2 == 2 |
| ; Read two RGB pixels and convert them to two ARGB pixels. |
| ; Read six bytes from the source to the destination xmm register. |
| mov TEMPq, WIDTHq |
| lea TEMPq, [TEMPq + TEMPq * 2] |
| movd %1, DWORD [ARGBq + TEMPq * 2] |
| pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 |
| |
| ; Fill the alpha values of these RGB pixels with 0 and convert them to two |
| ; ARGB pixels. |
| UNPACKRGB %1, 3 |
| %elif %2 == 4 |
| ; Read four RGB pixels and convert them to four ARGB pixels. |
| ; Read twelve bytes from the source to the destination xmm register. |
| mov TEMPq, WIDTHq |
| lea TEMPq, [TEMPq + TEMPq * 2] |
| movq %1, QWORD [ARGBq + TEMPq * 2] |
| movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] |
| shufps %1, xmm1, 01000100B |
| |
| ; Fill the alpha values of these RGB pixels with 0 and convert them to four |
| ; ARGB pixels. |
| UNPACKRGB %1, 3 |
| UNPACKRGB %1, 4 + 3 |
| UNPACKRGB %1, 4 + 4 + 3 |
| %else |
| %error unsupported number of pixels. |
| %endif |
| |
| %else |
| %error unsupported PIXELSIZE value. |
| %endif |
| |
| %endmacro |
| |
| ; |
| ; CALC_Y %1 (xmm), %2 (xmm) |
| ; Calculates four Y values from four ARGB pixels stored in %2. |
| ; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); |
| ; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); |
| ; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); |
| ; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); |
| ; |
| %macro CALC_Y 2 |
| ; To avoid signed saturation, we divide this conversion formula into two |
| ; formulae and store their results into two XMM registers %1 and xmm2. |
| ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; |
| ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; |
| ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; |
| ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; |
| ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; |
| ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; |
| ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; |
| ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; |
| movdqa %1, %2 |
| pmaddubsw %1, XMM_CONST_Y0 |
| phaddsw %1, %1 |
| movdqa xmm2, %2 |
| pmaddubsw xmm2, XMM_CONST_Y1 |
| phaddsw xmm2, xmm2 |
| |
| ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); |
| ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); |
| ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); |
| ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); |
| paddw %1, xmm2 |
| movdqa xmm2, XMM_CONST_128 |
| paddw %1, xmm2 |
| psrlw %1, 8 |
| psrlw xmm2, 3 |
| paddw %1, xmm2 |
| packuswb %1, %1 |
| %endmacro |
| |
| ; |
| ; INIT_UV %1 (r32), %2 (reg) %3 (imm) |
| ; |
| %macro INIT_UV 3 |
| |
| %if SUBSAMPLING == 1 && LINE == 1 |
| %if %3 == 1 || %3 == 2 |
| movzx %1, BYTE [%2 + WIDTHq] |
| %elif %3 == 4 |
| movzx %1, WORD [%2 + WIDTHq] |
| %else |
| %error unsupported number of pixels. |
| %endif |
| %endif |
| |
| %endmacro |
| |
| ; |
| ; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) |
| ; Calculates two U (or V) values from four ARGB pixels stored in %2. |
| ; if %3 == XMM_CONST_U |
| ; if (SUBSAMPLING) { |
| ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); |
| ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); |
| ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); |
| ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); |
| ; } else { |
| ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); |
| ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); |
| ; } |
| ; if %3 == XMM_CONST_V |
| ; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); |
| ; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); |
| ; |
| %macro CALC_UV 4 |
| ; for (int i = 0; i < 4; ++i) { |
| ; %1.w[i] = 0; |
| ; for (int j = 0; j < 4; ++j) |
| ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; |
| ; } |
| movdqa %1, %2 |
| pmaddubsw %1, %3 |
| phaddsw %1, %1 |
| |
| %if SUBSAMPLING == 1 |
| ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; |
| ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; |
| ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; |
| ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; |
| pshuflw xmm2, %1, 10110001B |
| pavgw %1, xmm2 |
| %endif |
| |
| ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); |
| ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); |
| pshuflw %1, %1, 10001000B |
| paddw %1, XMM_CONST_128 |
| psraw %1, 8 |
| paddw %1, XMM_CONST_128 |
| packuswb %1, %1 |
| |
| %if SUBSAMPLING == 1 && LINE == 1 |
| ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; |
| ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; |
| movd xmm2, %4 |
| pavgb %1, xmm2 |
| %endif |
| %endmacro |
| |
| ; |
| ; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertARGBToYUVRow_SSSE3 |
| %define PIXELSIZE 4 |
| %define SUBSAMPLING 0 |
| %define LINE 0 |
| %include "convert_rgb_to_yuv_ssse3.inc" |
| |
| ; |
| ; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertRGBToYUVRow_SSSE3 |
| %define PIXELSIZE 3 |
| %define SUBSAMPLING 0 |
| %define LINE 0 |
| %include "convert_rgb_to_yuv_ssse3.inc" |
| |
| ; |
| ; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertARGBToYUVEven_SSSE3 |
| %define PIXELSIZE 4 |
| %define SUBSAMPLING 1 |
| %define LINE 0 |
| %include "convert_rgb_to_yuv_ssse3.inc" |
| |
| ; |
| ; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertARGBToYUVOdd_SSSE3 |
| %define PIXELSIZE 4 |
| %define SUBSAMPLING 1 |
| %define LINE 1 |
| %include "convert_rgb_to_yuv_ssse3.inc" |
| |
| ; |
| ; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertRGBToYUVEven_SSSE3 |
| %define PIXELSIZE 3 |
| %define SUBSAMPLING 1 |
| %define LINE 0 |
| %include "convert_rgb_to_yuv_ssse3.inc" |
| |
| ; |
| ; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb, |
| ; uint8* y, |
| ; uint8* u, |
| ; uint8* v, |
| ; int width); |
| ; |
| %define SYMBOL ConvertRGBToYUVOdd_SSSE3 |
| %define PIXELSIZE 3 |
| %define SUBSAMPLING 1 |
| %define LINE 1 |
| %include "convert_rgb_to_yuv_ssse3.inc" |