; Copyright (c) 2011 The Chromium Authors. All rights reserved.
; Use of this source code is governed by a BSD-style license that can be
; found in the LICENSE file.

%include "x86inc.asm"

;
; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM
; processors.
;
  SECTION_TEXT
  CPU       SSE, SSE3, SSE3, SSSE3

;
; XMM registers representing constants. We must not use these registers as
; destination operands.
; for (int i = 0; i < 16; i += 4) {
;   xmm7.b[i] = 25;  xmm7.b[i+1] = 2;   xmm7.b[i+2] = 66;  xmm7.b[i+3] = 0;
;   xmm6.b[i] = 0;   xmm6.b[i+1] = 127; xmm6.b[i+2] = 0;   xmm6.b[i+3] = 0;
;   xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0;
;   xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0;
; }
;
%define XMM_CONST_Y0    xmm7
%define XMM_CONST_Y1    xmm6
%define XMM_CONST_U     xmm5
%define XMM_CONST_V     xmm4
%define XMM_CONST_128   xmm3

;
; LOAD_XMM %1 (xmm), %2 (imm32)
; Loads an immediate value to an XMM register.
;   %1.d[0] = %1.d[1] =  %1.d[2] =  %1.d[3] = %2;
;
%macro LOAD_XMM 2
  mov       TEMPd, %2
  movd      %1, TEMPd
  pshufd    %1, %1, 00000000B
%endmacro

;
; UNPACKRGB %1 (xmm), %2 (imm8)
; Unpacks one RGB pixel in the specified XMM register.
;   for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1];
;   %1.b[%2] = 0;
;   for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i];
;
%macro UNPACKRGB 2
  movdqa    xmm1, %1
  psrldq    xmm1, %2
  pslldq    xmm1, %2
  pxor      %1, xmm1
  pslldq    xmm1, 1
  por       %1, xmm1
%endmacro

;
; READ_ARGB %1 (xmm), %2 (imm)
; Read the specified number of ARGB (or RGB) pixels from the source and store
; them to the destination xmm register. If the input format is RGB, we read RGB
; pixels and convert them to ARGB pixels. (For this case, the alpha values of
; the output pixels become 0.)
;
%macro READ_ARGB 2

%if PIXELSIZE == 4

  ; Read ARGB pixels from the source. (This macro assumes the input buffer may
  ; not be aligned to a 16-byte boundary.)
%if %2 == 1
  movd      %1, DWORD [ARGBq + WIDTHq * 4 * 2]
%elif %2 == 2
  movq      %1, QWORD [ARGBq + WIDTHq * 4 * 2]
%elif %2 == 4
  movdqu    %1, DQWORD [ARGBq + WIDTHq * 4 * 2]
%else
%error unsupported number of pixels.
%endif

%elif PIXELSIZE == 3

  ; Read RGB pixels from the source and convert them to ARGB pixels.
%if %2 == 1
  ; Read one RGB pixel and convert it to one ARGB pixel.
  ; Save the WIDTH register to xmm1. (This macro needs to break it.)
  MOVq      xmm1, WIDTHq

  ; Once read three bytes from the source to TEMPd, and copy it to the
  ; destination xmm register.
  lea       WIDTHq, [WIDTHq + WIDTHq * 2]
  movzx     TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2]
  shl       TEMPd, 16
  mov       TEMPw, WORD [ARGBq + WIDTHq * 2]
  movd      %1, TEMPd

  ; Restore the WIDTH register.
  MOVq      WIDTHq, xmm1
%elif %2 == 2
  ; Read two RGB pixels and convert them to two ARGB pixels.
  ; Read six bytes from the source to the destination xmm register.
  mov       TEMPq, WIDTHq
  lea       TEMPq, [TEMPq + TEMPq * 2]
  movd      %1, DWORD [ARGBq + TEMPq * 2]
  pinsrw    %1, WORD [ARGBq + TEMPq * 2 + 4], 3

  ; Fill the alpha values of these RGB pixels with 0 and convert them to two
  ; ARGB pixels.
  UNPACKRGB %1, 3
%elif %2 == 4
  ; Read four RGB pixels and convert them to four ARGB pixels.
  ; Read twelve bytes from the source to the destination xmm register.
  mov       TEMPq, WIDTHq
  lea       TEMPq, [TEMPq + TEMPq * 2]
  movq      %1, QWORD [ARGBq + TEMPq * 2]
  movd      xmm1, DWORD [ARGBq + TEMPq * 2 + 8]
  shufps    %1, xmm1, 01000100B

  ; Fill the alpha values of these RGB pixels with 0 and convert them to four
  ; ARGB pixels.
  UNPACKRGB %1, 3
  UNPACKRGB %1, 4 + 3
  UNPACKRGB %1, 4 + 4 + 3
%else
%error unsupported number of pixels.
%endif

%else
%error unsupported PIXELSIZE value.
%endif

%endmacro

;
; CALC_Y %1 (xmm), %2 (xmm)
; Calculates four Y values from four ARGB pixels stored in %2.
;   %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16);
;   %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16);
;   %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16);
;   %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16);
;
%macro CALC_Y 2
  ; To avoid signed saturation, we divide this conversion formula into two
  ; formulae and store their results into two XMM registers %1 and xmm2.
  ; %1.w[0]   = 25  * %2.b[0]  + 2   * %2.b[1]  + 66  * %2.b[2]  + 0 * %2.b[3];
  ; %1.w[1]   = 25  * %2.b[4]  + 2   * %2.b[5]  + 66  * %2.b[6]  + 0 * %2.b[7];
  ; %1.w[2]   = 25  * %2.b[8]  + 2   * %2.b[9]  + 66  * %2.b[10] + 0 * %2.b[11];
  ; %1.w[3]   = 25  * %2.b[12] + 2   * %2.b[13] + 66  * %2.b[14] + 0 * %2.b[15];
  ; xmm2.w[0] = 0   * %2.b[0]  + 127 * %2.b[1]  + 0   * %2.b[2]  + 0 * %2.b[3];
  ; xmm2.w[1] = 0   * %2.b[4]  + 127 * %2.b[5]  + 0   * %2.b[6]  + 0 * %2.b[7];
  ; xmm2.w[2] = 0   * %2.b[8]  + 127 * %2.b[9]  + 0   * %2.b[10] + 0 * %2.b[11];
  ; xmm2.w[3] = 0   * %2.b[12] + 127 * %2.b[13] + 0   * %2.b[14] + 0 * %2.b[15];
  movdqa    %1, %2
  pmaddubsw %1, XMM_CONST_Y0
  phaddsw   %1, %1
  movdqa    xmm2, %2
  pmaddubsw xmm2, XMM_CONST_Y1
  phaddsw   xmm2, xmm2

  ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16);
  ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16);
  ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16);
  ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16);
  paddw     %1, xmm2
  movdqa    xmm2, XMM_CONST_128
  paddw     %1, xmm2
  psrlw     %1, 8
  psrlw     xmm2, 3
  paddw     %1, xmm2
  packuswb  %1, %1
%endmacro

;
; INIT_UV %1 (r32), %2 (reg) %3 (imm)
;
%macro INIT_UV 3

%if SUBSAMPLING == 1 && LINE == 1
%if %3 == 1 || %3 == 2
  movzx     %1, BYTE [%2 + WIDTHq]
%elif %3 == 4
  movzx     %1, WORD [%2 + WIDTHq]
%else
%error unsupported number of pixels.
%endif
%endif

%endmacro

;
; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32)
; Calculates two U (or V) values from four ARGB pixels stored in %2.
; if %3 == XMM_CONST_U
; if (SUBSAMPLING) {
;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
; } else {
;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
; }
; if %3 == XMM_CONST_V
;   %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128);
;   %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128);
;
%macro CALC_UV 4
  ; for (int i = 0; i < 4; ++i) {
  ;   %1.w[i] = 0;
  ;   for (int j = 0; j < 4; ++j)
  ;     %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j];
  ; }
  movdqa    %1, %2
  pmaddubsw %1, %3
  phaddsw   %1, %1

%if SUBSAMPLING == 1
  ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2;
  ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2;
  ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2;
  ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2;
  pshuflw   xmm2, %1, 10110001B
  pavgw     %1, xmm2
%endif

  ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128);
  ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128);
  pshuflw   %1, %1, 10001000B
  paddw     %1, XMM_CONST_128
  psraw     %1, 8
  paddw     %1, XMM_CONST_128
  packuswb  %1, %1

%if SUBSAMPLING == 1 && LINE == 1
  ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2;
  ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2;
  movd      xmm2, %4
  pavgb     %1, xmm2
%endif
%endmacro

;
; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb,
;                                           uint8* y,
;                                           uint8* u,
;                                           uint8* v,
;                                           int width);
;
%define SYMBOL          ConvertARGBToYUVRow_SSSE3
%define PIXELSIZE       4
%define SUBSAMPLING     0
%define LINE            0
%include "convert_rgb_to_yuv_ssse3.inc"

;
; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb,
;                                          uint8* y,
;                                          uint8* u,
;                                          uint8* v,
;                                          int width);
;
%define SYMBOL          ConvertRGBToYUVRow_SSSE3
%define PIXELSIZE       3
%define SUBSAMPLING     0
%define LINE            0
%include "convert_rgb_to_yuv_ssse3.inc"

;
; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb,
;                                            uint8* y,
;                                            uint8* u,
;                                            uint8* v,
;                                            int width);
;
%define SYMBOL          ConvertARGBToYUVEven_SSSE3
%define PIXELSIZE       4
%define SUBSAMPLING     1
%define LINE            0
%include "convert_rgb_to_yuv_ssse3.inc"

;
; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb,
;                                           uint8* y,
;                                           uint8* u,
;                                           uint8* v,
;                                           int width);
;
%define SYMBOL          ConvertARGBToYUVOdd_SSSE3
%define PIXELSIZE       4
%define SUBSAMPLING     1
%define LINE            1
%include "convert_rgb_to_yuv_ssse3.inc"

;
; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb,
;                                           uint8* y,
;                                           uint8* u,
;                                           uint8* v,
;                                           int width);
;
%define SYMBOL          ConvertRGBToYUVEven_SSSE3
%define PIXELSIZE       3
%define SUBSAMPLING     1
%define LINE            0
%include "convert_rgb_to_yuv_ssse3.inc"

;
; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb,
;                                          uint8* y,
;                                          uint8* u,
;                                          uint8* v,
;                                          int width);
;
%define SYMBOL          ConvertRGBToYUVOdd_SSSE3
%define PIXELSIZE       3
%define SUBSAMPLING     1
%define LINE            1
%include "convert_rgb_to_yuv_ssse3.inc"
