blob: f99a2fef8401128376c6ae8de7dfc9d595d22797 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "build/build_config.h"
#include "media/base/simd/convert_rgb_to_yuv.h"
#include "media/base/simd/yuv_to_rgb_table.h"
#if defined(COMPILER_MSVC)
#include <intrin.h>
#else
#include <mmintrin.h>
#include <emmintrin.h>
#endif
namespace media {
#define FIX_SHIFT 12
#define FIX(x) ((x) * (1 << FIX_SHIFT))
// Define a convenient macro to do static cast.
#define INT16_FIX(x) static_cast<int16>(FIX(x))
SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
-INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
-INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
};
#undef INT16_FIX
// This is the final offset for the conversion from signed yuv values to
// unsigned values. It is arranged so that offset of 16 is applied to Y
// components and 128 is added to UV components for 2 pixels.
SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
static inline int Clamp(int value) {
if (value < 0)
return 0;
if (value > 255)
return 255;
return value;
}
static inline int RGBToY(int r, int g, int b) {
int y = ConvertRGBAToYUV_kTable[0] * b +
ConvertRGBAToYUV_kTable[1] * g +
ConvertRGBAToYUV_kTable[2] * r;
y >>= FIX_SHIFT;
return Clamp(y + 16);
}
static inline int RGBToU(int r, int g, int b, int shift) {
int u = ConvertRGBAToYUV_kTable[8] * b +
ConvertRGBAToYUV_kTable[9] * g +
ConvertRGBAToYUV_kTable[10] * r;
u >>= FIX_SHIFT + shift;
return Clamp(u + 128);
}
static inline int RGBToV(int r, int g, int b, int shift) {
int v = ConvertRGBAToYUV_kTable[16] * b +
ConvertRGBAToYUV_kTable[17] * g +
ConvertRGBAToYUV_kTable[18] * r;
v >>= FIX_SHIFT + shift;
return Clamp(v + 128);
}
#define CONVERT_Y(rgb_buf, y_buf) \
b = *rgb_buf++; \
g = *rgb_buf++; \
r = *rgb_buf++; \
++rgb_buf; \
sum_b += b; \
sum_g += g; \
sum_r += r; \
*y_buf++ = RGBToY(r, g, b);
static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
const uint8* rgb_buf_2,
uint8* y_buf_1,
uint8* y_buf_2,
uint8* u_buf,
uint8* v_buf) {
int sum_b = 0;
int sum_g = 0;
int sum_r = 0;
int r, g, b;
CONVERT_Y(rgb_buf_1, y_buf_1);
CONVERT_Y(rgb_buf_1, y_buf_1);
CONVERT_Y(rgb_buf_2, y_buf_2);
CONVERT_Y(rgb_buf_2, y_buf_2);
*u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
*v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
}
static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
const uint8* rgb_buf_2,
uint8* y_buf_1,
uint8* y_buf_2,
uint8* u_buf,
uint8* v_buf) {
int sum_b = 0;
int sum_g = 0;
int sum_r = 0;
int r, g, b;
CONVERT_Y(rgb_buf_1, y_buf_1);
CONVERT_Y(rgb_buf_2, y_buf_2);
*u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
*v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
}
static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
uint8* y_buf,
uint8* u_buf,
uint8* v_buf) {
int sum_b = 0;
int sum_g = 0;
int sum_r = 0;
int r, g, b;
CONVERT_Y(rgb_buf, y_buf);
CONVERT_Y(rgb_buf, y_buf);
*u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
*v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
}
static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
uint8* y_buf,
uint8* u_buf,
uint8* v_buf) {
int sum_b = 0;
int sum_g = 0;
int sum_r = 0;
int r, g, b;
CONVERT_Y(rgb_buf, y_buf);
*u_buf++ = RGBToU(r, g, b, 0);
*v_buf++ = RGBToV(r, g, b, 0);
}
static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
const uint8* rgb_buf_2,
uint8* y_buf_1,
uint8* y_buf_2,
uint8* u_buf,
uint8* v_buf,
int width) {
while (width >= 4) {
// Name for the Y pixels:
// Row 1: a b c d
// Row 2: e f g h
//
// First row 4 pixels.
__m128i rgb_row_1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(rgb_buf_1));
__m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
__m128i y_table = _mm_load_si128(
reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
__m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
__m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
// Do a crazh shuffle so that we get:
// v------------ Multiply Add
// BG: a b c d
// A0: a b c d
__m128i bg_abcd = _mm_castps_si128(
_mm_shuffle_ps(
_mm_castsi128_ps(rgb_c_d),
_mm_castsi128_ps(rgb_a_b),
(3 << 6) | (1 << 4) | (3 << 2) | 1));
__m128i r_abcd = _mm_castps_si128(
_mm_shuffle_ps(
_mm_castsi128_ps(rgb_c_d),
_mm_castsi128_ps(rgb_a_b),
(2 << 6) | (2 << 2)));
__m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
// Down shift back to 8bits range.
__m128i y_offset = _mm_load_si128(
reinterpret_cast<const __m128i*>(kYOffset));
y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
y_abcd = _mm_add_epi32(y_abcd, y_offset);
y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
*reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
y_buf_1 += 4;
// Second row 4 pixels.
__m128i rgb_row_2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(rgb_buf_2));
__m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
__m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
__m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
// Add two rows together.
__m128i rgb_ae_bf =
_mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
__m128i rgb_cg_dh =
_mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
// Multiply add like the previous row.
rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
__m128i bg_efgh = _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
_mm_castsi128_ps(rgb_e_f),
(3 << 6) | (1 << 4) | (3 << 2) | 1));
__m128i r_efgh = _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
_mm_castsi128_ps(rgb_e_f),
(2 << 6) | (2 << 2)));
__m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
y_efgh = _mm_add_epi32(y_efgh, y_offset);
y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
*reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
y_buf_2 += 4;
__m128i rgb_ae_cg = _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
_mm_castsi128_ps(rgb_ae_bf),
(3 << 6) | (2 << 4) | (3 << 2) | 2));
__m128i rgb_bf_dh = _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
_mm_castsi128_ps(rgb_ae_bf),
(1 << 6) | (1 << 2)));
// This is a 2x2 subsampling for 2 pixels.
__m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
// Do a multiply add with U table.
__m128i u_a_b = _mm_madd_epi16(
rgb_abef_cdgh,
_mm_load_si128(
reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
_mm_shuffle_epi32(u_a_b, (2 << 2)));
// Right shift 14 because of 12 from fixed point and 2 from subsampling.
u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
__m128i uv_offset = _mm_slli_epi32(y_offset, 3);
u_a_b = _mm_add_epi32(u_a_b, uv_offset);
u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
*reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
u_buf += 2;
__m128i v_a_b = _mm_madd_epi16(
rgb_abef_cdgh,
_mm_load_si128(
reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
_mm_shuffle_epi32(v_a_b, (2 << 2)));
v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
v_a_b = _mm_add_epi32(v_a_b, uv_offset);
v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
*reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
v_buf += 2;
rgb_buf_1 += 16;
rgb_buf_2 += 16;
// Move forward by 4 pixels.
width -= 4;
}
// Just use C code to convert the remaining pixels.
if (width >= 2) {
ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
rgb_buf_1 += 8;
rgb_buf_2 += 8;
y_buf_1 += 2;
y_buf_2 += 2;
++u_buf;
++v_buf;
width -= 2;
}
if (width)
ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
}
extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
uint8* yplane,
uint8* uplane,
uint8* vplane,
int width,
int height,
int rgbstride,
int ystride,
int uvstride) {
while (height >= 2) {
ConvertRGB32ToYUVRow_SSE2(rgbframe,
rgbframe + rgbstride,
yplane,
yplane + ystride,
uplane,
vplane,
width);
rgbframe += 2 * rgbstride;
yplane += 2 * ystride;
uplane += uvstride;
vplane += uvstride;
height -= 2;
}
if (!height)
return;
// Handle the last row.
while (width >= 2) {
ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
rgbframe += 8;
yplane += 2;
++uplane;
++vplane;
width -= 2;
}
if (width)
ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
}
void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
uint8* yplane,
uint8* uplane,
uint8* vplane,
int width,
int height,
int rgbstride,
int ystride,
int uvstride) {
while (height >= 2) {
int i = 0;
// Convert a 2x2 block.
while (i + 2 <= width) {
ConvertRGBToYUV_V2H2(rgbframe + i * 4,
rgbframe + rgbstride + i * 4,
yplane + i,
yplane + ystride + i,
uplane + i / 2,
vplane + i / 2);
i += 2;
}
// Convert the last pixel of two rows.
if (i < width) {
ConvertRGBToYUV_V2H1(rgbframe + i * 4,
rgbframe + rgbstride + i * 4,
yplane + i,
yplane + ystride + i,
uplane + i / 2,
vplane + i / 2);
}
rgbframe += 2 * rgbstride;
yplane += 2 * ystride;
uplane += uvstride;
vplane += uvstride;
height -= 2;
}
if (!height)
return;
// Handle the last row.
while (width >= 2) {
ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
rgbframe += 8;
yplane += 2;
++uplane;
++vplane;
width -= 2;
}
// Handle the last pixel in the last row.
if (width)
ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
}
} // namespace media