third_party/libwebp/src/dsp/neon.h - cobalt - Git at Google

 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  NEON common code.

 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_

 #include "src/dsp/dsp.h"

 #if defined(WEBP_USE_NEON)

 #include <arm_neon.h>

 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
 #if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif

 #define INIT_VECTOR2(v, a, b) do {  \
   v.val[0] = a;                     \
   v.val[1] = b;                     \
 } while (0)

 #define INIT_VECTOR3(v, a, b, c) do {  \
   v.val[0] = a;                        \
   v.val[1] = b;                        \
   v.val[2] = c;                        \
 } while (0)

 #define INIT_VECTOR4(v, a, b, c, d) do {  \
   v.val[0] = a;                           \
   v.val[1] = b;                           \
   v.val[2] = c;                           \
   v.val[3] = d;                           \
 } while (0)

 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
 #if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
 #define WORK_AROUND_GCC
 #endif

 static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
   uint64x2x2_t row01, row23;

   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
   row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
   row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
   row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
   // Transpose 64-bit values (there's no vswp equivalent)
   {
     const uint64x1_t row0h = vget_high_u64(row01.val[0]);
     const uint64x1_t row2l = vget_low_u64(row23.val[0]);
     const uint64x1_t row1h = vget_high_u64(row01.val[1]);
     const uint64x1_t row3l = vget_low_u64(row23.val[1]);
     row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
     row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
     row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
     row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
   }
   {
     const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
                                         vreinterpretq_s32_u64(row01.val[1]));
     const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
                                         vreinterpretq_s32_u64(row23.val[1]));
     int32x4x4_t out;
     out.val[0] = out01.val[0];
     out.val[1] = out01.val[1];
     out.val[2] = out23.val[0];
     out.val[3] = out23.val[1];
     return out;
   }
 }

 #if 0     // Useful debug macro.
 #include <stdio.h>
 #define PRINT_REG(REG, SIZE) do {                       \
   int i;                                                \
   printf("%s \t[%d]: 0x", #REG, SIZE);                  \
   if (SIZE == 8) {                                      \
     uint8_t _tmp[8];                                    \
     vst1_u8(_tmp, (REG));                               \
     for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
   } else if (SIZE == 16) {                              \
     uint16_t _tmp[4];                                   \
     vst1_u16(_tmp, (REG));                              \
     for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
   }                                                     \
   printf("\n");                                         \
 } while (0)
 #endif

 #endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
	// Copyright 2014 Google Inc. All Rights Reserved.
	//
	// Use of this source code is governed by a BSD-style license
	// that can be found in the COPYING file in the root of the source
	// tree. An additional intellectual property rights grant can be found
	// in the file PATENTS. All contributing project authors may
	// be found in the AUTHORS file in the root of the source tree.
	// -----------------------------------------------------------------------------
	//
	// NEON common code.

	#ifndef WEBP_DSP_NEON_H_
	#define WEBP_DSP_NEON_H_

	#include "src/dsp/dsp.h"

	#if defined(WEBP_USE_NEON)

	#include <arm_neon.h>

	// Right now, some intrinsics functions seem slower, so we disable them
	// everywhere except newer clang/gcc or aarch64 where the inline assembly is
	// incompatible.
	#if LOCAL_CLANG_PREREQ(3, 8) \|\| LOCAL_GCC_PREREQ(4, 9) \|\| WEBP_AARCH64
	#define WEBP_USE_INTRINSICS // use intrinsics when possible
	#endif

	#define INIT_VECTOR2(v, a, b) do { \
	v.val[0] = a; \
	v.val[1] = b; \
	} while (0)

	#define INIT_VECTOR3(v, a, b, c) do { \
	v.val[0] = a; \
	v.val[1] = b; \
	v.val[2] = c; \
	} while (0)

	#define INIT_VECTOR4(v, a, b, c, d) do { \
	v.val[0] = a; \
	v.val[1] = b; \
	v.val[2] = c; \
	v.val[3] = d; \
	} while (0)

	// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
	// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
	// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
	#if !(LOCAL_CLANG_PREREQ(3, 8) \|\| LOCAL_GCC_PREREQ(4, 8) \|\| WEBP_AARCH64)
	#define WORK_AROUND_GCC
	#endif

	static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
	uint64x2x2_t row01, row23;

	row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
	row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
	row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
	row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
	// Transpose 64-bit values (there's no vswp equivalent)
	{
	const uint64x1_t row0h = vget_high_u64(row01.val[0]);
	const uint64x1_t row2l = vget_low_u64(row23.val[0]);
	const uint64x1_t row1h = vget_high_u64(row01.val[1]);
	const uint64x1_t row3l = vget_low_u64(row23.val[1]);
	row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
	row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
	row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
	row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
	}
	{
	const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
	vreinterpretq_s32_u64(row01.val[1]));
	const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
	vreinterpretq_s32_u64(row23.val[1]));
	int32x4x4_t out;
	out.val[0] = out01.val[0];
	out.val[1] = out01.val[1];
	out.val[2] = out23.val[0];
	out.val[3] = out23.val[1];
	return out;
	}
	}

	#if 0 // Useful debug macro.
	#include <stdio.h>
	#define PRINT_REG(REG, SIZE) do { \
	int i; \
	printf("%s \t[%d]: 0x", #REG, SIZE); \
	if (SIZE == 8) { \
	uint8_t _tmp[8]; \
	vst1_u8(_tmp, (REG)); \
	for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \
	} else if (SIZE == 16) { \
	uint16_t _tmp[4]; \
	vst1_u16(_tmp, (REG)); \
	for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \
	} \
	printf("\n"); \
	} while (0)
	#endif

	#endif // WEBP_USE_NEON
	#endif // WEBP_DSP_NEON_H_