|  | // Copyright 2014 Google Inc. All Rights Reserved. | 
|  | // | 
|  | // Use of this source code is governed by a BSD-style license | 
|  | // that can be found in the COPYING file in the root of the source | 
|  | // tree. An additional intellectual property rights grant can be found | 
|  | // in the file PATENTS. All contributing project authors may | 
|  | // be found in the AUTHORS file in the root of the source tree. | 
|  | // ----------------------------------------------------------------------------- | 
|  | // | 
|  | //  NEON common code. | 
|  |  | 
|  | #ifndef WEBP_DSP_NEON_H_ | 
|  | #define WEBP_DSP_NEON_H_ | 
|  |  | 
|  | #include <arm_neon.h> | 
|  |  | 
|  | #include "src/dsp/dsp.h" | 
|  |  | 
|  | // Right now, some intrinsics functions seem slower, so we disable them | 
|  | // everywhere except newer clang/gcc or aarch64 where the inline assembly is | 
|  | // incompatible. | 
|  | #if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) | 
|  | #define WEBP_USE_INTRINSICS   // use intrinsics when possible | 
|  | #endif | 
|  |  | 
|  | #define INIT_VECTOR2(v, a, b) do {  \ | 
|  | v.val[0] = a;                     \ | 
|  | v.val[1] = b;                     \ | 
|  | } while (0) | 
|  |  | 
|  | #define INIT_VECTOR3(v, a, b, c) do {  \ | 
|  | v.val[0] = a;                        \ | 
|  | v.val[1] = b;                        \ | 
|  | v.val[2] = c;                        \ | 
|  | } while (0) | 
|  |  | 
|  | #define INIT_VECTOR4(v, a, b, c, d) do {  \ | 
|  | v.val[0] = a;                           \ | 
|  | v.val[1] = b;                           \ | 
|  | v.val[2] = c;                           \ | 
|  | v.val[3] = d;                           \ | 
|  | } while (0) | 
|  |  | 
|  | // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 | 
|  | // crash ("internal compiler error: in immed_double_const, at emit-rtl."). | 
|  | // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) | 
|  | #if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) | 
|  | #define WORK_AROUND_GCC | 
|  | #endif | 
|  |  | 
|  | static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) { | 
|  | uint64x2x2_t row01, row23; | 
|  |  | 
|  | row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); | 
|  | row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); | 
|  | row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); | 
|  | row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); | 
|  | // Transpose 64-bit values (there's no vswp equivalent) | 
|  | { | 
|  | const uint64x1_t row0h = vget_high_u64(row01.val[0]); | 
|  | const uint64x1_t row2l = vget_low_u64(row23.val[0]); | 
|  | const uint64x1_t row1h = vget_high_u64(row01.val[1]); | 
|  | const uint64x1_t row3l = vget_low_u64(row23.val[1]); | 
|  | row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); | 
|  | row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); | 
|  | row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); | 
|  | row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); | 
|  | } | 
|  | { | 
|  | const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), | 
|  | vreinterpretq_s32_u64(row01.val[1])); | 
|  | const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), | 
|  | vreinterpretq_s32_u64(row23.val[1])); | 
|  | int32x4x4_t out; | 
|  | out.val[0] = out01.val[0]; | 
|  | out.val[1] = out01.val[1]; | 
|  | out.val[2] = out23.val[0]; | 
|  | out.val[3] = out23.val[1]; | 
|  | return out; | 
|  | } | 
|  | } | 
|  |  | 
|  | #if 0     // Useful debug macro. | 
|  | #include <stdio.h> | 
|  | #define PRINT_REG(REG, SIZE) do {                       \ | 
|  | int i;                                                \ | 
|  | printf("%s \t[%d]: 0x", #REG, SIZE);                  \ | 
|  | if (SIZE == 8) {                                      \ | 
|  | uint8_t _tmp[8];                                    \ | 
|  | vst1_u8(_tmp, (REG));                               \ | 
|  | for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \ | 
|  | } else if (SIZE == 16) {                              \ | 
|  | uint16_t _tmp[4];                                   \ | 
|  | vst1_u16(_tmp, (REG));                              \ | 
|  | for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \ | 
|  | }                                                     \ | 
|  | printf("\n");                                         \ | 
|  | } while (0) | 
|  | #endif | 
|  |  | 
|  | #endif  // WEBP_DSP_NEON_H_ |