| /* crc32_simd.c | 
 |  * | 
 |  * Copyright 2017 The Chromium Authors. All rights reserved. | 
 |  * Use of this source code is governed by a BSD-style license that can be | 
 |  * found in the Chromium source repository LICENSE file. | 
 |  */ | 
 |  | 
 | #include "crc32_simd.h" | 
 |  | 
 | #if defined(CRC32_SIMD_SSE42_PCLMUL) | 
 |  | 
 | /* | 
 |  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer | 
 |  * length must be at least 64, and a multiple of 16. Based on: | 
 |  * | 
 |  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" | 
 |  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | 
 |  */ | 
 |  | 
 | #include <emmintrin.h> | 
 | #include <smmintrin.h> | 
 | #include <wmmintrin.h> | 
 |  | 
 | uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */ | 
 |     const unsigned char *buf, | 
 |     z_size_t len, | 
 |     uint32_t crc) | 
 | { | 
 |     /* | 
 |      * Definitions of the bit-reflected domain constants k1,k2,k3, etc and | 
 |      * the CRC32+Barrett polynomials given at the end of the paper. | 
 |      */ | 
 |     static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; | 
 |     static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; | 
 |     static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; | 
 |     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; | 
 |  | 
 |     __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; | 
 |  | 
 |     /* | 
 |      * There's at least one block of 64. | 
 |      */ | 
 |     x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); | 
 |     x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); | 
 |     x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); | 
 |     x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); | 
 |  | 
 |     x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); | 
 |  | 
 |     x0 = _mm_load_si128((__m128i *)k1k2); | 
 |  | 
 |     buf += 64; | 
 |     len -= 64; | 
 |  | 
 |     /* | 
 |      * Parallel fold blocks of 64, if any. | 
 |      */ | 
 |     while (len >= 64) | 
 |     { | 
 |         x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |         x6 = _mm_clmulepi64_si128(x2, x0, 0x00); | 
 |         x7 = _mm_clmulepi64_si128(x3, x0, 0x00); | 
 |         x8 = _mm_clmulepi64_si128(x4, x0, 0x00); | 
 |  | 
 |         x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
 |         x2 = _mm_clmulepi64_si128(x2, x0, 0x11); | 
 |         x3 = _mm_clmulepi64_si128(x3, x0, 0x11); | 
 |         x4 = _mm_clmulepi64_si128(x4, x0, 0x11); | 
 |  | 
 |         y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); | 
 |         y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); | 
 |         y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); | 
 |         y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); | 
 |  | 
 |         x1 = _mm_xor_si128(x1, x5); | 
 |         x2 = _mm_xor_si128(x2, x6); | 
 |         x3 = _mm_xor_si128(x3, x7); | 
 |         x4 = _mm_xor_si128(x4, x8); | 
 |  | 
 |         x1 = _mm_xor_si128(x1, y5); | 
 |         x2 = _mm_xor_si128(x2, y6); | 
 |         x3 = _mm_xor_si128(x3, y7); | 
 |         x4 = _mm_xor_si128(x4, y8); | 
 |  | 
 |         buf += 64; | 
 |         len -= 64; | 
 |     } | 
 |  | 
 |     /* | 
 |      * Fold into 128-bits. | 
 |      */ | 
 |     x0 = _mm_load_si128((__m128i *)k3k4); | 
 |  | 
 |     x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |     x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
 |     x1 = _mm_xor_si128(x1, x2); | 
 |     x1 = _mm_xor_si128(x1, x5); | 
 |  | 
 |     x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |     x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
 |     x1 = _mm_xor_si128(x1, x3); | 
 |     x1 = _mm_xor_si128(x1, x5); | 
 |  | 
 |     x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |     x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
 |     x1 = _mm_xor_si128(x1, x4); | 
 |     x1 = _mm_xor_si128(x1, x5); | 
 |  | 
 |     /* | 
 |      * Single fold blocks of 16, if any. | 
 |      */ | 
 |     while (len >= 16) | 
 |     { | 
 |         x2 = _mm_loadu_si128((__m128i *)buf); | 
 |  | 
 |         x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |         x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
 |         x1 = _mm_xor_si128(x1, x2); | 
 |         x1 = _mm_xor_si128(x1, x5); | 
 |  | 
 |         buf += 16; | 
 |         len -= 16; | 
 |     } | 
 |  | 
 |     /* | 
 |      * Fold 128-bits to 64-bits. | 
 |      */ | 
 |     x2 = _mm_clmulepi64_si128(x1, x0, 0x10); | 
 |     x3 = _mm_setr_epi32(~0, 0, ~0, 0); | 
 |     x1 = _mm_srli_si128(x1, 8); | 
 |     x1 = _mm_xor_si128(x1, x2); | 
 |  | 
 |     x0 = _mm_loadl_epi64((__m128i*)k5k0); | 
 |  | 
 |     x2 = _mm_srli_si128(x1, 4); | 
 |     x1 = _mm_and_si128(x1, x3); | 
 |     x1 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
 |     x1 = _mm_xor_si128(x1, x2); | 
 |  | 
 |     /* | 
 |      * Barret reduce to 32-bits. | 
 |      */ | 
 |     x0 = _mm_load_si128((__m128i*)poly); | 
 |  | 
 |     x2 = _mm_and_si128(x1, x3); | 
 |     x2 = _mm_clmulepi64_si128(x2, x0, 0x10); | 
 |     x2 = _mm_and_si128(x2, x3); | 
 |     x2 = _mm_clmulepi64_si128(x2, x0, 0x00); | 
 |     x1 = _mm_xor_si128(x1, x2); | 
 |  | 
 |     /* | 
 |      * Return the crc32. | 
 |      */ | 
 |     return _mm_extract_epi32(x1, 1); | 
 | } | 
 |  | 
 | #elif defined(CRC32_ARMV8_CRC32) | 
 |  | 
 | /* CRC32 checksums using ARMv8-a crypto instructions. | 
 |  * | 
 |  * TODO: implement a version using the PMULL instruction. | 
 |  */ | 
 |  | 
 | #if defined(__clang__) | 
 | /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an | 
 |  * armv8 target, which is incompatible with ThinLTO optimizations on Android. | 
 |  * (Namely, mixing and matching different module-level targets makes ThinLTO | 
 |  * warn, and Android defaults to armv7-a. This restriction does not apply to | 
 |  * function-level `target`s, however.) | 
 |  * | 
 |  * Since we only need four crc intrinsics, and since clang's implementation of | 
 |  * those are just wrappers around compiler builtins, it's simplest to #define | 
 |  * those builtins directly. If this #define list grows too much (or we depend on | 
 |  * an intrinsic that isn't a trivial wrapper), we may have to find a better way | 
 |  * to go about this. | 
 |  * | 
 |  * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized | 
 |  * feature for this target (ignoring feature)." This appears to be a harmless | 
 |  * bug in clang. | 
 |  */ | 
 | #define __crc32b __builtin_arm_crc32b | 
 | #define __crc32d __builtin_arm_crc32d | 
 | #define __crc32w __builtin_arm_crc32w | 
 | #define __crc32cw __builtin_arm_crc32cw | 
 |  | 
 | #if defined(__aarch64__) | 
 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) | 
 | #else  // !defined(__aarch64__) | 
 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) | 
 | #endif  // defined(__aarch64__) | 
 |  | 
 | #elif defined(__GNUC__) | 
 | /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not | 
 |  * allowed. We can just include arm_acle.h. | 
 |  */ | 
 | #include <arm_acle.h> | 
 | #define TARGET_ARMV8_WITH_CRC | 
 | #else  // !defined(__GNUC__) && !defined(_aarch64__) | 
 | #error ARM CRC32 SIMD extensions only supported for Clang and GCC | 
 | #endif | 
 |  | 
 | TARGET_ARMV8_WITH_CRC | 
 | uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, | 
 |                                           const unsigned char *buf, | 
 |                                           z_size_t len) | 
 | { | 
 |     uint32_t c = (uint32_t) ~crc; | 
 |  | 
 |     while (len && ((uintptr_t)buf & 7)) { | 
 |         c = __crc32b(c, *buf++); | 
 |         --len; | 
 |     } | 
 |  | 
 |     const uint64_t *buf8 = (const uint64_t *)buf; | 
 |  | 
 |     while (len >= 64) { | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |  | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |         c = __crc32d(c, *buf8++); | 
 |         len -= 64; | 
 |     } | 
 |  | 
 |     while (len >= 8) { | 
 |         c = __crc32d(c, *buf8++); | 
 |         len -= 8; | 
 |     } | 
 |  | 
 |     buf = (const unsigned char *)buf8; | 
 |  | 
 |     while (len--) { | 
 |         c = __crc32b(c, *buf++); | 
 |     } | 
 |  | 
 |     return ~c; | 
 | } | 
 |  | 
 | TARGET_ARMV8_WITH_CRC | 
 | Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str) | 
 | { | 
 |     Pos ret; | 
 |     unsigned *ip, val, h = 0; | 
 |  | 
 |     ip = (unsigned *)&s->window[str]; | 
 |     val = *ip; | 
 |  | 
 |     if (s->level >= 6) | 
 |         val &= 0xFFFFFF; | 
 |  | 
 |     /* We use CRC32C (Castagnoli) to ensure that the compressed output | 
 |      * will match between Intel x ARM. | 
 |      * Unlike the case of data integrity checks for GZIP format where the | 
 |      * polynomial used is defined (https://tools.ietf.org/html/rfc1952#page-11), | 
 |      * here it is just a hash function for the hash table used while | 
 |      * performing compression. | 
 |      */ | 
 |     h = __crc32cw(h, val); | 
 |  | 
 |     ret = s->head[h & s->hash_mask]; | 
 |     s->head[h & s->hash_mask] = str; | 
 |     s->prev[str & s->w_mask] = ret; | 
 |     return ret; | 
 | } | 
 |  | 
 | #endif |