blob: e65e87bad2e0765eda51707301560daaba3e43d7 [file] [log] [blame]
/* Copyright 2018 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#ifndef __SLIDE_HASH__NEON__
#define __SLIDE_HASH__NEON__
#include "deflate.h"
/*
For Starboard we want to check NEON availability at runtime, and this means
we do not want to inline functions that use NEON instructions in the rest of
the code. This should be compiled separately.
*/
#if defined(STARBOARD)
void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
const uInt hash_size,
const ush w_size);
void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
const unsigned short w_size,
const uInt hash_size);
#else
#include <arm_neon.h>
inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
const uInt hash_size,
const ush w_size)
{
/* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
* values. For further details, check:
* ARM DHT 0002A, section 1.3.2 NEON Registers.
*/
const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
/* Unrolling the operation yielded a compression performance boost in both
* ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
* content. For full benchmarking data, check: http://crbug.com/863257.
*/
const size_t stride = 2*chunk;
const uint16x8_t v = vdupq_n_u16(w_size);
for (Posf *end = hash + hash_size; hash != end; hash += stride) {
uint16x8_t m_low = vld1q_u16(hash);
uint16x8_t m_high = vld1q_u16(hash + chunk);
/* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
* replacing the ternary operator expression in the original code:
* (m >= wsize ? m - wsize : NIL).
*/
m_low = vqsubq_u16(m_low, v);
m_high = vqsubq_u16(m_high, v);
vst1q_u16(hash, m_low);
vst1q_u16(hash + chunk, m_high);
}
}
inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
const unsigned short w_size,
const uInt hash_size)
{
/*
* SIMD implementation for hash table rebase assumes:
* 1. hash chain offset (Pos) is 2 bytes.
* 2. hash table size is multiple of 32 bytes.
* #1 should be true as Pos is defined as "ush"
* #2 should be true as hash_bits are greater than 7
*/
const size_t size = hash_size * sizeof(head[0]);
Assert(sizeof(Pos) == 2, "Wrong Pos size.");
Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
neon_slide_hash_update(head, hash_size, w_size);
#ifndef FASTEST
neon_slide_hash_update(prev, w_size, w_size);
#endif
}
#endif
#endif