blob: a5240d7d547fcb3fcc627229a978ff13b638cde6 [file] [log] [blame]
/* Copyright 2018 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#include "slide_hash_neon.h"
#if defined(STARBOARD)
#include <arm_neon.h>
void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
const uInt hash_size,
const ush w_size)
{
/* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
* values. For further details, check:
* ARM DHT 0002A, section 1.3.2 NEON Registers.
*/
const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
/* Unrolling the operation yielded a compression performance boost in both
* ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
* content. For full benchmarking data, check: http://crbug.com/863257.
*/
const size_t stride = 2*chunk;
const uint16x8_t v = vdupq_n_u16(w_size);
for (Posf *end = hash + hash_size; hash != end; hash += stride) {
uint16x8_t m_low = vld1q_u16(hash);
uint16x8_t m_high = vld1q_u16(hash + chunk);
/* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
* replacing the ternary operator expression in the original code:
* (m >= wsize ? m - wsize : NIL).
*/
m_low = vqsubq_u16(m_low, v);
m_high = vqsubq_u16(m_high, v);
vst1q_u16(hash, m_low);
vst1q_u16(hash + chunk, m_high);
}
}
void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
const unsigned short w_size,
const uInt hash_size)
{
/*
* SIMD implementation for hash table rebase assumes:
* 1. hash chain offset (Pos) is 2 bytes.
* 2. hash table size is multiple of 32 bytes.
* #1 should be true as Pos is defined as "ush"
* #2 should be true as hash_bits are greater than 7
*/
const size_t size = hash_size * sizeof(head[0]);
Assert(sizeof(Pos) == 2, "Wrong Pos size.");
Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
neon_slide_hash_update(head, hash_size, w_size);
#ifndef FASTEST
neon_slide_hash_update(prev, w_size, w_size);
#endif
}
#endif