| /* |
| * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) |
| * |
| * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. |
| * |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| #define JPEG_INTERNALS |
| #include "jconfigint.h" |
| #include "../../jinclude.h" |
| #include "../../jpeglib.h" |
| #include "../../jsimd.h" |
| #include "../../jdct.h" |
| #include "../../jsimddct.h" |
| #include "../jsimd.h" |
| #include "neon-compat.h" |
| |
| #include <arm_neon.h> |
| |
| |
| /* Data preparation for encode_mcu_AC_first(). |
| * |
| * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be |
| * found in jcphuff.c. |
| */ |
| |
| void jsimd_encode_mcu_AC_first_prepare_neon |
| (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, |
| JCOEF *values, size_t *zerobits) |
| { |
| JCOEF *values_ptr = values; |
| JCOEF *diff_values_ptr = values + DCTSIZE2; |
| |
| /* Rows of coefficients to zero (since they haven't been processed) */ |
| int i, rows_to_zero = 8; |
| |
| for (i = 0; i < Sl / 16; i++) { |
| int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); |
| int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); |
| |
| /* Isolate sign of coefficients. */ |
| int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); |
| int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs1 = vabsq_s16(coefs1); |
| int16x8_t abs_coefs2 = vabsq_s16(coefs2); |
| coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); |
| coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); |
| |
| /* Compute diff values. */ |
| int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); |
| int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); |
| |
| /* Store transformed coefficients and diff values. */ |
| vst1q_s16(values_ptr, coefs1); |
| vst1q_s16(values_ptr + DCTSIZE, coefs2); |
| vst1q_s16(diff_values_ptr, diff1); |
| vst1q_s16(diff_values_ptr + DCTSIZE, diff2); |
| values_ptr += 16; |
| diff_values_ptr += 16; |
| jpeg_natural_order_start += 16; |
| rows_to_zero -= 2; |
| } |
| |
| /* Same operation but for remaining partial vector */ |
| int remaining_coefs = Sl % 16; |
| if (remaining_coefs > 8) { |
| int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); |
| int16x8_t coefs2 = vdupq_n_s16(0); |
| switch (remaining_coefs) { |
| case 15: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 14: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 13: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 12: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 11: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 10: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 9: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| default: |
| break; |
| } |
| |
| /* Isolate sign of coefficients. */ |
| int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); |
| int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs1 = vabsq_s16(coefs1); |
| int16x8_t abs_coefs2 = vabsq_s16(coefs2); |
| coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); |
| coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); |
| |
| /* Compute diff values. */ |
| int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); |
| int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); |
| |
| /* Store transformed coefficients and diff values. */ |
| vst1q_s16(values_ptr, coefs1); |
| vst1q_s16(values_ptr + DCTSIZE, coefs2); |
| vst1q_s16(diff_values_ptr, diff1); |
| vst1q_s16(diff_values_ptr + DCTSIZE, diff2); |
| values_ptr += 16; |
| diff_values_ptr += 16; |
| rows_to_zero -= 2; |
| |
| } else if (remaining_coefs > 0) { |
| int16x8_t coefs = vdupq_n_s16(0); |
| |
| switch (remaining_coefs) { |
| case 8: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 7: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 6: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 5: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 4: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 3: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 2: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 1: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| default: |
| break; |
| } |
| |
| /* Isolate sign of coefficients. */ |
| int16x8_t sign_coefs = vshrq_n_s16(coefs, 15); |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs = vabsq_s16(coefs); |
| coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); |
| |
| /* Compute diff values. */ |
| int16x8_t diff = veorq_s16(coefs, sign_coefs); |
| |
| /* Store transformed coefficients and diff values. */ |
| vst1q_s16(values_ptr, coefs); |
| vst1q_s16(diff_values_ptr, diff); |
| values_ptr += 8; |
| diff_values_ptr += 8; |
| rows_to_zero--; |
| } |
| |
| /* Zero remaining memory in the values and diff_values blocks. */ |
| for (i = 0; i < rows_to_zero; i++) { |
| vst1q_s16(values_ptr, vdupq_n_s16(0)); |
| vst1q_s16(diff_values_ptr, vdupq_n_s16(0)); |
| values_ptr += 8; |
| diff_values_ptr += 8; |
| } |
| |
| /* Construct zerobits bitmap. A set bit means that the corresponding |
| * coefficient != 0. |
| */ |
| int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE); |
| int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE); |
| int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE); |
| int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE); |
| int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE); |
| int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE); |
| int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE); |
| int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE); |
| |
| uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0))); |
| uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0))); |
| uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0))); |
| uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0))); |
| uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0))); |
| uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0))); |
| uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); |
| uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); |
| |
| /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ |
| const uint8x8_t bitmap_mask = |
| vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); |
| |
| row0_eq0 = vand_u8(row0_eq0, bitmap_mask); |
| row1_eq0 = vand_u8(row1_eq0, bitmap_mask); |
| row2_eq0 = vand_u8(row2_eq0, bitmap_mask); |
| row3_eq0 = vand_u8(row3_eq0, bitmap_mask); |
| row4_eq0 = vand_u8(row4_eq0, bitmap_mask); |
| row5_eq0 = vand_u8(row5_eq0, bitmap_mask); |
| row6_eq0 = vand_u8(row6_eq0, bitmap_mask); |
| row7_eq0 = vand_u8(row7_eq0, bitmap_mask); |
| |
| uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0); |
| uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0); |
| uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0); |
| uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0); |
| uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); |
| uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); |
| uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| /* Move bitmap to a 64-bit scalar register. */ |
| uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); |
| /* Store zerobits bitmap. */ |
| *zerobits = ~bitmap; |
| #else |
| /* Move bitmap to two 32-bit scalar registers. */ |
| uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); |
| uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); |
| /* Store zerobits bitmap. */ |
| zerobits[0] = ~bitmap0; |
| zerobits[1] = ~bitmap1; |
| #endif |
| } |
| |
| |
| /* Data preparation for encode_mcu_AC_refine(). |
| * |
| * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be |
| * found in jcphuff.c. |
| */ |
| |
| int jsimd_encode_mcu_AC_refine_prepare_neon |
| (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, |
| JCOEF *absvalues, size_t *bits) |
| { |
| /* Temporary storage buffers for data used to compute the signbits bitmap and |
| * the end-of-block (EOB) position |
| */ |
| uint8_t coef_sign_bits[64]; |
| uint8_t coef_eq1_bits[64]; |
| |
| JCOEF *absvalues_ptr = absvalues; |
| uint8_t *coef_sign_bits_ptr = coef_sign_bits; |
| uint8_t *eq1_bits_ptr = coef_eq1_bits; |
| |
| /* Rows of coefficients to zero (since they haven't been processed) */ |
| int i, rows_to_zero = 8; |
| |
| for (i = 0; i < Sl / 16; i++) { |
| int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); |
| int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); |
| |
| /* Compute and store data for signbits bitmap. */ |
| uint8x8_t sign_coefs1 = |
| vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); |
| uint8x8_t sign_coefs2 = |
| vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); |
| vst1_u8(coef_sign_bits_ptr, sign_coefs1); |
| vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); |
| |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs1 = vabsq_s16(coefs1); |
| int16x8_t abs_coefs2 = vabsq_s16(coefs2); |
| coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); |
| coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); |
| vst1q_s16(absvalues_ptr, coefs1); |
| vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); |
| |
| /* Test whether transformed coefficient values == 1 (used to find EOB |
| * position.) |
| */ |
| uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); |
| uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); |
| vst1_u8(eq1_bits_ptr, coefs_eq11); |
| vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); |
| |
| absvalues_ptr += 16; |
| coef_sign_bits_ptr += 16; |
| eq1_bits_ptr += 16; |
| jpeg_natural_order_start += 16; |
| rows_to_zero -= 2; |
| } |
| |
| /* Same operation but for remaining partial vector */ |
| int remaining_coefs = Sl % 16; |
| if (remaining_coefs > 8) { |
| int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6); |
| coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7); |
| int16x8_t coefs2 = vdupq_n_s16(0); |
| switch (remaining_coefs) { |
| case 15: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 14: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 13: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 12: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 11: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 10: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 9: |
| coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| default: |
| break; |
| } |
| |
| /* Compute and store data for signbits bitmap. */ |
| uint8x8_t sign_coefs1 = |
| vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15))); |
| uint8x8_t sign_coefs2 = |
| vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15))); |
| vst1_u8(coef_sign_bits_ptr, sign_coefs1); |
| vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); |
| |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs1 = vabsq_s16(coefs1); |
| int16x8_t abs_coefs2 = vabsq_s16(coefs2); |
| coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); |
| coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); |
| vst1q_s16(absvalues_ptr, coefs1); |
| vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); |
| |
| /* Test whether transformed coefficient values == 1 (used to find EOB |
| * position.) |
| */ |
| uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); |
| uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); |
| vst1_u8(eq1_bits_ptr, coefs_eq11); |
| vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); |
| |
| absvalues_ptr += 16; |
| coef_sign_bits_ptr += 16; |
| eq1_bits_ptr += 16; |
| jpeg_natural_order_start += 16; |
| rows_to_zero -= 2; |
| |
| } else if (remaining_coefs > 0) { |
| int16x8_t coefs = vdupq_n_s16(0); |
| |
| switch (remaining_coefs) { |
| case 8: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 7: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 6: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 5: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 4: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 3: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 2: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| case 1: |
| coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0); |
| FALLTHROUGH /*FALLTHROUGH*/ |
| default: |
| break; |
| } |
| |
| /* Compute and store data for signbits bitmap. */ |
| uint8x8_t sign_coefs = |
| vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15))); |
| vst1_u8(coef_sign_bits_ptr, sign_coefs); |
| |
| /* Compute absolute value of coefficients and apply point transform Al. */ |
| int16x8_t abs_coefs = vabsq_s16(coefs); |
| coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); |
| vst1q_s16(absvalues_ptr, coefs); |
| |
| /* Test whether transformed coefficient values == 1 (used to find EOB |
| * position.) |
| */ |
| uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1))); |
| vst1_u8(eq1_bits_ptr, coefs_eq1); |
| |
| absvalues_ptr += 8; |
| coef_sign_bits_ptr += 8; |
| eq1_bits_ptr += 8; |
| rows_to_zero--; |
| } |
| |
| /* Zero remaining memory in blocks. */ |
| for (i = 0; i < rows_to_zero; i++) { |
| vst1q_s16(absvalues_ptr, vdupq_n_s16(0)); |
| vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); |
| vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); |
| absvalues_ptr += 8; |
| coef_sign_bits_ptr += 8; |
| eq1_bits_ptr += 8; |
| } |
| |
| /* Construct zerobits bitmap. */ |
| int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE); |
| int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE); |
| int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE); |
| int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE); |
| int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE); |
| int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE); |
| int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE); |
| int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE); |
| |
| uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0))); |
| uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0))); |
| uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0))); |
| uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0))); |
| uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0))); |
| uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0))); |
| uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); |
| uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); |
| |
| /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ |
| const uint8x8_t bitmap_mask = |
| vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201)); |
| |
| abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask); |
| abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask); |
| abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask); |
| abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask); |
| abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask); |
| abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask); |
| abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask); |
| abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask); |
| |
| uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0); |
| uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0); |
| uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0); |
| uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0); |
| uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); |
| uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); |
| uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| /* Move bitmap to a 64-bit scalar register. */ |
| uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); |
| /* Store zerobits bitmap. */ |
| bits[0] = ~bitmap; |
| #else |
| /* Move bitmap to two 32-bit scalar registers. */ |
| uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); |
| uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); |
| /* Store zerobits bitmap. */ |
| bits[0] = ~bitmap0; |
| bits[1] = ~bitmap1; |
| #endif |
| |
| /* Construct signbits bitmap. */ |
| uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE); |
| uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE); |
| uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE); |
| uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE); |
| uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE); |
| uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE); |
| uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE); |
| uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE); |
| |
| signbits_row0 = vand_u8(signbits_row0, bitmap_mask); |
| signbits_row1 = vand_u8(signbits_row1, bitmap_mask); |
| signbits_row2 = vand_u8(signbits_row2, bitmap_mask); |
| signbits_row3 = vand_u8(signbits_row3, bitmap_mask); |
| signbits_row4 = vand_u8(signbits_row4, bitmap_mask); |
| signbits_row5 = vand_u8(signbits_row5, bitmap_mask); |
| signbits_row6 = vand_u8(signbits_row6, bitmap_mask); |
| signbits_row7 = vand_u8(signbits_row7, bitmap_mask); |
| |
| bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1); |
| bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3); |
| bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5); |
| bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7); |
| bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); |
| bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); |
| bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| /* Move bitmap to a 64-bit scalar register. */ |
| bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); |
| /* Store signbits bitmap. */ |
| bits[1] = ~bitmap; |
| #else |
| /* Move bitmap to two 32-bit scalar registers. */ |
| bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); |
| bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); |
| /* Store signbits bitmap. */ |
| bits[2] = ~bitmap0; |
| bits[3] = ~bitmap1; |
| #endif |
| |
| /* Construct bitmap to find EOB position (the index of the last coefficient |
| * equal to 1.) |
| */ |
| uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE); |
| uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE); |
| uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE); |
| uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE); |
| uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE); |
| uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE); |
| uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE); |
| uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE); |
| |
| row0_eq1 = vand_u8(row0_eq1, bitmap_mask); |
| row1_eq1 = vand_u8(row1_eq1, bitmap_mask); |
| row2_eq1 = vand_u8(row2_eq1, bitmap_mask); |
| row3_eq1 = vand_u8(row3_eq1, bitmap_mask); |
| row4_eq1 = vand_u8(row4_eq1, bitmap_mask); |
| row5_eq1 = vand_u8(row5_eq1, bitmap_mask); |
| row6_eq1 = vand_u8(row6_eq1, bitmap_mask); |
| row7_eq1 = vand_u8(row7_eq1, bitmap_mask); |
| |
| bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1); |
| bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1); |
| bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1); |
| bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1); |
| bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23); |
| bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67); |
| bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567); |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| /* Move bitmap to a 64-bit scalar register. */ |
| bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0); |
| |
| /* Return EOB position. */ |
| if (bitmap == 0) { |
| /* EOB position is defined to be 0 if all coefficients != 1. */ |
| return 0; |
| } else { |
| return 63 - BUILTIN_CLZLL(bitmap); |
| } |
| #else |
| /* Move bitmap to two 32-bit scalar registers. */ |
| bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0); |
| bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1); |
| |
| /* Return EOB position. */ |
| if (bitmap0 == 0 && bitmap1 == 0) { |
| return 0; |
| } else if (bitmap1 != 0) { |
| return 63 - BUILTIN_CLZ(bitmap1); |
| } else { |
| return 31 - BUILTIN_CLZ(bitmap0); |
| } |
| #endif |
| } |