third_party/libjpeg-turbo/simd/arm/jquanti-neon.c - cobalt - Git at Google

 /*
  * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 #define JPEG_INTERNALS
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"

 #include <arm_neon.h>


 /* After downsampling, the resulting sample values are in the range [0, 255],
  * but the Discrete Cosine Transform (DCT) operates on values centered around
  * 0.
  *
  * To prepare sample values for the DCT, load samples into a DCT workspace,
  * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
  * are also widened from 8- to 16-bit.
  *
  * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
  */

 void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
                          DCTELEM *workspace)
 {
   uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
   uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
   uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
   uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
   uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
   uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
   uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
   uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);

   int16x8_t row0 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row1 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row2 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row3 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row4 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row5 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row6 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
   int16x8_t row7 =
     vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));

   vst1q_s16(workspace + 0 * DCTSIZE, row0);
   vst1q_s16(workspace + 1 * DCTSIZE, row1);
   vst1q_s16(workspace + 2 * DCTSIZE, row2);
   vst1q_s16(workspace + 3 * DCTSIZE, row3);
   vst1q_s16(workspace + 4 * DCTSIZE, row4);
   vst1q_s16(workspace + 5 * DCTSIZE, row5);
   vst1q_s16(workspace + 6 * DCTSIZE, row6);
   vst1q_s16(workspace + 7 * DCTSIZE, row7);
 }


 /* After the DCT, the resulting array of coefficient values needs to be divided
  * by an array of quantization values.
  *
  * To avoid a slow division operation, the DCT coefficients are multiplied by
  * the (scaled) reciprocals of the quantization values and then right-shifted.
  *
  * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
  */

 void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
                          DCTELEM *workspace)
 {
   JCOEFPTR out_ptr = coef_block;
   UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
   UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
   DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
   int i;

   for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
     /* Load DCT coefficients. */
     int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
     int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
     int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
     int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
     /* Load reciprocals of quantization values. */
     uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
     uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
     uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
     uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
     uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
     uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
     uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
     uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
     int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
     int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
     int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
     int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);

     /* Extract sign from coefficients. */
     int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
     int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
     int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
     int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
     /* Get absolute value of DCT coefficients. */
     uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
     uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
     uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
     uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
     /* Add correction. */
     abs_row0 = vaddq_u16(abs_row0, corr0);
     abs_row1 = vaddq_u16(abs_row1, corr1);
     abs_row2 = vaddq_u16(abs_row2, corr2);
     abs_row3 = vaddq_u16(abs_row3, corr3);

     /* Multiply DCT coefficients by quantization reciprocals. */
     int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
                                                        vget_low_u16(recip0)));
     int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
                                                        vget_high_u16(recip0)));
     int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
                                                        vget_low_u16(recip1)));
     int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
                                                        vget_high_u16(recip1)));
     int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
                                                        vget_low_u16(recip2)));
     int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
                                                        vget_high_u16(recip2)));
     int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
                                                        vget_low_u16(recip3)));
     int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
                                                        vget_high_u16(recip3)));
     /* Narrow back to 16-bit. */
     row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
     row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
     row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
     row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));

     /* Since VSHR only supports an immediate as its second argument, negate the
      * shift value and shift left.
      */
     row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
                                            vnegq_s16(shift0)));
     row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
                                            vnegq_s16(shift1)));
     row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
                                            vnegq_s16(shift2)));
     row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
                                            vnegq_s16(shift3)));

     /* Restore sign to original product. */
     row0 = veorq_s16(row0, sign_row0);
     row0 = vsubq_s16(row0, sign_row0);
     row1 = veorq_s16(row1, sign_row1);
     row1 = vsubq_s16(row1, sign_row1);
     row2 = veorq_s16(row2, sign_row2);
     row2 = vsubq_s16(row2, sign_row2);
     row3 = veorq_s16(row3, sign_row3);
     row3 = vsubq_s16(row3, sign_row3);

     /* Store quantized coefficients to memory. */
     vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
     vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
     vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
     vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
   }
 }
	/*
	* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
	*
	* Copyright (C) 2020, Arm Limited. All Rights Reserved.
	*
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the authors be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*/

	#define JPEG_INTERNALS
	#include "../../jinclude.h"
	#include "../../jpeglib.h"
	#include "../../jsimd.h"
	#include "../../jdct.h"
	#include "../../jsimddct.h"
	#include "../jsimd.h"

	#include <arm_neon.h>


	/* After downsampling, the resulting sample values are in the range [0, 255],
	* but the Discrete Cosine Transform (DCT) operates on values centered around
	* 0.
	*
	* To prepare sample values for the DCT, load samples into a DCT workspace,
	* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
	* are also widened from 8- to 16-bit.
	*
	* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
	*/

	void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
	DCTELEM *workspace)
	{
	uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
	uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
	uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
	uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
	uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
	uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
	uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
	uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);

	int16x8_t row0 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row1 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row2 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row3 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row4 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row5 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row6 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
	int16x8_t row7 =
	vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));

	vst1q_s16(workspace + 0 * DCTSIZE, row0);
	vst1q_s16(workspace + 1 * DCTSIZE, row1);
	vst1q_s16(workspace + 2 * DCTSIZE, row2);
	vst1q_s16(workspace + 3 * DCTSIZE, row3);
	vst1q_s16(workspace + 4 * DCTSIZE, row4);
	vst1q_s16(workspace + 5 * DCTSIZE, row5);
	vst1q_s16(workspace + 6 * DCTSIZE, row6);
	vst1q_s16(workspace + 7 * DCTSIZE, row7);
	}


	/* After the DCT, the resulting array of coefficient values needs to be divided
	* by an array of quantization values.
	*
	* To avoid a slow division operation, the DCT coefficients are multiplied by
	* the (scaled) reciprocals of the quantization values and then right-shifted.
	*
	* The equivalent scalar C function quantize() can be found in jcdctmgr.c.
	*/

	void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
	DCTELEM *workspace)
	{
	JCOEFPTR out_ptr = coef_block;
	UDCTELEM recip_ptr = (UDCTELEM )divisors;
	UDCTELEM corr_ptr = (UDCTELEM )divisors + DCTSIZE2;
	DCTELEM shift_ptr = divisors + 3 DCTSIZE2;
	int i;

	for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
	/* Load DCT coefficients. */
	int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
	int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
	int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
	int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
	/* Load reciprocals of quantization values. */
	uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
	uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
	uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
	uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
	uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
	uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
	uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
	uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
	int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
	int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
	int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
	int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);

	/* Extract sign from coefficients. */
	int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
	int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
	int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
	int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
	/* Get absolute value of DCT coefficients. */
	uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
	uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
	uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
	uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
	/* Add correction. */
	abs_row0 = vaddq_u16(abs_row0, corr0);
	abs_row1 = vaddq_u16(abs_row1, corr1);
	abs_row2 = vaddq_u16(abs_row2, corr2);
	abs_row3 = vaddq_u16(abs_row3, corr3);

	/* Multiply DCT coefficients by quantization reciprocals. */
	int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
	vget_low_u16(recip0)));
	int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
	vget_high_u16(recip0)));
	int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
	vget_low_u16(recip1)));
	int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
	vget_high_u16(recip1)));
	int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
	vget_low_u16(recip2)));
	int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
	vget_high_u16(recip2)));
	int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
	vget_low_u16(recip3)));
	int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
	vget_high_u16(recip3)));
	/* Narrow back to 16-bit. */
	row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
	row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
	row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
	row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));

	/* Since VSHR only supports an immediate as its second argument, negate the
	* shift value and shift left.
	*/
	row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
	vnegq_s16(shift0)));
	row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
	vnegq_s16(shift1)));
	row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
	vnegq_s16(shift2)));
	row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
	vnegq_s16(shift3)));

	/* Restore sign to original product. */
	row0 = veorq_s16(row0, sign_row0);
	row0 = vsubq_s16(row0, sign_row0);
	row1 = veorq_s16(row1, sign_row1);
	row1 = vsubq_s16(row1, sign_row1);
	row2 = veorq_s16(row2, sign_row2);
	row2 = vsubq_s16(row2, sign_row2);
	row3 = veorq_s16(row3, sign_row3);
	row3 = vsubq_s16(row3, sign_row3);

	/* Store quantized coefficients to memory. */
	vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
	vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
	vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
	vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
	}
	}