| /* |
| * jdsample-neon.c - upsampling (Arm Neon) |
| * |
| * Copyright (C) 2020, Arm Limited. All Rights Reserved. |
| * Copyright (C) 2020, D. R. Commander. All Rights Reserved. |
| * |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| #define JPEG_INTERNALS |
| #include "../../jinclude.h" |
| #include "../../jpeglib.h" |
| #include "../../jsimd.h" |
| #include "../../jdct.h" |
| #include "../../jsimddct.h" |
| #include "../jsimd.h" |
| |
| #include <arm_neon.h> |
| |
| |
| /* The diagram below shows a row of samples produced by h2v1 downsampling. |
| * |
| * s0 s1 s2 |
| * +---------+---------+---------+ |
| * | | | | |
| * | p0 p1 | p2 p3 | p4 p5 | |
| * | | | | |
| * +---------+---------+---------+ |
| * |
| * Samples s0-s2 were created by averaging the original pixel component values |
| * centered at positions p0-p5 above. To approximate those original pixel |
| * component values, we proportionally blend the adjacent samples in each row. |
| * |
| * An upsampled pixel component value is computed by blending the sample |
| * containing the pixel center with the nearest neighboring sample, in the |
| * ratio 3:1. For example: |
| * p1(upsampled) = 3/4 * s0 + 1/4 * s1 |
| * p2(upsampled) = 3/4 * s1 + 1/4 * s0 |
| * When computing the first and last pixel component values in the row, there |
| * is no adjacent sample to blend, so: |
| * p0(upsampled) = s0 |
| * p5(upsampled) = s2 |
| */ |
| |
| void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, |
| JDIMENSION downsampled_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr; |
| int inrow; |
| unsigned colctr; |
| /* Set up constants. */ |
| const uint16x8_t one_u16 = vdupq_n_u16(1); |
| const uint8x8_t three_u8 = vdup_n_u8(3); |
| |
| for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| inptr = input_data[inrow]; |
| outptr = output_data[inrow]; |
| /* First pixel component value in this row of the original image */ |
| *outptr = (JSAMPLE)GETJSAMPLE(*inptr); |
| |
| /* 3/4 * containing sample + 1/4 * nearest neighboring sample |
| * For p1: containing sample = s0, nearest neighboring sample = s1 |
| * For p2: containing sample = s1, nearest neighboring sample = s0 |
| */ |
| uint8x16_t s0 = vld1q_u8(inptr); |
| uint8x16_t s1 = vld1q_u8(inptr + 1); |
| /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes |
| * denote low half and high half respectively. |
| */ |
| uint16x8_t s1_add_3s0_l = |
| vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); |
| uint16x8_t s1_add_3s0_h = |
| vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); |
| uint16x8_t s0_add_3s1_l = |
| vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); |
| uint16x8_t s0_add_3s1_h = |
| vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); |
| /* Add ordered dithering bias to odd pixel values. */ |
| s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); |
| s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); |
| |
| /* The offset is initially 1, because the first pixel component has already |
| * been stored. However, in subsequent iterations of the SIMD loop, this |
| * offset is (2 * colctr - 1) to stay within the bounds of the sample |
| * buffers without having to resort to a slow scalar tail case for the last |
| * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" |
| * in jmemmgr.c for more details. |
| */ |
| unsigned outptr_offset = 1; |
| uint8x16x2_t output_pixels; |
| |
| /* We use software pipelining to maximise performance. The code indented |
| * an extra two spaces begins the next iteration of the loop. |
| */ |
| for (colctr = 16; colctr < downsampled_width; colctr += 16) { |
| |
| s0 = vld1q_u8(inptr + colctr - 1); |
| s1 = vld1q_u8(inptr + colctr); |
| |
| /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ |
| output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), |
| vrshrn_n_u16(s1_add_3s0_h, 2)); |
| output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), |
| vshrn_n_u16(s0_add_3s1_h, 2)); |
| |
| /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes |
| * denote low half and high half respectively. |
| */ |
| s1_add_3s0_l = |
| vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8); |
| s1_add_3s0_h = |
| vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8); |
| s0_add_3s1_l = |
| vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8); |
| s0_add_3s1_h = |
| vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); |
| /* Add ordered dithering bias to odd pixel values. */ |
| s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); |
| s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); |
| |
| /* Store pixel component values to memory. */ |
| vst2q_u8(outptr + outptr_offset, output_pixels); |
| outptr_offset = 2 * colctr - 1; |
| } |
| |
| /* Complete the last iteration of the loop. */ |
| |
| /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ |
| output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), |
| vrshrn_n_u16(s1_add_3s0_h, 2)); |
| output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), |
| vshrn_n_u16(s0_add_3s1_h, 2)); |
| /* Store pixel component values to memory. */ |
| vst2q_u8(outptr + outptr_offset, output_pixels); |
| |
| /* Last pixel component value in this row of the original image */ |
| outptr[2 * downsampled_width - 1] = |
| GETJSAMPLE(inptr[downsampled_width - 1]); |
| } |
| } |
| |
| |
| /* The diagram below shows an array of samples produced by h2v2 downsampling. |
| * |
| * s0 s1 s2 |
| * +---------+---------+---------+ |
| * | p0 p1 | p2 p3 | p4 p5 | |
| * sA | | | | |
| * | p6 p7 | p8 p9 | p10 p11| |
| * +---------+---------+---------+ |
| * | p12 p13| p14 p15| p16 p17| |
| * sB | | | | |
| * | p18 p19| p20 p21| p22 p23| |
| * +---------+---------+---------+ |
| * | p24 p25| p26 p27| p28 p29| |
| * sC | | | | |
| * | p30 p31| p32 p33| p34 p35| |
| * +---------+---------+---------+ |
| * |
| * Samples s0A-s2C were created by averaging the original pixel component |
| * values centered at positions p0-p35 above. To approximate one of those |
| * original pixel component values, we proportionally blend the sample |
| * containing the pixel center with the nearest neighboring samples in each |
| * row, column, and diagonal. |
| * |
| * An upsampled pixel component value is computed by first blending the sample |
| * containing the pixel center with the nearest neighboring samples in the |
| * same column, in the ratio 3:1, and then blending each column sum with the |
| * nearest neighboring column sum, in the ratio 3:1. For example: |
| * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) + |
| * 1/4 * (3/4 * s0B + 1/4 * s0A) |
| * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A |
| * When computing the first and last pixel component values in the row, there |
| * is no horizontally adjacent sample to blend, so: |
| * p12(upsampled) = 3/4 * s0B + 1/4 * s0A |
| * p23(upsampled) = 3/4 * s2B + 1/4 * s2C |
| * When computing the first and last pixel component values in the column, |
| * there is no vertically adjacent sample to blend, so: |
| * p2(upsampled) = 3/4 * s1A + 1/4 * s0A |
| * p33(upsampled) = 3/4 * s1C + 1/4 * s2C |
| * When computing the corner pixel component values, there is no adjacent |
| * sample to blend, so: |
| * p0(upsampled) = s0A |
| * p35(upsampled) = s2C |
| */ |
| |
| void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, |
| JDIMENSION downsampled_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; |
| int inrow, outrow; |
| unsigned colctr; |
| /* Set up constants. */ |
| const uint16x8_t seven_u16 = vdupq_n_u16(7); |
| const uint8x8_t three_u8 = vdup_n_u8(3); |
| const uint16x8_t three_u16 = vdupq_n_u16(3); |
| |
| inrow = outrow = 0; |
| while (outrow < max_v_samp_factor) { |
| inptr0 = input_data[inrow - 1]; |
| inptr1 = input_data[inrow]; |
| inptr2 = input_data[inrow + 1]; |
| /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, |
| * respectively. |
| */ |
| outptr0 = output_data[outrow++]; |
| outptr1 = output_data[outrow++]; |
| |
| /* First pixel component value in this row of the original image */ |
| int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0); |
| *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); |
| int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2); |
| *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4); |
| |
| /* Step 1: Blend samples vertically in columns s0 and s1. |
| * Leave the divide by 4 until the end, when it can be done for both |
| * dimensions at once, right-shifting by 4. |
| */ |
| |
| /* Load and compute s0colsum0 and s0colsum1. */ |
| uint8x16_t s0A = vld1q_u8(inptr0); |
| uint8x16_t s0B = vld1q_u8(inptr1); |
| uint8x16_t s0C = vld1q_u8(inptr2); |
| /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes |
| * denote low half and high half respectively. |
| */ |
| uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), |
| vget_low_u8(s0B), three_u8); |
| uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), |
| vget_high_u8(s0B), three_u8); |
| uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), |
| vget_low_u8(s0B), three_u8); |
| uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), |
| vget_high_u8(s0B), three_u8); |
| /* Load and compute s1colsum0 and s1colsum1. */ |
| uint8x16_t s1A = vld1q_u8(inptr0 + 1); |
| uint8x16_t s1B = vld1q_u8(inptr1 + 1); |
| uint8x16_t s1C = vld1q_u8(inptr2 + 1); |
| uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), |
| vget_low_u8(s1B), three_u8); |
| uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), |
| vget_high_u8(s1B), three_u8); |
| uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), |
| vget_low_u8(s1B), three_u8); |
| uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), |
| vget_high_u8(s1B), three_u8); |
| |
| /* Step 2: Blend the already-blended columns. */ |
| |
| uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); |
| uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); |
| uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); |
| uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); |
| uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); |
| uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); |
| uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); |
| uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); |
| /* Add ordered dithering bias to odd pixel values. */ |
| output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); |
| output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); |
| output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); |
| output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); |
| /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ |
| uint8x16x2_t output_pixels0 = { { |
| vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)), |
| vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4)) |
| } }; |
| uint8x16x2_t output_pixels1 = { { |
| vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)), |
| vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4)) |
| } }; |
| |
| /* Store pixel component values to memory. |
| * The minimum size of the output buffer for each row is 64 bytes => no |
| * need to worry about buffer overflow here. See "Creation of 2-D sample |
| * arrays" in jmemmgr.c for more details. |
| */ |
| vst2q_u8(outptr0 + 1, output_pixels0); |
| vst2q_u8(outptr1 + 1, output_pixels1); |
| |
| /* The first pixel of the image shifted our loads and stores by one byte. |
| * We have to re-align on a 32-byte boundary at some point before the end |
| * of the row (we do it now on the 32/33 pixel boundary) to stay within the |
| * bounds of the sample buffers without having to resort to a slow scalar |
| * tail case for the last (downsampled_width % 16) samples. See "Creation |
| * of 2-D sample arrays" in jmemmgr.c for more details. |
| */ |
| for (colctr = 16; colctr < downsampled_width; colctr += 16) { |
| /* Step 1: Blend samples vertically in columns s0 and s1. */ |
| |
| /* Load and compute s0colsum0 and s0colsum1. */ |
| s0A = vld1q_u8(inptr0 + colctr - 1); |
| s0B = vld1q_u8(inptr1 + colctr - 1); |
| s0C = vld1q_u8(inptr2 + colctr - 1); |
| s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B), |
| three_u8); |
| s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B), |
| three_u8); |
| s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B), |
| three_u8); |
| s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B), |
| three_u8); |
| /* Load and compute s1colsum0 and s1colsum1. */ |
| s1A = vld1q_u8(inptr0 + colctr); |
| s1B = vld1q_u8(inptr1 + colctr); |
| s1C = vld1q_u8(inptr2 + colctr); |
| s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B), |
| three_u8); |
| s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B), |
| three_u8); |
| s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B), |
| three_u8); |
| s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B), |
| three_u8); |
| |
| /* Step 2: Blend the already-blended columns. */ |
| |
| output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); |
| output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); |
| output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); |
| output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); |
| output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); |
| output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); |
| output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); |
| output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); |
| /* Add ordered dithering bias to odd pixel values. */ |
| output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); |
| output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); |
| output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); |
| output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); |
| /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */ |
| output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4), |
| vshrn_n_u16(output0_p1_h, 4)); |
| output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), |
| vrshrn_n_u16(output0_p2_h, 4)); |
| output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4), |
| vshrn_n_u16(output1_p1_h, 4)); |
| output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), |
| vrshrn_n_u16(output1_p2_h, 4)); |
| /* Store pixel component values to memory. */ |
| vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0); |
| vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1); |
| } |
| |
| /* Last pixel component value in this row of the original image */ |
| int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + |
| GETJSAMPLE(inptr0[downsampled_width - 1]); |
| outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); |
| int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + |
| GETJSAMPLE(inptr2[downsampled_width - 1]); |
| outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4); |
| inrow++; |
| } |
| } |
| |
| |
| /* The diagram below shows a column of samples produced by h1v2 downsampling |
| * (or by losslessly rotating or transposing an h2v1-downsampled image.) |
| * |
| * +---------+ |
| * | p0 | |
| * sA | | |
| * | p1 | |
| * +---------+ |
| * | p2 | |
| * sB | | |
| * | p3 | |
| * +---------+ |
| * | p4 | |
| * sC | | |
| * | p5 | |
| * +---------+ |
| * |
| * Samples sA-sC were created by averaging the original pixel component values |
| * centered at positions p0-p5 above. To approximate those original pixel |
| * component values, we proportionally blend the adjacent samples in each |
| * column. |
| * |
| * An upsampled pixel component value is computed by blending the sample |
| * containing the pixel center with the nearest neighboring sample, in the |
| * ratio 3:1. For example: |
| * p1(upsampled) = 3/4 * sA + 1/4 * sB |
| * p2(upsampled) = 3/4 * sB + 1/4 * sA |
| * When computing the first and last pixel component values in the column, |
| * there is no adjacent sample to blend, so: |
| * p0(upsampled) = sA |
| * p5(upsampled) = sC |
| */ |
| |
| void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, |
| JDIMENSION downsampled_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; |
| int inrow, outrow; |
| unsigned colctr; |
| /* Set up constants. */ |
| const uint16x8_t one_u16 = vdupq_n_u16(1); |
| const uint8x8_t three_u8 = vdup_n_u8(3); |
| |
| inrow = outrow = 0; |
| while (outrow < max_v_samp_factor) { |
| inptr0 = input_data[inrow - 1]; |
| inptr1 = input_data[inrow]; |
| inptr2 = input_data[inrow + 1]; |
| /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, |
| * respectively. |
| */ |
| outptr0 = output_data[outrow++]; |
| outptr1 = output_data[outrow++]; |
| inrow++; |
| |
| /* The size of the input and output buffers is always a multiple of 32 |
| * bytes => no need to worry about buffer overflow when reading/writing |
| * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more |
| * details. |
| */ |
| for (colctr = 0; colctr < downsampled_width; colctr += 16) { |
| /* Load samples. */ |
| uint8x16_t sA = vld1q_u8(inptr0 + colctr); |
| uint8x16_t sB = vld1q_u8(inptr1 + colctr); |
| uint8x16_t sC = vld1q_u8(inptr2 + colctr); |
| /* Blend samples vertically. */ |
| uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)), |
| vget_low_u8(sB), three_u8); |
| uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)), |
| vget_high_u8(sB), three_u8); |
| uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)), |
| vget_low_u8(sB), three_u8); |
| uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)), |
| vget_high_u8(sB), three_u8); |
| /* Add ordered dithering bias to pixel values in even output rows. */ |
| colsum0_l = vaddq_u16(colsum0_l, one_u16); |
| colsum0_h = vaddq_u16(colsum0_h, one_u16); |
| /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */ |
| uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2), |
| vshrn_n_u16(colsum0_h, 2)); |
| uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2), |
| vrshrn_n_u16(colsum1_h, 2)); |
| /* Store pixel component values to memory. */ |
| vst1q_u8(outptr0 + colctr, output_pixels0); |
| vst1q_u8(outptr1 + colctr, output_pixels1); |
| } |
| } |
| } |
| |
| |
| /* The diagram below shows a row of samples produced by h2v1 downsampling. |
| * |
| * s0 s1 |
| * +---------+---------+ |
| * | | | |
| * | p0 p1 | p2 p3 | |
| * | | | |
| * +---------+---------+ |
| * |
| * Samples s0 and s1 were created by averaging the original pixel component |
| * values centered at positions p0-p3 above. To approximate those original |
| * pixel component values, we duplicate the samples horizontally: |
| * p0(upsampled) = p1(upsampled) = s0 |
| * p2(upsampled) = p3(upsampled) = s1 |
| */ |
| |
| void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr; |
| int inrow; |
| unsigned colctr; |
| |
| for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| inptr = input_data[inrow]; |
| outptr = output_data[inrow]; |
| for (colctr = 0; 2 * colctr < output_width; colctr += 16) { |
| uint8x16_t samples = vld1q_u8(inptr + colctr); |
| /* Duplicate the samples. The store operation below interleaves them so |
| * that adjacent pixel component values take on the same sample value, |
| * per above. |
| */ |
| uint8x16x2_t output_pixels = { { samples, samples } }; |
| /* Store pixel component values to memory. |
| * Due to the way sample buffers are allocated, we don't need to worry |
| * about tail cases when output_width is not a multiple of 32. See |
| * "Creation of 2-D sample arrays" in jmemmgr.c for details. |
| */ |
| vst2q_u8(outptr + 2 * colctr, output_pixels); |
| } |
| } |
| } |
| |
| |
| /* The diagram below shows an array of samples produced by h2v2 downsampling. |
| * |
| * s0 s1 |
| * +---------+---------+ |
| * | p0 p1 | p2 p3 | |
| * sA | | | |
| * | p4 p5 | p6 p7 | |
| * +---------+---------+ |
| * | p8 p9 | p10 p11| |
| * sB | | | |
| * | p12 p13| p14 p15| |
| * +---------+---------+ |
| * |
| * Samples s0A-s1B were created by averaging the original pixel component |
| * values centered at positions p0-p15 above. To approximate those original |
| * pixel component values, we duplicate the samples both horizontally and |
| * vertically: |
| * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A |
| * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A |
| * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B |
| * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B |
| */ |
| |
| void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width, |
| JSAMPARRAY input_data, |
| JSAMPARRAY *output_data_ptr) |
| { |
| JSAMPARRAY output_data = *output_data_ptr; |
| JSAMPROW inptr, outptr0, outptr1; |
| int inrow, outrow; |
| unsigned colctr; |
| |
| for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { |
| inptr = input_data[inrow]; |
| outptr0 = output_data[outrow++]; |
| outptr1 = output_data[outrow++]; |
| |
| for (colctr = 0; 2 * colctr < output_width; colctr += 16) { |
| uint8x16_t samples = vld1q_u8(inptr + colctr); |
| /* Duplicate the samples. The store operation below interleaves them so |
| * that adjacent pixel component values take on the same sample value, |
| * per above. |
| */ |
| uint8x16x2_t output_pixels = { { samples, samples } }; |
| /* Store pixel component values for both output rows to memory. |
| * Due to the way sample buffers are allocated, we don't need to worry |
| * about tail cases when output_width is not a multiple of 32. See |
| * "Creation of 2-D sample arrays" in jmemmgr.c for details. |
| */ |
| vst2q_u8(outptr0 + 2 * colctr, output_pixels); |
| vst2q_u8(outptr1 + 2 * colctr, output_pixels); |
| } |
| } |
| } |