blob: 3fd18dec568cc20fdb3e5852d59f70d9d32372d1 [file] [log] [blame]
Kaido Kert72bde072021-03-12 15:55:15 -08001/*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10#include <stdlib.h>
11
12#include "./vpx_config.h"
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_dsp/mips/macros_msa.h"
15
16uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
17 uint32_t sum_out;
18 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
19 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
20 v4u32 sum = { 0 };
21
22 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
23 HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
24 HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
25 ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
26 ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
27 sum0 += sum4;
28
29 sum = __msa_hadd_u_w(sum0, sum0);
30 sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
31 sum = __msa_hadd_u_w(sum0, sum0);
32 sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
33 sum_out = __msa_copy_u_w((v4i32)sum, 0);
34
35 return sum_out;
36}
37
38uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
39 uint32_t sum_out;
40 uint32_t src0, src1, src2, src3;
41 v16u8 vec = { 0 };
42 v8u16 sum0;
43 v4u32 sum1;
44 v2u64 sum2;
45
46 LW4(src, src_stride, src0, src1, src2, src3);
47 INSERT_W4_UB(src0, src1, src2, src3, vec);
48
49 sum0 = __msa_hadd_u_h(vec, vec);
50 sum1 = __msa_hadd_u_w(sum0, sum0);
51 sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
52 sum1 = __msa_hadd_u_w(sum0, sum0);
53 sum2 = __msa_hadd_u_d(sum1, sum1);
54 sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
55 sum_out = __msa_copy_u_w((v4i32)sum1, 0);
56
57 return sum_out;
58}
59
60#if !CONFIG_VP9_HIGHBITDEPTH
61void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
62 int16_t *dst) {
63 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
64 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
65
66 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
67 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
68 tmp6, tmp7, tmp5, tmp3, tmp1);
69 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
70 src5, src7, src6, src3, src2);
71 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
72 tmp4, tmp5, tmp1, tmp6, tmp2);
73 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
74 src2, src3, src4, src5, src6, src7);
75 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
76 tmp6, tmp7, tmp5, tmp3, tmp1);
77 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
78 src5, src7, src6, src3, src2);
79 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
80 tmp4, tmp5, tmp1, tmp6, tmp2);
81 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
82 src2, src3, src4, src5, src6, src7);
83 ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
84}
85
86void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
87 int16_t *dst) {
88 v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
89 v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
90 v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
91 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
92
93 LD_SH2(src, 8, src0, src8);
94 src += src_stride;
95 LD_SH2(src, 8, src1, src9);
96 src += src_stride;
97 LD_SH2(src, 8, src2, src10);
98 src += src_stride;
99 LD_SH2(src, 8, src3, src11);
100 src += src_stride;
101 LD_SH2(src, 8, src4, src12);
102 src += src_stride;
103 LD_SH2(src, 8, src5, src13);
104 src += src_stride;
105 LD_SH2(src, 8, src6, src14);
106 src += src_stride;
107 LD_SH2(src, 8, src7, src15);
108 src += src_stride;
109
110 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
111 tmp6, tmp7, tmp5, tmp3, tmp1);
112 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
113 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
114
115 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
116 src5, src7, src6, src3, src2);
117 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
118 tmp4, tmp5, tmp1, tmp6, tmp2);
119 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
120 src2, src3, src4, src5, src6, src7);
121 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
122 tmp6, tmp7, tmp5, tmp3, tmp1);
123 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
124 src5, src7, src6, src3, src2);
125 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
126 tmp4, tmp5, tmp1, tmp6, tmp2);
127 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
128 src2, src11, src4, src5, src6, src7);
129 ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
130
131 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
132 src12, src13, src15, src14, src11, src10);
133 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
134 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
135 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
136 src9, src10, src11, src12, src13, src14, src15);
137 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
138 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
139 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
140 src12, src13, src15, src14, src11, src10);
141 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
142 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
143 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
144 res1, res2, res3, res4, res5, res6, res7);
145
146 LD_SH2(src, 8, src0, src8);
147 src += src_stride;
148 LD_SH2(src, 8, src1, src9);
149 src += src_stride;
150 LD_SH2(src, 8, src2, src10);
151 src += src_stride;
152 LD_SH2(src, 8, src3, src11);
153 src += src_stride;
154
155 ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
156
157 LD_SH2(src, 8, src4, src12);
158 src += src_stride;
159 LD_SH2(src, 8, src5, src13);
160 src += src_stride;
161 LD_SH2(src, 8, src6, src14);
162 src += src_stride;
163 LD_SH2(src, 8, src7, src15);
164 src += src_stride;
165
166 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
167 tmp6, tmp7, tmp5, tmp3, tmp1);
168 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
169 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
170
171 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
172 src5, src7, src6, src3, src2);
173 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
174 tmp4, tmp5, tmp1, tmp6, tmp2);
175 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
176 src2, src3, src4, src5, src6, src7);
177 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
178 tmp6, tmp7, tmp5, tmp3, tmp1);
179 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
180 src5, src7, src6, src3, src2);
181 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
182 tmp4, tmp5, tmp1, tmp6, tmp2);
183 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
184 src2, src3, src4, src5, src6, src7);
185 ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
186
187 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
188 src12, src13, src15, src14, src11, src10);
189 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
190 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
191 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
192 src9, src10, src11, src12, src13, src14, src15);
193 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
194 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
195 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
196 src12, src13, src15, src14, src11, src10);
197 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
198 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
199 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
200 res1, res2, res3, res4, res5, res6, res7);
201 ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
202
203 LD_SH4(dst, 64, src0, src1, src2, src3);
204 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
205
206 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
207 tmp6, tmp7, tmp5, tmp3, tmp1);
208 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
209 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
210 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
211 src5, src7, src6, src3, src2);
212
213 ST_SH4(src0, src1, src2, src3, dst, 64);
214 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
215 dst += 16;
216
217 LD_SH4(dst, 64, src0, src1, src2, src3);
218 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
219
220 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
221 tmp6, tmp7, tmp5, tmp3, tmp1);
222 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
223 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
224 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
225 src5, src7, src6, src3, src2);
226
227 ST_SH4(src0, src1, src2, src3, dst, 64);
228 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
229 dst += 16;
230
231 LD_SH4(dst, 64, src0, src1, src2, src3);
232 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
233
234 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
235 tmp6, tmp7, tmp5, tmp3, tmp1);
236 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
237 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
238 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
239 src5, src7, src6, src3, src2);
240
241 ST_SH4(src0, src1, src2, src3, dst, 64);
242 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
243 dst += 16;
244
245 LD_SH4(dst, 64, src0, src1, src2, src3);
246 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
247
248 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
249 tmp6, tmp7, tmp5, tmp3, tmp1);
250 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
251 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
252 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
253 src5, src7, src6, src3, src2);
254
255 ST_SH4(src0, src1, src2, src3, dst, 64);
256 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
257}
258
259int vpx_satd_msa(const int16_t *data, int length) {
260 int i, satd;
261 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
262 v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
263 v8i16 zero = { 0 };
264 v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
265 v4u32 tmp0_w = { 0 };
266
267 if (16 == length) {
268 LD_SH2(data, 8, src0, src1);
269 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
270 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
271 tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
272 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
273 satd = HADD_UW_U32(tmp0_w);
274 } else if (64 == length) {
275 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
276
277 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
278 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
279 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
280 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
281 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
282 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
283 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
284 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
285
286 tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
287 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
288 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
289 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
290 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
291 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
292 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
293 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
294
295 satd = HADD_UW_U32(tmp0_w);
296 } else if (256 == length) {
297 for (i = 0; i < 2; ++i) {
298 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
299 data += 8 * 8;
300 LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
301 data += 8 * 8;
302
303 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
304 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
305 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
306 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
307 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
308 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
309 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
310 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
311
312 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
313 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
314 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
315 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
316 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
317 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
318 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
319 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
320
321 tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
322 tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
323 tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
324 tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
325 tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
326 tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
327 tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
328 tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
329
330 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
331 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
332 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
333 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
334 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
335 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
336 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
337 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
338 }
339
340 satd = HADD_UW_U32(tmp0_w);
341 } else if (1024 == length) {
342 for (i = 0; i < 8; ++i) {
343 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
344 data += 8 * 8;
345 LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
346 data += 8 * 8;
347
348 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
349 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
350 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
351 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
352 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
353 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
354 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
355 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
356
357 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
358 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
359 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
360 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
361 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
362 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
363 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
364 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
365
366 tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
367 tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
368 tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
369 tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
370 tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
371 tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
372 tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
373 tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
374
375 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
376 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
377 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
378 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
379 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
380 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
381 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
382 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
383 }
384
385 satd = HADD_UW_U32(tmp0_w);
386 } else {
387 satd = 0;
388
389 for (i = 0; i < length; ++i) {
390 satd += abs(data[i]);
391 }
392 }
393
394 return satd;
395}
396#endif // !CONFIG_VP9_HIGHBITDEPTH
397
398void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
399 const int ref_stride, const int height) {
400 int i;
401 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
402 v8i16 hbuf_r = { 0 };
403 v8i16 hbuf_l = { 0 };
404 v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
405 v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
406
407 if (16 == height) {
408 for (i = 2; i--;) {
409 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
410 ref += 8 * ref_stride;
411 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
412 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
413 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
414 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
415 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
416 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
417 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
418 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
419 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
420 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
421 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
422 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
423 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
424 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
425 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
426 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
427 }
428
429 SRA_2V(hbuf_r, hbuf_l, 3);
430 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
431 } else if (32 == height) {
432 for (i = 2; i--;) {
433 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
434 ref += 8 * ref_stride;
435 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
436 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
437 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
438 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
439 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
440 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
441 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
442 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
443 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
444 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
445 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
446 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
447 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
448 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
449 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
450 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
451 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
452 ref += 8 * ref_stride;
453 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
454 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
455 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
456 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
457 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
458 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
459 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
460 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
461 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
462 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
463 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
464 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
465 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
466 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
467 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
468 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
469 }
470
471 SRA_2V(hbuf_r, hbuf_l, 4);
472 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
473 } else if (64 == height) {
474 for (i = 4; i--;) {
475 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
476 ref += 8 * ref_stride;
477 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
478 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
479 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
480 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
481 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
482 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
483 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
484 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
485 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
486 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
487 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
488 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
489 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
490 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
491 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
492 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
493 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
494 ref += 8 * ref_stride;
495 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
496 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
497 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
498 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
499 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
500 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
501 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
502 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
503 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
504 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
505 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
506 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
507 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
508 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
509 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
510 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
511 }
512
513 SRA_2V(hbuf_r, hbuf_l, 5);
514 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
515 } else {
516 const int norm_factor = height >> 1;
517 int cnt;
518
519 for (cnt = 0; cnt < 16; cnt++) {
520 hbuf[cnt] = 0;
521 }
522
523 for (i = 0; i < height; ++i) {
524 for (cnt = 0; cnt < 16; cnt++) {
525 hbuf[cnt] += ref[cnt];
526 }
527
528 ref += ref_stride;
529 }
530
531 for (cnt = 0; cnt < 16; cnt++) {
532 hbuf[cnt] /= norm_factor;
533 }
534 }
535}
536
537int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
538 int16_t sum;
539 v16u8 ref0, ref1, ref2, ref3;
540 v8u16 ref0_h;
541
542 if (16 == width) {
543 ref0 = LD_UB(ref);
544 ref0_h = __msa_hadd_u_h(ref0, ref0);
545 sum = HADD_UH_U32(ref0_h);
546 } else if (32 == width) {
547 LD_UB2(ref, 16, ref0, ref1);
548 ref0_h = __msa_hadd_u_h(ref0, ref0);
549 ref0_h += __msa_hadd_u_h(ref1, ref1);
550 sum = HADD_UH_U32(ref0_h);
551 } else if (64 == width) {
552 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
553 ref0_h = __msa_hadd_u_h(ref0, ref0);
554 ref0_h += __msa_hadd_u_h(ref1, ref1);
555 ref0_h += __msa_hadd_u_h(ref2, ref2);
556 ref0_h += __msa_hadd_u_h(ref3, ref3);
557 sum = HADD_UH_U32(ref0_h);
558 } else {
559 int idx;
560
561 sum = 0;
562 for (idx = 0; idx < width; ++idx) {
563 sum += ref[idx];
564 }
565 }
566
567 return sum;
568}
569
570int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
571 int sse, mean, var;
572 v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
573 v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
574 v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
575 v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
576 v4i32 res_l7_m, mean_v;
577 v2i64 sse_v;
578
579 if (2 == bwl) {
580 LD_SH2(src, 8, src0, src1);
581 LD_SH2(ref, 8, ref0, ref1);
582
583 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
584 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
585 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
586 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
587 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
588 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
589 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
590 mean_v = res_l0_m + res_l1_m;
591 mean_v += res_l2_m + res_l3_m;
592
593 sse_v += __msa_splati_d(sse_v, 1);
594 sse = __msa_copy_s_w((v4i32)sse_v, 0);
595
596 mean = HADD_SW_S32(mean_v);
597 } else if (3 == bwl) {
598 LD_SH4(src, 8, src0, src1, src2, src3);
599 LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
600
601 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
602 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
603 ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
604 ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
605 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
606 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
607 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
608 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
609 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
610 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
611 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
612 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
613 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
614 mean_v = res_l0_m + res_l1_m;
615 mean_v += res_l2_m + res_l3_m;
616 mean_v += res_l4_m + res_l5_m;
617 mean_v += res_l6_m + res_l7_m;
618
619 sse_v += __msa_splati_d(sse_v, 1);
620 sse = __msa_copy_s_w((v4i32)sse_v, 0);
621
622 mean = HADD_SW_S32(mean_v);
623 } else if (4 == bwl) {
624 LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
625 LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
626
627 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
628 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
629 ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
630 ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
631 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
632 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
633 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
634 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
635 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
636 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
637 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
638 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
639 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
640 mean_v = res_l0_m + res_l1_m;
641 mean_v += res_l2_m + res_l3_m;
642 mean_v += res_l4_m + res_l5_m;
643 mean_v += res_l6_m + res_l7_m;
644
645 ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
646 ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
647 ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
648 ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
649 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
650 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
651 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
652 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
653 DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
654 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
655 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
656 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
657 mean_v += res_l0_m + res_l1_m;
658 mean_v += res_l2_m + res_l3_m;
659 mean_v += res_l4_m + res_l5_m;
660 mean_v += res_l6_m + res_l7_m;
661
662 sse_v += __msa_splati_d(sse_v, 1);
663 sse = __msa_copy_s_w((v4i32)sse_v, 0);
664
665 mean = HADD_SW_S32(mean_v);
666 } else {
667 int i;
668 const int width = 4 << bwl;
669
670 sse = 0;
671 mean = 0;
672
673 for (i = 0; i < width; ++i) {
674 const int diff = ref[i] - src[i];
675
676 mean += diff;
677 sse += diff * diff;
678 }
679 }
680
681 var = sse - ((mean * mean) >> (bwl + 2));
682
683 return var;
684}
685
686void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
687 int *min, int *max) {
688 v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
689 v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
690
691 LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
692 LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
693 PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
694 PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
695
696 diff0 = __msa_asub_u_b(s0, d0);
697 diff1 = __msa_asub_u_b(s1, d1);
698 diff2 = __msa_asub_u_b(s2, d2);
699 diff3 = __msa_asub_u_b(s3, d3);
700
701 min0 = __msa_min_u_b(diff0, diff1);
702 min1 = __msa_min_u_b(diff2, diff3);
703 min0 = __msa_min_u_b(min0, min1);
704
705 max0 = __msa_max_u_b(diff0, diff1);
706 max1 = __msa_max_u_b(diff2, diff3);
707 max0 = __msa_max_u_b(max0, max1);
708
709 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
710 min0 = __msa_min_u_b(min0, min1);
711 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
712 max0 = __msa_max_u_b(max0, max1);
713
714 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
715 min0 = __msa_min_u_b(min0, min1);
716 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
717 max0 = __msa_max_u_b(max0, max1);
718
719 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
720 min0 = __msa_min_u_b(min0, min1);
721 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
722 max0 = __msa_max_u_b(max0, max1);
723
724 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
725 min0 = __msa_min_u_b(min0, min1);
726 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
727 max0 = __msa_max_u_b(max0, max1);
728
729 *min = min0[0];
730 *max = max0[0];
731}