src/third_party/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c - cobalt - Git at Google

 /*
  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "./vp8_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"

 static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
                                              uint32_t stride,
                                              uint8_t *frame2_ptr,
                                              int32_t strength_in,
                                              int32_t filter_wt_in,
                                              uint32_t *acc, uint16_t *cnt)
 {
     uint32_t row;
     v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
     v16u8 frame_l, frame_h;
     v16i8 zero = { 0 };
     v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
     v8i16 diff0, diff1, cnt0, cnt1;
     v4i32 const3, const16, filter_wt, strength;
     v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
     v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
     v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
     v4i32 acc0, acc1, acc2, acc3;

     filter_wt = __msa_fill_w(filter_wt_in);
     strength = __msa_fill_w(strength_in);
     const3 = __msa_ldi_w(3);
     const16 = __msa_ldi_w(16);

     for (row = 8; row--;)
     {
         frame1_0_b = LD_SB(frame1_ptr);
         frame2_0_b = LD_SB(frame2_ptr);
         frame1_ptr += stride;
         frame2_ptr += 16;
         frame1_1_b = LD_SB(frame1_ptr);
         frame2_1_b = LD_SB(frame2_ptr);
         LD_SW2(acc, 4, acc0, acc1);
         LD_SW2(acc + 8, 4, acc2, acc3);
         LD_SH2(cnt, 8, cnt0, cnt1);
         ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
         HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
         UNPCK_SH_SW(diff0, diff0_r, diff0_l);
         UNPCK_SH_SW(diff1, diff1_r, diff1_l);
         MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
              diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
         MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
         diff0_r = (mod0_w < const16);
         diff0_l = (mod1_w < const16);
         diff1_r = (mod2_w < const16);
         diff1_l = (mod3_w < const16);
         SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
              mod0_w, mod1_w, mod2_w, mod3_w);
         mod0_w = diff0_r & mod0_w;
         mod1_w = diff0_l & mod1_w;
         mod2_w = diff1_r & mod2_w;
         mod3_w = diff1_l & mod3_w;
         MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
              filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
         PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
         ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
         ST_SH2(mod0_h, mod1_h, cnt, 8);
         cnt += 16;
         ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
         UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
         UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
         MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
              frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
         ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         ST_SW2(mod0_w, mod1_w, acc, 4);
         ST_SW2(mod2_w, mod3_w, acc + 8, 4);
         acc += 16;
         LD_SW2(acc, 4, acc0, acc1);
         LD_SW2(acc + 8, 4, acc2, acc3);
         LD_SH2(cnt, 8, cnt0, cnt1);
         ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
         HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
         UNPCK_SH_SW(diff0, diff0_r, diff0_l);
         UNPCK_SH_SW(diff1, diff1_r, diff1_l);
         MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
              diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
         MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
         diff0_r = (mod0_w < const16);
         diff0_l = (mod1_w < const16);
         diff1_r = (mod2_w < const16);
         diff1_l = (mod3_w < const16);
         SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
              mod0_w, mod1_w, mod2_w, mod3_w);
         mod0_w = diff0_r & mod0_w;
         mod1_w = diff0_l & mod1_w;
         mod2_w = diff1_r & mod2_w;
         mod3_w = diff1_l & mod3_w;
         MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
              filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
         PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
         ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
         ST_SH2(mod0_h, mod1_h, cnt, 8);
         cnt += 16;

         UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
         UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
         UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
         MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
              frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
         ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         ST_SW2(mod0_w, mod1_w, acc, 4);
         ST_SW2(mod2_w, mod3_w, acc + 8, 4);
         acc += 16;
         frame1_ptr += stride;
         frame2_ptr += 16;
     }
 }

 static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
                                             uint32_t stride,
                                             uint8_t *frame2_ptr,
                                             int32_t strength_in,
                                             int32_t filter_wt_in,
                                             uint32_t *acc, uint16_t *cnt)
 {
     uint32_t row;
     uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
     v16i8 frame1 = { 0 };
     v16i8 frame2 = { 0 };
     v16i8 frame3 = { 0 };
     v16i8 frame4 = { 0 };
     v16u8 frame_l, frame_h;
     v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
     v8i16 diff0, diff1, cnt0, cnt1;
     v4i32 const3, const16;
     v4i32 filter_wt, strength;
     v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
     v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
     v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
     v4i32 acc0, acc1, acc2, acc3;

     filter_wt = __msa_fill_w(filter_wt_in);
     strength = __msa_fill_w(strength_in);
     const3 = __msa_ldi_w(3);
     const16 = __msa_ldi_w(16);

     for (row = 2; row--;)
     {
         LD2(frame1_ptr, stride, f0, f1);
         frame1_ptr += (2 * stride);
         LD2(frame2_ptr, 8, f2, f3);
         frame2_ptr += 16;
         LD2(frame1_ptr, stride, f4, f5);
         frame1_ptr += (2 * stride);
         LD2(frame2_ptr, 8, f6, f7);
         frame2_ptr += 16;

         LD_SW2(acc, 4, acc0, acc1);
         LD_SW2(acc + 8, 4, acc2, acc3);
         LD_SH2(cnt, 8, cnt0, cnt1);
         INSERT_D2_SB(f0, f1, frame1);
         INSERT_D2_SB(f2, f3, frame2);
         INSERT_D2_SB(f4, f5, frame3);
         INSERT_D2_SB(f6, f7, frame4);
         ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
         HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
         UNPCK_SH_SW(diff0, diff0_r, diff0_l);
         UNPCK_SH_SW(diff1, diff1_r, diff1_l);
         MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
              diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
         MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
         diff0_r = (mod0_w < const16);
         diff0_l = (mod1_w < const16);
         diff1_r = (mod2_w < const16);
         diff1_l = (mod3_w < const16);
         SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
              mod0_w, mod1_w, mod2_w, mod3_w);
         mod0_w = diff0_r & mod0_w;
         mod1_w = diff0_l & mod1_w;
         mod2_w = diff1_r & mod2_w;
         mod3_w = diff1_l & mod3_w;
         MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
              filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
         PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
         ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
         ST_SH2(mod0_h, mod1_h, cnt, 8);
         cnt += 16;

         UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
         UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
         UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
         MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
              frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
         ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         ST_SW2(mod0_w, mod1_w, acc, 4);
         ST_SW2(mod2_w, mod3_w, acc + 8, 4);
         acc += 16;

         LD_SW2(acc, 4, acc0, acc1);
         LD_SW2(acc + 8, 4, acc2, acc3);
         LD_SH2(cnt, 8, cnt0, cnt1);
         ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
         HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
         UNPCK_SH_SW(diff0, diff0_r, diff0_l);
         UNPCK_SH_SW(diff1, diff1_r, diff1_l);
         MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
              diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
         MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
         diff0_r = (mod0_w < const16);
         diff0_l = (mod1_w < const16);
         diff1_r = (mod2_w < const16);
         diff1_l = (mod3_w < const16);
         SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
              mod0_w, mod1_w, mod2_w, mod3_w);
         mod0_w = diff0_r & mod0_w;
         mod1_w = diff0_l & mod1_w;
         mod2_w = diff1_r & mod2_w;
         mod3_w = diff1_l & mod3_w;
         MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
              filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
         PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
         ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
         ST_SH2(mod0_h, mod1_h, cnt, 8);
         cnt += 16;

         UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
         UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
         UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
         MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
              frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
         ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
              mod0_w, mod1_w, mod2_w, mod3_w);
         ST_SW2(mod0_w, mod1_w, acc, 4);
         ST_SW2(mod2_w, mod3_w, acc + 8, 4);
         acc += 16;
     }
 }

 void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
                                    uint8_t *frame2, uint32_t block_size,
                                    int32_t strength,  int32_t filter_weight,
                                    uint32_t *accumulator, uint16_t *count)
 {
     if (8 == block_size)
     {
         temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
                                         filter_weight, accumulator, count);
     }
     else if (16 == block_size)
     {
         temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
                                          filter_weight, accumulator, count);
     }
     else
     {
         uint32_t i, j, k;
         int32_t modifier;
         int32_t byte = 0;
         const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;

         for (i = 0, k = 0; i < block_size; ++i)
         {
             for (j = 0; j < block_size; ++j, ++k)
             {
                 int src_byte = frame1[byte];
                 int pixel_value = *frame2++;

                 modifier = src_byte - pixel_value;
                 modifier *= modifier;
                 modifier *= 3;
                 modifier += rounding;
                 modifier >>= strength;

                 if (modifier > 16)
                     modifier = 16;

                 modifier = 16 - modifier;
                 modifier *= filter_weight;

                 count[k] += modifier;
                 accumulator[k] += modifier * pixel_value;

                 byte++;
             }

             byte += stride - block_size;
         }
     }
 }
	/*
	* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include "./vp8_rtcd.h"
	#include "vp8/common/mips/msa/vp8_macros_msa.h"

	static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
	uint32_t stride,
	uint8_t *frame2_ptr,
	int32_t strength_in,
	int32_t filter_wt_in,
	uint32_t acc, uint16_t cnt)
	{
	uint32_t row;
	v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
	v16u8 frame_l, frame_h;
	v16i8 zero = { 0 };
	v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
	v8i16 diff0, diff1, cnt0, cnt1;
	v4i32 const3, const16, filter_wt, strength;
	v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
	v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
	v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
	v4i32 acc0, acc1, acc2, acc3;

	filter_wt = __msa_fill_w(filter_wt_in);
	strength = __msa_fill_w(strength_in);
	const3 = __msa_ldi_w(3);
	const16 = __msa_ldi_w(16);

	for (row = 8; row--;)
	{
	frame1_0_b = LD_SB(frame1_ptr);
	frame2_0_b = LD_SB(frame2_ptr);
	frame1_ptr += stride;
	frame2_ptr += 16;
	frame1_1_b = LD_SB(frame1_ptr);
	frame2_1_b = LD_SB(frame2_ptr);
	LD_SW2(acc, 4, acc0, acc1);
	LD_SW2(acc + 8, 4, acc2, acc3);
	LD_SH2(cnt, 8, cnt0, cnt1);
	ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
	HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
	UNPCK_SH_SW(diff0, diff0_r, diff0_l);
	UNPCK_SH_SW(diff1, diff1_r, diff1_l);
	MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
	diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
	MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
	diff0_r = (mod0_w < const16);
	diff0_l = (mod1_w < const16);
	diff1_r = (mod2_w < const16);
	diff1_l = (mod3_w < const16);
	SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
	mod0_w, mod1_w, mod2_w, mod3_w);
	mod0_w = diff0_r & mod0_w;
	mod1_w = diff0_l & mod1_w;
	mod2_w = diff1_r & mod2_w;
	mod3_w = diff1_l & mod3_w;
	MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
	filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
	PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
	ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
	ST_SH2(mod0_h, mod1_h, cnt, 8);
	cnt += 16;
	ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
	UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
	UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
	MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
	frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
	ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	ST_SW2(mod0_w, mod1_w, acc, 4);
	ST_SW2(mod2_w, mod3_w, acc + 8, 4);
	acc += 16;
	LD_SW2(acc, 4, acc0, acc1);
	LD_SW2(acc + 8, 4, acc2, acc3);
	LD_SH2(cnt, 8, cnt0, cnt1);
	ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
	HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
	UNPCK_SH_SW(diff0, diff0_r, diff0_l);
	UNPCK_SH_SW(diff1, diff1_r, diff1_l);
	MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
	diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
	MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
	diff0_r = (mod0_w < const16);
	diff0_l = (mod1_w < const16);
	diff1_r = (mod2_w < const16);
	diff1_l = (mod3_w < const16);
	SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
	mod0_w, mod1_w, mod2_w, mod3_w);
	mod0_w = diff0_r & mod0_w;
	mod1_w = diff0_l & mod1_w;
	mod2_w = diff1_r & mod2_w;
	mod3_w = diff1_l & mod3_w;
	MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
	filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
	PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
	ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
	ST_SH2(mod0_h, mod1_h, cnt, 8);
	cnt += 16;

	UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
	UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
	UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
	MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
	frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
	ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	ST_SW2(mod0_w, mod1_w, acc, 4);
	ST_SW2(mod2_w, mod3_w, acc + 8, 4);
	acc += 16;
	frame1_ptr += stride;
	frame2_ptr += 16;
	}
	}

	static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
	uint32_t stride,
	uint8_t *frame2_ptr,
	int32_t strength_in,
	int32_t filter_wt_in,
	uint32_t acc, uint16_t cnt)
	{
	uint32_t row;
	uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
	v16i8 frame1 = { 0 };
	v16i8 frame2 = { 0 };
	v16i8 frame3 = { 0 };
	v16i8 frame4 = { 0 };
	v16u8 frame_l, frame_h;
	v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
	v8i16 diff0, diff1, cnt0, cnt1;
	v4i32 const3, const16;
	v4i32 filter_wt, strength;
	v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
	v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
	v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
	v4i32 acc0, acc1, acc2, acc3;

	filter_wt = __msa_fill_w(filter_wt_in);
	strength = __msa_fill_w(strength_in);
	const3 = __msa_ldi_w(3);
	const16 = __msa_ldi_w(16);

	for (row = 2; row--;)
	{
	LD2(frame1_ptr, stride, f0, f1);
	frame1_ptr += (2 * stride);
	LD2(frame2_ptr, 8, f2, f3);
	frame2_ptr += 16;
	LD2(frame1_ptr, stride, f4, f5);
	frame1_ptr += (2 * stride);
	LD2(frame2_ptr, 8, f6, f7);
	frame2_ptr += 16;

	LD_SW2(acc, 4, acc0, acc1);
	LD_SW2(acc + 8, 4, acc2, acc3);
	LD_SH2(cnt, 8, cnt0, cnt1);
	INSERT_D2_SB(f0, f1, frame1);
	INSERT_D2_SB(f2, f3, frame2);
	INSERT_D2_SB(f4, f5, frame3);
	INSERT_D2_SB(f6, f7, frame4);
	ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
	HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
	UNPCK_SH_SW(diff0, diff0_r, diff0_l);
	UNPCK_SH_SW(diff1, diff1_r, diff1_l);
	MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
	diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
	MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
	diff0_r = (mod0_w < const16);
	diff0_l = (mod1_w < const16);
	diff1_r = (mod2_w < const16);
	diff1_l = (mod3_w < const16);
	SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
	mod0_w, mod1_w, mod2_w, mod3_w);
	mod0_w = diff0_r & mod0_w;
	mod1_w = diff0_l & mod1_w;
	mod2_w = diff1_r & mod2_w;
	mod3_w = diff1_l & mod3_w;
	MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
	filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
	PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
	ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
	ST_SH2(mod0_h, mod1_h, cnt, 8);
	cnt += 16;

	UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
	UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
	UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
	MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
	frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
	ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	ST_SW2(mod0_w, mod1_w, acc, 4);
	ST_SW2(mod2_w, mod3_w, acc + 8, 4);
	acc += 16;

	LD_SW2(acc, 4, acc0, acc1);
	LD_SW2(acc + 8, 4, acc2, acc3);
	LD_SH2(cnt, 8, cnt0, cnt1);
	ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
	HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
	UNPCK_SH_SW(diff0, diff0_r, diff0_l);
	UNPCK_SH_SW(diff1, diff1_r, diff1_l);
	MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
	diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
	MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
	diff0_r = (mod0_w < const16);
	diff0_l = (mod1_w < const16);
	diff1_r = (mod2_w < const16);
	diff1_l = (mod3_w < const16);
	SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
	mod0_w, mod1_w, mod2_w, mod3_w);
	mod0_w = diff0_r & mod0_w;
	mod1_w = diff0_l & mod1_w;
	mod2_w = diff1_r & mod2_w;
	mod3_w = diff1_l & mod3_w;
	MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
	filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
	PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
	ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
	ST_SH2(mod0_h, mod1_h, cnt, 8);
	cnt += 16;

	UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
	UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
	UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
	MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
	frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
	ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
	mod0_w, mod1_w, mod2_w, mod3_w);
	ST_SW2(mod0_w, mod1_w, acc, 4);
	ST_SW2(mod2_w, mod3_w, acc + 8, 4);
	acc += 16;
	}
	}

	void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
	uint8_t *frame2, uint32_t block_size,
	int32_t strength, int32_t filter_weight,
	uint32_t accumulator, uint16_t count)
	{
	if (8 == block_size)
	{
	temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
	filter_weight, accumulator, count);
	}
	else if (16 == block_size)
	{
	temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
	filter_weight, accumulator, count);
	}
	else
	{
	uint32_t i, j, k;
	int32_t modifier;
	int32_t byte = 0;
	const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;

	for (i = 0, k = 0; i < block_size; ++i)
	{
	for (j = 0; j < block_size; ++j, ++k)
	{
	int src_byte = frame1[byte];
	int pixel_value = *frame2++;

	modifier = src_byte - pixel_value;
	modifier *= modifier;
	modifier *= 3;
	modifier += rounding;
	modifier >>= strength;

	if (modifier > 16)
	modifier = 16;

	modifier = 16 - modifier;
	modifier *= filter_weight;

	count[k] += modifier;
	accumulator[k] += modifier * pixel_value;

	byte++;
	}

	byte += stride - block_size;
	}
	}
	}