third_party/libwebp/src/dsp/rescaler_msa.c - cobalt - Git at Google

 // Copyright 2016 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MSA version of rescaling functions
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)

 #include "src/dsp/dsp.h"

 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)

 #include <assert.h>

 #include "src/utils/rescaler_utils.h"
 #include "src/dsp/msa_macro.h"

 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)

 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
   v16u8 t0, t1, t2, t3, t4, t5;                                       \
   v2u64 out0, out1, out2, out3;                                       \
   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
   PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
   PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
   PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
   dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
 } while (0)

 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
   v4u32 tmp0, tmp1;                                   \
   v16i8 t0, t1;                                       \
   v2u64 out0, out1;                                   \
   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
   SRAR_D2_UD(out0, out1, shift);                      \
   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
   t1 = __msa_pckev_b(t0, t0);                         \
   t0 = __msa_pckev_b(t1, t1);                         \
   dst = __msa_copy_s_w((v4i32)t0, 0);                 \
 } while (0)

 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
                           dst0, dst1, dst2, dst3) do {         \
   v4u32 tmp0, tmp1, tmp2, tmp3;                                \
   v2u64 out0, out1, out2, out3;                                \
   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
   PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
   PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
 } while (0)

 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
   v4u32 tmp0, tmp1;                                      \
   v2u64 out0, out1;                                      \
   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
   SRAR_D2_UD(out0, out1, shift);                         \
   dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
 } while (0)

 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
                           dst0, dst1) do {                         \
   v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
   v2u64 out0, out1, out2, out3;                                    \
   ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
   ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
   DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
   DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
   PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
 } while (0)

 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
   v4u32 tmp0, tmp1;                                               \
   v2u64 out0, out1;                                               \
   v16i8 t0, t1;                                                   \
   ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
   SRAR_D2_UD(out0, out1, shift);                                  \
   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
   SRAR_D2_UD(out0, out1, shift);                                  \
   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
   t1 = __msa_pckev_b(t0, t0);                                     \
   t0 = __msa_pckev_b(t1, t1);                                     \
   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
 } while (0)

 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
                                           int length,
                                           WebPRescaler* const wrk) {
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4i32 zero = { 0 };

   while (length >= 16) {
     v4u32 src0, src1, src2, src3;
     v16u8 out;
     LD_UW4(frow, 4, src0, src1, src2, src3);
     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
     ST_UB(out, dst);
     length -= 16;
     frow   += 16;
     dst    += 16;
   }
   if (length > 0) {
     int x_out;
     if (length >= 12) {
       uint32_t val0_m, val1_m, val2_m;
       v4u32 src0, src1, src2;
       LD_UW3(frow, 4, src0, src1, src2);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
       SW3(val0_m, val1_m, val2_m, dst, 4);
       length -= 12;
       frow   += 12;
       dst    += 12;
     } else if (length >= 8) {
       uint32_t val0_m, val1_m;
       v4u32 src0, src1;
       LD_UW2(frow, 4, src0, src1);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
       SW2(val0_m, val1_m, dst, 4);
       length -= 8;
       frow   += 8;
       dst    += 8;
     } else if (length >= 4) {
       uint32_t val0_m;
       const v4u32 src0 = LD_UW(frow);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       SW(val0_m, dst);
       length -= 4;
       frow   += 4;
       dst    += 4;
     }
     for (x_out = 0; x_out < length; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }

 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
                                           uint8_t* dst, int length,
                                           WebPRescaler* const wrk) {
   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
   const v4i32 B1 = __msa_fill_w(B);
   const v4i32 A1 = __msa_fill_w(A);
   const v4i32 AB = __msa_ilvr_w(A1, B1);
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);

   while (length >= 16) {
     v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
     v16u8 t0, t1, t2, t3, t4, t5;
     LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
     LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
     CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
     CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
     PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
     t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
     ST_UB(t0, dst);
     frow   += 16;
     irow   += 16;
     dst    += 16;
     length -= 16;
   }
   if (length > 0) {
     int x_out;
     if (length >= 12) {
       uint32_t val0_m, val1_m, val2_m;
       v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
       LD_UW3(frow, 4, frow0, frow1, frow2);
       LD_UW3(irow, 4, irow0, irow1, irow2);
       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
       CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
       SW3(val0_m, val1_m, val2_m, dst, 4);
       frow   += 12;
       irow   += 12;
       dst    += 12;
       length -= 12;
     } else if (length >= 8) {
       uint32_t val0_m, val1_m;
       v4u32 frow0, frow1, irow0, irow1;
       LD_UW2(frow, 4, frow0, frow1);
       LD_UW2(irow, 4, irow0, irow1);
       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
       SW2(val0_m, val1_m, dst, 4);
       frow   += 4;
       irow   += 4;
       dst    += 4;
       length -= 4;
     } else if (length >= 4) {
       uint32_t val0_m;
       const v4u32 frow0 = LD_UW(frow + 0);
       const v4u32 irow0 = LD_UW(irow + 0);
       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
       SW(val0_m, dst);
       frow   += 4;
       irow   += 4;
       dst    += 4;
       length -= 4;
     }
     for (x_out = 0; x_out < length; ++x_out) {
       const uint64_t I = (uint64_t)A * frow[x_out]
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }

 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const rescaler_t* frow = wrk->frow;
   assert(!WebPRescalerOutputDone(wrk));
   assert(wrk->y_accum <= 0);
   assert(wrk->y_expand);
   assert(wrk->y_sub != 0);
   if (wrk->y_accum == 0) {
     ExportRowExpand_0(frow, dst, x_out_max, wrk);
   } else {
     ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
   }
 }

 #if 0  // disabled for now. TODO(skal): make match the C-code
 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
                                           uint8_t* dst, int length,
                                           const uint32_t yscale,
                                           WebPRescaler* const wrk) {
   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4i32 zero = { 0 };

   while (length >= 16) {
     v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
     v16u8 out;
     LD_UW4(frow, 4, src0, src1, src2, src3);
     CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
                       frac0, frac1, frac2, frac3);
     LD_UW4(irow, 4, src0, src1, src2, src3);
     SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
          src0, src1, src2, src3);
     CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
     ST_UB(out, dst);
     ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
     frow   += 16;
     irow   += 16;
     dst    += 16;
     length -= 16;
   }
   if (length > 0) {
     int x_out;
     if (length >= 12) {
       uint32_t val0_m, val1_m, val2_m;
       v4u32 src0, src1, src2, frac0, frac1, frac2;
       LD_UW3(frow, 4, src0, src1, src2);
       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
       CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
       LD_UW3(irow, 4, src0, src1, src2);
       SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
       CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
       SW3(val0_m, val1_m, val2_m, dst, 4);
       ST_UW3(frac0, frac1, frac2, irow, 4);
       frow   += 12;
       irow   += 12;
       dst    += 12;
       length -= 12;
     } else if (length >= 8) {
       uint32_t val0_m, val1_m;
       v4u32 src0, src1, frac0, frac1;
       LD_UW2(frow, 4, src0, src1);
       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
       LD_UW2(irow, 4, src0, src1);
       SUB2(src0, frac0, src1, frac1, src0, src1);
       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
       SW2(val0_m, val1_m, dst, 4);
       ST_UW2(frac0, frac1, irow, 4);
       frow   += 8;
       irow   += 8;
       dst    += 8;
       length -= 8;
     } else if (length >= 4) {
       uint32_t val0_m;
       v4u32 frac0;
       v4u32 src0 = LD_UW(frow);
       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
       src0 = LD_UW(irow);
       src0 = src0 - frac0;
       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
       SW(val0_m, dst);
       ST_UW(frac0, irow);
       frow   += 4;
       irow   += 4;
       dst    += 4;
       length -= 4;
     }
     for (x_out = 0; x_out < length; ++x_out) {
       const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;
     }
   }
 }

 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
                                           int length,
                                           WebPRescaler* const wrk) {
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4i32 zero = { 0 };

   while (length >= 16) {
     v4u32 src0, src1, src2, src3;
     v16u8 dst0;
     LD_UW4(irow, 4, src0, src1, src2, src3);
     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
     ST_UB(dst0, dst);
     ST_SW4(zero, zero, zero, zero, irow, 4);
     length -= 16;
     irow   += 16;
     dst    += 16;
   }
   if (length > 0) {
     int x_out;
     if (length >= 12) {
       uint32_t val0_m, val1_m, val2_m;
       v4u32 src0, src1, src2;
       LD_UW3(irow, 4, src0, src1, src2);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
       SW3(val0_m, val1_m, val2_m, dst, 4);
       ST_SW3(zero, zero, zero, irow, 4);
       length -= 12;
       irow   += 12;
       dst    += 12;
     } else if (length >= 8) {
       uint32_t val0_m, val1_m;
       v4u32 src0, src1;
       LD_UW2(irow, 4, src0, src1);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
       SW2(val0_m, val1_m, dst, 4);
       ST_SW2(zero, zero, irow, 4);
       length -= 8;
       irow   += 8;
       dst    += 8;
     } else if (length >= 4) {
       uint32_t val0_m;
       const v4u32 src0 = LD_UW(irow + 0);
       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
       SW(val0_m, dst);
       ST_SW(zero, irow);
       length -= 4;
       irow   += 4;
       dst    += 4;
     }
     for (x_out = 0; x_out < length; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
 }

 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const rescaler_t* frow = wrk->frow;
   const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
   assert(!WebPRescalerOutputDone(wrk));
   assert(wrk->y_accum <= 0);
   assert(!wrk->y_expand);
   if (yscale) {
     ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
   } else {
     ExportRowShrink_1(irow, dst, x_out_max, wrk);
   }
 }
 #endif  // 0

 //------------------------------------------------------------------------------
 // Entry point

 extern void WebPRescalerDspInitMSA(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
 //  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
 }

 #else     // !WEBP_USE_MSA

 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)

 #endif    // WEBP_USE_MSA
	// Copyright 2016 Google Inc. All Rights Reserved.
	//
	// Use of this source code is governed by a BSD-style license
	// that can be found in the COPYING file in the root of the source
	// tree. An additional intellectual property rights grant can be found
	// in the file PATENTS. All contributing project authors may
	// be found in the AUTHORS file in the root of the source tree.
	// -----------------------------------------------------------------------------
	//
	// MSA version of rescaling functions
	//
	// Author: Prashant Patil (prashant.patil@imgtec.com)

	#include "src/dsp/dsp.h"

	#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)

	#include <assert.h>

	#include "src/utils/rescaler_utils.h"
	#include "src/dsp/msa_macro.h"

	#define ROUNDER (WEBP_RESCALER_ONE >> 1)
	#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
	#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)

	#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
	v4u32 tmp0, tmp1, tmp2, tmp3; \
	v16u8 t0, t1, t2, t3, t4, t5; \
	v2u64 out0, out1, out2, out3; \
	ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
	ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
	DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
	DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
	ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
	ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
	DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
	DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
	PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
	dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
	} while (0)

	#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
	v4u32 tmp0, tmp1; \
	v16i8 t0, t1; \
	v2u64 out0, out1; \
	ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
	DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
	SRAR_D2_UD(out0, out1, shift); \
	t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
	t1 = __msa_pckev_b(t0, t0); \
	t0 = __msa_pckev_b(t1, t1); \
	dst = __msa_copy_s_w((v4i32)t0, 0); \
	} while (0)

	#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
	dst0, dst1, dst2, dst3) do { \
	v4u32 tmp0, tmp1, tmp2, tmp3; \
	v2u64 out0, out1, out2, out3; \
	ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
	ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
	DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
	DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
	ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
	ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
	DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
	DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
	} while (0)

	#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
	v4u32 tmp0, tmp1; \
	v2u64 out0, out1; \
	ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
	DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
	SRAR_D2_UD(out0, out1, shift); \
	dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
	} while (0)

	#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
	dst0, dst1) do { \
	v4u32 tmp0, tmp1, tmp2, tmp3; \
	v2u64 out0, out1, out2, out3; \
	ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
	ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
	DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
	DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
	DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
	SRAR_D4_UD(out0, out1, out2, out3, shift); \
	PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
	} while (0)

	#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
	v4u32 tmp0, tmp1; \
	v2u64 out0, out1; \
	v16i8 t0, t1; \
	ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
	DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
	SRAR_D2_UD(out0, out1, shift); \
	DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
	SRAR_D2_UD(out0, out1, shift); \
	t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
	t1 = __msa_pckev_b(t0, t0); \
	t0 = __msa_pckev_b(t1, t1); \
	dst = __msa_copy_s_w((v4i32)t0, 0); \
	} while (0)

	static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
	int length,
	WebPRescaler* const wrk) {
	const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
	const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
	const v4i32 zero = { 0 };

	while (length >= 16) {
	v4u32 src0, src1, src2, src3;
	v16u8 out;
	LD_UW4(frow, 4, src0, src1, src2, src3);
	CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
	ST_UB(out, dst);
	length -= 16;
	frow += 16;
	dst += 16;
	}
	if (length > 0) {
	int x_out;
	if (length >= 12) {
	uint32_t val0_m, val1_m, val2_m;
	v4u32 src0, src1, src2;
	LD_UW3(frow, 4, src0, src1, src2);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	CALC_MULT_FIX_4(src1, scale, shift, val1_m);
	CALC_MULT_FIX_4(src2, scale, shift, val2_m);
	SW3(val0_m, val1_m, val2_m, dst, 4);
	length -= 12;
	frow += 12;
	dst += 12;
	} else if (length >= 8) {
	uint32_t val0_m, val1_m;
	v4u32 src0, src1;
	LD_UW2(frow, 4, src0, src1);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	CALC_MULT_FIX_4(src1, scale, shift, val1_m);
	SW2(val0_m, val1_m, dst, 4);
	length -= 8;
	frow += 8;
	dst += 8;
	} else if (length >= 4) {
	uint32_t val0_m;
	const v4u32 src0 = LD_UW(frow);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	SW(val0_m, dst);
	length -= 4;
	frow += 4;
	dst += 4;
	}
	for (x_out = 0; x_out < length; ++x_out) {
	const uint32_t J = frow[x_out];
	const int v = (int)MULT_FIX(J, wrk->fy_scale);
	dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
	}
	}
	}

	static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
	uint8_t* dst, int length,
	WebPRescaler* const wrk) {
	const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
	const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
	const v4i32 B1 = __msa_fill_w(B);
	const v4i32 A1 = __msa_fill_w(A);
	const v4i32 AB = __msa_ilvr_w(A1, B1);
	const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
	const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);

	while (length >= 16) {
	v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
	v16u8 t0, t1, t2, t3, t4, t5;
	LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
	LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
	CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
	CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
	PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
	t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
	ST_UB(t0, dst);
	frow += 16;
	irow += 16;
	dst += 16;
	length -= 16;
	}
	if (length > 0) {
	int x_out;
	if (length >= 12) {
	uint32_t val0_m, val1_m, val2_m;
	v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
	LD_UW3(frow, 4, frow0, frow1, frow2);
	LD_UW3(irow, 4, irow0, irow1, irow2);
	CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
	CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
	CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
	SW3(val0_m, val1_m, val2_m, dst, 4);
	frow += 12;
	irow += 12;
	dst += 12;
	length -= 12;
	} else if (length >= 8) {
	uint32_t val0_m, val1_m;
	v4u32 frow0, frow1, irow0, irow1;
	LD_UW2(frow, 4, frow0, frow1);
	LD_UW2(irow, 4, irow0, irow1);
	CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
	CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
	SW2(val0_m, val1_m, dst, 4);
	frow += 4;
	irow += 4;
	dst += 4;
	length -= 4;
	} else if (length >= 4) {
	uint32_t val0_m;
	const v4u32 frow0 = LD_UW(frow + 0);
	const v4u32 irow0 = LD_UW(irow + 0);
	CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
	SW(val0_m, dst);
	frow += 4;
	irow += 4;
	dst += 4;
	length -= 4;
	}
	for (x_out = 0; x_out < length; ++x_out) {
	const uint64_t I = (uint64_t)A * frow[x_out]
	+ (uint64_t)B * irow[x_out];
	const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
	const int v = (int)MULT_FIX(J, wrk->fy_scale);
	dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
	}
	}
	}

	static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
	uint8_t* dst = wrk->dst;
	rescaler_t* irow = wrk->irow;
	const int x_out_max = wrk->dst_width * wrk->num_channels;
	const rescaler_t* frow = wrk->frow;
	assert(!WebPRescalerOutputDone(wrk));
	assert(wrk->y_accum <= 0);
	assert(wrk->y_expand);
	assert(wrk->y_sub != 0);
	if (wrk->y_accum == 0) {
	ExportRowExpand_0(frow, dst, x_out_max, wrk);
	} else {
	ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
	}
	}

	#if 0 // disabled for now. TODO(skal): make match the C-code
	static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
	uint8_t* dst, int length,
	const uint32_t yscale,
	WebPRescaler* const wrk) {
	const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
	const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
	const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
	const v4i32 zero = { 0 };

	while (length >= 16) {
	v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
	v16u8 out;
	LD_UW4(frow, 4, src0, src1, src2, src3);
	CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
	frac0, frac1, frac2, frac3);
	LD_UW4(irow, 4, src0, src1, src2, src3);
	SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
	src0, src1, src2, src3);
	CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
	ST_UB(out, dst);
	ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
	frow += 16;
	irow += 16;
	dst += 16;
	length -= 16;
	}
	if (length > 0) {
	int x_out;
	if (length >= 12) {
	uint32_t val0_m, val1_m, val2_m;
	v4u32 src0, src1, src2, frac0, frac1, frac2;
	LD_UW3(frow, 4, src0, src1, src2);
	CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
	CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
	CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
	LD_UW3(irow, 4, src0, src1, src2);
	SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
	CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
	CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
	CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
	SW3(val0_m, val1_m, val2_m, dst, 4);
	ST_UW3(frac0, frac1, frac2, irow, 4);
	frow += 12;
	irow += 12;
	dst += 12;
	length -= 12;
	} else if (length >= 8) {
	uint32_t val0_m, val1_m;
	v4u32 src0, src1, frac0, frac1;
	LD_UW2(frow, 4, src0, src1);
	CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
	CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
	LD_UW2(irow, 4, src0, src1);
	SUB2(src0, frac0, src1, frac1, src0, src1);
	CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
	CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
	SW2(val0_m, val1_m, dst, 4);
	ST_UW2(frac0, frac1, irow, 4);
	frow += 8;
	irow += 8;
	dst += 8;
	length -= 8;
	} else if (length >= 4) {
	uint32_t val0_m;
	v4u32 frac0;
	v4u32 src0 = LD_UW(frow);
	CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
	src0 = LD_UW(irow);
	src0 = src0 - frac0;
	CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
	SW(val0_m, dst);
	ST_UW(frac0, irow);
	frow += 4;
	irow += 4;
	dst += 4;
	length -= 4;
	}
	for (x_out = 0; x_out < length; ++x_out) {
	const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
	const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
	dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
	irow[x_out] = frac;
	}
	}
	}

	static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
	int length,
	WebPRescaler* const wrk) {
	const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
	const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
	const v4i32 zero = { 0 };

	while (length >= 16) {
	v4u32 src0, src1, src2, src3;
	v16u8 dst0;
	LD_UW4(irow, 4, src0, src1, src2, src3);
	CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
	ST_UB(dst0, dst);
	ST_SW4(zero, zero, zero, zero, irow, 4);
	length -= 16;
	irow += 16;
	dst += 16;
	}
	if (length > 0) {
	int x_out;
	if (length >= 12) {
	uint32_t val0_m, val1_m, val2_m;
	v4u32 src0, src1, src2;
	LD_UW3(irow, 4, src0, src1, src2);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	CALC_MULT_FIX_4(src1, scale, shift, val1_m);
	CALC_MULT_FIX_4(src2, scale, shift, val2_m);
	SW3(val0_m, val1_m, val2_m, dst, 4);
	ST_SW3(zero, zero, zero, irow, 4);
	length -= 12;
	irow += 12;
	dst += 12;
	} else if (length >= 8) {
	uint32_t val0_m, val1_m;
	v4u32 src0, src1;
	LD_UW2(irow, 4, src0, src1);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	CALC_MULT_FIX_4(src1, scale, shift, val1_m);
	SW2(val0_m, val1_m, dst, 4);
	ST_SW2(zero, zero, irow, 4);
	length -= 8;
	irow += 8;
	dst += 8;
	} else if (length >= 4) {
	uint32_t val0_m;
	const v4u32 src0 = LD_UW(irow + 0);
	CALC_MULT_FIX_4(src0, scale, shift, val0_m);
	SW(val0_m, dst);
	ST_SW(zero, irow);
	length -= 4;
	irow += 4;
	dst += 4;
	}
	for (x_out = 0; x_out < length; ++x_out) {
	const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
	dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
	irow[x_out] = 0;
	}
	}
	}

	static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
	uint8_t* dst = wrk->dst;
	rescaler_t* irow = wrk->irow;
	const int x_out_max = wrk->dst_width * wrk->num_channels;
	const rescaler_t* frow = wrk->frow;
	const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
	assert(!WebPRescalerOutputDone(wrk));
	assert(wrk->y_accum <= 0);
	assert(!wrk->y_expand);
	if (yscale) {
	ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
	} else {
	ExportRowShrink_1(irow, dst, x_out_max, wrk);
	}
	}
	#endif // 0

	//------------------------------------------------------------------------------
	// Entry point

	extern void WebPRescalerDspInitMSA(void);

	WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
	WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
	// WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
	}

	#else // !WEBP_USE_MSA

	WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)

	#endif // WEBP_USE_MSA