src/third_party/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c - cobalt - Git at Google

 /*
  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include <assert.h>
 #include <string.h>

 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/asmdefs_mmi.h"
 #include "vpx_ports/mem.h"

 #define GET_DATA_H_MMI                                     \
   "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
   "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
   "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
   "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
   "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
   "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
   "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
   "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
   "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
   "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
   "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
   "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
   "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
   "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
   "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
   "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
   "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
   "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
   "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
   "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
   "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
   "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"

 #define GET_DATA_V_MMI                                     \
   "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
   "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
   "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
   "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
   "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
   "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
   "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
   "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
   "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
   "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"

 /* clang-format off */
 #define ROUND_POWER_OF_TWO_MMI                             \
   /* Add para[0] */                                        \
   "lw         %[tmp0],     0x00(%[para])             \n\t" \
   MMI_MTC1(%[tmp0],     %[ftmp6])                          \
   "punpcklwd  %[ftmp6],    %[ftmp6],    %[ftmp6]     \n\t" \
   "paddw      %[srcl],     %[srcl],     %[ftmp6]     \n\t" \
   "paddw      %[srch],     %[srch],     %[ftmp6]     \n\t" \
   /* Arithmetic right shift para[1] bits */                \
   "lw         %[tmp0],     0x04(%[para])             \n\t" \
   MMI_MTC1(%[tmp0],     %[ftmp5])                          \
   "psraw      %[srcl],     %[srcl],     %[ftmp5]     \n\t" \
   "psraw      %[srch],     %[srch],     %[ftmp5]     \n\t"
 /* clang-format on */

 #define CLIP_PIXEL_MMI                                     \
   /* Staturated operation */                               \
   "packsswh   %[srcl],     %[srcl],     %[srch]      \n\t" \
   "packushb   %[ftmp12],   %[srcl],     %[ftmp0]     \n\t"

 static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const InterpKernel *filter, int x0_q4,
                                int x_step_q4, int32_t w, int32_t h) {
   const int16_t *filter_x = filter[x0_q4];
   double ftmp[14];
   uint32_t tmp[2];
   uint32_t para[5];
   para[0] = (1 << ((FILTER_BITS)-1));
   para[1] = FILTER_BITS;
   src -= SUBPEL_TAPS / 2 - 1;
   src_stride -= w;
   dst_stride -= w;
   (void)x_step_q4;

   /* clang-format off */
   __asm__ volatile(
     "move       %[tmp1],    %[width]                   \n\t"
     "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
     "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
     "1:                                                \n\t"
     /* Get 8 data per row */
     "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
     "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
     "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
     "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
     "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
     "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
     "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
     "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
     "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
     MMI_ADDIU(%[width],   %[width],    -0x04)
     /* Get raw data */
     GET_DATA_H_MMI
     ROUND_POWER_OF_TWO_MMI
     CLIP_PIXEL_MMI
     "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
     MMI_ADDIU(%[dst],     %[dst],      0x04)
     MMI_ADDIU(%[src],     %[src],      0x04)
     /* Loop count */
     "bnez       %[width],   1b                         \n\t"
     "move       %[width],   %[tmp1]                    \n\t"
     MMI_ADDU(%[src],      %[src],      %[src_stride])
     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
     MMI_ADDIU(%[height],  %[height],   -0x01)
     "bnez       %[height],  1b                         \n\t"
     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
       [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
       [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
       [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
       [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
       [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
       [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
       [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
       [src]"+&r"(src),          [width]"+&r"(w),
       [dst]"+&r"(dst),          [height]"+&r"(h)
     : [filter]"r"(filter_x),    [para]"r"(para),
       [src_stride]"r"((mips_reg)src_stride),
       [dst_stride]"r"((mips_reg)dst_stride)
     : "memory"
   );
   /* clang-format on */
 }

 static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int y0_q4,
                               int y_step_q4, int32_t w, int32_t h) {
   const int16_t *filter_y = filter[y0_q4];
   double ftmp[16];
   uint32_t tmp[1];
   uint32_t para[2];
   ptrdiff_t addr = src_stride;
   para[0] = (1 << ((FILTER_BITS)-1));
   para[1] = FILTER_BITS;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   src_stride -= w;
   dst_stride -= w;
   (void)y_step_q4;

   __asm__ volatile(
     "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
     "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
     "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
     "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
     "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
     "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
     "1:                                                \n\t"
     /* Get 8 data per column */
     "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
     MMI_ADDU(%[tmp0],     %[src],     %[addr])
     "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
     MMI_ADDIU(%[width],   %[width],   -0x04)
     /* Get raw data */
     GET_DATA_V_MMI
     ROUND_POWER_OF_TWO_MMI
     CLIP_PIXEL_MMI
     "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
     MMI_ADDIU(%[dst],     %[dst],      0x04)
     MMI_ADDIU(%[src],     %[src],      0x04)
     /* Loop count */
     "bnez       %[width],    1b                        \n\t"
     MMI_SUBU(%[width],    %[addr],     %[src_stride])
     MMI_ADDU(%[src],      %[src],      %[src_stride])
     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
     MMI_ADDIU(%[height],  %[height],   -0x01)
     "bnez       %[height],   1b                        \n\t"
     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
       [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
       [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
       [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
       [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
       [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
       [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
       [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
       [src]"+&r"(src),          [dst]"+&r"(dst),
       [width]"+&r"(w),          [height]"+&r"(h),
       [tmp0]"=&r"(tmp[0])
     : [filter]"r"(filter_y),    [para]"r"(para),
       [src_stride]"r"((mips_reg)src_stride),
       [dst_stride]"r"((mips_reg)dst_stride),
       [addr]"r"((mips_reg)addr)
     : "memory"
   );
 }

 static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int32_t w, int32_t h) {
   const int16_t *filter_x = filter[x0_q4];
   double ftmp[14];
   uint32_t tmp[2];
   uint32_t para[2];
   para[0] = (1 << ((FILTER_BITS)-1));
   para[1] = FILTER_BITS;
   src -= SUBPEL_TAPS / 2 - 1;
   src_stride -= w;
   dst_stride -= w;
   (void)x_step_q4;

   __asm__ volatile(
     "move       %[tmp1],    %[width]                   \n\t"
     "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
     "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
     "1:                                                \n\t"
     /* Get 8 data per row */
     "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
     "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
     "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
     "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
     "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
     "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
     "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
     "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
     "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
     "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
     MMI_ADDIU(%[width],   %[width],    -0x04)
     /* Get raw data */
     GET_DATA_H_MMI
     ROUND_POWER_OF_TWO_MMI
     CLIP_PIXEL_MMI
     "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
     "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
     "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
     "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
     "li         %[tmp0],    0x10001                    \n\t"
     MMI_MTC1(%[tmp0],     %[ftmp5])
     "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
     "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
     "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
     "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
     "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
     MMI_ADDIU(%[dst],     %[dst],      0x04)
     MMI_ADDIU(%[src],     %[src],      0x04)
     /* Loop count */
     "bnez       %[width],   1b                         \n\t"
     "move       %[width],   %[tmp1]                    \n\t"
     MMI_ADDU(%[src],      %[src],      %[src_stride])
     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
     MMI_ADDIU(%[height],  %[height],   -0x01)
     "bnez       %[height],  1b                         \n\t"
     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
       [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
       [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
       [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
       [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
       [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
       [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
       [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
       [src]"+&r"(src),          [width]"+&r"(w),
       [dst]"+&r"(dst),          [height]"+&r"(h)
     : [filter]"r"(filter_x),    [para]"r"(para),
       [src_stride]"r"((mips_reg)src_stride),
       [dst_stride]"r"((mips_reg)dst_stride)
     : "memory"
   );
 }

 static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int y0_q4,
                                   int y_step_q4, int32_t w, int32_t h) {
   const int16_t *filter_y = filter[y0_q4];
   double ftmp[16];
   uint32_t tmp[1];
   uint32_t para[2];
   ptrdiff_t addr = src_stride;
   para[0] = (1 << ((FILTER_BITS)-1));
   para[1] = FILTER_BITS;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   src_stride -= w;
   dst_stride -= w;
   (void)y_step_q4;

   __asm__ volatile(
     "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
     "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
     "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
     "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
     "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
     "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
     "1:                                                \n\t"
     /* Get 8 data per column */
     "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
     MMI_ADDU(%[tmp0],     %[src],     %[addr])
     "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
     "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
     "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
     "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
     MMI_ADDIU(%[width],   %[width],   -0x04)
     /* Get raw data */
     GET_DATA_V_MMI
     ROUND_POWER_OF_TWO_MMI
     CLIP_PIXEL_MMI
     "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
     "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
     "li         %[tmp0],     0x10001                   \n\t"
     MMI_MTC1(%[tmp0],     %[ftmp5])
     "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
     "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
     "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
     "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
     "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
     MMI_ADDIU(%[dst],     %[dst],      0x04)
     MMI_ADDIU(%[src],     %[src],      0x04)
     /* Loop count */
     "bnez       %[width],    1b                        \n\t"
     MMI_SUBU(%[width],    %[addr],     %[src_stride])
     MMI_ADDU(%[src],      %[src],      %[src_stride])
     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
     MMI_ADDIU(%[height],  %[height],   -0x01)
     "bnez       %[height],   1b                        \n\t"
     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
       [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
       [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
       [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
       [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
       [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
       [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
       [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
       [src]"+&r"(src),          [dst]"+&r"(dst),
       [width]"+&r"(w),          [height]"+&r"(h),
       [tmp0]"=&r"(tmp[0])
     : [filter]"r"(filter_y),    [para]"r"(para),
       [src_stride]"r"((mips_reg)src_stride),
       [dst_stride]"r"((mips_reg)dst_stride),
       [addr]"r"((mips_reg)addr)
     : "memory"
   );
 }

 void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *filter, int x0_q4, int x_step_q4,
                           int y0_q4, int y_step_q4, int w, int h) {
   int x, y;

   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
   (void)y0_q4;
   (void)y_step_q4;

   if (w & 0x03) {
     for (y = 0; y < h; ++y) {
       for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
       src += src_stride;
       dst += dst_stride;
     }
   } else {
     double ftmp[4];
     uint32_t tmp[2];
     src_stride -= w;
     dst_stride -= w;

     __asm__ volatile(
       "move       %[tmp1],    %[width]                  \n\t"
       "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
       "li         %[tmp0],    0x10001                   \n\t"
       MMI_MTC1(%[tmp0],    %[ftmp3])
       "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
       "1:                                               \n\t"
       "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
       "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
       "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
       "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
       "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
       "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
       "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
       "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
       "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
       "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
       "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
       MMI_ADDIU(%[width],  %[width],   -0x04)
       MMI_ADDIU(%[dst],    %[dst],     0x04)
       MMI_ADDIU(%[src],    %[src],     0x04)
       "bnez       %[width],   1b                        \n\t"
       "move       %[width],   %[tmp1]                   \n\t"
       MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
       MMI_ADDU(%[src],     %[src],     %[src_stride])
       MMI_ADDIU(%[height], %[height],  -0x01)
       "bnez       %[height],  1b                        \n\t"
       : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
         [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
         [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
         [src]"+&r"(src),        [dst]"+&r"(dst),
         [width]"+&r"(w),        [height]"+&r"(h)
       : [src_stride]"r"((mips_reg)src_stride),
         [dst_stride]"r"((mips_reg)dst_stride)
       : "memory"
     );
   }
 }

 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const InterpKernel *x_filters, int x0_q4,
                            int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;

   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       x_q4 += x_step_q4;
     }
     src += src_stride;
     dst += dst_stride;
   }
 }

 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *y_filters, int y0_q4,
                           int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);

   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       y_q4 += y_step_q4;
     }
     ++src;
     ++dst;
   }
 }

 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *y_filters, int y0_q4,
                               int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);

   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
           dst[y * dst_stride] +
               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
           1);
       y_q4 += y_step_q4;
     }
     ++src;
     ++dst;
   }
 }

 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const InterpKernel *x_filters, int x0_q4,
                                int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;

   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
       dst[x] = ROUND_POWER_OF_TWO(
           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
     dst += dst_stride;
   }
 }

 void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const InterpKernel *filter,
                        int x0_q4, int32_t x_step_q4, int y0_q4,
                        int32_t y_step_q4, int32_t w, int32_t h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
   //   (2) Interpolate temp vertically to derive the sub-pixel result.
   // Deriving the maximum number of rows in the temp buffer (135):
   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   // --Largest block size is 64x64 pixels.
   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
   //   original frame (in 1/16th pixel units).
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // When calling in frame scaling function, the smallest scaling factor is x1/4
   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
   // big enough.
   uint8_t temp[64 * 135];
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
   assert(x_step_q4 <= 64);

   if (w & 0x03) {
     convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
                    64, filter, x0_q4, x_step_q4, w, intermediate_height);
     convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
                   filter, y0_q4, y_step_q4, w, h);
   } else {
     convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                        temp, 64, filter, x0_q4, x_step_q4, w,
                        intermediate_height);
     convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
                       filter, y0_q4, y_step_q4, w, h);
   }
 }

 void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
                              int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                              int32_t w, int32_t h) {
   (void)y0_q4;
   (void)y_step_q4;
   if (w & 0x03)
     convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
                    w, h);
   else
     convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
                        x_step_q4, w, h);
 }

 void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *filter, int x0_q4,
                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   (void)x0_q4;
   (void)x_step_q4;
   if (w & 0x03)
     convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
                   h);
   else
     convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
                       y_step_q4, w, h);
 }

 void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *filter, int x0_q4,
                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
                                  int w, int h) {
   (void)y0_q4;
   (void)y_step_q4;
   if (w & 0x03)
     convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                        x_step_q4, w, h);
   else
     convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
                            x_step_q4, w, h);
 }

 void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *filter, int x0_q4,
                                 int32_t x_step_q4, int y0_q4, int y_step_q4,
                                 int w, int h) {
   (void)x0_q4;
   (void)x_step_q4;
   if (w & 0x03)
     convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                       y_step_q4, w, h);
   else
     convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
                           y_step_q4, w, h);
 }

 void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const InterpKernel *filter, int x0_q4,
                            int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                            int32_t w, int32_t h) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);

   vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                     y_step_q4, w, h);
   vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }