blob: a07a7e3b41d3362fbd4f4d06087f3aaeee4d3fb0 [file] [log] [blame]
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
#include "vpx_ports/asmdefs_mmi.h"
void vp8_loop_filter_horizontal_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
uint64_t tmp[1];
mips_reg addr[2];
double ftmp[12];
double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0x0001000100010001 \n\t"
"dmtc1 %[tmp0], %[ff_ph_01] \n\t"
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0303030303030303 \n\t"
"dmtc1 %[tmp0], %[ff_pb_03] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t"
MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
"gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
"gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
"psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t"
"psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
"gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
"pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
"psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
"gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
"pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
"pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
"pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
"gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t"
"gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t"
"psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
"psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
"paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
"pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
"psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
"psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t"
"paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
"punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
"psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
"packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
"pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
"punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t"
"psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
"paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t"
"packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
"psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
"packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
"pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
"gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
"pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
"gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
"gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
"psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
"psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
"gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
"gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
"addiu %[count], %[count], -0x01 \n\t"
MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
[src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
[ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe),
[ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04),
[ff_pb_03]"=&f"(ff_pb_03)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
[src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
[src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
: "memory"
);
/* clang-format on */
}
void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
int src_pixel_step,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh, int count) {
uint64_t tmp[1];
mips_reg addr[2];
double ftmp[13];
double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x0001000100010001 \n\t"
"dmtc1 %[tmp0], %[ff_ph_01] \n\t"
"dli %[tmp0], 0x0303030303030303 \n\t"
"dmtc1 %[tmp0], %[ff_pb_03] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
"1: \n\t"
MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
"gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
MMI_ADDU(%[addr1], %[addr0], %[tmp0])
"gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
"punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
"gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t"
"gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
"gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
"punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
"punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t"
"punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t"
"punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t"
"punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
"punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
"gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
"punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
"gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[addr1], %[addr0], %[tmp0])
"gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t"
"punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
"punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
"punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
"punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t"
"punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t"
/* ftmp9:q0 ftmp10:q1 */
"punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
"punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
/* ftmp11:q2 ftmp12:q3 */
"punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
"punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
/* ftmp1:p3 ftmp2:p2 */
"punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
"punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
/* ftmp5:p1 ftmp6:p0 */
"punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
"punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
"gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t"
/* abs (q3-q2) */
"pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
"psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
/* abs (q2-q1) */
"pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp3: abs(q1-q0) */
"pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
"psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp4: abs(p1-p0) */
"pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
"psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p2-p1) */
"pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p3-p2) */
"pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t"
/* abs (p0-q0) */
"pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t"
"paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
/* abs (p1-q1) */
"pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
"pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp1] \n\t"
"psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t"
"paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
/* ftmp0:mask */
"pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t"
/* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */
"psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
"psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
"por %[ftmp2], %[ftmp4], %[ftmp3] \n\t"
"pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
/* ftmp1:hev */
"pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t"
"pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
"pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
"psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
"psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
/* ftmp2:filter_value */
"pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t"
"paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp7] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
"packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
"packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t"
"psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
"pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
"paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t"
"paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp7] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
"packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t"
"pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t"
"psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
"pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
/* ftmp5: *op1 ; ftmp6: *op0 */
"punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
"punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
/* ftmp9: *oq0 ; ftmp10: *oq1 */
"punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
"punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t"
"punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
"punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
"punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
"punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
"gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
"li %[tmp0], 0x20 \n\t"
"mtc1 %[tmp0], %[ftmp9] \n\t"
"ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[addr1], %[addr0], %[tmp0])
"gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
"gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
"ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
"gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t"
"gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t"
"ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t"
"gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t"
MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
"gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
"ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
MMI_ADDU(%[addr1], %[addr0], %[tmp0])
"gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
"gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
MMI_ADDIU(%[count], %[count], -0x01)
MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
[src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
[ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03),
[ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80),
[ff_pb_fe]"=&f"(ff_pb_fe)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
/* clang-format on */
}
/* clang-format off */
#define VP8_MBLOOP_HPSRAB \
"punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \
"punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \
"psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \
"psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \
"packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
#define VP8_MBLOOP_HPSRAB_ADD(reg) \
"punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \
"punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \
"pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \
"pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \
"paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \
"paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \
"psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
"psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \
"packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
/* clang-format on */
void vp8_mbloop_filter_horizontal_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
uint64_t tmp[1];
double ftmp[13];
double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
ff_ph_1200, ff_ph_1b00;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0303030303030303 \n\t"
"dmtc1 %[tmp0], %[ff_pb_03] \n\t"
"dli %[tmp0], 0x003f003f003f003f \n\t"
"dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
"dli %[tmp0], 0x0900090009000900 \n\t"
"dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
"dli %[tmp0], 0x1200120012001200 \n\t"
"dmtc1 %[tmp0], %[ff_ph_1200] \n\t"
"dli %[tmp0], 0x1b001b001b001b00 \n\t"
"dmtc1 %[tmp0], %[ff_ph_1b00] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
"1: \n\t"
"gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t"
/* ftmp1: p3 */
"gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
/* ftmp3: p2 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
/* ftmp4: p1 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
/* ftmp5: p0 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
/* ftmp6: q0 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
/* ftmp7: q1 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
/* ftmp8: q2 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
/* ftmp2: q3 */
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t"
"gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t"
"pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
"psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t"
"psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
"psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
"pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
/* ftmp0: mask */
"pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t"
"gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t"
"psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
"psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t"
"paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
/* ftmp1: hev */
"pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
"pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
"psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
"psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
"pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t"
VP8_MBLOOP_HPSRAB
"paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
"paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t"
VP8_MBLOOP_HPSRAB
"psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
"dli %[tmp0], 0x07 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
"psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
"gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
"paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
"psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
"pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
"psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
"gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
"addiu %[count], %[count], -0x01 \n\t"
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
[src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
[ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
[ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03),
[ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00),
[ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f)
: [limit]"r"(limit), [blimit]"r"(blimit),
[thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
/* clang-format on */
}
/* clang-format off */
#define VP8_MBLOOP_VPSRAB_ADDH \
"pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
"pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
#define VP8_MBLOOP_VPSRAB_ADDT \
"paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \
"paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \
"packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
/* clang-format on */
void vp8_mbloop_filter_vertical_edge_mmi(
unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
const unsigned char *limit, const unsigned char *thresh, int count) {
mips_reg tmp[1];
DECLARE_ALIGNED(8, const uint64_t, srct[2]);
double ftmp[14];
double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0x003f003f003f003f \n\t"
"dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
"dli %[tmp0], 0x0900090009000900 \n\t"
"dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0303030303030303 \n\t"
"dmtc1 %[tmp0], %[ff_pb_03] \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
"1: \n\t"
"gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
"punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
"punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
"punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
"punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
"punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t"
"punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t"
"punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t"
"punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
"punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
"punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
"punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
"punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
"punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t"
"punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t"
"punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t"
"punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t"
"gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t"
"gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t"
/* ftmp9:q0 ftmp10:q1 */
"punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
"punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
/* ftmp11:q2 ftmp12:q3 */
"punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
"punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
/* srct[0x00]: q3 */
"sdc1 %[ftmp12], 0x00(%[srct]) \n\t"
/* ftmp1:p3 ftmp2:p2 */
"punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
"punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
/* srct[0x08]: p3 */
"sdc1 %[ftmp1], 0x08(%[srct]) \n\t"
/* ftmp5:p1 ftmp6:p0 */
"punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
"punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
/* abs (q3-q2) */
"pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
"psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t"
/* abs (q2-q1) */
"pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp3: abs(q1-q0) */
"pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
"psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* ftmp4: abs(p1-p0) */
"pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
"psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p2-p1) */
"pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
/* abs (p3-p2) */
"pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
"psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t"
"gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t"
/* abs (p0-q0) * 2 */
"pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t"
"paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
/* abs (p1-q1) / 2 */
"pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
"pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp8] \n\t"
"psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
"paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
"psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
"pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
/* ftmp0: mask */
"pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
/* abs(p1-p0) - thresh */
"psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
/* abs(q1-q0) - thresh */
"psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
"por %[ftmp3], %[ftmp4], %[ftmp3] \n\t"
"pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t"
"pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
/* ftmp1: hev */
"pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
/* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
"pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t"
"pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
"pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
"pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
"psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t"
"psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
/* filter_value &= mask */
"pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
/* Filter2 = filter_value & hev */
"pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
/* filter_value &= ~hev */
"pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
"paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp12] \n\t"
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
"packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
/* ftmp9: qs0 */
"psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
"paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t"
"punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
"punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t"
"psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
"psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
"packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
/* ftmp6: ps0 */
"paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
"dli %[tmp0], 0x07 \n\t"
"dmtc1 %[tmp0], %[ftmp12] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
"paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
"paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t"
"pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
"pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t"
/* ftmp9: oq0 */
"pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t"
/* ftmp6: op0 */
"pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
"paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
"pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
"pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t"
/* ftmp10: oq1 */
"pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
/* ftmp5: op1 */
"pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t"
VP8_MBLOOP_VPSRAB_ADDH
"pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t"
"pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t"
VP8_MBLOOP_VPSRAB_ADDT
"psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t"
/* ftmp11: oq2 */
"pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t"
"paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t"
/* ftmp2: op2 */
"pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t"
"ldc1 %[ftmp12], 0x00(%[srct]) \n\t"
"ldc1 %[ftmp8], 0x08(%[srct]) \n\t"
"punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t"
"punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t"
"punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
"punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
"punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
"punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t"
"punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
"punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
"punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t"
"punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
"punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
"punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
"punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t"
"punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
"punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t"
"punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
"punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
"punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
"gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
"punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t"
"punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
"punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t"
"punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
"punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t"
"punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
"addiu %[count], %[count], -0x01 \n\t"
MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
[tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
[count]"+&r"(count),
[ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900),
[ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04),
[ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe)
: [limit]"r"(limit), [blimit]"r"(blimit),
[srct]"r"(srct), [thresh]"r"(thresh),
[src_pixel_step]"r"((mips_reg)src_pixel_step)
: "memory"
);
/* clang-format on */
}
/* clang-format off */
#define VP8_SIMPLE_HPSRAB \
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \
"psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \
"psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \
"psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \
"por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
/* clang-format on */
void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
int src_pixel_step,
const unsigned char *blimit) {
uint64_t tmp[1], count = 2;
mips_reg addr[2];
double ftmp[12];
double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp11] \n\t"
"dli %[tmp0], 0x08 \n\t"
"dmtc1 %[tmp0], %[ftmp8] \n\t"
"dli %[tmp0], 0x03 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp11] \n\t"
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0101010101010101 \n\t"
"dmtc1 %[tmp0], %[ff_pb_01] \n\t"
"1: \n\t"
"gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t"
MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
"gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
"pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t"
"pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
"psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
"gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
"gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
"gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
"pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t"
"paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
"psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
"pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
"pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
"pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
"psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t"
"psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
"pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
VP8_SIMPLE_HPSRAB
"psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
"gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
"psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
VP8_SIMPLE_HPSRAB
"paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
"gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
"addiu %[count], %[count], -0x01 \n\t"
MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
[src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
[ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
[ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
: [blimit]"r"(blimit),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
[src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
: "memory"
);
/* clang-format on */
}
void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
int src_pixel_step,
const unsigned char *blimit) {
uint64_t tmp[1], count = 2;
mips_reg addr[2];
DECLARE_ALIGNED(8, const uint64_t, srct[2]);
double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
/* clang-format off */
__asm__ volatile (
"dli %[tmp0], 0x08 \n\t"
"dmtc1 %[tmp0], %[ftmp8] \n\t"
"dli %[tmp0], 0x20 \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"dli %[tmp0], 0x08 \n\t"
"dmtc1 %[tmp0], %[ftmp8] \n\t"
"dli %[tmp0], 0x20 \n\t"
"dmtc1 %[tmp0], %[ftmp10] \n\t"
"dli %[tmp0], 0xfefefefefefefefe \n\t"
"dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
"dli %[tmp0], 0x8080808080808080 \n\t"
"dmtc1 %[tmp0], %[ff_pb_80] \n\t"
"dli %[tmp0], 0x0404040404040404 \n\t"
"dmtc1 %[tmp0], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x0101010101010101 \n\t"
"dmtc1 %[tmp0], %[ff_pb_01] \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
"1: \n\t"
MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
"gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
"gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
"gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t"
"gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
"punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
"punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
"punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
"gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
"gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
"gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
"punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
"punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
"punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
"punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
"punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t"
"punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
"dli %[tmp0], 0x01 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
"pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t"
"psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
"pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
"paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
"paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
"gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t"
"gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t"
"psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
"pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
"pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
"sdc1 %[ftmp0], 0x00(%[srct]) \n\t"
"sdc1 %[ftmp3], 0x08(%[srct]) \n\t"
"pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t"
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
"pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t"
"pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t"
"psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t"
"paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
"paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
"dli %[tmp0], 0x03 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t"
"psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
"psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
"pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
"psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
"dli %[tmp0], 0x03 \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
"psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
"psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
"dli %[tmp0], 0x0b \n\t"
"dmtc1 %[tmp0], %[ftmp9] \n\t"
"psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
"psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
"por %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
"paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
"pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
"ldc1 %[ftmp0], 0x00(%[srct]) \n\t"
"ldc1 %[ftmp4], 0x08(%[srct]) \n\t"
"punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t"
"punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
"punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
"punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
"punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t"
"punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
"gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
"punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
"punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
"ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
"gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
"ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
"gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t"
"gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
"gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
"gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
"ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
"gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t"
"gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
"ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
"gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
"gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
"addiu %[count], %[count], -0x01 \n\t"
"bnez %[count], 1b \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
[src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
[ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
[ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
: [blimit]"r"(blimit), [srct]"r"(srct),
[src_pixel_step]"r"((mips_reg)src_pixel_step),
[src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
[src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
[src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
: "memory"
);
/* clang-format on */
}
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
blimit);
vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
blimit);
vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
blimit);
}
void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
}