| /* |
| * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "./vpx_dsp_rtcd.h" |
| #include "vpx_dsp/variance.h" |
| #include "vpx_ports/mem.h" |
| #include "vpx/vpx_integer.h" |
| #include "vpx_ports/asmdefs_mmi.h" |
| |
| static const uint8_t bilinear_filters[8][2] = { |
| { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, |
| { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, |
| }; |
| |
| /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, |
| vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ |
| #define VARIANCE_SSE_SUM_8_FOR_W64 \ |
| /* sse */ \ |
| "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ |
| "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ |
| "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ |
| "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ |
| "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ |
| \ |
| /* sum */ \ |
| "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ |
| "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ |
| "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ |
| "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ |
| "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ |
| "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ |
| "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ |
| "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ |
| "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ |
| "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ |
| "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ |
| "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ |
| "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ |
| "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ |
| "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" |
| |
| #define VARIANCE_SSE_SUM_4 \ |
| /* sse */ \ |
| "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ |
| "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ |
| "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ |
| \ |
| /* sum */ \ |
| "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ |
| "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ |
| "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" |
| |
| #define VARIANCE_SSE_SUM_8 \ |
| /* sse */ \ |
| "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ |
| "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ |
| "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ |
| \ |
| /* sum */ \ |
| "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ |
| "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ |
| "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" |
| |
| #define VARIANCE_SSE_8 \ |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \ |
| "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ |
| "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ |
| "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" |
| |
| #define VARIANCE_SSE_16 \ |
| VARIANCE_SSE_8 \ |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \ |
| "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ |
| "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ |
| "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ |
| "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ |
| /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ |
| "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ |
| /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ |
| "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ |
| /* calculate: temp2[0] ~ temp2[3] */ \ |
| "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ |
| \ |
| /* store: temp2[0] ~ temp2[3] */ \ |
| "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ |
| "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ |
| "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ |
| /* calculate: temp2[0] ~ temp2[3] */ \ |
| "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ |
| \ |
| /* store: temp2[0] ~ temp2[3] */ \ |
| "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ |
| "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ |
| "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ |
| /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ |
| "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ |
| "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ |
| "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ |
| "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ |
| "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ |
| "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ |
| /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ |
| "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ |
| "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ |
| "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ |
| "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ |
| "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ |
| "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ |
| "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ |
| "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ |
| /* calculate: temp2[0] ~ temp2[3] */ \ |
| "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ |
| "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ |
| \ |
| /* calculate: temp2[4] ~ temp2[7] */ \ |
| "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ |
| "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ |
| "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ |
| \ |
| /* store: temp2[0] ~ temp2[7] */ \ |
| "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ |
| "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \ |
| "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ |
| "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ |
| "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ |
| /* calculate: temp2[0] ~ temp2[3] */ \ |
| "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ |
| "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ |
| "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ |
| \ |
| /* calculate: temp2[4] ~ temp2[7] */ \ |
| "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ |
| "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ |
| "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ |
| \ |
| /* store: temp2[0] ~ temp2[7] */ \ |
| "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \ |
| "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \ |
| "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ |
| "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ |
| "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ |
| /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ |
| \ |
| /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ |
| "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ |
| "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ |
| "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ |
| "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ |
| "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ |
| "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ |
| /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ |
| \ |
| /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ |
| "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ |
| "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ |
| "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ |
| "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ |
| "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ |
| "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ |
| "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ |
| "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ |
| "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ |
| "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ |
| "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ |
| \ |
| /* calculate: temp2[8] ~ temp2[11] */ \ |
| "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ |
| "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ |
| \ |
| /* calculate: temp2[12] ~ temp2[15] */ \ |
| "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ |
| "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ |
| "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ |
| \ |
| /* store: temp2[8] ~ temp2[15] */ \ |
| "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ |
| "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \ |
| "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ |
| "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ |
| "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" |
| |
| #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ |
| \ |
| /* calculate: temp2[8] ~ temp2[11] */ \ |
| "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ |
| "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ |
| \ |
| /* calculate: temp2[12] ~ temp2[15] */ \ |
| "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ |
| "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ |
| "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ |
| "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ |
| "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ |
| \ |
| /* store: temp2[8] ~ temp2[15] */ \ |
| "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \ |
| "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \ |
| "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ |
| "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ |
| "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" |
| |
| // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal |
| // or vertical direction to produce the filtered output block. Used to implement |
| // the first-pass of 2-D separable filter. |
| // |
| // Produces int16_t output to retain precision for the next pass. Two filter |
| // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is |
| // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). |
| // It defines the offset required to move from one input to the next. |
| static void var_filter_block2d_bil_first_pass( |
| const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, |
| int pixel_step, unsigned int output_height, unsigned int output_width, |
| const uint8_t *filter) { |
| unsigned int i, j; |
| |
| for (i = 0; i < output_height; ++i) { |
| for (j = 0; j < output_width; ++j) { |
| ref_ptr[j] = ROUND_POWER_OF_TWO( |
| (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], |
| FILTER_BITS); |
| |
| ++src_ptr; |
| } |
| |
| src_ptr += src_pixels_per_line - output_width; |
| ref_ptr += output_width; |
| } |
| } |
| |
| // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal |
| // or vertical direction to produce the filtered output block. Used to implement |
| // the second-pass of 2-D separable filter. |
| // |
| // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two |
| // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the |
| // filter is applied horizontally (pixel_step = 1) or vertically |
| // (pixel_step = stride). It defines the offset required to move from one input |
| // to the next. Output is 8-bit. |
| static void var_filter_block2d_bil_second_pass( |
| const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, |
| unsigned int pixel_step, unsigned int output_height, |
| unsigned int output_width, const uint8_t *filter) { |
| unsigned int i, j; |
| |
| for (i = 0; i < output_height; ++i) { |
| for (j = 0; j < output_width; ++j) { |
| ref_ptr[j] = ROUND_POWER_OF_TWO( |
| (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], |
| FILTER_BITS); |
| ++src_ptr; |
| } |
| |
| src_ptr += src_pixels_per_line - output_width; |
| ref_ptr += output_width; |
| } |
| } |
| |
| static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, int high) { |
| int sum; |
| double ftmp[12]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" |
| "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "mfc1 %[tmp1], %[ftmp9] \n\t" |
| "mfhc1 %[tmp2], %[ftmp9] \n\t" |
| "addu %[sum], %[tmp1], %[tmp2] \n\t" |
| "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" |
| "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" |
| "swc1 %[ftmp1], 0x00(%[sse]) \n\t" |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), |
| [tmp2]"=&r"(tmp[2]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), |
| [sum]"=&r"(sum) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / (64 * high)); |
| } |
| |
| #define VPX_VARIANCE64XN(n) \ |
| uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| VPX_VARIANCE64XN(64) |
| VPX_VARIANCE64XN(32) |
| |
| uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse) { |
| int sum; |
| double ftmp[12]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| "li %[tmp0], 0x40 \n\t" |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" |
| "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8_FOR_W64 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "mfc1 %[tmp1], %[ftmp9] \n\t" |
| "mfhc1 %[tmp2], %[ftmp9] \n\t" |
| "addu %[sum], %[tmp1], %[tmp2] \n\t" |
| "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" |
| "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" |
| "swc1 %[ftmp1], 0x00(%[sse]) \n\t" |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), |
| [tmp2]"=&r"(tmp[2]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), |
| [sum]"=&r"(sum) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [sse]"r"(sse) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / 2048); |
| } |
| |
| static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, int high) { |
| int sum; |
| double ftmp[13]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" |
| "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| |
| "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" |
| "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" |
| "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" |
| "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" |
| "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" |
| "swc1 %[ftmp0], 0x00(%[sum]) \n\t" |
| |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / (32 * high)); |
| } |
| |
| #define VPX_VARIANCE32XN(n) \ |
| uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| VPX_VARIANCE32XN(32) |
| VPX_VARIANCE32XN(16) |
| |
| static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, int high) { |
| int sum; |
| double ftmp[13]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" |
| "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| |
| "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" |
| "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" |
| "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" |
| "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" |
| "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" |
| "swc1 %[ftmp0], 0x00(%[sum]) \n\t" |
| |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / (16 * high)); |
| } |
| |
| #define VPX_VARIANCE16XN(n) \ |
| uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| VPX_VARIANCE16XN(32) |
| VPX_VARIANCE16XN(16) |
| VPX_VARIANCE16XN(8) |
| |
| static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, int high) { |
| int sum; |
| double ftmp[13]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" |
| "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_8 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| |
| "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" |
| "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" |
| "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" |
| "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" |
| "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" |
| "swc1 %[ftmp0], 0x00(%[sum]) \n\t" |
| |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / (8 * high)); |
| } |
| |
| #define VPX_VARIANCE8XN(n) \ |
| uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| VPX_VARIANCE8XN(16) |
| VPX_VARIANCE8XN(8) |
| VPX_VARIANCE8XN(4) |
| |
| static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, int high) { |
| int sum; |
| double ftmp[12]; |
| uint32_t tmp[3]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp10] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" |
| "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| "1: \n\t" |
| "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" |
| "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" |
| "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" |
| "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" |
| VARIANCE_SSE_SUM_4 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| |
| "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" |
| "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" |
| "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" |
| "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" |
| "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" |
| "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t" |
| "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" |
| "swc1 %[ftmp0], 0x00(%[sum]) \n\t" |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), |
| [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse - (((int64_t)sum * sum) / (4 * high)); |
| } |
| |
| #define VPX_VARIANCE4XN(n) \ |
| uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| VPX_VARIANCE4XN(8) |
| VPX_VARIANCE4XN(4) |
| |
| static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, uint64_t high) { |
| double ftmp[12]; |
| uint32_t tmp[1]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| |
| "1: \n\t" |
| VARIANCE_SSE_16 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse; |
| } |
| |
| #define vpx_mse16xN(n) \ |
| uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| vpx_mse16xN(16); |
| vpx_mse16xN(8); |
| |
| static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, |
| const uint8_t *ref_ptr, int ref_stride, |
| uint32_t *sse, uint64_t high) { |
| double ftmp[12]; |
| uint32_t tmp[1]; |
| |
| *sse = 0; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "li %[tmp0], 0x20 \n\t" |
| "mtc1 %[tmp0], %[ftmp11] \n\t" |
| MMI_L(%[tmp0], %[high], 0x00) |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" |
| |
| "1: \n\t" |
| VARIANCE_SSE_8 |
| |
| "addiu %[tmp0], %[tmp0], -0x01 \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) |
| "bnez %[tmp0], 1b \n\t" |
| |
| "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" |
| "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" |
| "swc1 %[ftmp9], 0x00(%[sse]) \n\t" |
| : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), |
| [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), |
| [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), |
| [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), |
| [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), |
| [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), |
| [tmp0]"=&r"(tmp[0]), |
| [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) |
| : [src_stride]"r"((mips_reg)src_stride), |
| [ref_stride]"r"((mips_reg)ref_stride), |
| [high]"r"(&high), [sse]"r"(sse) |
| : "memory" |
| ); |
| /* clang-format on */ |
| |
| return *sse; |
| } |
| |
| #define vpx_mse8xN(n) \ |
| uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ |
| const uint8_t *ref_ptr, int ref_stride, \ |
| uint32_t *sse) { \ |
| return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ |
| } |
| |
| vpx_mse8xN(16); |
| vpx_mse8xN(8); |
| |
| #define SUBPIX_VAR(W, H) \ |
| uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ |
| const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
| const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ |
| uint16_t fdata3[((H) + 1) * (W)]; \ |
| uint8_t temp2[(H) * (W)]; \ |
| \ |
| var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ |
| W, bilinear_filters[x_offset]); \ |
| var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ |
| bilinear_filters[y_offset]); \ |
| \ |
| return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \ |
| } |
| |
| SUBPIX_VAR(64, 64) |
| SUBPIX_VAR(64, 32) |
| SUBPIX_VAR(32, 64) |
| SUBPIX_VAR(32, 32) |
| SUBPIX_VAR(32, 16) |
| SUBPIX_VAR(16, 32) |
| |
| static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, |
| int src_stride, int x_offset, |
| int y_offset, uint8_t *temp2, |
| int counter) { |
| uint8_t *temp2_ptr = temp2; |
| mips_reg l_counter = counter; |
| double ftmp[15]; |
| double ff_ph_40, mask; |
| double filter_x0, filter_x1, filter_y0, filter_y1; |
| mips_reg tmp[2]; |
| uint64_t x0, x1, y0, y1, all; |
| |
| const uint8_t *filter_x = bilinear_filters[x_offset]; |
| const uint8_t *filter_y = bilinear_filters[y_offset]; |
| x0 = (uint64_t)filter_x[0]; |
| x1 = (uint64_t)filter_x[1]; |
| y0 = (uint64_t)filter_y[0]; |
| y1 = (uint64_t)filter_y[1]; |
| all = x0 | x1 << 8 | y0 << 16 | y1 << 24; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| MMI_MTC1(%[all], %[ftmp14]) |
| "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" |
| "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x10) |
| MMI_MTC1(%[tmp0], %[mask]) |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x07) |
| MMI_MTC1(%[tmp0], %[ftmp14]) |
| MMI_LI(%[tmp0], 0x0040004000400040) |
| MMI_MTC1(%[tmp0], %[ff_ph_40]) |
| MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) |
| MMI_MTC1(%[tmp0], %[mask]) |
| // fdata3: fdata3[0] ~ fdata3[15] |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A |
| |
| // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B |
| // temp2: temp2[0] ~ temp2[15] |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A |
| |
| // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A |
| // temp2+16*1: temp2[0] ~ temp2[15] |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B |
| |
| "1: \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A |
| |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B |
| "addiu %[counter], %[counter], -0x01 \n\t" |
| "bnez %[counter], 1b \n\t" |
| : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), |
| [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), |
| [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), |
| [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), |
| [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), |
| [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), |
| [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), |
| [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), |
| [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), |
| [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) |
| : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) |
| : "memory" |
| ); |
| /* clang-format on */ |
| } |
| |
| #define SUBPIX_VAR16XN(H) \ |
| uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ |
| const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
| const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ |
| uint8_t temp2[16 * (H)]; \ |
| var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \ |
| ((H)-2) / 2); \ |
| \ |
| return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \ |
| } |
| |
| SUBPIX_VAR16XN(16) |
| SUBPIX_VAR16XN(8) |
| |
| static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, |
| int src_stride, int x_offset, |
| int y_offset, uint8_t *temp2, |
| int counter) { |
| uint8_t *temp2_ptr = temp2; |
| mips_reg l_counter = counter; |
| double ftmp[15]; |
| mips_reg tmp[2]; |
| double ff_ph_40, mask; |
| uint64_t x0, x1, y0, y1, all; |
| double filter_x0, filter_x1, filter_y0, filter_y1; |
| const uint8_t *filter_x = bilinear_filters[x_offset]; |
| const uint8_t *filter_y = bilinear_filters[y_offset]; |
| x0 = (uint64_t)filter_x[0]; |
| x1 = (uint64_t)filter_x[1]; |
| y0 = (uint64_t)filter_y[0]; |
| y1 = (uint64_t)filter_y[1]; |
| all = x0 | x1 << 8 | y0 << 16 | y1 << 24; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| MMI_MTC1(%[all], %[ftmp14]) |
| "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" |
| "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x10) |
| MMI_MTC1(%[tmp0], %[mask]) |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" |
| "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" |
| "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x07) |
| MMI_MTC1(%[tmp0], %[ftmp14]) |
| MMI_LI(%[tmp0], 0x0040004000400040) |
| MMI_MTC1(%[tmp0], %[ff_ph_40]) |
| MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) |
| MMI_MTC1(%[tmp0], %[mask]) |
| |
| // fdata3: fdata3[0] ~ fdata3[7] |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A |
| |
| // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B |
| // temp2: temp2[0] ~ temp2[7] |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A |
| |
| // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A |
| // temp2+8*1: temp2[0] ~ temp2[7] |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B |
| |
| "1: \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A |
| |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B |
| "addiu %[counter], %[counter], -0x01 \n\t" |
| "bnez %[counter], 1b \n\t" |
| : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), |
| [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), |
| [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), |
| [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), |
| [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), |
| [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), |
| [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), |
| [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), |
| [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), |
| [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) |
| : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) |
| : "memory" |
| ); |
| /* clang-format on */ |
| } |
| |
| #define SUBPIX_VAR8XN(H) \ |
| uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ |
| const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
| const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ |
| uint8_t temp2[8 * (H)]; \ |
| var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \ |
| ((H)-2) / 2); \ |
| \ |
| return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \ |
| } |
| |
| SUBPIX_VAR8XN(16) |
| SUBPIX_VAR8XN(8) |
| SUBPIX_VAR8XN(4) |
| |
| static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, |
| int src_stride, int x_offset, |
| int y_offset, uint8_t *temp2, |
| int counter) { |
| uint8_t *temp2_ptr = temp2; |
| mips_reg l_counter = counter; |
| double ftmp[7]; |
| mips_reg tmp[2]; |
| double ff_ph_40, mask; |
| uint64_t x0, x1, y0, y1, all; |
| double filter_x0, filter_x1, filter_y0, filter_y1; |
| const uint8_t *filter_x = bilinear_filters[x_offset]; |
| const uint8_t *filter_y = bilinear_filters[y_offset]; |
| x0 = (uint64_t)filter_x[0]; |
| x1 = (uint64_t)filter_x[1]; |
| y0 = (uint64_t)filter_y[0]; |
| y1 = (uint64_t)filter_y[1]; |
| all = x0 | x1 << 8 | y0 << 16 | y1 << 24; |
| |
| /* clang-format off */ |
| __asm__ volatile ( |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| MMI_MTC1(%[all], %[ftmp6]) |
| "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" |
| "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x10) |
| MMI_MTC1(%[tmp0], %[mask]) |
| "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" |
| "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t" |
| "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" |
| "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t" |
| "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" |
| "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t" |
| "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" |
| MMI_LI(%[tmp0], 0x07) |
| MMI_MTC1(%[tmp0], %[ftmp6]) |
| MMI_LI(%[tmp0], 0x0040004000400040) |
| MMI_MTC1(%[tmp0], %[ff_ph_40]) |
| MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) |
| MMI_MTC1(%[tmp0], %[mask]) |
| // fdata3: fdata3[0] ~ fdata3[3] |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A |
| |
| // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B |
| // temp2: temp2[0] ~ temp2[7] |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A |
| |
| // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3] |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A |
| // temp2+4*1: temp2[0] ~ temp2[7] |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B |
| |
| "1: \n\t" |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A |
| |
| MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) |
| VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A |
| MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) |
| VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B |
| "addiu %[counter], %[counter], -0x01 \n\t" |
| "bnez %[counter], 1b \n\t" |
| : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), |
| [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), |
| [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), |
| [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter), |
| [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), |
| [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), |
| [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) |
| : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) |
| : "memory" |
| ); |
| /* clang-format on */ |
| } |
| |
| #define SUBPIX_VAR4XN(H) \ |
| uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ |
| const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
| const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ |
| uint8_t temp2[4 * (H)]; \ |
| var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \ |
| ((H)-2) / 2); \ |
| \ |
| return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \ |
| } |
| |
| SUBPIX_VAR4XN(8) |
| SUBPIX_VAR4XN(4) |
| |
| #define SUBPIX_AVG_VAR(W, H) \ |
| uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ |
| const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ |
| const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ |
| const uint8_t *second_pred) { \ |
| uint16_t fdata3[((H) + 1) * (W)]; \ |
| uint8_t temp2[(H) * (W)]; \ |
| DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \ |
| \ |
| var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ |
| W, bilinear_filters[x_offset]); \ |
| var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ |
| bilinear_filters[y_offset]); \ |
| \ |
| vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ |
| \ |
| return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \ |
| } |
| |
| SUBPIX_AVG_VAR(64, 64) |
| SUBPIX_AVG_VAR(64, 32) |
| SUBPIX_AVG_VAR(32, 64) |
| SUBPIX_AVG_VAR(32, 32) |
| SUBPIX_AVG_VAR(32, 16) |
| SUBPIX_AVG_VAR(16, 32) |
| SUBPIX_AVG_VAR(16, 16) |
| SUBPIX_AVG_VAR(16, 8) |
| SUBPIX_AVG_VAR(8, 16) |
| SUBPIX_AVG_VAR(8, 8) |
| SUBPIX_AVG_VAR(8, 4) |
| SUBPIX_AVG_VAR(4, 8) |
| SUBPIX_AVG_VAR(4, 4) |