| /* |
| * Copyright © 2018, VideoLAN and dav1d authors |
| * Copyright © 2020, Martin Storsjo |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "src/arm/asm.S" |
| #include "util.S" |
| |
| const right_ext_mask_buf |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| right_ext_mask: |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| endconst |
| |
| // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, |
| // const pixel (*left)[4], const pixel *lpf, |
| // const int w, int h, |
| // const int16_t filter[2][8], |
| // const enum LrEdgeFlags edges, |
| // const int bitdepth_max); |
| function wiener_filter7_16bpc_neon, export=1 |
| ldr w8, [sp] |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29, x30, [sp, #-32]! |
| stp d8, d9, [sp, #16] |
| mov x29, sp |
| ld1 {v0.8h, v1.8h}, [x6] |
| tst w7, #4 // LR_HAVE_TOP |
| sub_sp 384*2*6 |
| |
| dup v28.8h, w8 // bitdepth_max |
| clz w8, w8 |
| movi v30.4s, #1 |
| sub w10, w8, #38 // -(bitdepth + 6) |
| sub w11, w8, #11 // round_bits_v |
| sub w8, w8, #25 // -round_bits_h |
| neg w10, w10 // bitdepth + 6 |
| neg w11, w11 // -round_bits_v |
| dup v2.4s, w10 |
| dup v29.4s, w8 // -round_bits_h |
| dup v27.4s, w11 // -round_bits_v |
| movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 |
| ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) |
| |
| zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 |
| |
| // x9 - t6 |
| // x10 - t5 |
| // x11 - t4 |
| // x12 - t3 |
| // x13 - t2 |
| // x14 - t1 |
| // x15 - t0 |
| mov x14, sp // t1 |
| b.eq L(no_top_7) |
| |
| mov x16, x2 // backup left |
| mov x2, #0 |
| bl wiener_filter7_h_16bpc_neon |
| add x3, x3, x1 // lpf += stride |
| mov x9, x14 // t6 |
| mov x10, x14 // t5 |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter7_h_16bpc_neon |
| add x3, x3, x1, lsl #2 |
| add x3, x3, x1 // lpf += stride*5 |
| mov x11, x14 // t4 |
| add x14, x14, #384*2 // t1 += 384*2 |
| mov x2, x16 // left |
| mov x16, x3 // backup lpf |
| mov x3, x0 // lpf = p |
| bl wiener_filter7_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| mov x12, x14 // t3 |
| mov x13, x14 // t2 |
| b.eq L(v1_7) |
| add x3, x3, x1 // src += stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter7_h_16bpc_neon |
| mov x13, x14 // t2 |
| subs w5, w5, #1 // h-- |
| b.eq L(v2_7) |
| add x3, x3, x1 // src += stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter7_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v3_7) |
| add x3, x3, x1 // src += stride |
| |
| L(main_7): |
| add x15, x14, #384*2 // t0 = t1 + 384*2 |
| L(main_loop_7): |
| bl wiener_filter7_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.ne L(main_loop_7) |
| tst w7, #8 // LR_HAVE_BOTTOM |
| b.eq L(v3_7) |
| |
| mov x3, x16 // restore lpf |
| mov x2, #0 // left = NULL |
| bl wiener_filter7_hv_16bpc_neon |
| bl wiener_filter7_hv_16bpc_neon |
| L(v1_7): |
| bl wiener_filter7_v_16bpc_neon |
| |
| mov sp, x29 |
| ldp d8, d9, [sp, #16] |
| ldp x29, x30, [sp], #32 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(no_top_7): |
| add x3, x3, x1, lsl #2 |
| add x16, x3, x1, lsl #1 // lpf += stride*6, backup |
| mov x3, x0 // lpf = p |
| |
| bl wiener_filter7_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| mov x9, x14 // t6 |
| mov x10, x14 // t5 |
| mov x11, x14 // t4 |
| mov x12, x14 // t3 |
| mov x13, x14 // t2 |
| b.eq L(v1_7) |
| add x3, x3, x1 // src += p_stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter7_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| mov x13, x14 // t2 |
| b.eq L(v2_7) |
| add x3, x3, x1 // src += p_stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter7_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v3_7) |
| add x3, x3, x1 // src += p_stride |
| add x15, x14, #384*2 // t0 = t1 + 384*2 |
| bl wiener_filter7_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v3_7) |
| add x15, x15, #384*2*4 // t0 += 384*2*4 |
| bl wiener_filter7_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.ne L(main_7) |
| L(v3_7): |
| bl wiener_filter7_v_16bpc_neon |
| L(v2_7): |
| bl wiener_filter7_v_16bpc_neon |
| b L(v1_7) |
| endfunc |
| |
| |
| function wiener_filter7_h_16bpc_neon |
| stp x3, x4, [sp, #-32]! |
| str x14, [sp, #16] |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 1f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #6 |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| b 2f |
| |
| 0: |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| ld1 {v4.d}[1], [x2], #8 |
| // Move x3 back to account for the last 3 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #6 |
| ext v3.16b, v2.16b, v3.16b, #10 |
| ext v2.16b, v4.16b, v2.16b, #10 |
| b 2f |
| |
| 1: |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| // !LR_HAVE_LEFT, fill v4 with the leftmost pixel |
| // and shift v3 to have 3x the first pixel at the front. |
| dup v4.8h, v2.h[0] |
| // Move x3 back to account for the last 3 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #6 |
| ext v3.16b, v2.16b, v3.16b, #10 |
| ext v2.16b, v4.16b, v2.16b, #10 |
| |
| 2: |
| ld1 {v4.8h}, [x3], #16 |
| |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w4, #19 |
| b.ge 4f // If w >= 19, all used input pixels are valid |
| |
| // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, |
| // this ends up called again; it's not strictly needed in those |
| // cases (we pad enough here), but keeping the code as simple as possible. |
| |
| // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie |
| // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. |
| sub w17, w4, #22 |
| // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the |
| // buffer pointer. |
| movrel x6, right_ext_mask, -6 |
| ldr h26, [x3, w17, sxtw #1] |
| sub x6, x6, w4, uxtw #1 |
| dup v26.8h, v26.h[0] |
| ld1 {v23.16b, v24.16b, v25.16b}, [x6] |
| |
| bit v2.16b, v26.16b, v23.16b |
| bit v3.16b, v26.16b, v24.16b |
| bit v4.16b, v26.16b, v25.16b |
| |
| 4: // Loop horizontally |
| // Interleaving the mul/mla chains actually hurts performance |
| // significantly on Cortex A53, thus keeping mul/mla tightly |
| // chained like this. |
| ext v17.16b, v2.16b, v3.16b, #4 |
| ext v19.16b, v2.16b, v3.16b, #8 |
| ext v16.16b, v2.16b, v3.16b, #2 |
| ext v20.16b, v2.16b, v3.16b, #10 |
| ext v21.16b, v2.16b, v3.16b, #12 |
| ext v18.16b, v2.16b, v3.16b, #6 |
| add v19.8h, v19.8h, v17.8h |
| add v20.8h, v20.8h, v16.8h |
| add v21.8h, v21.8h, v2.8h |
| smull v6.4s, v18.4h, v0.h[3] |
| smlal v6.4s, v19.4h, v0.h[2] |
| smlal v6.4s, v20.4h, v0.h[1] |
| smlal v6.4s, v21.4h, v0.h[0] |
| smull2 v7.4s, v18.8h, v0.h[3] |
| smlal2 v7.4s, v19.8h, v0.h[2] |
| smlal2 v7.4s, v20.8h, v0.h[1] |
| smlal2 v7.4s, v21.8h, v0.h[0] |
| |
| ext v17.16b, v3.16b, v4.16b, #4 |
| ext v19.16b, v3.16b, v4.16b, #8 |
| ext v16.16b, v3.16b, v4.16b, #2 |
| ext v20.16b, v3.16b, v4.16b, #10 |
| ext v21.16b, v3.16b, v4.16b, #12 |
| ext v18.16b, v3.16b, v4.16b, #6 |
| |
| add v19.8h, v19.8h, v17.8h |
| add v20.8h, v20.8h, v16.8h |
| add v21.8h, v21.8h, v3.8h |
| smull v16.4s, v18.4h, v0.h[3] |
| smlal v16.4s, v19.4h, v0.h[2] |
| smlal v16.4s, v20.4h, v0.h[1] |
| smlal v16.4s, v21.4h, v0.h[0] |
| smull2 v17.4s, v18.8h, v0.h[3] |
| smlal2 v17.4s, v19.8h, v0.h[2] |
| smlal2 v17.4s, v20.8h, v0.h[1] |
| smlal2 v17.4s, v21.8h, v0.h[0] |
| |
| mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 |
| add v6.4s, v6.4s, v30.4s |
| add v7.4s, v7.4s, v30.4s |
| add v16.4s, v16.4s, v30.4s |
| add v17.4s, v17.4s, v30.4s |
| srshl v6.4s, v6.4s, v29.4s |
| srshl v7.4s, v7.4s, v29.4s |
| srshl v16.4s, v16.4s, v29.4s |
| srshl v17.4s, v17.4s, v29.4s |
| sqxtun v6.4h, v6.4s |
| sqxtun2 v6.8h, v7.4s |
| sqxtun v7.4h, v16.4s |
| sqxtun2 v7.8h, v17.4s |
| umin v6.8h, v6.8h, v24.8h |
| umin v7.8h, v7.8h, v24.8h |
| sub v6.8h, v6.8h, v31.8h |
| sub v7.8h, v7.8h, v31.8h |
| |
| subs w4, w4, #16 |
| |
| st1 {v6.8h, v7.8h}, [x14], #32 |
| |
| b.le 0f |
| mov v2.16b, v4.16b |
| tst w7, #2 // LR_HAVE_RIGHT |
| ld1 {v3.8h, v4.8h}, [x3], #32 |
| b.ne 4b // If we don't need to pad, just keep filtering. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 0: |
| ldr x14, [sp, #16] |
| ldp x3, x4, [sp], #32 |
| ret |
| endfunc |
| |
| function wiener_filter7_v_16bpc_neon |
| // Backing up/restoring registers shifted, so that x9 gets the value |
| // of x10, etc, afterwards. |
| stp x10, x11, [sp, #-64]! |
| stp x12, x13, [sp, #16] |
| stp x14, x14, [sp, #32] |
| stp x0, x4, [sp, #48] |
| 1: |
| ld1 {v16.8h, v17.8h}, [x9], #32 |
| ld1 {v18.8h, v19.8h}, [x10], #32 |
| ld1 {v20.8h, v21.8h}, [x11], #32 |
| ld1 {v22.8h, v23.8h}, [x12], #32 |
| ld1 {v24.8h, v25.8h}, [x13], #32 |
| ld1 {v6.8h, v7.8h}, [x14], #32 |
| |
| smull v2.4s, v16.4h, v0.h[4] |
| smlal v2.4s, v18.4h, v0.h[5] |
| smlal v2.4s, v20.4h, v0.h[6] |
| smlal v2.4s, v22.4h, v0.h[7] |
| smlal v2.4s, v24.4h, v0.h[6] |
| smlal v2.4s, v6.4h, v0.h[5] |
| smlal v2.4s, v6.4h, v0.h[4] |
| smull2 v3.4s, v16.8h, v0.h[4] |
| smlal2 v3.4s, v18.8h, v0.h[5] |
| smlal2 v3.4s, v20.8h, v0.h[6] |
| smlal2 v3.4s, v22.8h, v0.h[7] |
| smlal2 v3.4s, v24.8h, v0.h[6] |
| smlal2 v3.4s, v6.8h, v0.h[5] |
| smlal2 v3.4s, v6.8h, v0.h[4] |
| smull v4.4s, v17.4h, v0.h[4] |
| smlal v4.4s, v19.4h, v0.h[5] |
| smlal v4.4s, v21.4h, v0.h[6] |
| smlal v4.4s, v23.4h, v0.h[7] |
| smlal v4.4s, v25.4h, v0.h[6] |
| smlal v4.4s, v7.4h, v0.h[5] |
| smlal v4.4s, v7.4h, v0.h[4] |
| smull2 v5.4s, v17.8h, v0.h[4] |
| smlal2 v5.4s, v19.8h, v0.h[5] |
| smlal2 v5.4s, v21.8h, v0.h[6] |
| smlal2 v5.4s, v23.8h, v0.h[7] |
| smlal2 v5.4s, v25.8h, v0.h[6] |
| smlal2 v5.4s, v7.8h, v0.h[5] |
| smlal2 v5.4s, v7.8h, v0.h[4] |
| srshl v2.4s, v2.4s, v27.4s // -round_bits_v |
| srshl v3.4s, v3.4s, v27.4s |
| srshl v4.4s, v4.4s, v27.4s |
| srshl v5.4s, v5.4s, v27.4s |
| sqxtun v2.4h, v2.4s |
| sqxtun2 v2.8h, v3.4s |
| sqxtun v3.4h, v4.4s |
| sqxtun2 v3.8h, v5.4s |
| umin v2.8h, v2.8h, v28.8h // bitdepth_max |
| umin v3.8h, v3.8h, v28.8h |
| subs w4, w4, #16 |
| st1 {v2.8h, v3.8h}, [x0], #32 |
| b.gt 1b |
| |
| ldp x0, x4, [sp, #48] |
| ldp x13, x14, [sp, #32] |
| ldp x11, x12, [sp, #16] |
| ldp x9, x10, [sp], #64 |
| |
| add x0, x0, x1 |
| ret |
| endfunc |
| |
| function wiener_filter7_hv_16bpc_neon |
| // Backing up/restoring registers shifted, so that x9 gets the value |
| // of x10, etc, and x15==x9, afterwards. |
| stp x10, x11, [sp, #-80]! |
| stp x12, x13, [sp, #16] |
| stp x14, x15, [sp, #32] |
| stp x10, x0, [sp, #48] |
| stp x3, x4, [sp, #64] |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 1f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #6 |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| b 2f |
| |
| 0: |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| ld1 {v4.d}[1], [x2], #8 |
| // Move x3 back to account for the last 3 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #6 |
| ext v3.16b, v2.16b, v3.16b, #10 |
| ext v2.16b, v4.16b, v2.16b, #10 |
| b 2f |
| 1: |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| // !LR_HAVE_LEFT, fill v4 with the leftmost pixel |
| // and shift v3 to have 3x the first pixel at the front. |
| dup v4.8h, v2.h[0] |
| // Move x3 back to account for the last 3 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #6 |
| ext v3.16b, v2.16b, v3.16b, #10 |
| ext v2.16b, v4.16b, v2.16b, #10 |
| |
| 2: |
| ld1 {v4.8h}, [x3], #16 |
| |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w4, #19 |
| b.ge 4f // If w >= 19, all used input pixels are valid |
| |
| // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, |
| // this ends up called again; it's not strictly needed in those |
| // cases (we pad enough here), but keeping the code as simple as possible. |
| |
| // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie |
| // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. |
| sub w17, w4, #22 |
| // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the |
| // buffer pointer. |
| movrel x6, right_ext_mask, -6 |
| ldr h26, [x3, w17, sxtw #1] |
| sub x6, x6, w4, uxtw #1 |
| dup v26.8h, v26.h[0] |
| ld1 {v23.16b, v24.16b, v25.16b}, [x6] |
| |
| bit v2.16b, v26.16b, v23.16b |
| bit v3.16b, v26.16b, v24.16b |
| bit v4.16b, v26.16b, v25.16b |
| |
| 4: // Loop horizontally |
| ext v17.16b, v2.16b, v3.16b, #4 |
| ext v19.16b, v2.16b, v3.16b, #8 |
| ext v16.16b, v2.16b, v3.16b, #2 |
| ext v20.16b, v2.16b, v3.16b, #10 |
| ext v21.16b, v2.16b, v3.16b, #12 |
| ext v18.16b, v2.16b, v3.16b, #6 |
| add v19.8h, v19.8h, v17.8h |
| add v20.8h, v20.8h, v16.8h |
| add v21.8h, v21.8h, v2.8h |
| smull v6.4s, v18.4h, v0.h[3] |
| smlal v6.4s, v19.4h, v0.h[2] |
| smlal v6.4s, v20.4h, v0.h[1] |
| smlal v6.4s, v21.4h, v0.h[0] |
| smull2 v7.4s, v18.8h, v0.h[3] |
| smlal2 v7.4s, v19.8h, v0.h[2] |
| smlal2 v7.4s, v20.8h, v0.h[1] |
| smlal2 v7.4s, v21.8h, v0.h[0] |
| |
| ext v17.16b, v3.16b, v4.16b, #4 |
| ext v19.16b, v3.16b, v4.16b, #8 |
| ext v16.16b, v3.16b, v4.16b, #2 |
| ext v20.16b, v3.16b, v4.16b, #10 |
| ext v21.16b, v3.16b, v4.16b, #12 |
| ext v18.16b, v3.16b, v4.16b, #6 |
| |
| add v19.8h, v19.8h, v17.8h |
| add v20.8h, v20.8h, v16.8h |
| add v21.8h, v21.8h, v3.8h |
| smull v24.4s, v18.4h, v0.h[3] |
| smlal v24.4s, v19.4h, v0.h[2] |
| smlal v24.4s, v20.4h, v0.h[1] |
| smlal v24.4s, v21.4h, v0.h[0] |
| smull2 v25.4s, v18.8h, v0.h[3] |
| smlal2 v25.4s, v19.8h, v0.h[2] |
| smlal2 v25.4s, v20.8h, v0.h[1] |
| smlal2 v25.4s, v21.8h, v0.h[0] |
| |
| ld1 {v16.8h, v17.8h}, [x9], #32 |
| |
| mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 |
| add v6.4s, v6.4s, v30.4s |
| add v7.4s, v7.4s, v30.4s |
| add v24.4s, v24.4s, v30.4s |
| add v25.4s, v25.4s, v30.4s |
| ld1 {v18.8h, v19.8h}, [x10], #32 |
| srshl v6.4s, v6.4s, v29.4s |
| srshl v7.4s, v7.4s, v29.4s |
| srshl v24.4s, v24.4s, v29.4s |
| srshl v25.4s, v25.4s, v29.4s |
| ld1 {v20.8h, v21.8h}, [x11], #32 |
| sqxtun v6.4h, v6.4s |
| sqxtun2 v6.8h, v7.4s |
| sqxtun v7.4h, v24.4s |
| sqxtun2 v7.8h, v25.4s |
| ld1 {v22.8h, v23.8h}, [x12], #32 |
| umin v6.8h, v6.8h, v26.8h |
| umin v7.8h, v7.8h, v26.8h |
| ld1 {v24.8h, v25.8h}, [x13], #32 |
| sub v6.8h, v6.8h, v31.8h |
| sub v7.8h, v7.8h, v31.8h |
| |
| ld1 {v8.8h, v9.8h}, [x14], #32 |
| |
| smull v1.4s, v16.4h, v0.h[4] |
| smlal v1.4s, v18.4h, v0.h[5] |
| smlal v1.4s, v20.4h, v0.h[6] |
| smlal v1.4s, v22.4h, v0.h[7] |
| smlal v1.4s, v24.4h, v0.h[6] |
| smlal v1.4s, v8.4h, v0.h[5] |
| smlal v1.4s, v6.4h, v0.h[4] |
| smull2 v5.4s, v16.8h, v0.h[4] |
| smlal2 v5.4s, v18.8h, v0.h[5] |
| smlal2 v5.4s, v20.8h, v0.h[6] |
| smlal2 v5.4s, v22.8h, v0.h[7] |
| smlal2 v5.4s, v24.8h, v0.h[6] |
| smlal2 v5.4s, v8.8h, v0.h[5] |
| smlal2 v5.4s, v6.8h, v0.h[4] |
| smull v26.4s, v17.4h, v0.h[4] |
| smlal v26.4s, v19.4h, v0.h[5] |
| smlal v26.4s, v21.4h, v0.h[6] |
| smlal v26.4s, v23.4h, v0.h[7] |
| smlal v26.4s, v25.4h, v0.h[6] |
| smlal v26.4s, v9.4h, v0.h[5] |
| smlal v26.4s, v7.4h, v0.h[4] |
| smull2 v16.4s, v17.8h, v0.h[4] |
| smlal2 v16.4s, v19.8h, v0.h[5] |
| smlal2 v16.4s, v21.8h, v0.h[6] |
| smlal2 v16.4s, v23.8h, v0.h[7] |
| smlal2 v16.4s, v25.8h, v0.h[6] |
| smlal2 v16.4s, v9.8h, v0.h[5] |
| smlal2 v16.4s, v7.8h, v0.h[4] |
| srshl v1.4s, v1.4s, v27.4s // -round_bits_v |
| srshl v5.4s, v5.4s, v27.4s |
| srshl v26.4s, v26.4s, v27.4s |
| srshl v16.4s, v16.4s, v27.4s |
| sqxtun v18.4h, v1.4s |
| sqxtun2 v18.8h, v5.4s |
| sqxtun v19.4h, v26.4s |
| sqxtun2 v19.8h, v16.4s |
| st1 {v6.8h, v7.8h}, [x15], #32 |
| umin v18.8h, v18.8h, v28.8h // bitdepth_max |
| umin v19.8h, v19.8h, v28.8h |
| subs w4, w4, #16 |
| |
| st1 {v18.8h, v19.8h}, [x0], #32 |
| |
| b.le 0f |
| mov v2.16b, v4.16b |
| tst w7, #2 // LR_HAVE_RIGHT |
| ld1 {v3.8h, v4.8h}, [x3], #32 |
| b.ne 4b // If we don't need to pad, just keep filtering. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 0: |
| ldp x3, x4, [sp, #64] |
| ldp x15, x0, [sp, #48] |
| ldp x13, x14, [sp, #32] |
| ldp x11, x12, [sp, #16] |
| ldp x9, x10, [sp], #80 |
| |
| add x3, x3, x1 |
| add x0, x0, x1 |
| |
| ret |
| endfunc |
| |
| // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, |
| // const pixel (*left)[4], const pixel *lpf, |
| // const int w, int h, |
| // const int16_t filter[2][8], |
| // const enum LrEdgeFlags edges, |
| // const int bitdepth_max); |
| function wiener_filter5_16bpc_neon, export=1 |
| ldr w8, [sp] |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29, x30, [sp, #-32]! |
| stp d8, d9, [sp, #16] |
| mov x29, sp |
| ld1 {v0.8h, v1.8h}, [x6] |
| tst w7, #4 // LR_HAVE_TOP |
| sub_sp 384*2*4 |
| |
| dup v28.8h, w8 // bitdepth_max |
| clz w8, w8 |
| movi v30.4s, #1 |
| sub w10, w8, #38 // -(bitdepth + 6) |
| sub w11, w8, #11 // round_bits_v |
| sub w8, w8, #25 // -round_bits_h |
| neg w10, w10 // bitdepth + 6 |
| neg w11, w11 // -round_bits_v |
| dup v2.4s, w10 |
| dup v29.4s, w8 // -round_bits_h |
| dup v27.4s, w11 // -round_bits_v |
| movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 |
| ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) |
| |
| zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 |
| |
| // x11 - t4 |
| // x12 - t3 |
| // x13 - t2 |
| // x14 - t1 |
| // x15 - t0 |
| mov x14, sp // t1 |
| b.eq L(no_top_5) |
| |
| mov x16, x2 // backup left |
| mov x2, #0 |
| bl wiener_filter5_h_16bpc_neon |
| add x3, x3, x1 // lpf += stride |
| mov x11, x14 // t4 |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter5_h_16bpc_neon |
| add x3, x3, x1, lsl #2 |
| add x3, x3, x1 // lpf += stride*5 |
| mov x12, x14 // t3 |
| add x14, x14, #384*2 // t1 += 384*2 |
| mov x2, x16 // left |
| mov x16, x3 // backup lpf |
| mov x3, x0 // lpf = p |
| bl wiener_filter5_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| mov x13, x14 // t2 |
| b.eq L(v1_5) |
| add x3, x3, x1 // src += stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter5_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v2_5) |
| add x3, x3, x1 // src += stride |
| |
| L(main_5): |
| mov x15, x11 // t0 = t4 |
| L(main_loop_5): |
| bl wiener_filter5_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.ne L(main_loop_5) |
| tst w7, #8 // LR_HAVE_BOTTOM |
| b.eq L(v2_5) |
| |
| mov x3, x16 // restore lpf |
| mov x2, #0 // left = NULL |
| bl wiener_filter5_hv_16bpc_neon |
| bl wiener_filter5_hv_16bpc_neon |
| L(end_5): |
| |
| mov sp, x29 |
| ldp d8, d9, [sp, #16] |
| ldp x29, x30, [sp], #32 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| L(no_top_5): |
| add x3, x3, x1, lsl #2 |
| add x16, x3, x1, lsl #1 // lpf += stride*6, backup |
| mov x3, x0 // lpf = p |
| |
| bl wiener_filter5_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| mov x11, x14 // t4 |
| mov x12, x14 // t3 |
| mov x13, x14 // t2 |
| b.eq L(v1_5) |
| add x3, x3, x1 // src += stride |
| add x14, x14, #384*2 // t1 += 384*2 |
| bl wiener_filter5_h_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v2_5) |
| add x3, x3, x1 // src += stride |
| add x15, x14, #384*2 // t0 = t1 + 384*2 |
| bl wiener_filter5_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.eq L(v2_5) |
| add x15, x15, #384*2*3 // t0 += 384*2*3 |
| bl wiener_filter5_hv_16bpc_neon |
| subs w5, w5, #1 // h-- |
| b.ne L(main_5) |
| L(v2_5): |
| bl wiener_filter5_v_16bpc_neon |
| add x0, x0, x1 |
| mov x11, x12 |
| mov x12, x13 |
| mov x13, x14 |
| L(v1_5): |
| bl wiener_filter5_v_16bpc_neon |
| b L(end_5) |
| endfunc |
| |
| |
| function wiener_filter5_h_16bpc_neon |
| stp x3, x4, [sp, #-32]! |
| str x14, [sp, #16] |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 1f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #4 |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| b 2f |
| |
| 0: |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| ld1 {v4.d}[1], [x2], #8 |
| // Move x3 back to account for the last 2 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #4 |
| ext v3.16b, v2.16b, v3.16b, #12 |
| ext v2.16b, v4.16b, v2.16b, #12 |
| b 2f |
| |
| 1: |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| // !LR_HAVE_LEFT, fill v2 with the leftmost pixel |
| // and shift v3 to have 3x the first pixel at the front. |
| dup v4.8h, v2.h[0] |
| // Move x3 back to account for the last 2 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #4 |
| ext v3.16b, v2.16b, v3.16b, #12 |
| ext v2.16b, v4.16b, v2.16b, #12 |
| |
| 2: |
| ld1 {v4.8h}, [x3], #16 |
| |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w4, #18 |
| b.ge 4f // If w >= 18, all used input pixels are valid |
| |
| // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, |
| // this ends up called again; it's not strictly needed in those |
| // cases (we pad enough here), but keeping the code as simple as possible. |
| |
| // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie |
| // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. |
| sub w17, w4, #23 |
| // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the |
| // buffer pointer. |
| movrel x6, right_ext_mask, -4 |
| ldr h26, [x3, w17, sxtw #1] |
| sub x6, x6, w4, uxtw #1 |
| dup v26.8h, v26.h[0] |
| ld1 {v23.16b, v24.16b, v25.16b}, [x6] |
| |
| bit v2.16b, v26.16b, v23.16b |
| bit v3.16b, v26.16b, v24.16b |
| bit v4.16b, v26.16b, v25.16b |
| |
| 4: // Loop horizontally |
| // Interleaving the mul/mla chains actually hurts performance |
| // significantly on Cortex A53, thus keeping mul/mla tightly |
| // chained like this. |
| ext v16.16b, v2.16b, v3.16b, #2 |
| ext v18.16b, v2.16b, v3.16b, #6 |
| ext v19.16b, v2.16b, v3.16b, #8 |
| ext v17.16b, v2.16b, v3.16b, #4 |
| add v18.8h, v18.8h, v16.8h |
| add v19.8h, v19.8h, v2.8h |
| smull v6.4s, v17.4h, v0.h[3] |
| smlal v6.4s, v18.4h, v0.h[2] |
| smlal v6.4s, v19.4h, v0.h[1] |
| smull2 v7.4s, v17.8h, v0.h[3] |
| smlal2 v7.4s, v18.8h, v0.h[2] |
| smlal2 v7.4s, v19.8h, v0.h[1] |
| |
| ext v16.16b, v3.16b, v4.16b, #2 |
| ext v18.16b, v3.16b, v4.16b, #6 |
| ext v19.16b, v3.16b, v4.16b, #8 |
| ext v17.16b, v3.16b, v4.16b, #4 |
| add v18.8h, v18.8h, v16.8h |
| add v19.8h, v19.8h, v3.8h |
| smull v16.4s, v17.4h, v0.h[3] |
| smlal v16.4s, v18.4h, v0.h[2] |
| smlal v16.4s, v19.4h, v0.h[1] |
| smull2 v17.4s, v17.8h, v0.h[3] |
| smlal2 v17.4s, v18.8h, v0.h[2] |
| smlal2 v17.4s, v19.8h, v0.h[1] |
| |
| mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 |
| add v6.4s, v6.4s, v30.4s |
| add v7.4s, v7.4s, v30.4s |
| add v16.4s, v16.4s, v30.4s |
| add v17.4s, v17.4s, v30.4s |
| srshl v6.4s, v6.4s, v29.4s |
| srshl v7.4s, v7.4s, v29.4s |
| srshl v16.4s, v16.4s, v29.4s |
| srshl v17.4s, v17.4s, v29.4s |
| sqxtun v6.4h, v6.4s |
| sqxtun2 v6.8h, v7.4s |
| sqxtun v7.4h, v16.4s |
| sqxtun2 v7.8h, v17.4s |
| umin v6.8h, v6.8h, v24.8h |
| umin v7.8h, v7.8h, v24.8h |
| sub v6.8h, v6.8h, v31.8h |
| sub v7.8h, v7.8h, v31.8h |
| |
| subs w4, w4, #16 |
| |
| st1 {v6.8h, v7.8h}, [x14], #32 |
| |
| b.le 0f |
| mov v2.16b, v4.16b |
| tst w7, #2 // LR_HAVE_RIGHT |
| ld1 {v3.8h, v4.8h}, [x3], #32 |
| b.ne 4b // If we don't need to pad, just keep filtering. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 0: |
| ldr x14, [sp, #16] |
| ldp x3, x4, [sp], #32 |
| ret |
| endfunc |
| |
| function wiener_filter5_v_16bpc_neon |
| stp x11, x12, [sp, #-48]! |
| stp x13, x14, [sp, #16] |
| stp x0, x4, [sp, #32] |
| 1: |
| ld1 {v16.8h, v17.8h}, [x11], #32 |
| ld1 {v18.8h, v19.8h}, [x12], #32 |
| ld1 {v20.8h, v21.8h}, [x13], #32 |
| ld1 {v22.8h, v23.8h}, [x14], #32 |
| |
| smull v2.4s, v16.4h, v0.h[5] |
| smlal v2.4s, v18.4h, v0.h[6] |
| smlal v2.4s, v20.4h, v0.h[7] |
| smlal v2.4s, v22.4h, v0.h[6] |
| smlal v2.4s, v22.4h, v0.h[5] |
| smull2 v3.4s, v16.8h, v0.h[5] |
| smlal2 v3.4s, v18.8h, v0.h[6] |
| smlal2 v3.4s, v20.8h, v0.h[7] |
| smlal2 v3.4s, v22.8h, v0.h[6] |
| smlal2 v3.4s, v22.8h, v0.h[5] |
| smull v4.4s, v17.4h, v0.h[5] |
| smlal v4.4s, v19.4h, v0.h[6] |
| smlal v4.4s, v21.4h, v0.h[7] |
| smlal v4.4s, v23.4h, v0.h[6] |
| smlal v4.4s, v23.4h, v0.h[5] |
| smull2 v5.4s, v17.8h, v0.h[5] |
| smlal2 v5.4s, v19.8h, v0.h[6] |
| smlal2 v5.4s, v21.8h, v0.h[7] |
| smlal2 v5.4s, v23.8h, v0.h[6] |
| smlal2 v5.4s, v23.8h, v0.h[5] |
| srshl v2.4s, v2.4s, v27.4s // -round_bits_v |
| srshl v3.4s, v3.4s, v27.4s |
| srshl v4.4s, v4.4s, v27.4s |
| srshl v5.4s, v5.4s, v27.4s |
| sqxtun v2.4h, v2.4s |
| sqxtun2 v2.8h, v3.4s |
| sqxtun v3.4h, v4.4s |
| sqxtun2 v3.8h, v5.4s |
| umin v2.8h, v2.8h, v28.8h // bitdepth_max |
| umin v3.8h, v3.8h, v28.8h |
| |
| subs w4, w4, #16 |
| st1 {v2.8h, v3.8h}, [x0], #32 |
| b.gt 1b |
| |
| ldp x0, x4, [sp, #32] |
| ldp x13, x14, [sp, #16] |
| ldp x11, x12, [sp], #48 |
| |
| ret |
| endfunc |
| |
| function wiener_filter5_hv_16bpc_neon |
| // Backing up/restoring registers shifted, so that x11 gets the value |
| // of x12, etc, and x15==x11, afterwards. |
| stp x12, x13, [sp, #-64]! |
| stp x14, x15, [sp, #16] |
| stp x12, x0, [sp, #32] |
| stp x3, x4, [sp, #48] |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 1f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #4 |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| b 2f |
| |
| 0: |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| ld1 {v4.d}[1], [x2], #8 |
| // Move x3 back to account for the last 2 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #4 |
| ext v3.16b, v2.16b, v3.16b, #12 |
| ext v2.16b, v4.16b, v2.16b, #12 |
| b 2f |
| 1: |
| ld1 {v2.8h, v3.8h}, [x3], #32 |
| // !LR_HAVE_LEFT, fill v2 with the leftmost pixel |
| // and shift v3 to have 2x the first pixel at the front. |
| dup v4.8h, v2.h[0] |
| // Move x3 back to account for the last 2 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #4 |
| ext v3.16b, v2.16b, v3.16b, #12 |
| ext v2.16b, v4.16b, v2.16b, #12 |
| |
| 2: |
| ld1 {v4.8h}, [x3], #16 |
| |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w4, #18 |
| b.ge 4f // If w >= 18, all used input pixels are valid |
| |
| // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, |
| // this ends up called again; it's not strictly needed in those |
| // cases (we pad enough here), but keeping the code as simple as possible. |
| |
| // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie |
| // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. |
| sub w17, w4, #23 |
| // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the |
| // buffer pointer. |
| movrel x6, right_ext_mask, -4 |
| ldr h26, [x3, w17, sxtw #1] |
| sub x6, x6, w4, uxtw #1 |
| dup v26.8h, v26.h[0] |
| ld1 {v23.16b, v24.16b, v25.16b}, [x6] |
| |
| bit v2.16b, v26.16b, v23.16b |
| bit v3.16b, v26.16b, v24.16b |
| bit v4.16b, v26.16b, v25.16b |
| |
| 4: // Loop horizontally |
| ext v16.16b, v2.16b, v3.16b, #2 |
| ext v18.16b, v2.16b, v3.16b, #6 |
| ext v19.16b, v2.16b, v3.16b, #8 |
| ext v17.16b, v2.16b, v3.16b, #4 |
| add v18.8h, v18.8h, v16.8h |
| add v19.8h, v19.8h, v2.8h |
| smull v6.4s, v17.4h, v0.h[3] |
| smlal v6.4s, v18.4h, v0.h[2] |
| smlal v6.4s, v19.4h, v0.h[1] |
| smull2 v7.4s, v17.8h, v0.h[3] |
| smlal2 v7.4s, v18.8h, v0.h[2] |
| smlal2 v7.4s, v19.8h, v0.h[1] |
| |
| ext v16.16b, v3.16b, v4.16b, #2 |
| ext v18.16b, v3.16b, v4.16b, #6 |
| ext v19.16b, v3.16b, v4.16b, #8 |
| ext v17.16b, v3.16b, v4.16b, #4 |
| add v18.8h, v18.8h, v16.8h |
| add v19.8h, v19.8h, v3.8h |
| smull v24.4s, v17.4h, v0.h[3] |
| smlal v24.4s, v18.4h, v0.h[2] |
| smlal v24.4s, v19.4h, v0.h[1] |
| smull2 v25.4s, v17.8h, v0.h[3] |
| smlal2 v25.4s, v18.8h, v0.h[2] |
| smlal2 v25.4s, v19.8h, v0.h[1] |
| |
| ld1 {v16.8h, v17.8h}, [x11], #32 |
| mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 |
| add v6.4s, v6.4s, v30.4s |
| add v7.4s, v7.4s, v30.4s |
| add v24.4s, v24.4s, v30.4s |
| add v25.4s, v25.4s, v30.4s |
| ld1 {v18.8h, v19.8h}, [x12], #32 |
| srshl v6.4s, v6.4s, v29.4s |
| srshl v7.4s, v7.4s, v29.4s |
| srshl v24.4s, v24.4s, v29.4s |
| srshl v25.4s, v25.4s, v29.4s |
| ld1 {v20.8h, v21.8h}, [x13], #32 |
| sqxtun v6.4h, v6.4s |
| sqxtun2 v6.8h, v7.4s |
| sqxtun v7.4h, v24.4s |
| sqxtun2 v7.8h, v25.4s |
| ld1 {v22.8h, v23.8h}, [x14], #32 |
| umin v6.8h, v6.8h, v26.8h |
| umin v7.8h, v7.8h, v26.8h |
| sub v6.8h, v6.8h, v31.8h |
| sub v7.8h, v7.8h, v31.8h |
| |
| smull v8.4s, v16.4h, v0.h[5] |
| smlal v8.4s, v18.4h, v0.h[6] |
| smlal v8.4s, v20.4h, v0.h[7] |
| smlal v8.4s, v22.4h, v0.h[6] |
| smlal v8.4s, v6.4h, v0.h[5] |
| smull2 v9.4s, v16.8h, v0.h[5] |
| smlal2 v9.4s, v18.8h, v0.h[6] |
| smlal2 v9.4s, v20.8h, v0.h[7] |
| smlal2 v9.4s, v22.8h, v0.h[6] |
| smlal2 v9.4s, v6.8h, v0.h[5] |
| smull v1.4s, v17.4h, v0.h[5] |
| smlal v1.4s, v19.4h, v0.h[6] |
| smlal v1.4s, v21.4h, v0.h[7] |
| smlal v1.4s, v23.4h, v0.h[6] |
| smlal v1.4s, v7.4h, v0.h[5] |
| smull2 v5.4s, v17.8h, v0.h[5] |
| smlal2 v5.4s, v19.8h, v0.h[6] |
| smlal2 v5.4s, v21.8h, v0.h[7] |
| smlal2 v5.4s, v23.8h, v0.h[6] |
| smlal2 v5.4s, v7.8h, v0.h[5] |
| srshl v8.4s, v8.4s, v27.4s // -round_bits_v |
| srshl v9.4s, v9.4s, v27.4s |
| srshl v1.4s, v1.4s, v27.4s |
| srshl v5.4s, v5.4s, v27.4s |
| sqxtun v8.4h, v8.4s |
| sqxtun2 v8.8h, v9.4s |
| sqxtun v9.4h, v1.4s |
| sqxtun2 v9.8h, v5.4s |
| st1 {v6.8h, v7.8h}, [x15], #32 |
| umin v8.8h, v8.8h, v28.8h // bitdepth_max |
| umin v9.8h, v9.8h, v28.8h |
| |
| subs w4, w4, #16 |
| |
| st1 {v8.8h, v9.8h}, [x0], #32 |
| |
| b.le 0f |
| mov v2.16b, v4.16b |
| tst w7, #2 // LR_HAVE_RIGHT |
| ld1 {v3.8h, v4.8h}, [x3], #32 |
| b.ne 4b // If we don't need to pad, just keep filtering. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 0: |
| ldp x3, x4, [sp, #48] |
| ldp x15, x0, [sp, #32] |
| ldp x13, x14, [sp, #16] |
| ldp x11, x12, [sp], #64 |
| |
| add x3, x3, x1 |
| add x0, x0, x1 |
| |
| ret |
| endfunc |
| |
| #define SUM_STRIDE (384+16) |
| |
| #include "looprestoration_tmpl.S" |
| |
| // void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, |
| // const pixel (*left)[4], |
| // const pixel *src, const ptrdiff_t stride, |
| // const int w, const int h, |
| // const enum LrEdgeFlags edges); |
| function sgr_box3_h_16bpc_neon, export=1 |
| add w5, w5, #2 // w += 2 |
| |
| // Set up pointers for reading/writing alternate rows |
| add x10, x0, #(4*SUM_STRIDE) // sumsq |
| add x11, x1, #(2*SUM_STRIDE) // sum |
| add x12, x3, x4 // src |
| lsl x4, x4, #1 |
| mov x9, #(2*2*SUM_STRIDE) // double sum stride |
| |
| // Subtract the aligned width from the output stride. |
| add w13, w5, #7 |
| bic w13, w13, #7 |
| sub x9, x9, w13, uxtw #1 |
| |
| // Store the width for the vertical loop |
| mov w8, w5 |
| |
| // Subtract the number of pixels read from the input from the stride |
| add w13, w13, #8 |
| sub x4, x4, w13, uxtw #1 |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 2f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #4 |
| sub x12, x12, #4 |
| b 1f |
| 0: // LR_HAVE_LEFT, left != NULL |
| 2: // !LR_HAVE_LEFT, increase the stride. |
| // For this case we don't read the left 2 pixels from the src pointer, |
| // but shift it as if we had done that. |
| add x4, x4, #4 |
| |
| |
| 1: // Loop vertically |
| ld1 {v0.8h, v1.8h}, [x3], #32 |
| ld1 {v16.8h, v17.8h}, [x12], #32 |
| |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 0f |
| cbz x2, 2f |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.d}[1], [x2], #8 |
| // Move x3/x12 back to account for the last 2 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #4 |
| sub x12, x12, #4 |
| ld1 {v18.d}[1], [x2], #8 |
| ext v1.16b, v0.16b, v1.16b, #12 |
| ext v0.16b, v2.16b, v0.16b, #12 |
| ext v17.16b, v16.16b, v17.16b, #12 |
| ext v16.16b, v18.16b, v16.16b, #12 |
| b 2f |
| 0: |
| // !LR_HAVE_LEFT, fill v2 with the leftmost pixel |
| // and shift v0/v1 to have 2x the first pixel at the front. |
| dup v2.8h, v0.h[0] |
| dup v18.8h, v16.h[0] |
| // Move x3 back to account for the last 2 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #4 |
| sub x12, x12, #4 |
| ext v1.16b, v0.16b, v1.16b, #12 |
| ext v0.16b, v2.16b, v0.16b, #12 |
| ext v17.16b, v16.16b, v17.16b, #12 |
| ext v16.16b, v18.16b, v16.16b, #12 |
| |
| 2: |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| // If we'll need to pad the right edge, load that pixel to pad with |
| // here since we can find it pretty easily from here. |
| sub w13, w5, #(2 + 16 - 2 + 1) |
| ldr h30, [x3, w13, sxtw #1] |
| ldr h31, [x12, w13, sxtw #1] |
| // Fill v30/v31 with the right padding pixel |
| dup v30.8h, v30.h[0] |
| dup v31.8h, v31.h[0] |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w5, #10 |
| b.ge 4f // If w >= 10, all used input pixels are valid |
| |
| // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called |
| // again; it's not strictly needed in those cases (we pad enough here), |
| // but keeping the code as simple as possible. |
| |
| // Insert padding in v0/1.h[w] onwards |
| movrel x13, right_ext_mask |
| sub x13, x13, w5, uxtw #1 |
| ld1 {v28.16b, v29.16b}, [x13] |
| |
| bit v0.16b, v30.16b, v28.16b |
| bit v1.16b, v30.16b, v29.16b |
| bit v16.16b, v31.16b, v28.16b |
| bit v17.16b, v31.16b, v29.16b |
| |
| 4: // Loop horizontally |
| ext v26.16b, v0.16b, v1.16b, #2 |
| ext v28.16b, v16.16b, v17.16b, #2 |
| ext v27.16b, v0.16b, v1.16b, #4 |
| ext v29.16b, v16.16b, v17.16b, #4 |
| |
| add v6.8h, v0.8h, v26.8h |
| umull v22.4s, v0.4h, v0.4h |
| umlal v22.4s, v26.4h, v26.4h |
| umlal v22.4s, v27.4h, v27.4h |
| add v7.8h, v16.8h, v28.8h |
| umull v24.4s, v16.4h, v16.4h |
| umlal v24.4s, v28.4h, v28.4h |
| umlal v24.4s, v29.4h, v29.4h |
| add v6.8h, v6.8h, v27.8h |
| umull2 v23.4s, v0.8h, v0.8h |
| umlal2 v23.4s, v26.8h, v26.8h |
| umlal2 v23.4s, v27.8h, v27.8h |
| add v7.8h, v7.8h, v29.8h |
| umull2 v25.4s, v16.8h, v16.8h |
| umlal2 v25.4s, v28.8h, v28.8h |
| umlal2 v25.4s, v29.8h, v29.8h |
| |
| subs w5, w5, #8 |
| |
| st1 {v6.8h}, [x1], #16 |
| st1 {v7.8h}, [x11], #16 |
| st1 {v22.4s,v23.4s}, [x0], #32 |
| st1 {v24.4s,v25.4s}, [x10], #32 |
| |
| b.le 9f |
| tst w7, #2 // LR_HAVE_RIGHT |
| mov v0.16b, v1.16b |
| mov v16.16b, v17.16b |
| ld1 {v1.8h}, [x3], #16 |
| ld1 {v17.8h}, [x12], #16 |
| |
| b.ne 4b // If we don't need to pad, just keep summing. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 9: |
| subs w6, w6, #2 |
| b.le 0f |
| // Jump to the next row and loop horizontally |
| add x0, x0, x9, lsl #1 |
| add x10, x10, x9, lsl #1 |
| add x1, x1, x9 |
| add x11, x11, x9 |
| add x3, x3, x4 |
| add x12, x12, x4 |
| mov w5, w8 |
| b 1b |
| 0: |
| ret |
| endfunc |
| |
| // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, |
| // const pixel (*left)[4], |
| // const pixel *src, const ptrdiff_t stride, |
| // const int w, const int h, |
| // const enum LrEdgeFlags edges); |
| function sgr_box5_h_16bpc_neon, export=1 |
| add w5, w5, #2 // w += 2 |
| |
| // Set up pointers for reading/writing alternate rows |
| add x10, x0, #(4*SUM_STRIDE) // sumsq |
| add x11, x1, #(2*SUM_STRIDE) // sum |
| add x12, x3, x4 // src |
| lsl x4, x4, #1 |
| mov x9, #(2*2*SUM_STRIDE) // double sum stride |
| |
| // Subtract the aligned width from the output stride. |
| add w13, w5, #7 |
| bic w13, w13, #7 |
| sub x9, x9, w13, uxtw #1 |
| add w13, w13, #8 |
| sub x4, x4, w13, uxtw #1 |
| |
| // Store the width for the vertical loop |
| mov w8, w5 |
| |
| // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 2f |
| // LR_HAVE_LEFT |
| cbnz x2, 0f |
| // left == NULL |
| sub x3, x3, #6 |
| sub x12, x12, #6 |
| b 1f |
| 0: // LR_HAVE_LEFT, left != NULL |
| 2: // !LR_HAVE_LEFT, increase the stride. |
| // For this case we don't read the left 3 pixels from the src pointer, |
| // but shift it as if we had done that. |
| add x4, x4, #6 |
| |
| 1: // Loop vertically |
| ld1 {v0.8h, v1.8h}, [x3], #32 |
| ld1 {v16.8h, v17.8h}, [x12], #32 |
| |
| tst w7, #1 // LR_HAVE_LEFT |
| b.eq 0f |
| cbz x2, 2f |
| // LR_HAVE_LEFT, left != NULL |
| ld1 {v2.d}[1], [x2], #8 |
| // Move x3/x12 back to account for the last 3 pixels we loaded earlier, |
| // which we'll shift out. |
| sub x3, x3, #6 |
| sub x12, x12, #6 |
| ld1 {v18.d}[1], [x2], #8 |
| ext v1.16b, v0.16b, v1.16b, #10 |
| ext v0.16b, v2.16b, v0.16b, #10 |
| ext v17.16b, v16.16b, v17.16b, #10 |
| ext v16.16b, v18.16b, v16.16b, #10 |
| b 2f |
| 0: |
| // !LR_HAVE_LEFT, fill v2 with the leftmost pixel |
| // and shift v0/v1 to have 3x the first pixel at the front. |
| dup v2.8h, v0.h[0] |
| dup v18.8h, v16.h[0] |
| // Move x3 back to account for the last 3 pixels we loaded before, |
| // which we shifted out. |
| sub x3, x3, #6 |
| sub x12, x12, #6 |
| ext v1.16b, v0.16b, v1.16b, #10 |
| ext v0.16b, v2.16b, v0.16b, #10 |
| ext v17.16b, v16.16b, v17.16b, #10 |
| ext v16.16b, v18.16b, v16.16b, #10 |
| |
| 2: |
| tst w7, #2 // LR_HAVE_RIGHT |
| b.ne 4f |
| // If we'll need to pad the right edge, load that pixel to pad with |
| // here since we can find it pretty easily from here. |
| sub w13, w5, #(2 + 16 - 3 + 1) |
| ldr h30, [x3, w13, sxtw #1] |
| ldr h31, [x12, w13, sxtw #1] |
| // Fill v30/v31 with the right padding pixel |
| dup v30.8h, v30.h[0] |
| dup v31.8h, v31.h[0] |
| 3: // !LR_HAVE_RIGHT |
| |
| // Check whether we need to pad the right edge |
| cmp w5, #11 |
| b.ge 4f // If w >= 11, all used input pixels are valid |
| |
| // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, |
| // this ends up called again; it's not strictly needed in those |
| // cases (we pad enough here), but keeping the code as simple as possible. |
| |
| // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the |
| // buffer pointer. |
| movrel x13, right_ext_mask, -2 |
| sub x13, x13, w5, uxtw #1 |
| ld1 {v28.16b, v29.16b}, [x13] |
| |
| bit v0.16b, v30.16b, v28.16b |
| bit v1.16b, v30.16b, v29.16b |
| bit v16.16b, v31.16b, v28.16b |
| bit v17.16b, v31.16b, v29.16b |
| |
| 4: // Loop horizontally |
| ext v26.16b, v0.16b, v1.16b, #2 |
| ext v28.16b, v16.16b, v17.16b, #2 |
| ext v27.16b, v0.16b, v1.16b, #4 |
| ext v29.16b, v16.16b, v17.16b, #4 |
| |
| add v6.8h, v0.8h, v26.8h |
| umull v22.4s, v0.4h, v0.4h |
| umlal v22.4s, v26.4h, v26.4h |
| umlal v22.4s, v27.4h, v27.4h |
| add v7.8h, v16.8h, v28.8h |
| umull v24.4s, v16.4h, v16.4h |
| umlal v24.4s, v28.4h, v28.4h |
| umlal v24.4s, v29.4h, v29.4h |
| add v6.8h, v6.8h, v27.8h |
| umull2 v23.4s, v0.8h, v0.8h |
| umlal2 v23.4s, v26.8h, v26.8h |
| umlal2 v23.4s, v27.8h, v27.8h |
| add v7.8h, v7.8h, v29.8h |
| umull2 v25.4s, v16.8h, v16.8h |
| umlal2 v25.4s, v28.8h, v28.8h |
| umlal2 v25.4s, v29.8h, v29.8h |
| |
| ext v26.16b, v0.16b, v1.16b, #6 |
| ext v28.16b, v16.16b, v17.16b, #6 |
| ext v27.16b, v0.16b, v1.16b, #8 |
| ext v29.16b, v16.16b, v17.16b, #8 |
| |
| add v6.8h, v6.8h, v26.8h |
| umlal v22.4s, v26.4h, v26.4h |
| umlal v22.4s, v27.4h, v27.4h |
| add v7.8h, v7.8h, v28.8h |
| umlal v24.4s, v28.4h, v28.4h |
| umlal v24.4s, v29.4h, v29.4h |
| add v6.8h, v6.8h, v27.8h |
| umlal2 v23.4s, v26.8h, v26.8h |
| umlal2 v23.4s, v27.8h, v27.8h |
| add v7.8h, v7.8h, v29.8h |
| umlal2 v25.4s, v28.8h, v28.8h |
| umlal2 v25.4s, v29.8h, v29.8h |
| |
| subs w5, w5, #8 |
| |
| st1 {v6.8h}, [x1], #16 |
| st1 {v7.8h}, [x11], #16 |
| st1 {v22.4s,v23.4s}, [x0], #32 |
| st1 {v24.4s,v25.4s}, [x10], #32 |
| |
| b.le 9f |
| tst w7, #2 // LR_HAVE_RIGHT |
| mov v0.16b, v1.16b |
| mov v16.16b, v17.16b |
| ld1 {v1.8h}, [x3], #16 |
| ld1 {v17.8h}, [x12], #16 |
| |
| b.ne 4b // If we don't need to pad, just keep summing. |
| b 3b // If we need to pad, check how many pixels we have left. |
| |
| 9: |
| subs w6, w6, #2 |
| b.le 0f |
| // Jump to the next row and loop horizontally |
| add x0, x0, x9, lsl #1 |
| add x10, x10, x9, lsl #1 |
| add x1, x1, x9 |
| add x11, x11, x9 |
| add x3, x3, x4 |
| add x12, x12, x4 |
| mov w5, w8 |
| b 1b |
| 0: |
| ret |
| endfunc |
| |
| sgr_funcs 16 |