| /* |
| * Copyright © 2018, VideoLAN and dav1d authors |
| * Copyright © 2019, Martin Storsjo |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "src/arm/asm.S" |
| #include "util.S" |
| #include "cdef_tmpl.S" |
| |
| .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret |
| tst w7, #1 // CDEF_HAVE_LEFT |
| b.eq 2f |
| // CDEF_HAVE_LEFT |
| sub \s1, \s1, #2 |
| sub \s2, \s2, #2 |
| tst w7, #2 // CDEF_HAVE_RIGHT |
| b.eq 1f |
| // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT |
| ldr \rn\()0, [\s1] |
| ldr s1, [\s1, #\w] |
| ldr \rn\()2, [\s2] |
| ldr s3, [\s2, #\w] |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| uxtl v2.8h, v2.8b |
| uxtl v3.8h, v3.8b |
| str \rw\()0, [x0] |
| str d1, [x0, #2*\w] |
| add x0, x0, #2*\stride |
| str \rw\()2, [x0] |
| str d3, [x0, #2*\w] |
| .if \ret |
| ret |
| .else |
| add x0, x0, #2*\stride |
| b 3f |
| .endif |
| |
| 1: |
| // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT |
| ldr \rn\()0, [\s1] |
| ldr h1, [\s1, #\w] |
| ldr \rn\()2, [\s2] |
| ldr h3, [\s2, #\w] |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| uxtl v2.8h, v2.8b |
| uxtl v3.8h, v3.8b |
| str \rw\()0, [x0] |
| str s1, [x0, #2*\w] |
| str s31, [x0, #2*\w+4] |
| add x0, x0, #2*\stride |
| str \rw\()2, [x0] |
| str s3, [x0, #2*\w] |
| str s31, [x0, #2*\w+4] |
| .if \ret |
| ret |
| .else |
| add x0, x0, #2*\stride |
| b 3f |
| .endif |
| |
| 2: |
| // !CDEF_HAVE_LEFT |
| tst w7, #2 // CDEF_HAVE_RIGHT |
| b.eq 1f |
| // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT |
| ldr \rn\()0, [\s1] |
| ldr h1, [\s1, #\w] |
| ldr \rn\()2, [\s2] |
| ldr h3, [\s2, #\w] |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| uxtl v2.8h, v2.8b |
| uxtl v3.8h, v3.8b |
| str s31, [x0] |
| stur \rw\()0, [x0, #4] |
| str s1, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| str s31, [x0] |
| stur \rw\()2, [x0, #4] |
| str s3, [x0, #4+2*\w] |
| .if \ret |
| ret |
| .else |
| add x0, x0, #2*\stride |
| b 3f |
| .endif |
| |
| 1: |
| // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT |
| ldr \rn\()0, [\s1] |
| ldr \rn\()1, [\s2] |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| str s31, [x0] |
| stur \rw\()0, [x0, #4] |
| str s31, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| str s31, [x0] |
| stur \rw\()1, [x0, #4] |
| str s31, [x0, #4+2*\w] |
| .if \ret |
| ret |
| .else |
| add x0, x0, #2*\stride |
| .endif |
| 3: |
| .endm |
| |
| .macro load_n_incr dst, src, incr, w |
| .if \w == 4 |
| ld1 {\dst\().s}[0], [\src], \incr |
| .else |
| ld1 {\dst\().8b}, [\src], \incr |
| .endif |
| .endm |
| |
| // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, |
| // ptrdiff_t src_stride, const pixel (*left)[2], |
| // const pixel *const top, |
| // const pixel *const bottom, int h, |
| // enum CdefEdgeFlags edges); |
| |
| .macro padding_func w, stride, rn, rw |
| function cdef_padding\w\()_8bpc_neon, export=1 |
| cmp w7, #0xf // fully edged |
| b.eq cdef_padding\w\()_edged_8bpc_neon |
| movi v30.8h, #0x80, lsl #8 |
| mov v31.16b, v30.16b |
| sub x0, x0, #2*(2*\stride+2) |
| tst w7, #4 // CDEF_HAVE_TOP |
| b.ne 1f |
| // !CDEF_HAVE_TOP |
| st1 {v30.8h, v31.8h}, [x0], #32 |
| .if \w == 8 |
| st1 {v30.8h, v31.8h}, [x0], #32 |
| .endif |
| b 3f |
| 1: |
| // CDEF_HAVE_TOP |
| add x9, x4, x2 |
| pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 |
| |
| // Middle section |
| 3: |
| tst w7, #1 // CDEF_HAVE_LEFT |
| b.eq 2f |
| // CDEF_HAVE_LEFT |
| tst w7, #2 // CDEF_HAVE_RIGHT |
| b.eq 1f |
| // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT |
| 0: |
| ld1 {v0.h}[0], [x3], #2 |
| ldr h2, [x1, #\w] |
| load_n_incr v1, x1, x2, \w |
| subs w6, w6, #1 |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| uxtl v2.8h, v2.8b |
| str s0, [x0] |
| stur \rw\()1, [x0, #4] |
| str s2, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| b.gt 0b |
| b 3f |
| 1: |
| // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT |
| ld1 {v0.h}[0], [x3], #2 |
| load_n_incr v1, x1, x2, \w |
| subs w6, w6, #1 |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| str s0, [x0] |
| stur \rw\()1, [x0, #4] |
| str s31, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| b.gt 1b |
| b 3f |
| 2: |
| tst w7, #2 // CDEF_HAVE_RIGHT |
| b.eq 1f |
| // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT |
| 0: |
| ldr h1, [x1, #\w] |
| load_n_incr v0, x1, x2, \w |
| subs w6, w6, #1 |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| str s31, [x0] |
| stur \rw\()0, [x0, #4] |
| str s1, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| b.gt 0b |
| b 3f |
| 1: |
| // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT |
| load_n_incr v0, x1, x2, \w |
| subs w6, w6, #1 |
| uxtl v0.8h, v0.8b |
| str s31, [x0] |
| stur \rw\()0, [x0, #4] |
| str s31, [x0, #4+2*\w] |
| add x0, x0, #2*\stride |
| b.gt 1b |
| |
| 3: |
| tst w7, #8 // CDEF_HAVE_BOTTOM |
| b.ne 1f |
| // !CDEF_HAVE_BOTTOM |
| st1 {v30.8h, v31.8h}, [x0], #32 |
| .if \w == 8 |
| st1 {v30.8h, v31.8h}, [x0], #32 |
| .endif |
| ret |
| 1: |
| // CDEF_HAVE_BOTTOM |
| add x9, x5, x2 |
| pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 |
| endfunc |
| .endm |
| |
| padding_func 8, 16, d, q |
| padding_func 4, 8, s, d |
| |
| // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, |
| // ptrdiff_t src_stride, const pixel (*left)[2], |
| // const pixel *const top, |
| // const pixel *const bottom, int h, |
| // enum CdefEdgeFlags edges); |
| |
| .macro padding_func_edged w, stride, reg |
| function cdef_padding\w\()_edged_8bpc_neon, export=1 |
| sub x4, x4, #2 |
| sub x5, x5, #2 |
| sub x0, x0, #(2*\stride+2) |
| |
| .if \w == 4 |
| ldr d0, [x4] |
| ldr d1, [x4, x2] |
| st1 {v0.8b, v1.8b}, [x0], #16 |
| .else |
| add x9, x4, x2 |
| ldr d0, [x4] |
| ldr s1, [x4, #8] |
| ldr d2, [x9] |
| ldr s3, [x9, #8] |
| str d0, [x0] |
| str s1, [x0, #8] |
| str d2, [x0, #\stride] |
| str s3, [x0, #\stride+8] |
| add x0, x0, #2*\stride |
| .endif |
| |
| 0: |
| ld1 {v0.h}[0], [x3], #2 |
| ldr h2, [x1, #\w] |
| load_n_incr v1, x1, x2, \w |
| subs w6, w6, #1 |
| str h0, [x0] |
| stur \reg\()1, [x0, #2] |
| str h2, [x0, #2+\w] |
| add x0, x0, #\stride |
| b.gt 0b |
| |
| .if \w == 4 |
| ldr d0, [x5] |
| ldr d1, [x5, x2] |
| st1 {v0.8b, v1.8b}, [x0], #16 |
| .else |
| add x9, x5, x2 |
| ldr d0, [x5] |
| ldr s1, [x5, #8] |
| ldr d2, [x9] |
| ldr s3, [x9, #8] |
| str d0, [x0] |
| str s1, [x0, #8] |
| str d2, [x0, #\stride] |
| str s3, [x0, #\stride+8] |
| .endif |
| ret |
| endfunc |
| .endm |
| |
| padding_func_edged 8, 16, d |
| padding_func_edged 4, 8, s |
| |
| tables |
| |
| filter 8, 8 |
| filter 4, 8 |
| |
| find_dir 8 |
| |
| .macro load_px_8 d1, d2, w |
| .if \w == 8 |
| add x6, x2, w9, sxtb // x + off |
| sub x9, x2, w9, sxtb // x - off |
| ld1 {\d1\().d}[0], [x6] // p0 |
| add x6, x6, #16 // += stride |
| ld1 {\d2\().d}[0], [x9] // p1 |
| add x9, x9, #16 // += stride |
| ld1 {\d1\().d}[1], [x6] // p0 |
| ld1 {\d2\().d}[1], [x9] // p0 |
| .else |
| add x6, x2, w9, sxtb // x + off |
| sub x9, x2, w9, sxtb // x - off |
| ld1 {\d1\().s}[0], [x6] // p0 |
| add x6, x6, #8 // += stride |
| ld1 {\d2\().s}[0], [x9] // p1 |
| add x9, x9, #8 // += stride |
| ld1 {\d1\().s}[1], [x6] // p0 |
| add x6, x6, #8 // += stride |
| ld1 {\d2\().s}[1], [x9] // p1 |
| add x9, x9, #8 // += stride |
| ld1 {\d1\().s}[2], [x6] // p0 |
| add x6, x6, #8 // += stride |
| ld1 {\d2\().s}[2], [x9] // p1 |
| add x9, x9, #8 // += stride |
| ld1 {\d1\().s}[3], [x6] // p0 |
| ld1 {\d2\().s}[3], [x9] // p1 |
| .endif |
| .endm |
| .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min |
| .if \min |
| umin v3.16b, v3.16b, \s1\().16b |
| umax v4.16b, v4.16b, \s1\().16b |
| umin v3.16b, v3.16b, \s2\().16b |
| umax v4.16b, v4.16b, \s2\().16b |
| .endif |
| uabd v16.16b, v0.16b, \s1\().16b // abs(diff) |
| uabd v20.16b, v0.16b, \s2\().16b // abs(diff) |
| ushl v17.16b, v16.16b, \shift // abs(diff) >> shift |
| ushl v21.16b, v20.16b, \shift // abs(diff) >> shift |
| uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) |
| uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) |
| cmhi v18.16b, v0.16b, \s1\().16b // px > p0 |
| cmhi v22.16b, v0.16b, \s2\().16b // px > p1 |
| umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) |
| umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) |
| dup v19.16b, \tap // taps[k] |
| neg v16.16b, v17.16b // -imin() |
| neg v20.16b, v21.16b // -imin() |
| bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() |
| bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() |
| mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() |
| mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() |
| .endm |
| |
| // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, |
| // const uint8_t *tmp, int pri_strength, |
| // int sec_strength, int dir, int damping, |
| // int h); |
| .macro filter_func_8 w, pri, sec, min, suffix |
| function cdef_filter\w\suffix\()_edged_8bpc_neon |
| .if \pri |
| movrel x8, pri_taps |
| and w9, w3, #1 |
| add x8, x8, w9, uxtw #1 |
| .endif |
| movrel x9, directions\w |
| add x5, x9, w5, uxtw #1 |
| movi v30.8b, #7 |
| dup v28.8b, w6 // damping |
| |
| .if \pri |
| dup v25.16b, w3 // threshold |
| .endif |
| .if \sec |
| dup v27.16b, w4 // threshold |
| .endif |
| trn1 v24.8b, v25.8b, v27.8b |
| clz v24.8b, v24.8b // clz(threshold) |
| sub v24.8b, v30.8b, v24.8b // ulog2(threshold) |
| uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) |
| neg v24.8b, v24.8b // -shift |
| .if \sec |
| dup v26.16b, v24.b[1] |
| .endif |
| .if \pri |
| dup v24.16b, v24.b[0] |
| .endif |
| |
| 1: |
| .if \w == 8 |
| add x12, x2, #16 |
| ld1 {v0.d}[0], [x2] // px |
| ld1 {v0.d}[1], [x12] // px |
| .else |
| add x12, x2, #1*8 |
| add x13, x2, #2*8 |
| add x14, x2, #3*8 |
| ld1 {v0.s}[0], [x2] // px |
| ld1 {v0.s}[1], [x12] // px |
| ld1 {v0.s}[2], [x13] // px |
| ld1 {v0.s}[3], [x14] // px |
| .endif |
| |
| // We need 9-bits or two 8-bit accululators to fit the sum. |
| // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. |
| // Start sum at -1 instead of 0 to help handle rounding later. |
| movi v1.16b, #255 // sum |
| movi v2.16b, #0 // sum |
| .if \min |
| mov v3.16b, v0.16b // min |
| mov v4.16b, v0.16b // max |
| .endif |
| |
| // Instead of loading sec_taps 2, 1 from memory, just set it |
| // to 2 initially and decrease for the second round. |
| // This is also used as loop counter. |
| mov w11, #2 // sec_taps[0] |
| |
| 2: |
| .if \pri |
| ldrb w9, [x5] // off1 |
| |
| load_px_8 v5, v6, \w |
| .endif |
| |
| .if \sec |
| add x5, x5, #4 // +2*2 |
| ldrb w9, [x5] // off2 |
| load_px_8 v28, v29, \w |
| .endif |
| |
| .if \pri |
| ldrb w10, [x8] // *pri_taps |
| |
| handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min |
| .endif |
| |
| .if \sec |
| add x5, x5, #8 // +2*4 |
| ldrb w9, [x5] // off3 |
| load_px_8 v5, v6, \w |
| |
| handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min |
| |
| handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min |
| |
| sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; |
| .else |
| add x5, x5, #1 // x5 += 1 |
| .endif |
| subs w11, w11, #1 // sec_tap-- (value) |
| .if \pri |
| add x8, x8, #1 // pri_taps++ (pointer) |
| .endif |
| b.ne 2b |
| |
| // Perform halving adds since the value won't fit otherwise. |
| // To handle the offset for negative values, use both halving w/ and w/o rounding. |
| srhadd v5.16b, v1.16b, v2.16b // sum >> 1 |
| shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 |
| cmlt v1.16b, v5.16b, #0 // sum < 0 |
| bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 |
| |
| srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 |
| |
| usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 |
| .if \min |
| umin v0.16b, v0.16b, v4.16b |
| umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) |
| .endif |
| .if \w == 8 |
| st1 {v0.d}[0], [x0], x1 |
| add x2, x2, #2*16 // tmp += 2*tmp_stride |
| subs w7, w7, #2 // h -= 2 |
| st1 {v0.d}[1], [x0], x1 |
| .else |
| st1 {v0.s}[0], [x0], x1 |
| add x2, x2, #4*8 // tmp += 4*tmp_stride |
| st1 {v0.s}[1], [x0], x1 |
| subs w7, w7, #4 // h -= 4 |
| st1 {v0.s}[2], [x0], x1 |
| st1 {v0.s}[3], [x0], x1 |
| .endif |
| |
| // Reset pri_taps and directions back to the original point |
| sub x5, x5, #2 |
| .if \pri |
| sub x8, x8, #2 |
| .endif |
| |
| b.gt 1b |
| ret |
| endfunc |
| .endm |
| |
| .macro filter_8 w |
| filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri |
| filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec |
| filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec |
| .endm |
| |
| filter_8 8 |
| filter_8 4 |