blob: 32b258aba8a3cfb4dffebf95cb4f414ea858fc32 [file] [log] [blame]
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
tst w7, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
sub \s1, \s1, #2
sub \s2, \s2, #2
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr s1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr s3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str \rw\()0, [x0]
str d1, [x0, #2*\w]
add x0, x0, #2*\stride
str \rw\()2, [x0]
str d3, [x0, #2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr h1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr h3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str \rw\()0, [x0]
str s1, [x0, #2*\w]
str s31, [x0, #2*\w+4]
add x0, x0, #2*\stride
str \rw\()2, [x0]
str s3, [x0, #2*\w]
str s31, [x0, #2*\w+4]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr h1, [\s1, #\w]
ldr \rn\()2, [\s2]
ldr h3, [\s2, #\w]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
uxtl v3.8h, v3.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \rw\()2, [x0, #4]
str s3, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \rn\()0, [\s1]
ldr \rn\()1, [\s2]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \rw\()1, [x0, #4]
str s31, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
.endif
3:
.endm
.macro load_n_incr dst, src, incr, w
.if \w == 4
ld1 {\dst\().s}[0], [\src], \incr
.else
ld1 {\dst\().8b}, [\src], \incr
.endif
.endm
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top,
// const pixel *const bottom, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_8bpc_neon, export=1
cmp w7, #0xf // fully edged
b.eq cdef_padding\w\()_edged_8bpc_neon
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w7, #4 // CDEF_HAVE_TOP
b.ne 1f
// !CDEF_HAVE_TOP
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
b 3f
1:
// CDEF_HAVE_TOP
add x9, x4, x2
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
tst w7, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
str s0, [x0]
stur \rw\()1, [x0, #4]
str s2, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ld1 {v0.h}[0], [x3], #2
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s0, [x0]
stur \rw\()1, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
b 3f
2:
tst w7, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldr h1, [x1, #\w]
load_n_incr v0, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr v0, x1, x2, \w
subs w6, w6, #1
uxtl v0.8h, v0.8b
str s31, [x0]
stur \rw\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
3:
tst w7, #8 // CDEF_HAVE_BOTTOM
b.ne 1f
// !CDEF_HAVE_BOTTOM
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
ret
1:
// CDEF_HAVE_BOTTOM
add x9, x5, x2
pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1
endfunc
.endm
padding_func 8, 16, d, q
padding_func 4, 8, s, d
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top,
// const pixel *const bottom, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg
function cdef_padding\w\()_edged_8bpc_neon, export=1
sub x4, x4, #2
sub x5, x5, #2
sub x0, x0, #(2*\stride+2)
.if \w == 4
ldr d0, [x4]
ldr d1, [x4, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x4, x2
ldr d0, [x4]
ldr s1, [x4, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
add x0, x0, #2*\stride
.endif
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w6, w6, #1
str h0, [x0]
stur \reg\()1, [x0, #2]
str h2, [x0, #2+\w]
add x0, x0, #\stride
b.gt 0b
.if \w == 4
ldr d0, [x5]
ldr d1, [x5, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x5, x2
ldr d0, [x5]
ldr s1, [x5, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
.endif
ret
endfunc
.endm
padding_func_edged 8, 16, d
padding_func_edged 4, 8, s
tables
filter 8, 8
filter 4, 8
find_dir 8
.macro load_px_8 d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().d}[0], [x6] // p0
add x6, x6, #16 // += stride
ld1 {\d2\().d}[0], [x9] // p1
add x9, x9, #16 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p0
.else
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().s}[0], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[0], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[1], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[1], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[2], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[2], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[3], [x6] // p0
ld1 {\d2\().s}[3], [x9] // p1
.endif
.endm
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
umin v3.16b, v3.16b, \s1\().16b
umax v4.16b, v4.16b, \s1\().16b
umin v3.16b, v3.16b, \s2\().16b
umax v4.16b, v4.16b, \s2\().16b
.endif
uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
cmhi v18.16b, v0.16b, \s1\().16b // px > p0
cmhi v22.16b, v0.16b, \s2\().16b // px > p1
umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
dup v19.16b, \tap // taps[k]
neg v16.16b, v17.16b // -imin()
neg v20.16b, v21.16b // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint8_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_8bpc_neon
.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.8b, #7
dup v28.8b, w6 // damping
.if \pri
dup v25.16b, w3 // threshold
.endif
.if \sec
dup v27.16b, w4 // threshold
.endif
trn1 v24.8b, v25.8b, v27.8b
clz v24.8b, v24.8b // clz(threshold)
sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
neg v24.8b, v24.8b // -shift
.if \sec
dup v26.16b, v24.b[1]
.endif
.if \pri
dup v24.16b, v24.b[0]
.endif
1:
.if \w == 8
add x12, x2, #16
ld1 {v0.d}[0], [x2] // px
ld1 {v0.d}[1], [x12] // px
.else
add x12, x2, #1*8
add x13, x2, #2*8
add x14, x2, #3*8
ld1 {v0.s}[0], [x2] // px
ld1 {v0.s}[1], [x12] // px
ld1 {v0.s}[2], [x13] // px
ld1 {v0.s}[3], [x14] // px
.endif
// We need 9-bits or two 8-bit accululators to fit the sum.
// Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
// Start sum at -1 instead of 0 to help handle rounding later.
movi v1.16b, #255 // sum
movi v2.16b, #0 // sum
.if \min
mov v3.16b, v0.16b // min
mov v4.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px_8 v5, v6, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px_8 v28, v29, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px_8 v5, v6, \w
handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
// Perform halving adds since the value won't fit otherwise.
// To handle the offset for negative values, use both halving w/ and w/o rounding.
srhadd v5.16b, v1.16b, v2.16b // sum >> 1
shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
cmlt v1.16b, v5.16b, #0 // sum < 0
bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
.if \min
umin v0.16b, v0.16b, v4.16b
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
.endif
.if \w == 8
st1 {v0.d}[0], [x0], x1
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
st1 {v0.d}[1], [x0], x1
.else
st1 {v0.s}[0], [x0], x1
add x2, x2, #4*8 // tmp += 4*tmp_stride
st1 {v0.s}[1], [x0], x1
subs w7, w7, #4 // h -= 4
st1 {v0.s}[2], [x0], x1
st1 {v0.s}[3], [x0], x1
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
filter_8 8
filter_8 4