blob: d181a3e623900b67cf93171e5c66d8a6ca4ca10b [file] [log] [blame]
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// depending on how many pixels need to be stored, returns:
// x14 = (1 << 0) : 0 pixels
// x14 = (1 << 4) : inner 4 pixels
// x14 = (1 << 6) : inner 6 pixels
// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
.if \wd >= 6
uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
.endif
.if \wd >= 8
uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
.endif
.if \wd >= 6
umax v4.8h, v4.8h, v5.8h
.endif
uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
ushr v3.8h, v3.8h, #1
.if \wd >= 8
umax v4.8h, v4.8h, v6.8h
.endif
.if \wd >= 6
and v4.16b, v4.16b, v14.16b
.endif
umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
umax v4.8h, v0.8h, v4.8h
cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
and v1.16b, v1.16b, v2.16b // fm
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 16
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
b.ne 9f // if (!fm || wd < 4) return;
mov x14, #(1 << 0)
ret
9:
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
dup v9.8h, w9 // bitdepth_min_8
.if \wd >= 8
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
.endif
umax v2.8h, v2.8h, v3.8h
umax v4.8h, v4.8h, v5.8h
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
umax v2.8h, v2.8h, v4.8h
ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
.if \wd >= 8
umax v2.8h, v2.8h, v6.8h
.endif
.if \wd == 16
uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
.endif
cmhs v2.8h, v10.8h, v2.8h // flat8in
.if \wd == 16
uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
.endif
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
.if \wd == 16
umax v3.8h, v3.8h, v4.8h
umax v5.8h, v5.8h, v6.8h
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
.if \wd == 16
umax v7.8h, v7.8h, v8.8h
umax v3.8h, v3.8h, v5.8h
umax v3.8h, v3.8h, v7.8h
cmhs v3.8h, v10.8h, v3.8h // flat8out
.endif
adds x16, x16, x17
.if \wd == 16
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
.endif
b.eq 1f // skip wd == 4 case
.endif
dup v3.8h, w8 // bitdepth_max
sub v2.8h, v22.8h, v25.8h // p1 - q1
ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
cmhi v0.8h, v0.8h, v12.8h // hev
not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
sub v2.8h, v24.8h, v23.8h
movi v5.8h, #3
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
mul v2.8h, v2.8h, v5.8h
movi v6.8h, #4
add v2.8h, v2.8h, v4.8h
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
sqadd v4.8h, v6.8h, v2.8h // f + 4
sqadd v5.8h, v5.8h, v2.8h // f + 3
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
sshr v4.8h, v4.8h, #3 // f1
sshr v5.8h, v5.8h, #3 // f2
movi v9.8h, #0
dup v3.8h, w8 // bitdepth_max
sqadd v2.8h, v23.8h, v5.8h // p0 + f2
sqsub v6.8h, v24.8h, v4.8h // q0 - f1
srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
sqadd v2.8h, v22.8h, v4.8h // p1 + f
sqsub v6.8h, v25.8h, v4.8h // q1 - f
smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 2f // skip if there's no flat8in
add v0.8h, v21.8h, v21.8h // p2 * 2
add v2.8h, v21.8h, v22.8h // p2 + p1
add v4.8h, v22.8h, v23.8h // p1 + p0
add v6.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v0.8h, v2.8h
add v10.8h, v4.8h, v6.8h
add v12.8h, v24.8h, v25.8h // q0 + q1
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v0.8h
add v10.8h, v25.8h, v26.8h // q1 + q2
urshr v0.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v12.8h
sub v10.8h, v10.8h, v2.8h
add v12.8h, v26.8h, v26.8h // q2 + q2
urshr v1.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v4.8h
urshr v2.8h, v8.8h, #3 // out q0
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
add v8.8h, v8.8h, v12.8h
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
urshr v3.8h, v8.8h, #3 // out q1
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
.elseif \wd >= 8
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
.if \wd == 8
b.eq 8f // skip if there's no flat8in
.else
b.eq 2f // skip if there's no flat8in
.endif
add v0.8h, v20.8h, v21.8h // p3 + p2
add v2.8h, v22.8h, v25.8h // p1 + q1
add v4.8h, v20.8h, v22.8h // p3 + p1
add v6.8h, v23.8h, v26.8h // p0 + q2
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
add v9.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v8.8h, v4.8h // + p3 + p1
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
add v8.8h, v8.8h, v9.8h // + p0 + q0
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
urshr v10.8h, v8.8h, #3 // out p2
add v8.8h, v8.8h, v2.8h
add v0.8h, v20.8h, v23.8h // p3 + p0
add v2.8h, v24.8h, v27.8h // q0 + q3
urshr v11.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
add v4.8h, v21.8h, v24.8h // p2 + q0
add v6.8h, v25.8h, v27.8h // q1 + q3
urshr v12.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v2.8h
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
add v0.8h, v22.8h, v25.8h // p1 + q1
add v2.8h, v26.8h, v27.8h // q2 + q3
urshr v13.8h, v8.8h, #3 // out q0
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
urshr v0.8h, v8.8h, #3 // out q1
add v8.8h, v8.8h, v2.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
bit v23.16b, v12.16b, v14.16b
urshr v1.8h, v8.8h, #3 // out q2
bit v24.16b, v13.16b, v14.16b
bit v25.16b, v0.16b, v14.16b
bit v26.16b, v1.16b, v14.16b
.endif
2:
.if \wd == 16
mov x16, v15.d[0]
mov x17, v15.d[1]
adds x16, x16, x17
b.ne 1f // check if flat8out is needed
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
add v2.8h, v17.8h, v17.8h // p6 + p6
add v4.8h, v17.8h, v18.8h // p6 + p5
add v6.8h, v17.8h, v19.8h // p6 + p4
add v8.8h, v17.8h, v20.8h // p6 + p3
add v12.8h, v2.8h, v4.8h
add v10.8h, v6.8h, v8.8h
add v6.8h, v17.8h, v21.8h // p6 + p2
add v12.8h, v12.8h, v10.8h
add v8.8h, v17.8h, v22.8h // p6 + p1
add v10.8h, v18.8h, v23.8h // p5 + p0
add v6.8h, v6.8h, v8.8h
add v8.8h, v19.8h, v24.8h // p4 + q0
add v12.8h, v12.8h, v6.8h
add v10.8h, v10.8h, v8.8h
add v6.8h, v20.8h, v25.8h // p3 + q1
add v12.8h, v12.8h, v10.8h
sub v6.8h, v6.8h, v2.8h
add v2.8h, v21.8h, v26.8h // p2 + q2
urshr v0.8h, v12.8h, #4 // out p5
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
sub v2.8h, v2.8h, v4.8h
add v4.8h, v22.8h, v27.8h // p1 + q3
add v6.8h, v17.8h, v19.8h // p6 + p4
urshr v1.8h, v12.8h, #4 // out p4
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
sub v4.8h, v4.8h, v6.8h
add v6.8h, v23.8h, v28.8h // p0 + q4
add v8.8h, v17.8h, v20.8h // p6 + p3
urshr v2.8h, v12.8h, #4 // out p3
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
sub v6.8h, v6.8h, v8.8h
add v8.8h, v24.8h, v29.8h // q0 + q5
add v4.8h, v17.8h, v21.8h // p6 + p2
urshr v3.8h, v12.8h, #4 // out p2
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
sub v8.8h, v8.8h, v4.8h
add v6.8h, v25.8h, v30.8h // q1 + q6
add v10.8h, v17.8h, v22.8h // p6 + p1
urshr v4.8h, v12.8h, #4 // out p1
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
sub v6.8h, v6.8h, v10.8h
add v8.8h, v26.8h, v30.8h // q2 + q6
bif v0.16b, v18.16b, v15.16b // out p5
add v10.8h, v18.8h, v23.8h // p5 + p0
urshr v5.8h, v12.8h, #4 // out p0
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
sub v8.8h, v8.8h, v10.8h
add v10.8h, v27.8h, v30.8h // q3 + q6
bif v1.16b, v19.16b, v15.16b // out p4
add v18.8h, v19.8h, v24.8h // p4 + q0
urshr v6.8h, v12.8h, #4 // out q0
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
sub v10.8h, v10.8h, v18.8h
add v8.8h, v28.8h, v30.8h // q4 + q6
bif v2.16b, v20.16b, v15.16b // out p3
add v18.8h, v20.8h, v25.8h // p3 + q1
urshr v7.8h, v12.8h, #4 // out q1
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
sub v18.8h, v8.8h, v18.8h
add v10.8h, v29.8h, v30.8h // q5 + q6
bif v3.16b, v21.16b, v15.16b // out p2
add v20.8h, v21.8h, v26.8h // p2 + q2
urshr v8.8h, v12.8h, #4 // out q2
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
sub v10.8h, v10.8h, v20.8h
add v18.8h, v30.8h, v30.8h // q6 + q6
bif v4.16b, v22.16b, v15.16b // out p1
add v20.8h, v22.8h, v27.8h // p1 + q3
urshr v9.8h, v12.8h, #4 // out q3
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
sub v18.8h, v18.8h, v20.8h
bif v5.16b, v23.16b, v15.16b // out p0
urshr v10.8h, v12.8h, #4 // out q4
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
urshr v11.8h, v12.8h, #4 // out q5
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
bif v9.16b, v27.16b, v15.16b // out q3
bif v10.16b, v28.16b, v15.16b // out q4
bif v11.16b, v29.16b, v15.16b // out q5
.endif
mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
mov x14, #(1 << 6)
ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
mov x14, #(1 << 4)
ret
.endif
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_8_wd16
bl lpf_8_wd16_neon
cbz x14, 1f
tbnz x14, #6, 7f
tbnz x14, #4, 8f
ret x15
1:
.endm
.macro lpf_8_wd8
bl lpf_8_wd8_neon
cbz x14, 1f
tbnz x14, #4, 8f
ret x15
1:
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
cbz x14, 1f
ret x15
1:
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
cbz x14, 1f
ret x15
1:
.endm
function lpf_v_4_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
lpf_8_wd4
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
ret x15
endfunc
function lpf_h_4_8_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #2
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
add x0, x0, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd4
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
ret x15
endfunc
function lpf_v_6_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
sub x16, x16, x1
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
lpf_8_wd6
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
ret x15
endfunc
function lpf_h_6_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd6
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
ret x15
endfunc
function lpf_v_8_8_neon
mov x15, x30
sub x16, x0, x1, lsl #2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v27.8h}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
lpf_8_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
ret x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
ret x15
endfunc
function lpf_h_8_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd8
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
ret x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
ret x15
endfunc
function lpf_v_16_8_neon
mov x15, x30
sub x16, x0, x1, lsl #3
add x16, x16, x1
ld1 {v17.8h}, [x16], x1 // p6
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v18.8h}, [x16], x1 // p5
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v19.8h}, [x16], x1 // p4
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v27.8h}, [x0], x1 // q3
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v28.8h}, [x0], x1 // q4
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v29.8h}, [x0], x1 // q5
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v30.8h}, [x0], x1 // q6
sub x0, x0, x1, lsl #3
add x0, x0, x1
lpf_8_wd16
sub x16, x0, x1, lsl #2
sub x16, x16, x1, lsl #1
st1 {v0.8h}, [x16], x1 // p5
st1 {v6.8h}, [x0], x1 // q0
st1 {v1.8h}, [x16], x1 // p4
st1 {v7.8h}, [x0], x1 // q1
st1 {v2.8h}, [x16], x1 // p3
st1 {v8.8h}, [x0], x1 // q2
st1 {v3.8h}, [x16], x1 // p2
st1 {v9.8h}, [x0], x1 // q3
st1 {v4.8h}, [x16], x1 // p1
st1 {v10.8h}, [x0], x1 // q4
st1 {v5.8h}, [x16], x1 // p0
st1 {v11.8h}, [x0], x1 // q5
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ret x15
7:
sub x16, x0, x1
sub x16, x16, x1, lsl #1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
ret x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
ret x15
endfunc
function lpf_h_16_8_neon
mov x15, x30
sub x16, x0, #16
ld1 {v16.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v17.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v18.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v19.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
ld1 {v20.8h}, [x16], x1
ld1 {v28.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v29.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v30.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v31.8h}, [x0], x1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lpf_8_wd16
sub x0, x0, x1, lsl #3
sub x16, x0, #16
transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
st1 {v16.8h}, [x16], x1
st1 {v6.8h}, [x0], x1
st1 {v17.8h}, [x16], x1
st1 {v7.8h}, [x0], x1
st1 {v0.8h}, [x16], x1
st1 {v8.8h}, [x0], x1
st1 {v1.8h}, [x16], x1
st1 {v9.8h}, [x0], x1
st1 {v2.8h}, [x16], x1
st1 {v10.8h}, [x0], x1
st1 {v3.8h}, [x16], x1
st1 {v11.8h}, [x0], x1
st1 {v4.8h}, [x16], x1
st1 {v30.8h}, [x0], x1
st1 {v5.8h}, [x16], x1
st1 {v31.8h}, [x0], x1
ret x15
7:
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
ret x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
ret x15
endfunc
// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w,
// const int bitdepth_max)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
mov x11, x30
mov w8, w7 // bitdepth_max
clz w9, w8
mov w10, #24
sub w9, w10, w9 // bitdepth_min_8
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp w6, w7, [x2] // vmask[0], vmask[1]
.ifc \type, y
ldr w2, [x2, #8] // vmask[2]
.endif
add x5, x5, #128 // Move to sharp part of lut
.ifc \type, y
orr w7, w7, w2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub x4, x3, x4, lsl #2
.else
sub x3, x3, #4
lsl x4, x4, #2
.endif
orr w6, w6, w7 // vmask[0] |= vmask[1]
1:
tst w6, #0x03
.ifc \dir, v
ld1 {v0.8b}, [x4], #8
ld1 {v1.8b}, [x3], #8
.else
ld2 {v0.s,v1.s}[0], [x3], x4
ld2 {v0.s,v1.s}[1], [x3], x4
.endif
b.eq 7f // if (!(vm & bits)) continue;
ld1r {v5.8b}, [x5] // sharp[0]
add x5, x5, #8
movi v2.2s, #0xff
dup v13.2s, w6 // vmask[0]
dup v31.8h, w9 // bitdepth_min_8
and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
and v1.8b, v1.8b, v2.8b
cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
movi v4.8b, #1
ld1r {v6.8b}, [x5] // sharp[1]
sub x5, x5, #8
bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
cmtst v2.2s, v1.2s, v2.2s // L != 0
mul v1.2s, v1.2s, v4.2s // L
.ifc \type, y
dup v15.2s, w2 // vmask[2]
.endif
dup v14.2s, w7 // vmask[1]
mov x16, v2.d[0]
cmp x16, #0
b.eq 7f // if (!L) continue;
neg v5.8b, v5.8b // -sharp[0]
movrel x16, word_12
ushr v12.8b, v1.8b, #4 // H
ld1 {v16.2s}, [x16]
sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
.ifc \type, y
cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
.endif
movi v7.8b, #2
umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
add v0.8b, v1.8b, v7.8b // L + 2
umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
add v0.8b, v0.8b, v0.8b // 2*(L + 2)
cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
uxtl v12.8h, v12.8b
add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
uxtl v11.8h, v11.8b
uxtl v10.8h, v10.8b
and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
sxtl v14.8h, v14.8b
sxtl v13.8h, v13.8b
.ifc \type, y
sxtl v15.8h, v15.8b
.endif
ushl v12.8h, v12.8h, v31.8h
ushl v11.8h, v11.8h, v31.8h
ushl v10.8h, v10.8h, v31.8h
.ifc \type, y
tst w2, #0x03
b.eq 2f
// wd16
bl lpf_\dir\()_16_8_neon
b 8f
2:
.endif
tst w7, #0x03
b.eq 3f
.ifc \type, y
// wd8
bl lpf_\dir\()_8_8_neon
.else
// wd6
bl lpf_\dir\()_6_8_neon
.endif
b 8f
3:
// wd4
bl lpf_\dir\()_4_8_neon
.ifc \dir, h
b 8f
7:
// For dir h, the functions above increment x0.
// If the whole function is skipped, increment it here instead.
add x0, x0, x1, lsl #3
.else
7:
.endif
8:
lsr w6, w6, #2 // vmask[0] >>= 2
lsr w7, w7, #2 // vmask[1] >>= 2
.ifc \type, y
lsr w2, w2, #2 // vmask[2] >>= 2
.endif
.ifc \dir, v
add x0, x0, #16
.else
// For dir h, x0 is returned incremented
.endif
cbnz w6, 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret x11
endfunc
.endm
lpf_func v, y
lpf_func h, y
lpf_func v, uv
lpf_func h, uv
const word_12
.word 1, 2
endconst