blob: f26e55f77392a047226049797affe2506d95a962 [file] [log] [blame]
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* Copyright © 2019, B Krishnan Iyer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_128_neon, export=1
push {r4, lr}
ldr r4, [sp, #8]
clz r3, r3
adr r2, L(ipred_dc_128_tbl)
sub r3, r3, #25
ldr r3, [r2, r3, lsl #2]
mov lr, #128
vdup.8 q0, lr
add r2, r2, r3
add r12, r0, r1
lsl r1, r1, #1
bx r2
.align 2
L(ipred_dc_128_tbl):
.word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vdup.8 q1, lr
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vdup.8 q1, lr
vdup.8 q2, lr
vdup.8 q3, lr
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_v_neon, export=1
push {r4, lr}
ldr lr, [sp, #8]
clz r3, r3
adr r4, L(ipred_v_tbl)
sub r3, r3, #25
ldr r3, [r4, r3, lsl #2]
add r2, r2, #1
add r4, r4, r3
add r12, r0, r1
lsl r1, r1, #1
bx r4
.align 2
L(ipred_v_tbl):
.word 640f - L(ipred_v_tbl) + CONFIG_THUMB
.word 320f - L(ipred_v_tbl) + CONFIG_THUMB
.word 160f - L(ipred_v_tbl) + CONFIG_THUMB
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[0]}, [r2]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs lr, lr, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
80:
vld1.8 {d0}, [r2]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs lr, lr, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
160:
vld1.8 {q0}, [r2]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vld1.8 {q0, q1}, [r2]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vld1.8 {q0, q1}, [r2]!
sub r1, r1, #32
vld1.8 {q2, q3}, [r2]
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_h_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_h_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
sub r2, r2, #4
mov lr, #-4
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_h_tbl):
.word 640f - L(ipred_h_tbl) + CONFIG_THUMB
.word 320f - L(ipred_h_tbl) + CONFIG_THUMB
.word 160f - L(ipred_h_tbl) + CONFIG_THUMB
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
4:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.32 {d3[0]}, [r0, :32], r1
vst1.32 {d2[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
8:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.8 {d3}, [r0, :64], r1
vst1.8 {d2}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d1}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
add r2, r2, #3
mov lr, #-1
16:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128], r1
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #16
32:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #48
64:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_top_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_dc_top_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
add r2, r2, #1
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_top_tbl):
.word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d0, d0[0]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
80:
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 d0, d0[0]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d4, q0, #5
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d18, q0, #6
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_left_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
sub r2, r2, r4
clz r3, r3
clz lr, r4
sub lr, lr, #25
adr r5, L(ipred_dc_left_tbl)
sub r3, r3, #20
ldr r3, [r5, r3, lsl #2]
ldr lr, [r5, lr, lsl #2]
add r3, r5, r3
add r5, r5, lr
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_left_tbl):
.word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
L(ipred_dc_left_h4):
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w4):
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt L(ipred_dc_left_w4)
pop {r4-r5, pc}
L(ipred_dc_left_h8):
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w8):
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt L(ipred_dc_left_w8)
pop {r4-r5, pc}
L(ipred_dc_left_h16):
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w16):
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt L(ipred_dc_left_w16)
pop {r4-r5, pc}
L(ipred_dc_left_h32):
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #5
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w32):
vmov.8 q1, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
L(ipred_dc_left_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #6
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w64):
sub r1, r1, #32
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
endfunc
// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_neon, export=1
push {r4-r6, lr}
ldr r4, [sp, #16]
sub r2, r2, r4
add lr, r3, r4 // width + height
clz r3, r3
clz r12, r4
vdup.16 q15, lr // width + height
mov r6, #0
adr r5, L(ipred_dc_tbl)
rbit lr, lr // rbit(width + height)
sub r3, r3, #20 // 25 leading bits, minus table offset 5
sub r12, r12, #25
clz lr, lr // ctz(width + height)
ldr r3, [r5, r3, lsl #2]
ldr r12, [r5, r12, lsl #2]
neg lr, lr // -ctz(width + height)
add r3, r5, r3
add r5, r5, r12
vshr.u16 q15, q15, #1 // (width + height) >> 1
vdup.16 q14, lr // -ctz(width + height)
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_tbl):
.word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
L(ipred_dc_h4):
vld1.32 {d0[0]}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
vld1.32 {d1[0]}, [r2]
vmov.32 d1[1], r6
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
vpadd.u16 d1, d1
vpadd.u16 d1, d1
cmp r4, #4
vadd.s16 d0, d0, d1
vshl.u16 d0, d0, d28
beq 1f // h = 8/16
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
cmp r4, #16
it ne
movne lr, r5
vdup.16 d30, lr
vqdmulh.s16 d0, d0, d30
1:
vdup.8 d0, d0[0]
2:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h8):
vld1.8 {d0}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w8):
add r2, r2, #1
vld1.8 {d2}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d2, d2
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #8
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/16/32
cmp r4, #32
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 d0, d0[0]
2:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h16):
vld1.8 {d0, d1}, [r2]!
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w16):
add r2, r2, #1
vld1.8 {d2, d3}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #16
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/8/32/64
tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 q0, d0[0]
2:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h32):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w32):
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vadd.u16 d4, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
cmp r4, #32
vadd.s16 d0, d0, d4
vadd.s16 d0, d0, d2
vshl.u16 d4, d0, d28
beq 1f // h = 8/16/64
cmp r4, #8
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d4, d4, d24
1:
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]!
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w64):
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
2:
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]!
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d4, d4, d5
vadd.u16 d2, d2, d3
vld1.8 {d16, d17, d18, d19}, [r2]
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vaddl.u8 q8, d16, d17
vaddl.u8 q9, d18, d19
vadd.u16 d16, d16, d17
vadd.u16 d18, d18, d19
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vadd.u16 d2, d2, d4
vadd.u16 d3, d16, d18
cmp r4, #64
vadd.s16 d0, d0, d2
vadd.s16 d0, d0, d3
vshl.u16 d18, d0, d28
beq 1f // h = 16/32
movw lr, #(0x5556/2)
movt lr, #(0x3334/2)
mov r5, r4
and r5, r5, #31
lsr lr, lr, r5
vdup.16 d30, lr
vqdmulh.s16 d18, d18, d30
1:
sub r1, r1, #32
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
endfunc